终于该好了,该字典树能够实现对于数据的模糊匹配。代码如下:
#!/usr/bin/env python
#-*- coding: UTF-8 -*-
class Node:
def __init__(self):
self.map = {}
self.indexnum = 0
self.endflag = False
def contain(self,key):
return self.map.__contains__(key)
def __getitem__(self,key):
return self.map[key]
def __setitem__(self,key,value):
self.map[key] =value
class TrieTree:
def __init__(self):
self.subNum = 0
self.indexNum = 0
self.subNode = Node()
def add(self,key,trieTree):
self.subNum += 1
self.subNode[key] = trieTree
def __chinese(self,char):
char = unicode(char,"utf8")
buf = []
for word in char:
if word >= u'\u4e00'and word <= u'\u9fa5':
buf.append(word.encode('utf-8'))
elif word == '\n':
#在这里对索引进行标记
self.indexNum+=1
return buf
def search(self,buf):
buf = self.__chinese(buf)
current = self
for i in range(len(buf)):
#转化成每个汉字
#print buf[i]
if current.subNode.contain(buf[i]):
if current.subNode[buf[i]].subNode.endflag == True:
return current.subNode.contain(buf[i])
else:
current = current.subNode[buf[i]]
else:
#如果没有匹配上,直接进入下一层
continue
def load(self,filename):
try:
sock = open(filename,'r')
buf = sock.read().split('\n')
sock.close()
except IOError:
return None
#读取每个词语
for i in range(len(buf)):
buftmp = self.__chinese(buf[i])
tree = self
#读取每个汉字
current = tree
for j in range(len(buftmp)):
if current.subNode.contain(buftmp[j]):
current = current.subNode[buftmp[j]]
else:
sub = TrieTree()
current.add(buftmp[j],sub)
current = sub
if j == len(buftmp) - 1:
current.subNode.endflag = True
current.subNode.indexnum = self.indexNum
if __name__=='__main__':
s = TrieTree()
s.load('citynames')
print s.search('我你')
# s.printSelf()