#!/usr/bin/python
#encoding=gbk
import sys
dictmaxlength = 5
dctdict = {}
encoding='gbk'
初始化字典、初始化最大詞長
def initdct(dct):
global dctdict
global dictmaxlength
dctobj = open(dct)
for line in dctobj:
line = line.strip()
word = line.split("\t")[0].strip()
dctdict[word] = line
word = word.strip().decode(encoding)
if dictmaxlength < len(word):
dictmaxlength = len(word)
dctobj.close()
正向最大匹配演算法
def maximunmathching(sent):
global dictmaxlength
global dctdict
index = 0
j = 0
result = ""
sent = sent.strip().decode(encoding)
sentlen = len(sent)
while(index < sentlen):
for i in range(dictmaxlength, 0, -1):
j = i + index
if j > sentlen:
j = sentlen
sub = sent[index:j]
if len(sub) > 1:
if dctdict.has_key(sub.encode(encoding)):
index += i
result += sub.encode(encoding) + " "
break;
else:
index += i
if not sub.encode(encoding) == " ":
result += sub.encode(encoding) + " "
break
return result.strip()
逆向最大匹配演算法
def reversemaximunmathching(sent):
global dctdict
global dictmaxlength
sb = ""
sent = sent.strip().decode(encoding)
index = len(sent)
j = 0
list =
while index >= 0:
for i in range(dictmaxlength, 0, -1):
j = index - i
if j < 0: j = 0
sub = sent[j:index]
if len(sub) > 1:
if dctdict.has_key(sub.encode(encoding)):
index = index - i
break;
else:
if not sub.encode(encoding) == " ":
index = index - i
break
list.reverse()
return " ".join(list)
非字典詞、單字字典詞、總詞數 越少越好
def segmenter(sent):
mm = maximunmathching(sent).strip()
rmm = reversemaximunmathching(sent).strip()
if mm == rmm:
return mm
else:
return bmmresult(mm, rmm)
非字典詞、單字字典詞、總詞數 越少越好
def bmmresult(mm, rmm):
#print mm
#print rmm
global dctdict
mmlist = mm.split(" ")
rmmlist = rmm.split(" ")
oovnum_mm = 0
oovnum_rmm = 0
signum_mm = 0
signum_rmm = 0
totnum_mm = len(mmlist)
totnum_rmm = len(rmmlist)
for word in mmlist:
if not dctdict.has_key(word):
oovnum_mm += 1
if len(word.decode(encoding)) == 1:
signum_mm += 1
for word in rmmlist:
if not dctdict.has_key(word):
oovnum_rmm += 1
if len(word.decode(encoding)) == 1:
signum_rmm += 1
mmwmix = 0
rmmnwmix = 0
if oovnum_mm > oovnum_rmm:
rmmnwmix += 1
elif oovnum_mm < oovnum_rmm:
mmwmix += 1
if signum_mm > signum_rmm:
rmmnwmix += 1
elif signum_mm < signum_rmm:
mmwmix += 1
if totnum_mm > totnum_rmm:
rmmnwmix += 1
elif totnum_mm < totnum_rmm:
mmwmix += 1
#print oovnum_mm, signum_mm, totnum_mm
#print oovnum_rmm, signum_rmm, totnum_rmm
if mmwmix < mmwmix:
return mm
else:
return rmm
def handlefile(input, output):
inputobj = open(input)
outputobj = open(output,"w")
index = 0
for line in inputobj:
index += 1
if index % 100000 == 0:
print str(index) + "\r"
line = line.strip().lower()
seg = segmenter(line)
outputobj.write(seg.strip() + "\n")
inputobj.close()
outputobj.close()
if __name__ == '__main__':
if len(sys.ar**) != 4:
print "usage %s dict[in] infile[in] outfile[out]." %sys.ar**[0]
sys.exit(-1)
dct = sys.ar**[1]
input = sys.ar**[2]
output = sys.ar**[3]
initdct(dct)
#sent = "chien中華人民共和國在2023年成立了"
#print segmenter(sent)
handlefile(input, output)
基於規則的雙向最大匹配演算法的分詞
雙向最大匹配演算法 bi directction matching method 是將最大匹配法得到的分詞結果和逆向最大匹配法得到的結果通過雙向最大匹配演算法的規則進行篩選而得到。coding utf 8 project exuding nlp all author texuding time 20...
最大匹配演算法
最大匹配法是最簡單的分詞方法,他完全使用詞典進行分詞,如果詞典好,則分詞的效果好 正向,即從左往右進行匹配 maximum match method 最大匹配法 class mm def init self self.window size 4 def cut self,text result in...
Python 最大逆向匹配演算法
第三次重新寫這個演算法,每次寫都有新的體會。這次最大的感受是把訪問資料夾的包都熟悉了一下,os和shutil。後者用來刪除整個檔案,這種破壞力還是慎用吧。def mk new dir filename 新建乙個資料夾,如果存在,則刪除並重建。if os.path.exists filename is...