__author__ = 'wanghuafeng'
#coding:utf-8
import os
import sys
import codecs
from collections import deque
try:
path = os.path.dirname(os.path.abspath(__file__))
except:
path = os.getcwd()
class slicerbase(object):
def __init__(self, options=none):
if not options:
options = {}
self.options = options
if not self.options.has_key('vocab_file') or not self.options['vocab_file']:
self.options['vocab_file'] = os.path.join(path, 'data', 'cizu_and_singleword_komoxo95k.txt')
self.total_base_word_set = self._load_base_wordlist()
def _load_base_wordlist(self):
with codecs.open(self.options['vocab_file'], encoding='utf-8') as f:
total_base_word_set = set([item.split('\t')[0] for item in f.readlines()])
return total_base_word_set
def to_unicode(self, sentence):
sentence = sentence.strip()
if isinstance(sentence, str):
try:
sentence = sentence.decode('utf-8')
except exception,e:
try:
sentence = sentence.decode('gbk')
except exception:
raise valueerror('unknown coding...')
return sentence
def cut_forward(self, complexwords):
'''cut forword'''
max_len = 8#基礎此表中,詞的最大長度
complexwords = self.to_unicode(complexwords)
complex_words_lenght = len(complexwords)
new_postion = 0
point_position = max_len
temp_splited_sentence_list =
while point_position-new_postion >= 1:
#print new_postion, point_position
point_complex_words = complexwords[new_postion:point_position]
if point_complex_words in self.total_base_word_set:#若已切出詞在基礎此表中,則將其新增到陣列中。此時,前指標移動到後指標出,後指標=前指標max_len,進行下一輪查詢
new_postion = point_position
if point_position + max_len <= complex_words_lenght:
point_position = new_postion + max_len
else:
point_position = complex_words_lenght
continue
if new_postion <= complex_words_lenght and point_position == new_postion + 1:#說明該單字詞沒有在基礎詞表中,前指標沒有超過句子最大長度,且前後指標相差1,則此時將前指標向後移動一位,後指標=前指標+max_len,進行下一輪查詢
new_postion += 1
if point_position + max_len <= complex_words_lenght:
point_position = new_postion + max_len
else:
point_position = complex_words_lenght
continue
point_position -= 1
print ' '.join(temp_splited_sentence_list)
return temp_splited_sentence_list
def cut_backwords(self, complexwords):
'''cut backward '''
max_len = 8
complexwords = self.to_unicode(complexwords)
point_posttion = len(complexwords)
new_position = point_posttion - max_len
splited_setence_list = deque()
while point_posttion - new_position >= 1:
print new_position, point_posttion
point_complex_words = complexwords[new_position:point_posttion]
if point_complex_words in self.total_base_word_set:
point_posttion = new_position
if point_posttion - max_len >= 0:
new_position = point_posttion - max_len
else:
new_position = 0
continue
if new_position + 1 == point_posttion and point_posttion >= 0:
point_posttion -= 1
if point_posttion - max_len >= 0:
new_position = point_posttion - max_len
else:
new_position = 0
continue
new_position += 1
return splited_setence_list
cp = slicerbase()
print ' '.join(cp.cut_backwords('對雜湊索引表的演算法行封裝'))
輸出:
其中前列為:new_position ;後列為:point_position
4 12
5 12
6 12
7 12
8 12
9 12
10 12
2 10
3 10
4 10
5 10
6 10
7 10
8 10
9 10
1 92 9
3 94 9
5 96 9
7 90 7
1 72 7
3 74 7
5 70 5
1 52 5
3 50 3
1 30 1
對 雜湊 索引 表的 演算法 行 封裝
語料切詞演算法研究
b123y5 15元,一路順風 等。語音索引號 語音內容00 11。9910 十11百12 千13萬14 年15月16 日17天18 歡迎光臨 19一路平安 20此卡 21已過期 22無效 23有效 24有效期 37a38 b。61y 62z63京 64黑。96學97 警。101請繳費 102點 1...
python反素數演算法優化 合數
python中判斷乙個數是不是質數2020 12 18 16 57 27 首先說明,內容結合了其他人的程式設計想法 和其他學習平台學習的思路,本人只是將解法記錄下來方便自己之後查閱。由於本人學藝不精,如果有錯誤還請大家見諒並指出,謝謝。質數是除了1和它本身再無其他的因數,例如5。在數學上與質數相對的...
分詞演算法的python實現(正向最大匹配法)
正向最大匹配法又稱mm法,其基本思想是 假設分詞詞典中的最長詞由i個漢字字元組成,則 用被處理文件的當前字串中前i個字作為匹配 字段查詢詞典。若詞典中存在這樣乙個字詞,則 匹配成功,匹配字段作為乙個詞被切分出來,否則 匹配失敗。應將匹配欄位中的最後乙個字去掉,對剩下的字串重新進行匹配處理。如此進行下...