gensim 是乙個通過衡量片語(或更高階結構,如整句或文件)模式來挖掘文件語義結構的工具
from gensim import corpora
import jieba
documents = ['工業網際網路平台的核心技術是什麼',
def word_cut(doc):
seg = [jieba.lcut(w) for w in doc]
return seg
texts= word_cut(documents)
dictionary = corpora.dictionary(texts)
bow_corpus = [dictionary.doc2bow(text) for text in texts]
每個元組的第一項對應詞典中符號的 id,第二項對應該符號出現的次數。[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
[(2, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1)]]
分詞工具from gensim import models
# train the model
tfidf = models.tfidfmodel(bow_corpus)
原始資料會存在一些【空或重複的語句】,須過濾掉這些【無價值且影響效率】的語句。使用計算機自動地對中文文字進行詞語切分的過程稱為中文分詞(chinese word segmentation),即使中文句子中的詞之間有空格標識。若要對乙個句子進行分析,就需要將其切分成詞的序列,然後以詞為單位進行句子的分析,故中文分詞是中文自然語言處理中最基本的乙個環節。
2、結巴分詞後的停用詞性 [標點符號、連詞、助詞、副詞、介詞、時語素、『的』、數詞、方位詞、代詞]def stopwordslist(filepath):
wlst = [w.strip() for w in open(filepath,'r',encoding='utf8').readlines()]
return wlst
stop_flag = ['x', 'c', 'u','d', 'p', 't', 'uj', 'm', 'f', 'r']
def seg_sentence(sentence,stop_words):
sentence_seged = jieba.cut(sentence.strip())
# sentence_seged = set(sentence_seged)
outstr = ''
for word in sentence_seged:
if word not in stop_words:
if word != '\t':
outstr += word
outstr += ' '
return outstr.split(' ')
texts = [seg_sentence(seg,stop_words) for seg in open(tpath,'r',encoding='utf8').readlines()]
dictionary = corpora.dictionary(texts)
feature_cnt = len(dictionary.token2id.keys())
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.tfidfmodel(corpus)
kw_vector = dictionary.doc2bow(seg_sentence(keyword,stop_words))
index = similarities.sparsematrixsimilarity(tfidf[corpus],num_features=feature_cnt)
sim = index[tfidf[kw_vector]]
python文字相似度計算import jieba
import jieba.posseg as pseg
from gensim import corpora, models, similarities
def stopwordslist(filepath):
wlst = [w.strip() for w in open(filepath, 'r', encoding='utf8').readlines()]
return wlst
def seg_sentence(sentence, stop_words):
# stop_flag = ['x', 'c', 'u', 'd', 'p', 't', 'uj', 'm', 'f', 'r']#過濾數字m
stop_flag = ['x', 'c', 'u', 'd', 'p', 't', 'uj', 'f', 'r']
sentence_seged = pseg.cut(sentence)
# sentence_seged = set(sentence_seged)
outstr =
for word,flag in sentence_seged:
# if word not in stop_words:
if word not in stop_words and flag not in stop_flag:
return outstr
if __name__ == '__main__':
sppath = 'stopwords.txt'
tpath = 'test.txt'
stop_words = stopwordslist(sppath)
keyword = '吃雞'
# 1、將【文字集】生產【分詞列表】
texts = [seg_sentence(seg, stop_words) for seg in open(tpath, 'r', encoding='utf8').readlines()]
orig_txt = [seg for seg in open(tpath, 'r', encoding='utf8').readlines()]
# 2、基於檔案集建立【詞典】,並提取詞典特徵數
dictionary = corpora.dictionary(texts)
feature_cnt = len(dictionary.token2id.keys())
# 3、基於詞典,將【分詞列表集】轉換為【稀疏向量集】,也就是【語料庫】
corpus = [dictionary.doc2bow(text) for text in texts]
# 4、使用「tf-tdf模型」處理【語料庫】
tfidf = models.tfidfmodel(corpus)
# 5、同理,用詞典把搜尋詞也轉換為稀疏向量
kw_vector = dictionary.doc2bow(seg_sentence(keyword, stop_words))
# 6、對稀疏向量建立索引
index = similarities.sparsematrixsimilarity(tfidf[corpus], num_features=feature_cnt)
# 7、相似的計算
sim = index[tfidf[kw_vector]]
result_list =
for i in range(len(sim)):
print('keyword 與 text%d 相似度為:%.2f' % (i + 1, sim[i]))
if sim[i] > 0.4:
