)#降噪,避免大小寫的干擾
#用空格替換特殊符號
for ch in
'!"#$%&()*+,-./:;<=>?@[\\]^_『,《》~』'
: txt = txt.replace(ch,
" ")
return txt
hamletxt = gettext(
)words = hamletxt.split(
)counts =
for word in words:
counts[word]
= counts.get(word,0)
+1items =
list
(counts.items())
#列表的排序方法,true返回從大到小
#按照鍵值對的第二個元素排序
("三國演義.txt"
增加排除詞庫,和人名的別名統一
注意:排除詞庫需要不斷執行,不斷測試新增,此處並不完全
import jieba
txt =
open
("三國演義.txt"
,"r"
, encoding=
"gb18030"
).read(
)#排除詞庫
excludes =
words = jieba.lcut(txt)
counts =
for word in words:
iflen
(word)==1
:continue
elif word ==
"諸葛亮"
or word ==
"孔明曰"
: rword =
"孔明"
elif word ==
"關公"
or word ==
"雲長"
: rword =
"關羽"
elif word ==
"玄德"
or word ==
"玄德曰"
: rword =
"劉備"
elif word ==
"孟德"
or word ==
"丞相"
: rword =
"曹操"
else
: rword = word
counts[rword]
= counts.get(rword,0)
+1for word in excludes:
del counts[word]
items =
list
(counts.items())
items.sort(key=
lambda x:x[1]
, reverse=
true
)for i in
range(15
):word, count = items[i]
print(""
.format
(word, count)
)
通過不斷新增排除詞庫,執行程式,得三國演義人物出場順序前20
Python例項 文字詞頻統計
最近在mooc跟著北京理工大學的嵩天老師學習python 受益匪淺,老師所講的通俗易懂,推薦給大家。在此記點筆記和注釋,備忘。今天所記得是文字詞頻統計 hamlet文字詞頻統計。直接上源 calhamletv1.py def gettext txt open e hamlet.txt r read ...
Python 文字詞頻統計
hamlettxt gettext words hemlettxt.split counts for word in words counts word counts.get word,0 1這是一段遍歷hamlet.txt檔案的一段 s.split 函式返回的是列表list 我有一些困惑 1.最後...
python詞頻統計例項
詞頻統計 import jieba 分詞庫包 import snownlp 情感分析 words 非常時尚鞋子,非常非常非常時尚的一款鞋子,設計好看,設計設計做活動買的,超超超超超超超超超划算。滿意。設計好看!words list list jieba.cut words words frequen...