def
gettext()
: txt =
open
("hamlet.txt"
,"r"
).read(
)#讀取檔案
txt = txt.lower(
)#把文字全部變為小寫
for ch in
'|"#$%&^()*+,-./:;<>=?@\\_『{}~'
:#把特殊字元變為空格
txt = txt.replace(ch,
" ")
return txt
hamlettext = gettext(
)words = hamlettext.split(
)#把檔案變為乙個單詞列表
counts =
#定義字典
for word in words:
counts[word]
= counts.get(word,0)
+1items =
list
(counts.items())
#把字典變為列表
items.sort(key=
lambda x:x[1]
,reverse =
true
)#按照詞頻降序排列,要記住,常用
for i in
range(10
):#輸出詞頻前10的單詞
word, count = items[i]
print(""
.format
(word,count)
)
import jieba
txt =
open
("threekingdoms.txt"
,"r"
,encoding=
"utf-8"
).read(
)words = jieba.lcut(txt)
#利用jieba庫進行分詞
counts =
for word in words:
iflen
(word)==1
:continue
else
: counts[word]
= counts.get(word,0)
+1items =
list
(counts.items())
items.sort(key=
lambda x:x[1]
, reverse=
true
)for i in
range(15
):word, count = items[i]
print(""
.format
(word,count)
)
import jieba
txt =
open
("threekingdoms.txt"
,"r"
,encoding=
"utf-8"
).read(
)excludes =
words = jieba.lcut(txt)
#利用jieba庫進行分詞
counts =
for word in words:
iflen
(word)==1
:continue
elif word ==
"諸葛亮"
or word ==
"孔明曰"
: rword =
"孔明"
elif word ==
"關公"
or word ==
"雲長"
: rword =
"關羽"
elif word ==
"玄德"
or word ==
"玄德曰"
: rword =
"劉備"
else
: reword = word
counts[rword]
= counts.get(word,0)
+1for word in excludes:
del counts[word]
items =
list
(counts.items())
items.sort(key=
lambda x:x[1]
, reverse=
true
)for i in
range(15
):word, count = items[i]
print(""
.format
(word,count)
)
Python進行詞頻統計
1.測試文字 test.txt 2.測試文字內容 this is just for test 這只是用來測試的 this is just for test 這只是用來測試的 3.及解釋如下 import jieba def doc2matrix doc x open doc,r y x.read 讀...
Python進行詞頻統計
基礎python統計詞頻,未考慮到刪除停用詞 詞頻統計 defgettext 處理檔案 txt open english.txt r read txt txt.lower 將英文全部變為小寫 for ch in txt txt.replace ch,return txt mytxt gettext ...
用python做詞頻統計
假設有乙個本地的txt檔案,想對其進行詞頻統計,可以這樣寫 import time path c users zhangxiaomei desktop walden.txt with open path,r as text words text.read split print words forw...