python 詞頻統計中英文

#calhamletv1.py#英文統計程式
def gettext():
txt = open("hamlet.txt", "r").read()
txt = txt.lower()
for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_『~':
txt = txt.replace(ch, " ")   #將文字中特殊字元替換為空格
return txt
hamlettxt = gettext()
words  = hamlettxt.split()#split預設用空格分割字串，返回列表，後是無空格的單詞列表
counts = {}#建立字典
for word in words:           
counts[word] = counts.get(word,0) + 1#遍歷字典的鍵並對相應值+1，達到統計目的
items = list(counts.items())#items函式返回所有鍵值對元組，再用list返回列表型別，其中每個元素都是乙個鍵值對
items.sort(key=lambda x:x[1], reverse=true) #按照鍵值對中的就是第二個元素排序，逆序==從大到小（預設從小到大）
for i in range(10):
word, count = items[i]#鍵值對賦值給word，count
print ("".format(word, count)

#calthreekingdomsv1.py
import jieba
txt = open("threekingdoms.txt", "r", encoding='utf-8').read()
words  = jieba.lcut(txt)
counts = {}
for word in words:
if len(word) == 1:
continue
else:
counts[word] = counts.get(word,0) + 1
items = list(counts.items())
items.sort(key=lambda x:x[1], reverse=true) 
for i in range(15):
word, count = items[i]
print ("".format(word, count))

#calthreekingdomsv2.py
import jieba    #匯入jieba第三方庫（需要提前用在cmd中用pip install jieba命令安裝）
excludes =     #排除非人名但是排名靠前的詞
txt = open("threekingdoms.txt", "r", encoding='utf-8').read()    #開啟檔案
words  = jieba.lcut(txt)    #自動轉換成單詞列表，自動處理空格和標點符號
counts = {}     #建立字典
for word in words:
if len(word) == 1:  #單個字不能作為人名，跳過繼續下乙個
continue
elif word == "諸葛亮" or word == "孔明曰":   #把含義是同乙個人的詞統一成乙個詞
rword = "孔明"
elif word == "關公" or word == "雲長":
rword = "關羽"
elif word == "玄德" or word == "玄德曰":
rword = "劉備"
elif word == "孟德" or word == "丞相":
rword = "曹操"
else:
rword = word
counts[rword] = counts.get(rword,0) + 1    #按鍵查詢並對值累加計數，其中dict.get()方法，第二個引數是預設值，
for word in excludes:
del counts[word]    #遍歷exludes集合 刪除對應字典中的鍵及值對
items = list(counts.items()) #python 字典(dictionary) items() 函式以列表返回可遍歷的(鍵, 值) 元組陣列似乎不用再用list建立函式轉換
items.sort(key=lambda x:x[1], reverse=true) #lambuda匿名函式，key引數，按照元組x中第二個元素，從大到小排序
for i in range(10):
word, count = items[i]
print ("".format(word, count))

Python 文字詞頻統計中英文

統計一段英文中出現次數最多的幾個單詞 def get text text open eng.txt r read text text.lower 所有單詞都替換成小寫 for ch in 去噪，歸一化處理，把所有特殊符號替換為空格 text text.replace ch,return text ...

python 中英文分離中英文分離

由於沒有安裝 numpy 根據部落格提示，成功安裝了numpy 執行之後沒有錯誤，可是嘛，我看不到結果。也就隨它去了。主要有兩個問題，乙個是執行的時候出現的 valueerror need more than 0 values to unpack 對於空行就會報錯。不機智。於是加了個判斷。讓它一直走...

python中的中英文本元統計

英語字元和中文字元的區別在於大小寫字元和字元個數中文中是乙個詞語統計英語字元 def gettext txt open halmet.txt r read txt txt.lower 文中所有英語小寫 for ch in txt txt.replace ch,return txt halmet...

python 詞頻統計 中英文

Python 文字詞頻統計中英文

python 中英文 分離 中英文分離

python中的中英文本元統計

相關推薦

python 詞頻統計中英文

python 中英文分離中英文分離