import sqlite3
import pandas as pd
import numpy as np
import jieba
from collections import counter
from pyecharts import wordcloud
class mywordcloud:
'''自定義的詞云分割
'''def __init__(self,db_path,table_name,stop_word_file_path,field_name):
self.sqlit3_db = sqlite3.connect(db_path)#鏈結sqlite3
sql = 'select * from {}'.format(table_name)
self.content_from_db = pd.read_sql(sql,self.sqlit3_db)#從sqliter3中載入資料
self.stop_words = #停止詞,即不需要的一些詞語,比如:的、得、你、我、他等
self.stop_word_path = stop_word_file_path#停止詞詞庫檔案路徑
self.field_filter = field_name#過濾字段
self.user_dic = jieba.load_userdict('user_dic.txt')
def stop_word_file(self,):
'''載入停止詞詞庫,主要針對中文
:param stop_word_path: 詞庫檔案路徑
:return:
'''if self.stop_word_path:
with open(self.stop_word_path, encoding='utf-8') as f:
else:
print('未新增詞云停止詞庫,請新增停止詞庫檔案或者開啟filter_more為true')
def mk_word_list(self,record_count=none,filter_more_bol=true):
'''生成所需得詞庫列表
:param field_need: 使用jieba要剪取得字段或者說類別
:param record_count: sqlite3中所需要進行切詞分析的記錄條數
:param filter_more_bol: 進一步或者說更加精準的過濾不需要的詞
:return:
'''word_list =
if record_count is not none:
content_need = self.content_from_db[self.field_filter].head(record_count)
else:
content_need = self.content_from_db[self.field_filter]
for record_one in content_need:
words = jieba.cut(record_one)
if filter_more_bol:
words = self.filter_more(words)
for word in words:
if word not in self.stop_words:
return word_list
def filter_more(self,words:list):
'''過濾出中文和英文以及過濾掉單個字元
:param words: jieba切過的詞語序列
:return:
'''word_list =
for word in words:
if word.isalpha() and len(word) >1:
return word_list
def words_cloud(self,words_list:list):
'''生成詞云檔案
:param words_list: 詞云所需詞庫
:return:
'''content = pd.series(words_list).value_counts()
words_show = content.index
words_count = content.values
wd = wordcloud(width=1300, height=620)
wd.add('', words_show, words_count, word_size_range=(20, 100))
wd.render('wordcloud.html')
def start(self,record_count=10):
'''直接生成詞云檔案的函式,當然也可以自己一步一步進行函式呼叫
:param record_count: 需要的sqlite3資料庫記錄數,預設10條
:return:
'''self.stop_word_file()
word_list = self.mk_word_list(record_count=record_count)
self.words_cloud(word_list)
if __name__ == '__main__':
wordcloud = mywordcloud('recruit.db','recruit','stopword.txt','job_detail')
wordcloud.start()
所需要的三個檔案:
recruit.db:sqlite3資料庫檔案,提供資料
stopword.txt:需要過濾的字元檔案,不需要字元或者字串寫入該檔案,注意每個字元或者字串都要換行
user_dic.txt:自定義字典,不進行切割的字元或者字串
Python 生成詞云
import matplotlib.pyplot as plt from wordcloud import wordcloud import jieba text from file with apath open python.txt encoding utf 8 read wordlist af...
python 生成詞云
coding utf 8 from wordcloud import wordcloud import matplotlib.pyplot as plt import jieba from pil import image import numpy as np 生成詞云 defcreate word...
python 生成詞云
1 知識點 wordcloud引數講解 font path表示用到字型的路徑 width和height表示畫布的寬和高 prefer horizontal可以調整詞雲中字型水平和垂直的多少 mask即掩膜,產生詞云背景的區域 scale 計算和繪圖之間的縮放 min font size設定最小的字型...