jieba切割生成詞云

2021-09-08 21:40:26 字數 2971 閱讀 1618

import sqlite3

import pandas as pd

import numpy as np

import jieba

from collections import counter

from pyecharts import wordcloud

class mywordcloud:

'''自定義的詞云分割

'''def __init__(self,db_path,table_name,stop_word_file_path,field_name):

self.sqlit3_db = sqlite3.connect(db_path)#鏈結sqlite3

sql = 'select * from {}'.format(table_name)

self.content_from_db = pd.read_sql(sql,self.sqlit3_db)#從sqliter3中載入資料

self.stop_words = #停止詞,即不需要的一些詞語,比如:的、得、你、我、他等

self.stop_word_path = stop_word_file_path#停止詞詞庫檔案路徑

self.field_filter = field_name#過濾字段

self.user_dic = jieba.load_userdict('user_dic.txt')

def stop_word_file(self,):

'''載入停止詞詞庫,主要針對中文

:param stop_word_path: 詞庫檔案路徑

:return:

'''if self.stop_word_path:

with open(self.stop_word_path, encoding='utf-8') as f:

else:

print('未新增詞云停止詞庫,請新增停止詞庫檔案或者開啟filter_more為true')

def mk_word_list(self,record_count=none,filter_more_bol=true):

'''生成所需得詞庫列表

:param field_need: 使用jieba要剪取得字段或者說類別

:param record_count: sqlite3中所需要進行切詞分析的記錄條數

:param filter_more_bol: 進一步或者說更加精準的過濾不需要的詞

:return:

'''word_list =

if record_count is not none:

content_need = self.content_from_db[self.field_filter].head(record_count)

else:

content_need = self.content_from_db[self.field_filter]

for record_one in content_need:

words = jieba.cut(record_one)

if filter_more_bol:

words = self.filter_more(words)

for word in words:

if word not in self.stop_words:

return word_list

def filter_more(self,words:list):

'''過濾出中文和英文以及過濾掉單個字元

:param words: jieba切過的詞語序列

:return:

'''word_list =

for word in words:

if word.isalpha() and len(word) >1:

return word_list

def words_cloud(self,words_list:list):

'''生成詞云檔案

:param words_list: 詞云所需詞庫

:return:

'''content = pd.series(words_list).value_counts()

words_show = content.index

words_count = content.values

wd = wordcloud(width=1300, height=620)

wd.add('', words_show, words_count, word_size_range=(20, 100))

wd.render('wordcloud.html')

def start(self,record_count=10):

'''直接生成詞云檔案的函式,當然也可以自己一步一步進行函式呼叫

:param record_count: 需要的sqlite3資料庫記錄數,預設10條

:return:

'''self.stop_word_file()

word_list = self.mk_word_list(record_count=record_count)

self.words_cloud(word_list)

if __name__ == '__main__':

wordcloud = mywordcloud('recruit.db','recruit','stopword.txt','job_detail')

wordcloud.start()

所需要的三個檔案:

recruit.db:sqlite3資料庫檔案,提供資料

stopword.txt:需要過濾的字元檔案,不需要字元或者字串寫入該檔案,注意每個字元或者字串都要換行

user_dic.txt:自定義字典,不進行切割的字元或者字串

Python 生成詞云

import matplotlib.pyplot as plt from wordcloud import wordcloud import jieba text from file with apath open python.txt encoding utf 8 read wordlist af...

python 生成詞云

coding utf 8 from wordcloud import wordcloud import matplotlib.pyplot as plt import jieba from pil import image import numpy as np 生成詞云 defcreate word...

python 生成詞云

1 知識點 wordcloud引數講解 font path表示用到字型的路徑 width和height表示畫布的寬和高 prefer horizontal可以調整詞雲中字型水平和垂直的多少 mask即掩膜,產生詞云背景的區域 scale 計算和繪圖之間的縮放 min font size設定最小的字型...