寫的乙個生成詞云的函式
這個是沒有自定義圖版本
#coding:utf-8
__author__ = '英俊'
import warnings
warnings.filterwarnings("ignore")
#codecs提供的open方法來指定開啟的檔案的語言編碼,它會在讀取的時候自動轉換為內部unicode
import codecs
#分詞包
import jieba
#統計import pandas as pd
#numpy計算包
import numpy
#視覺化
import matplotlib.pyplot as plt
#展示在當前頁面中
%matplotlib inline
import matplotlib
#設定大小
matplotlib.rcparams['figure.figsize'] = (10.0, 5.0)
from wordcloud import wordcloud#詞云包
def createwordcloud(text_path):
# "./data/entertainment_news.csv"
df = pd.read_csv(text_path, encoding='utf-8')
# 去掉空行
df = df.dropna()
# df.head()
#將資料變成list
content=df.content.values.tolist()
segment=
for line in content:
try:
#列表segs=jieba.lcut(line)
for seg in segs:
#判斷是否為空或者是不是換行詞
if len(seg)>1 and seg!='\r\n':
except:
print(line)
continue
words_df=pd.dataframe()
# words_df.head()
stopwords=pd.read_csv("data/stopwords.txt",index_col=false,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用
# stopwords.head()
# 先抽取在停用詞裡面的分詞片語,然後再將它去掉
words_df=words_df[~words_df.segment.isin(stopwords.stopword)]
# words_df.head()
# 這一塊是個難點,詞頻統計
words_stat = words_df.groupby('segment').agg(計數=pd.namedagg(column='segment', aggfunc='size')).reset_index().sort_values(
by='計數', ascending=false)
# words_stat.head()
# font_path因為中文文字做詞云很麻煩,backgrou_color是設定背景底,max是文字的最大大小
wordcloud=wordcloud(font_path="data/simhei.ttf",background_color="white",max_font_size=80)
#這是乙個字典生成,x[0]是詞是什麼,x[1]是詞的數量
word_frequence =
#生成詞云
wordcloud=wordcloud.fit_words(word_frequence)
plt.imshow(wordcloud)
呼叫函式
createwordcloud("./data/entertainment_news.csv")
生成詞云
加入自定義的
from scipy.misc import imread
#詞云大小
matplotlib.rcparams['figure.figsize'] = (15.0, 15.0)
from wordcloud import wordcloud,imagecolorgenerator
def createsuperwordcloud(text_path,image_path):
# "./data/entertainment_news.csv"
df = pd.read_csv(text_path, encoding='utf-8')
# 去掉空行
df = df.dropna()
# df.head()
#將資料變成list
content=df.content.values.tolist()
segment=
for line in content:
try:
#列表segs=jieba.lcut(line)
for seg in segs:
#判斷是否為空或者是不是換行詞
if len(seg)>1 and seg!='\r\n':
except:
print(line)
continue
words_df=pd.dataframe()
# words_df.head()
stopwords=pd.read_csv("data/stopwords.txt",index_col=false,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用
# stopwords.head()
# 先抽取在停用詞裡面的分詞片語,然後再將它去掉
words_df=words_df[~words_df.segment.isin(stopwords.stopword)]
# words_df.head()
# 這一塊是個難點,詞頻統計
words_stat = words_df.groupby('segment').agg(計數=pd.namedagg(column='segment', aggfunc='size')).reset_index().sort_values(
by='計數', ascending=false)
#讀取生成背景
# 'image/entertainment.jpeg'
bimg=imread(image_path)
# 生成詞云
wordcloud=wordcloud(background_color="white",mask=bimg,font_path='data/simhei.ttf',max_font_size=200)
#生成詞頻
word_frequence =
wordcloud=wordcloud.fit_words(word_frequence)
# 重新上色
bimgcolors=imagecolorgenerator(bimg)
# 去掉off
plt.axis("off")
#重新填寫背景
plt.imshow(wordcloud.recolor(color_func=bimgcolors))
呼叫函式:
createsuperwordcloud("./data/entertainment_news.csv",'image/entertainment.jpeg')
生成詞云:
Python 生成詞云
import matplotlib.pyplot as plt from wordcloud import wordcloud import jieba text from file with apath open python.txt encoding utf 8 read wordlist af...
python 生成詞云
coding utf 8 from wordcloud import wordcloud import matplotlib.pyplot as plt import jieba from pil import image import numpy as np 生成詞云 defcreate word...
python 生成詞云
1 知識點 wordcloud引數講解 font path表示用到字型的路徑 width和height表示畫布的寬和高 prefer horizontal可以調整詞雲中字型水平和垂直的多少 mask即掩膜,產生詞云背景的區域 scale 計算和繪圖之間的縮放 min font size設定最小的字型...