非結構化資料的分析與挖掘

# 導入庫
import re  # 正規表示式庫
import collections  # 詞頻統計庫
import numpy as np  # numpy庫
import jieba  # 結巴分詞
import wordcloud  # 詞云展示庫
from pil import image  # 影象處理庫
import matplotlib.pyplot as plt  # 影象展示庫
# 讀取文字檔案
with
open
('article1.txt'
)as fn:
string_data = fn.read(
)# 使用read方法讀取整段文字
# 文字預處理
pattern = re.
compile
(u'\t|\n|\.|-|一|:|;|\)|\(|\?|"'
)# 建立正規表示式匹配模式
string_data = re.sub(pattern,
'', string_data)
# 將符合模式的字串替換掉
# 文字分詞
seg_list_exact = jieba.cut(string_data, cut_all=
false
)# 精確模式分詞[預設模式]
remove_words =
['的'
,'，'
,'和'
,'是'
,'隨著'
,'對於'
,' '
,'對'
,'等'
,'能'
,'都'
,'。'
,'、'
,'中'
,'與'
,'在'
,'其'
,'了'
,'可以'
,'進行'
,'有'
,'更'
,'需要'
,'提供'
,'多'
,'能力'
,'通過'
,'會'
,'不同'
,'乙個'
,'這個'
,'我們'
,'將'
,'並'
,'同時'
,'看'
,'如果'
,'但'
,'到'
,'非常'
,'—'
,'如何'
,'包括'
,'這'
]# 自定義去除詞庫
# remove_words =  #空去除詞列表，用於跟關鍵字提取做效果對比
object_list =
[i for i in seg_list_exact if i not
in remove_words]
# 將不在去除詞列表中的詞新增到列表中
# 詞頻統計
word_counts = collections.counter(object_list)
# 對分詞做詞頻統計
word_counts_top5 = word_counts.most_common(5)
# 獲取前10個頻率最高的詞
for w, c in word_counts_top5:
# 分別讀出每條詞和出現從次數
print
(w, c)
# 列印輸出
# 詞頻展示
# 定義詞頻背景
wc = wordcloud.wordcloud(
font_path=
'c:/windows/fonts/simhei.ttf'
,# 設定字型格式，不設定將無法顯示中文
mask=mask,
# 設定背景圖
max_words=
200,
# 設定最大顯示的詞數
max_font_size=
100,
# 設定字型最大值
)wc.generate_from_frequencies(word_counts)
# 從字典生成詞云
image_colors = wordcloud.imagecolorgenerator(mask)
# 從背景圖建立顏色方案
wc.recolor(color_func=image_colors)
# 將詞云顏色設定為背景圖方案
plt.imshow(wc)
# 顯示詞云
plt.axis(
'off'
)# 關閉座標軸
# 導入庫
import jieba.posseg as pseg
import pandas as pd
with
open
('article1.txt'
)as fn:
string_data = fn.read(
)# 使用read方法讀取整段文字
# 分詞+詞性標註
words = pseg.cut(string_data)
# 分詞
words_pd = pd.dataframe(words, columns=
['word'
,'type'])
# 建立結果資料框
print
(words_pd.head(4)
)# 展示結果前4條
'''        word type
0      adobe  eng
1               x
2  analytics  eng
3          和    c
'''# 詞性分類彙總-兩列分類
words_gb = words_pd.groupby(
['type'
,'word'])
['word'
].count(
)print
(words_gb.head(4)
)'''
type  word
a     不同      14
不足       2
不通       1
嚴謹       2
name: word, dtype: int64
'''# 詞性分類彙總-單列分類
words_gb2 = words_pd.groupby(
'type'
).count(
)words_gb2 = words_gb2.sort_values(by=
'word'
, ascending=
false
)print
(words_gb2.head(4)
)'''
word
type      
x      994
n      981
v      834
eng    295
'''# 選擇特定型別詞語做展示
words_pd_index = words_pd[
'type'
].isin(
['n'
,'eng'])
words_pd_select = words_pd[words_pd_index]
print
(words_pd_select.head(4)
)'''
word type
0      adobe  eng
2  analytics  eng
4   webtrekk  eng
9         領域    n
'''#導入庫
import jieba.analyse  # 匯入關鍵字提取庫
import pandas as pd  # 匯入pandas
# 讀取文字資料
with
open
('article1.txt'
)as fn:
string_data = fn.read(
)# 使用read方法讀取整段文字
# 關鍵字提取
tags_pairs = jieba.analyse.extract_tags(string_data, topk=
5, withweight=
true
, allowpos=
['ns'
,'n'
,'vn'
,'v'
,'nr'
], withflag=
true
)# 提取關鍵字標籤
tags_list =
[(i[0]
.word, i[0]
.flag, i[1]
)for i in tags_pairs]
#tags_pd = pd.dataframe(tags_list, columns=
['word'
,'flag'
,'weight'])
# 建立資料框
print
(tags_pd)
# 列印資料框
'''  word flag    weight
0   資料    n  0.313395
1   報表    n  0.163367
2   功能    n  0.150263
3   分析   vn  0.134857
4   使用者    n  0.126633
'''
				結構化資料和非結構化資料的區別 非結構化資料
據 idc 2018 年到 2025 年之間，全球產生的資料量將會從 33 zb 增長到 175 zb，復合增長率達到 27 其中超過 80 的資料都會是處理難度較大的非結構化資料。預計到 2030年全球資料總量將達到 3,5000eb。由於非結構化資料的資訊量和資訊的重要程度很難被界定，如何對其進...
				結構化資料 半結構化資料 非結構化資料
結構化資料 即行資料,儲存在資料庫裡,可以用二維表結構來邏輯表達實現的資料 所謂半結構化資料，就是介於完全結構化資料 如關係型資料庫 物件導向資料庫中的資料 和完全無結構的資料 如聲音 影象檔案等 之間的資料，html文件就屬於半結構化資料。它一般是自描述的，資料的結構和內容混在一起，沒有明顯的區分...
				結構化資料，非結構化資料，半結構化資料的區別
1.結構化資料 先有結構，再有資料 是指由二維表結構來邏輯表達和實現的資料，嚴格地遵循資料格式與長度規範，主要通過關係型資料庫進行儲存和管理。也稱作行資料，一般特點是 資料以行為單位，一行資料表示乙個實體的資訊，每一行資料的屬性是相同的。2.非結構化資料 先有資料，再有結構 半結構化資料，是結構化資...
非結構化資料的分析與挖掘

結構化資料和非結構化資料的區別非結構化資料

結構化資料半結構化資料非結構化資料

結構化資料，非結構化資料，半結構化資料的區別

非結構化資料的分析與挖掘

結構化資料和非結構化資料的區別 非結構化資料

結構化資料 半結構化資料 非結構化資料

結構化資料，非結構化資料，半結構化資料的區別

相關推薦

結構化資料和非結構化資料的區別非結構化資料

結構化資料半結構化資料非結構化資料