import json
import requests
import sqlite3
base_url =
''# 可能還需要偽造的是
headers =
# 列編輯模式alt+shift
params =
# 連線資料庫,如果資料庫不存在會自動生成
connect = sqlite3.connect(
'./jingdongsqlite.db'
)# 從會話中生成游標,相當於excel的游標
cursor = connect.cursor(
)# execute(sql)
cursor.execute(
""" create table if not exists comment(
cid integer primary key,
content text,
creation_time text,
product_color text,
product_size text
);""")
for i in
range(1
,60):
params[
'page'
]= i
resp = requests.get(base_url, headers=headers, params=params)
status_code = resp.status_code
comments_json = resp.text
print
(comments_json)
# 方法1:python切片 方法2:正則 方法3:本例比較特殊,可以直接返回json
comments_obj = json.loads(comments_json)
print
(comments_obj)
comments = comments_obj[
'comments'
]for c in comments:
cid = c[
'id'
] content = c[
'content'
] creation_time = c[
'creationtime'
] product_color = c[
'productcolor'
] product_size = c[
'productsize'
]print
('-'
*100
)print
(cid, content)
cursor.execute(
"""insert or ignore into comment (cid, content, creation_time, product_color, product_size) values (?,?,?,?,?);"""
,[cid,content,creation_time,product_color,product_size]
)# 提交確認(插入和更新)
connect.commit(
)cursor.execute(
""" select * from comment;
""")
# 取出查詢資料
rs = cursor.fetchall(
)print
(rs)
# 關閉游標
cursor.close(
)# 關閉資料庫
connect.close(
)
import sqlite3
import jieba
import pil .image as image
import numpy as np
from wordcloud import wordcloud
connect = sqlite3.connect(
'../l05/jingdongsqlite.db'
)cursor = connect.cursor(
)cursor.execute(
"""select * from comment order by creation_time desc limit 0,499;"""
)comments_rs = cursor.fetchall(
)comments =
[c[1
]for c in comments_rs]
comments =
''.join(comments)
words = jieba.cut(comments, cut_all=
false
)comment_words_list =
list
(words)
with
open
('../l05/dict/stop_words_zh.txt'
, mode=
'r', encoding=
'utf-8'
)as f:
stop_words = f.read(
).splitlines(
)# print(stop_words)
filtered_comment_word_list =
for word in comment_words_list:
if word not
in stop_words:
# print(filtered_comment_word_list)
comment_words_str =
' '.join(filtered_comment_word_list)
print
(comment_words_str)
wc = wordcloud(
font_path=
'./問藏書房.ttf'
, background_color=
'black'
, mask=np.array(image.
open
('./三角形.jpg'))
, width=
1000
, height=
800,
max_words=
500,
relative_scaling=
0.3,
min_font_size=50,
).generate(comment_words_str)
wc.to_file(
)
python爬取京東評論
這不是我的第乙個爬蟲,但大多數都是像這樣簡單粗暴的,因為一開始對於定義函式,然後再進行相應的操作,是比較困難的,這能直接寫for迴圈語句。然後,我們便開始進行相應的爬蟲第一步 匯入必要的包 import requests import json header這個的作用在於偽裝成瀏覽器進行操作,有些網...
京東爬取評論簡單分析
def get comment url i 0 while true url str i pagesize 10 isshadowsku 0 fold 1 headers response requests.get url,headers headers comment list re.compil...
python爬取京東評論 一
作為乙個爬蟲小白解決問題是十分蛋疼的 皿 就這幾行 我折磨了一下午,然後我發現,學習 最大的難題是學習資源獲取的途徑並不是 本身,只要學,任何人都能學會 開發者選項 3.知道了這個是相應的請求了,去headers弄url,去看看怎麼樣 複製url 4下面開始搞 import requests imp...