**很簡單,朋友需要就簡單寫了一下
只針對天貓,其他沒有測試
# -
*- coding: utf-8-
*-"""--
----
----
----
----
----
----
----
----
----
----
-------
file name : tianmao.py
date : 19-1
-23author : hebel
----
----
----
----
----
----
----
----
----
----
----
----
- description:
note:
----
----
----
----
----
----
----
----
----
----
----
----
-"""
import requests
import re
import json
from urllib import parse
def collect_tianmao_goods_comments
(goods_url ,cookies, fied_page_number=0)
:"""
:param goods_url:
:param cookies:
:param fied_page_number: 指定翻頁頁數,不指定會獲取所有頁
:return
:"""
assert isinstance
(cookies, dict)
if cookies.
get(
"x5sec"
, none) is none:
raise valueerror
("無效的cookies,缺失 x5sec 值"
) parseresult = parse.
urlparse
(goods_url)
param_dict = parse.
parse_qs
(parseresult.query)
id = param_dict.
get(
"id")[
0]if param_dict.
get(
"id"
)else none #商品id
user_id = param_dict.
get(
"user_id")[
0]if param_dict.
get(
"user_id"
)else
"725677994" # 賣家使用者id ,
725677994是天貓超市的user_id
assert id and user_id
url =
""start_url = url.
format
(id=id, user_id=user_id, page_number=1)
headers =
print
(start_url)
resp = requests.
get(url=start_url, headers=headers, cookies=cookies, timeout=6)
if not resp.status_code is 200
: raise ioerror
("請求失敗"
) html = resp.content.
decode
("utf8"
) str_json_list = re.
findall
('jsonp\d+\((.*)\)'
,html)
if str_json_list:
dict_data = json.
loads
(str_json_list[0]
) ratedetail = dict_data.
get(
"ratedetail"
) paginator = ratedetail.
get(
"paginator"
) items = paginator.
get(
"items"
)print(.
format
(count=items)
) page_number_all = items//20+1 if items%20>0 else items//20
print
("當前總頁數"
,page_number_all)
page_number = page_number_all if fied_page_number <=
0else page_number_all if fied_page_number > page_number_all else fied_page_number
comments_all_list =
for pn in
range
(page_number)
: pn+=
1 new_url = url.
format
(id=id, user_id=user_id, page_number=pn)
resp = requests.
get(url=new_url, headers=headers, cookies=cookies, timeout=6)
if resp.status_code is 200
: html = resp.content.
decode
("utf8"
) str_json_list = re.
findall
('jsonp\d+\((.*)\)'
, html)
if str_json_list:
dict_data = json.
loads
(str_json_list[0]
) ratedetail = dict_data.
get(
"ratedetail"
) ratelist = ratedetail.
get(
"ratelist"
) comments_all_list.
extend
(ratelist)
if comments_all_list:
return comments_all_list
else
:return
if __name__ ==
'__main__'
: # 商品url
goods_url =
""cookies =
fied_page_number =
7 # #
comments_data =
collect_tianmao_goods_comments
(goods_url=goods_url,cookies=cookies, fied_page_number=fied_page_number)
if not comments_data is none:
print
(len
(comments_data)
) # for comments in comments_data:
# print
(comments)
用 puppeteer 獲取 jd 商品評論
由於很簡單,直接看 注釋 const puppeteer require puppeteer const autoscroll require autoscroll const url async function crawler 開乙個 tab 頁 let page await browser.n...
python json 爬京東商品評論
1 我用的是qq瀏覽器,右擊檢查,在network下選擇js,在搜尋框裡輸入productpagecomments 如果出不來記得f5重新整理一下 如圖 2 雙擊productpagecomments會得到以下頁面 二 找到想要的東西就要寫 啦 上 coding utf 8 importurllib...
爬取亞馬遜評論 亞馬遜商品評論分析
1 原始資料 2 資料清洗 由於資料量較小且清洗過程簡單,直接利用excel進行處理。最終得到的negative txt包含1013條資料,positive txt包含3198條資料。二 模型構建 1 分詞 1.1 讀取停用詞 stopwords def stopword filename glob...