天貓商品評論獲取

2021-09-30 19:24:22 字數 4134 閱讀 6434

**很簡單,朋友需要就簡單寫了一下

只針對天貓,其他沒有測試

# -

*- coding: utf-8-

*-"""--

----

----

----

----

----

----

----

----

----

----

-------

file name : tianmao.py

date : 19-1

-23author : hebel

----

----

----

----

----

----

----

----

----

----

----

----

- description:

note:

----

----

----

----

----

----

----

----

----

----

----

----

-"""

import requests

import re

import json

from urllib import parse

def collect_tianmao_goods_comments

(goods_url ,cookies, fied_page_number=0)

:"""

:param goods_url:

:param cookies:

:param fied_page_number: 指定翻頁頁數,不指定會獲取所有頁

:return

:"""

assert isinstance

(cookies, dict)

if cookies.

get(

"x5sec"

, none) is none:

raise valueerror

("無效的cookies,缺失 x5sec 值"

) parseresult = parse.

urlparse

(goods_url)

param_dict = parse.

parse_qs

(parseresult.query)

id = param_dict.

get(

"id")[

0]if param_dict.

get(

"id"

)else none #商品id

user_id = param_dict.

get(

"user_id")[

0]if param_dict.

get(

"user_id"

)else

"725677994" # 賣家使用者id ,

725677994是天貓超市的user_id

assert id and user_id

url =

""start_url = url.

format

(id=id, user_id=user_id, page_number=1)

headers =

print

(start_url)

resp = requests.

get(url=start_url, headers=headers, cookies=cookies, timeout=6)

if not resp.status_code is 200

: raise ioerror

("請求失敗"

) html = resp.content.

decode

("utf8"

) str_json_list = re.

findall

('jsonp\d+\((.*)\)'

,html)

if str_json_list:

dict_data = json.

loads

(str_json_list[0]

) ratedetail = dict_data.

get(

"ratedetail"

) paginator = ratedetail.

get(

"paginator"

) items = paginator.

get(

"items"

)print(.

format

(count=items)

) page_number_all = items//20+1 if items%20>0 else items//20

print

("當前總頁數"

,page_number_all)

page_number = page_number_all if fied_page_number <=

0else page_number_all if fied_page_number > page_number_all else fied_page_number

comments_all_list =

for pn in

range

(page_number)

: pn+=

1 new_url = url.

format

(id=id, user_id=user_id, page_number=pn)

resp = requests.

get(url=new_url, headers=headers, cookies=cookies, timeout=6)

if resp.status_code is 200

: html = resp.content.

decode

("utf8"

) str_json_list = re.

findall

('jsonp\d+\((.*)\)'

, html)

if str_json_list:

dict_data = json.

loads

(str_json_list[0]

) ratedetail = dict_data.

get(

"ratedetail"

) ratelist = ratedetail.

get(

"ratelist"

) comments_all_list.

extend

(ratelist)

if comments_all_list:

return comments_all_list

else

:return

if __name__ ==

'__main__'

: # 商品url

goods_url =

""cookies =

fied_page_number =

7 # #

comments_data =

collect_tianmao_goods_comments

(goods_url=goods_url,cookies=cookies, fied_page_number=fied_page_number)

if not comments_data is none:

print

(len

(comments_data)

) # for comments in comments_data:

# print

(comments)

用 puppeteer 獲取 jd 商品評論

由於很簡單,直接看 注釋 const puppeteer require puppeteer const autoscroll require autoscroll const url async function crawler 開乙個 tab 頁 let page await browser.n...

python json 爬京東商品評論

1 我用的是qq瀏覽器,右擊檢查,在network下選擇js,在搜尋框裡輸入productpagecomments 如果出不來記得f5重新整理一下 如圖 2 雙擊productpagecomments會得到以下頁面 二 找到想要的東西就要寫 啦 上 coding utf 8 importurllib...

爬取亞馬遜評論 亞馬遜商品評論分析

1 原始資料 2 資料清洗 由於資料量較小且清洗過程簡單,直接利用excel進行處理。最終得到的negative txt包含1013條資料,positive txt包含3198條資料。二 模型構建 1 分詞 1.1 讀取停用詞 stopwords def stopword filename glob...