看的別人的** 爬取某部影片的影評 沒有模擬登入只能爬6頁
# -*- encoding:utf-8 -*-
import requests
from bs4 import beautifulsoup
import re
import random
import io
import sys
import time
# 使用session來儲存登陸資訊
s = requests.session()
# 獲取動態ip,防止ip被封
def get_ip_list(url, headers):
web_data = requests.get(url, headers=headers)
soup = beautifulsoup(web_data.text, 'lxml')
ips = soup.find_all('tr')
ip_list =
for i in range(1, len(ips)):
ip_info = ips[i]
tds = ip_info.find_all('td')
return ip_list
# 隨機從動態ip鍊錶中選擇一條ip
def get_random_ip(ip_list):
proxy_list =
for ip in ip_list:
proxy_ip = random.choice(proxy_list)
proxies =
return proxies
def get_data(html):
soup = beautifulsoup(html, "lxml")
comment_list = soup.select('.comment > p')
next_page = soup.select('.next')[0].get('href')
return comment_list, next_page
if __name__ == "__main__":
absolute = ''
headers =
# 獲取動態ip
url = ''
ip_list = get_ip_list(url, headers=headers)
proxies = get_random_ip(ip_list)
current_page = absolute
next_page = ""
comment_list =
temp_list =
num = 0
ans = 0
while (1):
ans+=1
print("爬取第" + str(ans) + "頁")
time.sleep(5)
html = s.get(current_page, headers=headers, proxies=proxies).content
temp_list, next_page = get_data(html)
if ans is 7:
break
current_page = absolute + next_page
comment_list = comment_list + temp_list
# time.sleep(1 + float(random.randint(1, 100)) / 20)
num = num + 1
# 每20次更新一次ip
if num % 20 == 0:
proxies = get_random_ip(ip_list)
print(current_page)
with open("f:\comments.txt", 'a')as f:
ark = 0
for node in comment_list:
comment = node.get_text().strip().replace("\n", "")
f.write(comment + "\n")
ark += 1
print("寫了" + str(ark) + "個")
f.close()
nodejs爬取豆瓣影評
爬取豆瓣心靈奇旅影評,包括使用者主頁頭像 let request require request let fs require fs const path require path var startnum 0 起始爬取位置 傳送請求 function reqdata url else 請求處理 a...
豆瓣影評爬取 中國機長
10月大火的中國機長相信大家都看過了吧 悄悄的說,我還有二刷?超級超級超級超級喜歡袁泉姐姐,溫柔又堅定,真誠又勇敢!import requests from bs4 import beautifulsoup import time import pandas as pd import os def ...
python爬蟲實戰 爬取豆瓣影評資料
爬取豆瓣影評資料步驟 1 獲取網頁請求 2 解析獲取的網頁 3 提速資料 4 儲存檔案 1 匯入需要的庫 import urllib.request from bs4 import beautifulsoup 隨機數的庫 import random 時間庫 import time 庫 import ...