**:
python資源共享群:626017123
import requestsfrom bs4 import beautifulsoup
import re
import random
import time
# 爬蟲主函式
def mm(url):
# 設定目標url,使用requests建立請求
header = {
req0 = requests.get(url=url, headers=header)
req0.encoding = "gb18030" # 解決亂碼問題
html0 = req0.text
# 使用beautifulsoup建立html**的beautifulsoup例項,存為soup0
soup0 = beautifulsoup(html0, "html.parser")
# 獲取最後一頁數字,對應-122(對照前一小節獲取尾頁的內容看你就明白了)
total_page = int(soup0.find("div", class_="pagers").findall("a")[-2].get_text())
myfile = open("aika_qc_gn_1_1_1.txt", "a", encoding='gb18030', errors='ignore') # 解決亂碼問題
print("user", " **", " 認為有用人數", " 型別", " comment")
name = "user" + " **" + " 認為有用人數" + " 型別" + " comment"
myfile.write(name + "\n")
for i in list(range(1, total_page + 1)):
# 設定隨機暫停時間
stop = random.uniform(1, 3)
url = "" + str(i) + ".htm"
req = requests.get(url=url, headers=header)
req.encoding = "gb18030" # 解決亂碼問題
html = req.text
soup = beautifulsoup(html, "html.parser")
contents = soup.find('div', class_="review_comments").findall("dl")
l = len(contents)
for content in contents:
tiaoshu = contents.index(content)
try:
print(ss) # 正在爬取的條數
try:
# 點評角度
comment_jiaodu = content.find("dt").find("em").find("a").get_text().strip().replace("\n",
"").replace(
"\t", "").replace("\r", "")
except:
comment_jiaodu = "sunny"
try:
# 點評型別
comment_type0 = content.find("dt").get_text().strip().replace("\n", "").replace("\t", "").replace(
"\r",
"")comment_type1 = comment_type0.split("【")[1]
comment_type = comment_type1.split("】")[0]
except:
comment_type = "sunny"
# 認為該條評價有用的人數
try:
useful = int(
content.find("dd").find("div", class_="useful").find("i").find(
"span").get_text().strip().replace(
"\n", "").replace("\t", "").replace("\r", ""))
except:
useful = "sunny"
try:
comment_region = content.find("dd").find("p").find("a").get_text().strip().replace("\n",
"").replace(
"\t", "").replace("\r", "")
except:
comment_region = "sunny"
try:
user = \
content.find("dd").find("p").get_text().strip().replace("\n", "").replace("\t", "").replace(
"\r",
"").split(
":")[-1]
except:
user = "sunny"
try:
comment_url = content.find('dt').findall('a')[-1]['href']
urlc = comment_url
headerc = {
reqc = requests.get(urlc, headers=headerc)
htmlc = reqc.text
soupc = beautifulsoup(htmlc, "html.parser")
comment0 = \
soupc.find('div', id='mainnew').find('div', class_='maintable').findall('form')[1].find('table',
class_='t_msg').findall(
'tr')[1]
try:
comment = comment0.find('font').get_text().strip().replace("\n", "").replace("\t", "")
except:
comment = "sunny"
try:
comment_time = soupc.find('div', id='mainnew').find('div', class_='maintable').findall('form')[
1].find('table', class_='t_msg').find('div',
style='padding-top: 4px;float:left').get_text().strip().replace(
"\n", "").replace(
"\t", "")[4:]
except:
comment_time = "sunny"
except:
try:
comment = \
content.find("dd").get_text().split("\n")[-1].split('\r')[-1].strip().replace("\n",
"").replace(
"\t", "").replace("\r", "").split(":")[-1]
except:
comment = "sunny"
time.sleep(stop)
print(user, comment_region, useful, comment_type, comment)
tt = user + " " + comment_region + " " + str(useful) + " " + comment_type + " " + comment
myfile.write(tt + "\n")
except exception as e:
print(e)
print(s)
pass
myfile.close()
def fenxi():
myfile = open("aika_qc_gn_1_1_1.txt", "r")
good = 0
middle = 0
bad = 0
nn = 0
for line in myfile:
commit = line.split(" ")[3]
if commit == "好評":
good = good + 1
elif commit == "中評":
middle = middle + 1
elif commit == "差評":
bad = bad + 1
else:
nn = nn + 1
count = good + middle + bad + nn
g = round(good / (count - nn) * 100, 2)
m = round(middle / (count - nn) * 100, 2)
b = round(bad / (count - nn) * 100, 2)
n = round(nn / (count - nn) * 100, 2)
print("好評佔比:", g)
print("中評佔比:", m)
print("差評佔比:", b)
url = ""
mm(url)
fenxi()
python爬蟲爬取京東 亞馬遜商品頁面資訊
利用爬蟲獲取京東頁面上的資訊 import requests r requests.get r.status code 200 r.encoding gbk r.text 1000 利用爬蟲獲取亞馬遜的商品資訊 import requests url try kv 亞馬遜對 做了審查,我們需要利用r...
requests爬蟲爬取頁面資料
新建檔案test.py,寫入一下 import requests 通過pip install requests安裝 from bs4 import beautifulsoup 通過pip install bs4安裝 import re 安裝了python就有了re模組 import json 安裝了...
手寫爬取靜態頁面汽車之家
scrapy寫多了,手寫爬蟲有點生疏,今天來回顧手寫爬取靜態頁面,以便日後做筆記用,我今天爬取的是汽車之家網頁,第一步 匯入requests和bs4 import requests from bs4 import beautifulsoup 第三步 解析頁面,在這裡我們用的beautifulsoup...