2.設定**(構造ip池):看過很多大牛的回答,設定**不失為乙個好方法,由於囊中羞澀,所以決定去爬取乙個免費****的ip(免費** **隨後附上)。但是免費的**一來不穩定,二來需要經常的更換,嘗試了一番之後還是決定使用有償ip,畢竟也不貴。通過包裝本地ip,可以很方便的去爬取**資訊。
原始碼如下:
# -*- coding: utf-8 -*-
下面是爬取免費**的原始碼:# @time : 2018/1/8 14:05
# @author : rkgg
# @file : reptile_douban_top250_comments.py
# @software: pycharm
import requests
from bs4 import beautifulsoup
import codecs
import re
import time
import ua
import random
1.獲取單個top250影片頁面
3.記錄下來
'''host_url = ''
headers =
proxies =
def getproxies():
proxies = {}
for page in range(1,5):
url = ''+str(page)
html = requests.get(url,headers=headers).text
soup = beautifulsoup(html,"html.parser")
ip_list = soup.find('table',attrs=)
for ip_tr in ip_list.find_all('tr',attrs=):
ip = ip_tr.find_all('td')[1].gettext()
port = ip_tr.find_all('td')[2].gettext()
protocol = ip_tr.find_all('td')[5].gettext()
proxies[protocol] = protocol +':'+ ip+':'+port
# print('page--->'+str(page))
# print(proxies)
return proxies
def gethtml(url):
# headers =
data = requests.get(url,headers=headers,params=proxies)
return data.text
def parse_html(html):
soup = beautifulsoup(html,"html.parser")
response = soup.find('ol',attrs=)
movies_list =
for movies_li in response.find_all('li'):
detail = movies_li.find('div', attrs=)
movie_name = detail.find('span', attrs=).gettext()
# 獲取單個影片頁面
movies_url = detail.find('a')['href']
hot_list =
movies_comments_url = movies_url + 'comments?sort=new_score&status=p'
comment_data = gethtml(movies_comments_url)
comment_soup = beautifulsoup(comment_data, "html.parser")
head_info = comment_soup.find('head').find('title').gettext()
if head_info!='頁面不存在':
comments_list = comment_soup.find('div', attrs=)
hot_comments = comments_list.find('div', attrs=).find('p').gettext()
movie_detail = movie_name + ':' + hot_list[0]
time.sleep(random.randint(1,10))
print(hot_list[0])
continue
else:
print('頁面失效!')
# 跳轉頁面
next_page = soup.find('span', attrs=).find('a')
# print('--->'+next_page)
if next_page:
return movies_list, host_url + next_page['href']
return movies_list,none
if __name__ == '__main__':
url = host_url
with codecs.open('movies_hot_comments.txt','w',encoding='utf-8') as fp:
while url:
html = gethtml(url)
# print(html)
movies, url = parse_html(html)
fp.write(u'\n'.join(movies))
import requests
from bs4 import beautifulsoup
import time
import random
import re
headers =
def getproxies():
proxies = {}
# for page in random.ra(1,5):
url = ''+str(random.randint(1,5))
html = requests.get(url,headers=headers).text
soup = beautifulsoup(html,"html.parser")
ip_list = soup.find('table',attrs=)
for ip_tr in ip_list.find_all('tr',attrs=):
ip = ip_tr.find_all('td')[1].gettext()
port = ip_tr.find_all('td')[2].gettext()
protocol = ip_tr.find_all('td')[5].gettext()
proxies[protocol.lower()] = protocol.lower() +'://'+ ip+':'+port
# print('page--->'+str(page))
print(proxies)
time.sleep(random.randint(0,3))
return proxies
python爬蟲 豆瓣
在爬蟲時候,通過requests.get 得到不正確的狀態碼 4 表示出錯 異常狀態碼檢視 fr aladdin 4 19 用requests庫的get 函式抓取乙個你平時經常使用的 主頁,並檢視該 的爬蟲協議 如果有的話 預設答案是正確,你成功了嗎?fp.write r.content b.有些 ...
python爬蟲 豆瓣電影
最近學習python 順便寫下爬蟲練手 爬的是豆瓣電影排行榜 python版本2.7.6 安裝 beautiful soup sudo apt get install python bs4 安裝 requests sudo apt get install python requests下面是py a...
Python爬蟲 爬取豆瓣電影(二)
檢視上乙個專案,請看 上乙個專案中獲取到了一定數量的電影url資訊,這次來獲取單個電影的電影詳情。對傳遞的url返回乙個名為soup的beautifulsoup物件 defget url html soup url header request body.get header proxies req...