#!/usr/bin/python
# coding:utf8
import re
import requests
import sys
import getopt
from bs4 import beautifulsoup
from urllib.parse import quote
from time import sleep
import time
import random
class crawler:
'''爬360搜尋結果的爬蟲'''
url = ''
#去重urls = set()
html = ''
total_pages = 5
current_page = 0
next_page_url = ''
timeout = 60
p1 = 0
i1 = 1
headersparameters =
def __init__(self, keyword):
print("正在獲取網頁鏈結中......")
self.url = u'' + quote(keyword)
def set_timeout(self, time):
'''設定超時時間,單位:秒'''
try:
self.timeout = int(time)
except:
pass
def set_total_pages(self, num):
'''設定總共要爬取的頁數'''
try:
self.total_pages = int(num)
self.p1 = int(100 / self.total_pages)
except:
pass
def set_current_url(self, url):
'''設定當前url'''
self.url = url
def switch_url(self):
if self.next_page_url == '':
sys.exit()
else:
self.set_current_url(self.next_page_url)
def is_finish(self):
'''判斷是否爬取完畢'''
if self.current_page >= self.total_pages:
return true
else:
return false
def get_html(self):
'''爬取當前url所指頁面的內容,儲存到html中'''
#傳送網路請求,如果連線失敗,延時5秒,無限重試鏈結
success = false
while(success == false):
try:
#傳送網路請求
r = requests.get(self.url ,timeout=self.timeout, headers=self.headersparameters)
except requests.exceptions.connectionerror as e:
sleep(5)
else:
success = true
if r.status_code == 200:
self.html = r.text
self.current_page += 1
else:
self.html = u''
print('[error]',self.url,u'get此url返回的http狀態碼不是200')
def get_urls(self):
'''從當前html中解析出搜尋結果的url,儲存到o_urls'''
bsobj = beautifulsoup(self.html,"html.parser")
list_h3 = bsobj.find_all("h3","res-title ")
for h3 in list_h3:
if "data-url" in h3.a.attrs:
self.urls.add(h3.a.attrs["data-url"])
else:
self.urls.add(h3.a.attrs["href"])
next = re.findall(' href\=\"(\/s\?q\=[\w\d\%\&\=\_\-]*?)\"', self.html)
if len(next) > 0:
self.next_page_url = '' + next[-1]
else:
self.next_page_url = ''
def print_urls(self):
'''輸出當前urls中的url'''
for url in self.urls:
print(url)
def run(self):
while(not self.is_finish()):
c.get_html()
c.get_urls()
c.switch_url()
print(str(self.p1 * self.i1) + " %")
if not self.is_finish():
#隨機延時
time.sleep(random.randint(6,20))
self.i1+=1
if self.p1 * self.i1 < 100:
print("100 %")
c.print_urls()
print("完畢......")
if __name__ == '__main__':
help = '360_crawler.py -k [-t -p ]'
keyword = none
timeout = none
totalpages = none
try:
opts, args = getopt.getopt(sys.ar**[1:], "hk:t:p:")
except getopt.getopterror:
print(help)
sys.exit(2)
#解析命令列引數
for opt, arg in opts:
if opt == '-h':
print(help)
sys.exit()
elif opt in ("-k", "--keyword"):
keyword = arg
elif opt in ("-t", "--timeout"):
timeout = arg
elif opt in ("-p", "--totalpages"):
totalpages = arg
if keyword == none:
print(help)
sys.exit()
c = crawler(keyword)
if timeout != none:
c.set_timeout(timeout)
if totalpages != none:
print('獲取' + totalpages + '個搜尋結果頁面')
c.set_total_pages(totalpages)
print("0 %")
c.run()
python3獲取請求cookie
github鏈結 使用python庫直接獲取cookie defget cookie url param url 請求連線應保證伺服器有set cookie寫入操作 return 請求負載 data 請求頭 headers try 宣告乙個cookiejar物件例項來儲存cookie cookie ...
Python3 獲取ajax 返回內容
使用工具 瀏覽器chrome 或者firefox python3 什麼樣的頁面是使用了ajax技術?留給你回答.其中method有post和get兩種方法。區別就是get,那麼request url就是這個鏈結的返回值,使用json格式輸出 如果是post,則request url返回一般是空,aj...
Python3爬蟲連續獲取Cookies的方法
第一次獲取cookies headers url response requests.get url cookies response.cookies.get dict print cookies 第二次獲取cookies 跳過ssl驗證證書 import ssl 設定忽略ssl驗證 宣告乙個coo...