Python 3獲取360搜尋結果的鏈結

2021-09-08 15:57:58 字數 3779 閱讀 9902

#!/usr/bin/python

# coding:utf8

import re

import requests

import sys

import getopt

from bs4 import beautifulsoup

from urllib.parse import quote

from time import sleep

import time

import random

class crawler:

'''爬360搜尋結果的爬蟲'''

url = ''

#去重urls = set()

html = ''

total_pages = 5

current_page = 0

next_page_url = ''

timeout = 60

p1 = 0

i1 = 1

headersparameters =

def __init__(self, keyword):

print("正在獲取網頁鏈結中......")

self.url = u'' + quote(keyword)

def set_timeout(self, time):

'''設定超時時間,單位:秒'''

try:

self.timeout = int(time)

except:

pass

def set_total_pages(self, num):

'''設定總共要爬取的頁數'''

try:

self.total_pages = int(num)

self.p1 = int(100 / self.total_pages)

except:

pass

def set_current_url(self, url):

'''設定當前url'''

self.url = url

def switch_url(self):

if self.next_page_url == '':

sys.exit()

else:

self.set_current_url(self.next_page_url)

def is_finish(self):

'''判斷是否爬取完畢'''

if self.current_page >= self.total_pages:

return true

else:

return false

def get_html(self):

'''爬取當前url所指頁面的內容,儲存到html中'''

#傳送網路請求,如果連線失敗,延時5秒,無限重試鏈結

success = false

while(success == false):

try:

#傳送網路請求

r = requests.get(self.url ,timeout=self.timeout, headers=self.headersparameters)

except requests.exceptions.connectionerror as e:

sleep(5)

else:

success = true

if r.status_code == 200:

self.html = r.text

self.current_page += 1

else:

self.html = u''

print('[error]',self.url,u'get此url返回的http狀態碼不是200')

def get_urls(self):

'''從當前html中解析出搜尋結果的url,儲存到o_urls'''

bsobj = beautifulsoup(self.html,"html.parser")

list_h3 = bsobj.find_all("h3","res-title ")

for h3 in list_h3:

if "data-url" in h3.a.attrs:

self.urls.add(h3.a.attrs["data-url"])

else:

self.urls.add(h3.a.attrs["href"])

next = re.findall(' href\=\"(\/s\?q\=[\w\d\%\&\=\_\-]*?)\"', self.html)

if len(next) > 0:

self.next_page_url = '' + next[-1]

else:

self.next_page_url = ''

def print_urls(self):

'''輸出當前urls中的url'''

for url in self.urls:

print(url)

def run(self):

while(not self.is_finish()):

c.get_html()

c.get_urls()

c.switch_url()

print(str(self.p1 * self.i1) + " %")

if not self.is_finish():

#隨機延時

time.sleep(random.randint(6,20))

self.i1+=1

if self.p1 * self.i1 < 100:

print("100 %")

c.print_urls()

print("完畢......")

if __name__ == '__main__':

help = '360_crawler.py -k [-t -p ]'

keyword = none

timeout = none

totalpages = none

try:

opts, args = getopt.getopt(sys.ar**[1:], "hk:t:p:")

except getopt.getopterror:

print(help)

sys.exit(2)

#解析命令列引數

for opt, arg in opts:

if opt == '-h':

print(help)

sys.exit()

elif opt in ("-k", "--keyword"):

keyword = arg

elif opt in ("-t", "--timeout"):

timeout = arg

elif opt in ("-p", "--totalpages"):

totalpages = arg

if keyword == none:

print(help)

sys.exit()

c = crawler(keyword)

if timeout != none:

c.set_timeout(timeout)

if totalpages != none:

print('獲取' + totalpages + '個搜尋結果頁面')

c.set_total_pages(totalpages)

print("0 %")

c.run()

python3獲取請求cookie

github鏈結 使用python庫直接獲取cookie defget cookie url param url 請求連線應保證伺服器有set cookie寫入操作 return 請求負載 data 請求頭 headers try 宣告乙個cookiejar物件例項來儲存cookie cookie ...

Python3 獲取ajax 返回內容

使用工具 瀏覽器chrome 或者firefox python3 什麼樣的頁面是使用了ajax技術?留給你回答.其中method有post和get兩種方法。區別就是get,那麼request url就是這個鏈結的返回值,使用json格式輸出 如果是post,則request url返回一般是空,aj...

Python3爬蟲連續獲取Cookies的方法

第一次獲取cookies headers url response requests.get url cookies response.cookies.get dict print cookies 第二次獲取cookies 跳過ssl驗證證書 import ssl 設定忽略ssl驗證 宣告乙個coo...