協程爬取整站豆瓣網路

2021-08-18 19:51:11 字數 3388 閱讀 3207

爬取豆瓣網路思路:

# coding: utf8

from

gevent import

monkey

monkey.patch_all()

#用於隨機獲取請求頭,用法random.choice(list),會從列表中隨機取出乙個元素

import

random

#用法:urljoin(base_url,result),會產生新的url,url=base_url+result

from

urlparse import

urljoin

import

requests

#用於解析所獲取的html,獲取自己想要的資訊

from

lxml import

etree

from

gevent.pool import

pool

#利用queue建立執行緒間的通訊

from

gevent.queue import

queue

#此處的base_url 就是要和我們將要獲取的result 利用urljoin方法組成新的url

base_url = ''

# 種子url

start_url = '/tag/?view=type&icn=index-sorttags-all'

# 解析規則

rules =

# 定義佇列

#用來建立提取標籤頁列表的通道

list_queue = queue()

#用來建立提取詳情頁列表的通道

detail_queue = queue()

# 定義協程池

pool = pool(size

=10)

# 定義useragent

user_agent_list = [

,'mozilla/5.0 (windows nt 6.1; wow64; rv:40.0) gecko/20100101 firefox/40.1',,

'mozilla/5.0 (windows nt 6.1; wow64; trident/7.0; as; rv:11.0) like gecko',]

def

fetch

(url):

"""發起http請求,獲取"""

user_agent = random.choice(user_agent_list)

headers =

html = requests.get(url,

headers

=headers).text

return

html

def

parse

(html,

rule):

"""解析頁面"""

return

etree.html(html).xpath(rule)

def

crawl

(url):

"""首頁"""

html = fetch(url)

list_urls = parse(html,

rules['list_urls'

]) for

list_url in

list_urls:

#把獲取來的list_url也就是上述的result,組合成新的url,並上傳到queue

list_queue.put(urljoin(base_url,

list_url))

def

list_loop

(): """採集列表頁"""

while

true

: #從queue中獲取上傳的資訊

list_url = list_queue.get()

#協程引數一為方法,引數二為方法的引數,也就是不停呼叫crawl_list_page(list_url)

pool.spawn(crawl_list_page,

list_url)

def

detail_loop

(): """採集詳情頁"""

while

true

: detail_url = detail_queue.get()

pool.spawn(crawl_detail_page,

detail_url)

def

crawl_list_page

(list_url):

"""採集列表頁"""

html = fetch(list_url)

detail_urls = parse(html,

rules['detail_urls'

]) # 詳情頁

for

detail_url in

detail_urls:

detail_queue.put(urljoin(base_url,

detail_url))

list_urls = parse(html,

rules['page_urls'

]) for

list_url in

list_urls:

list_queue.put(urljoin(base_url,

list_url))

def

crawl_detail_page

(list_url):

"""採集詳情頁"""

html = fetch(list_url)

title = parse(html,

rules['title'

])[0

] print

title

def

run():

"""run"""

# 1. 首頁

crawl(start_url)

# 2. 列表頁

pool.spawn(list_loop)

# 3. 詳情頁

pool.spawn(detail_loop)

# 開始採集

pool.join()

if __name__ == '__main__'

: run()

執行結果:

c:\python27\python.exe "c:/study  wang/spider/spiderdome.py"

圍城活著

平凡的世界(全三部)

沉默的大多數

文學回憶錄(全2冊)

送你一顆子彈

白鹿原繁花

北鳶**時代

python 協程爬蟲 爬取豆瓣電影top250

from lxml import etree html解析庫 from time import time 獲取時間 import asyncio 協程庫 import aiohttp 協程http請求url headers 主要做html頁面抓取 async deffetch content url...

lxml爬取豆瓣

coding utf 8 author wang fake useragent 第三方庫user agent模組,它提供了最新的,最全面的user agent 瀏覽器標識,支援谷歌,火狐,ie,opera等主流瀏覽器的user agent值 安裝方法 pip install fake userage...

python爬取豆瓣影評

看的別人的 爬取某部影片的影評 沒有模擬登入只能爬6頁 encoding utf 8 import requests from bs4 import beautifulsoup import re import random import io import sys import time 使用se...