爬取豆瓣網路思路:
# coding: utf8from
gevent import
monkey
monkey.patch_all()
#用於隨機獲取請求頭,用法random.choice(list),會從列表中隨機取出乙個元素
import
random
#用法:urljoin(base_url,result),會產生新的url,url=base_url+result
from
urlparse import
urljoin
import
requests
#用於解析所獲取的html,獲取自己想要的資訊
from
lxml import
etree
from
gevent.pool import
pool
#利用queue建立執行緒間的通訊
from
gevent.queue import
queue
#此處的base_url 就是要和我們將要獲取的result 利用urljoin方法組成新的url
base_url = ''
# 種子url
start_url = '/tag/?view=type&icn=index-sorttags-all'
# 解析規則
rules =
# 定義佇列
#用來建立提取標籤頁列表的通道
list_queue = queue()
#用來建立提取詳情頁列表的通道
detail_queue = queue()
# 定義協程池
pool = pool(size
=10)
# 定義useragent
user_agent_list = [
,'mozilla/5.0 (windows nt 6.1; wow64; rv:40.0) gecko/20100101 firefox/40.1',,
'mozilla/5.0 (windows nt 6.1; wow64; trident/7.0; as; rv:11.0) like gecko',]
def
fetch
(url):
"""發起http請求,獲取"""
user_agent = random.choice(user_agent_list)
headers =
html = requests.get(url,
headers
=headers).text
return
html
def
parse
(html,
rule):
"""解析頁面"""
return
etree.html(html).xpath(rule)
def
crawl
(url):
"""首頁"""
html = fetch(url)
list_urls = parse(html,
rules['list_urls'
]) for
list_url in
list_urls:
#把獲取來的list_url也就是上述的result,組合成新的url,並上傳到queue
list_queue.put(urljoin(base_url,
list_url))
def
list_loop
(): """採集列表頁"""
while
true
: #從queue中獲取上傳的資訊
list_url = list_queue.get()
#協程引數一為方法,引數二為方法的引數,也就是不停呼叫crawl_list_page(list_url)
pool.spawn(crawl_list_page,
list_url)
def
detail_loop
(): """採集詳情頁"""
while
true
: detail_url = detail_queue.get()
pool.spawn(crawl_detail_page,
detail_url)
def
crawl_list_page
(list_url):
"""採集列表頁"""
html = fetch(list_url)
detail_urls = parse(html,
rules['detail_urls'
]) # 詳情頁
for
detail_url in
detail_urls:
detail_queue.put(urljoin(base_url,
detail_url))
list_urls = parse(html,
rules['page_urls'
]) for
list_url in
list_urls:
list_queue.put(urljoin(base_url,
list_url))
def
crawl_detail_page
(list_url):
"""採集詳情頁"""
html = fetch(list_url)
title = parse(html,
rules['title'
])[0
title
def
run():
"""run"""
# 1. 首頁
crawl(start_url)
# 2. 列表頁
pool.spawn(list_loop)
# 3. 詳情頁
pool.spawn(detail_loop)
# 開始採集
pool.join()
if __name__ == '__main__'
: run()
執行結果:
c:\python27\python.exe "c:/study wang/spider/spiderdome.py"
圍城活著
平凡的世界(全三部)
沉默的大多數
文學回憶錄(全2冊)
送你一顆子彈
白鹿原繁花
北鳶**時代
python 協程爬蟲 爬取豆瓣電影top250
from lxml import etree html解析庫 from time import time 獲取時間 import asyncio 協程庫 import aiohttp 協程http請求url headers 主要做html頁面抓取 async deffetch content url...
lxml爬取豆瓣
coding utf 8 author wang fake useragent 第三方庫user agent模組,它提供了最新的,最全面的user agent 瀏覽器標識,支援谷歌,火狐,ie,opera等主流瀏覽器的user agent值 安裝方法 pip install fake userage...
python爬取豆瓣影評
看的別人的 爬取某部影片的影評 沒有模擬登入只能爬6頁 encoding utf 8 import requests from bs4 import beautifulsoup import re import random import io import sys import time 使用se...