定義裝飾器函式
1defrun_forever(func):
2def
3while
true:
4func(obj)
5類初始化
def__init__
(self, page):
self.max_page =page
self.url_head = '
'self.url_mid = '
text/page/
'self.url_detail = '/'
self.count =0
self.url_queue = queue() #
頁面url佇列
self.get_url_content_queue = queue() #
單個頁面佇列
self.url_queue_all =queue()
self.page_url_list =
定義類方法
defadd_url_to_queue(self):
for i in range(1, self.max_page):
self.url_queue.put(self.url_head + self.url_detail + self.url_mid + str(i) +self.url_detail)
@run_forever
defget_page_url_to_list(self):
url =self.url_queue.get()
response =requests.get(url)
if response.status_code != 200:
self.url_queue.put(url)
print('
url {}驗證失敗 重新寫入
'.format(url))
else
: html =etree.html(response.text)
url_list = html.xpath('
//a[@class="contentherf"]/@href')
for url in
url_list:
self.url_queue_all.put(self.url_head +url)
self.url_queue.task_done()
@run_forever
defget_url_to_content_queue(self):
url =self.url_queue_all.get()
(url)
self.get_url_content_queue.put(url)
self.url_queue_all.task_done()
@run_forever
defget_content(self):
url =self.get_url_content_queue.get()
try:
response = requests.get(url, timeout=1)
if response.status_code != 200:
self.get_url_content_queue.put(url)
else
: html =etree.html(response.text)
title = html.xpath('
//h1[@class="article-title"]/text()')
contents = html.xpath('
//div[@class="content"]/text()')
with open(
'qiushi.txt
', '
a', encoding='
utf8
') as p:
for x in
title:
p.write(
"title:
" +x)
p.write('\n
')for i in
contents:
p.write(i + '\n'
) p.write('\n
')response.close()
self.count += 1
print("
".format(self.count))
self.get_url_content_queue.task_done()
except
:
print("
url truble:{}
".format(url))
self.get_url_content_queue.put(url)
def run_sue_more_task(self, func, count=1):
for i in
range(0, count):
t = thread(target=func)
t.setdaemon(true)
t.start()
defrun(self):
self.add_url_to_queue()
self.run_sue_more_task(self.get_page_url_to_list, 3)
self.run_sue_more_task(self.get_url_to_content_queue, 3)
self.run_sue_more_task(self.get_content, 5)
self.url_queue.join()
self.get_url_content_queue.join()
self.url_queue_all.join()
建立例項,呼叫方法
ifps:爬蟲有風險,封ip需謹慎,執行緒一時爽,封號火葬場__name__ == '
__main__':
qbs = get_qiushibaike(12)
qbs.run()
多執行緒爬去糗事百科
import queue import threading from fake useragent import useragent import time import requests from requests.exceptions import requestexception from l...
簡單爬取糗事百科
剛剛入門,對於爬蟲還要折騰很久才行,雖然很多功能還沒開始掌握,但是爬取下來就很開心,接下來還會爭取進步的。把自己出現的一些錯誤都加上了注釋,我目前還在學習當中,大家一起進步。期間學了乙個新的函式,在這裡分享下 strip 網上是這麼說的 需要注意的是,傳入的是乙個字元陣列,編譯器去除兩端所有相應的字...
python 爬取糗事百科
step 1 構建乙個提取糗事百科笑話的函式import urllib2 import urllib import re import thread import time import sys reload sys sys.setdefaultencoding utf 8 defgetpage p...