基礎篇 7 多線執行緒 物件實現爬蟲

2021-09-25 17:07:30 字數 2302 閱讀 7390

import time

import random

import re

import os

from urllib import request

import requests

import threading

from lxml import etree

from queue import queue # 這個佇列是執行緒佇列

"""多執行緒的queue就是執行緒安全的,所有我們不用考慮鎖的問題

"""class procuder(threading.thread):

"""生成者繼承threading.thread需要實現__init__方法和run()

"""headers =

def __init__(self,url_queue,img_queue,*args,**kwargs):

""":param url_queue: url位址佇列

:param img_queue: 位址佇列

:param args: threading.thread類的元組引數

:param kwargs: threading.thread的字典引數

"""super(procuder,self).__init__(*args,**kwargs)

self.url_queue =url_queue

self.img_queue =img_queue

def run(self):

while true:

if self.url_queue.empty():

break

url =self.url_queue.get() # 從佇列中獲取乙個url

self.parse_page(url)

def parse_page(self,url):

response = requests.get(url,headers=self.headers)

if response.status_code==200:

text = response.text

html = etree.html(text)

imgs = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")

for img in imgs:

img_url = img.get("data-original")

alt = img.get("alt")

alt = re.sub(r'[\.\*\?。?!!,,]',"",alt)

suffix = os.path.splitext(img_url)[1]

file_name = alt +suffix

self.img_queue.put((img_url,file_name))

class consumer(threading.thread):

def __init__(self,url_queue,img_queue,*args,**kwargs):

super(consumer,self).__init__(*args,**kwargs)

self.url_queue = url_queue

self.img_queue = img_queue

def run(self):

while true:

if self.url_queue.empty() and self.img_queue.empty():

break

img_url, filename = self.img_queue.get()

request.urlretrieve(img_url, 'images/' + filename)

def main():

url_queue = queue(100)

img_queue = queue(1000)

for page in range(1,101):

url = "".format(str(page))

url_queue.put(url)

for i in range(5):

p_thrd = procuder(url_queue,img_queue)

p_thrd.start()

for j in range(10):

c_thrd = consumer(url_queue,img_queue)

c_thrd.start()

if __name__ == '__main__':

main()

python 多執行緒偽 python3對多執行緒處理

importthreadingimporttimedeft1 job print t1 start for i in range 10 print begin sleep 0.1s time.sleep 0.1 print t1 finish deft2 job print t2 start pri...

c 11 多線執行緒 future

std promise 類介紹 promise 物件可以儲存某一型別 t 的值,該值可被 future 物件讀取 可能在另外乙個執行緒中 因此 promise 也提供了一種執行緒同步的手段。在 promise 物件構造時可以和乙個共享狀態 通常是std future 相關聯,並可以在相關聯的共享狀態...

python捕捉執行緒錯誤 python 多執行緒錯誤

我想用多執行緒查詢資料庫,然後進行資料操作。list range 19999,100000 pool threadpool 10 results pool.map main,list pool.close pool.join def main i print i query id,link,keyw...