多執行緒抓取英雄聯盟全面板並儲存

2021-10-01 04:20:36 字數 1972 閱讀 4858

import requests

import threading

from queue import queue

from lxml import etree

import time

import wget

import os

urlqueue=queue(

)#鏈結佇列

threads_num=

100#執行緒數

threads=

urlist=

skin=

defgeturl

(urllist)

:for i in

range(1

,43):

""+str

(i)+

".shtml"

)return urllist

defdownload

(url)

:try

: r=requests.get(url)

r.raise_for_status

print

(r.url)

html=r.text

return html

except

:print

("抓取失敗"

)def

tree_parser

(html)

:try

: tre=etree.html(html)

skin=tre.xpath(

'//li[@class="boxshadow"]/a/img/@src'

)return skin

except

:print

("分析失敗"

)def

downloads()

:while

true

:global urlqueue

url = urlqueue.get(

)# 取乙個url

if url is

none

:break

t = download(url)

skin=tree_parser(t)

for i in skin:

try:

path =

"d:\\skin"

t = wget.download(i,os.path.join(path,i.split(

"1")[-

1]))

except

:print

("儲存失敗"

,i) urlqueue.task_done(

)print

("剩餘:"

,urlqueue.qsize())

defmain()

: list=geturl(urlist)

#獲取所有**鏈結

for i in list:

urlqueue.put(i)

#啟動執行緒,並放入執行緒池中

for i in

range

(threads_num)

: t=threading.thread(target=downloads())

t.start(

) urlqueue.join(

)#向佇列傳送n個none

for i in

range

(threads_num)

: urlqueue.put(

"none"

)for t in threads:

t.jion(

)print

("結束程式"

)starttime=time.time(

)main(

)times = time.time(

)- starttime

print

(times)

Python爬英雄聯盟lol全英雄面板

1.匯入所需模組 import requests import os 建立lol資料夾 os.mkdir lol 2.讀取js檔案,獲取英雄id hero id url response requests.get url,headers headers json list response.json...

python爬蟲 爬取英雄聯盟全英雄面板

import requests import re 1 分析目標網頁,確定爬取的url路徑,headers引數 base url headers 2 傳送請求 response requests.get base url,headers headers base data response.json...

LOL全英雄面板爬蟲

coding utf 8 import requests import re import os class lolspider def init self 定義乙個user agent,偽裝成瀏覽器 self.headers defgetresponse self,url 傳送請求,獲取響應 ur...