突然想測試多執行緒爬蟲的速度,就寫了爬智聯多執行緒,爬的資料量少效果不是很明顯,**都有注釋應該很好理解
# 獲取網頁內容,返回html資料
response = requests.get(url, headers=headers)
# 通過狀態碼判斷是否獲取成功
if response.status_code == 200:
return response.text
return
none
except requestexception as e:
return
none
defparse_one_page
(html):
''' 解析html**,提取有用資訊並返回
'''# 正規表示式進行解析
# print(html)
items = json.loads(html)["data"]["results"]
len(items)
if len(items):
for item in items:
job_name = item['jobname']
ifu"爬蟲"
or"python"
in job_name:
company = item['company']['name']
# website = item['company']['url']
salary = item['salary']
empltype = item['empltype']
city = item['city']['display']
edulevel = item['edulevel']['name']
company_size = item['company']['size']['name']
company_type = item['company']['type']['name']
workingexp = item['workingexp']['name']
yield
defwrite_csv_file
(path, headers, rows):
''' 將表頭和行寫入csv檔案
'''# 加入encoding防止中文寫入報錯
# newline引數防止每寫入一行都多乙個空行
with open(path, 'a', encoding='gb18030', newline='') as f:
f_csv = csv.dictwriter(f, headers)
f_csv.writeheader()
f_csv.writerows(rows)
defwrite_csv_headers
(path, headers):
''' 寫入表頭
'''with open(path, 'a', encoding='gb18030', newline='') as f:
f_csv = csv.dictwriter(f, headers)
f_csv.writeheader()
defwrite_csv_rows
(path, headers, rows):
''' 寫入行
'''with open(path, 'a', encoding='gb18030', newline='') as f:
f_csv = csv.dictwriter(f, headers)
f_csv.writerows(rows)
defmain
(city, keyword, region, pages):
''' 主函式
'''filename = 'zl_' + city + '_' + keyword + '.csv'
headers = [u'職位名稱', u'公司', u'工資',u'工作型別',u'工作城市',u"學歷",u"工作經驗",u'公司型別',u'公司人數']
write_csv_headers(filename, headers)
for i in tqdm(range(1, pages)):
'''獲取該頁中所有職位資訊,寫入csv檔案
'''jobs =
html = get_one_page(city, keyword, region, i)
items = parse_one_page(html)
for item in items:
write_csv_rows(filename, headers, jobs)
# city_lsit是有多少任務
city_lsit = [,,
,,]# num是併發執行緒總數
q = queue()
num = 5
# 具體的處理函式,負責處理單個任務
defdo_somthing_using
(arguments):
main(arguments["city"], arguments["keyword"], 2005, int(arguments["pages"]))
# 這個是工作程序,負責不斷從佇列取資料並處理
defworking
():while
true:
arguments = q.get()
do_somthing_using(arguments)
# sleep(1)
q.task_done()
# fork num個執行緒等待佇列
print("開始",datetime.datetime.now())
for i in range(num):
t = thread(target=working)
t.setdaemon(true)
t.start()
# 把city_lsit排入佇列
for i in city_lsit:
q.put(i)
# 等待所有jobs完成
q.join()
print("結束",datetime.datetime.now())
爬蟲 智聯爬取 拼接鏈結的演示
import requests import time import numpy as np import pandas as pd 標頭檔案 headers ip proxies 資料集合 infos 基礎鏈結 url 需求鏈結 base url start 300 pagesize 60 cit...
使用多執行緒爬取資料
應用名稱 應用鏈結 import requests import time from multiprocessing import queue from threading import thread import json import urllib.parse class xiaomispide...
回顧多執行緒爬取資料
明確目的 將多執行緒爬蟲涉及到的技術點回顧一下 首先,是基本流程,多執行緒爬蟲架構圖如下 用來存url,和 網頁的響應內容,給執行緒提供資料執行緒資料 class queue object enqueue item 往佇列中新增乙個item元素 dequeue 從佇列頭部刪除乙個元素 is empt...