使用多執行緒實現我愛我家租房資訊的爬取

我愛我家的租房**：

完整**：

import math
import requests
from lxml import etree
import re
from queue import queue
import threading
import time
def request_url(url):
headers = '.format(str(int(time.time())))
}response = requests.get(url=url, headers=headers)
content = response.content.decode('utf-8')
times = 4
while times > 0:
if '' in content:
content = content
break
else:
pattern = re.compile(r'')
href = pattern.findall(content)[0]
# print(href)
content = request_url(href)
times -= 1
print(times)
return content
class thread_crawl(threading.thread):
def __init__(self, name, page_queue):
# 引數一般以簡短為主，所以選page
threading.thread.__init__(self)
# 拿到任務佇列
self.page_queue = page_queue
self.name = name
def run(self):
# # 開始時間
# time_start = time.time()
while true:
# 執行緒停止條件
if self.page_queue.empty():
break
else:
page = self.page_queue.get()
print(self.name, "將要從佇列中取任務是：", page)
content = request_url(page)
self.get_data(content)
print(self.name, '完成的任務是', page)
# time_end = time.time()
# print(self.name, '完成時間是：', time_end)
# print(self.name, '耗時是：', time_end - time_start)
def get_data(self, content):
try:
# print(content)
tree = etree.html(content)
li_list = tree.xpath('//div[@class="list-con-box"]/ul[@class="plist"]/li')
print('11111', len(li_list))
for li in li_list:
title = li.xpath('.//h3//text()')[1]
# print("名稱：",title)
type = li.xpath('.//div//text()')[13]
# print("戶型：",type)
apartment = li.xpath('.//div//text()')[15]
# print("小區名稱：",apartment)
follows_info = li.xpath('.//div//text()')[17]
# print("關注訊息：",follows_info)
price = li.xpath('.//div//text()')[20]
# print(price)
info_dic = 
print(info_dic)
with open('information.txt', 'a', encoding='utf-8') as fp:
fp.write(str(info_dic) + '\n')
except:
with open('error.txt', 'a', encoding='utf-8') as fp:
fp.write(url)
# with open('tecent_job.txt', 'a', encoding='utf-8') as fp:
#     fp.write(info)
if __name__ == '__main__':
# 任務開始時間
t_start = time.time()
url = ''
# 1.建立任務佇列：存放所有的page
content = request_url(url)
tree = etree.html(content)
# num = tree.xpath('//div[@class="total-box nobor"]/span/text()')
# print(num[0])
a_list = tree.xpath('//ul[@class="new_di_tab stab"]/a')[1:]
# print(len(a_list))
page_queue = queue()
for a in a_list:
link = a.xpath('./@href')[0]
url_old = '' + link
content = request_url(url_old)
tree = etree.html(content)
num = tree.xpath('//div[@class="total-box nobor"]/span/text()')
pages = math.ceil(int(num[0]) / 30)
for i in range(1, pages + 1):
url = url_old + 'n' + str(i)
page_queue.put(url)
# 2.生成執行緒：
crawl_name = ['c1', 'c2', 'c3']
crawl_thread = 
for name in crawl_name:
crawl = thread_crawl(name, page_queue)
crawl.start()
# join()不能寫在這裡，因為這樣寫就類似於單執行緒 c1完了才啟動c2
# 阻塞執行緒  讓子執行緒都完成任務後，主線程再往下進行
for thread in crawl_thread:
thread.join()
# 主程序結束時間
t_end = time.time()
print(t_end)
print('完成時間是：', t_end - t_start)

使用python socket多執行緒實現大檔案分發

一伺服器端 usr bin env python coding utf 8 environment 2.7 在python3中,模組不是socketserver,而是socketserver import os,json,time,socketserver,threading file path ...

python使用socket實現多執行緒埠掃瞄

使用socket主要是判斷是否埠能否聯通 socket 詳情參考 1.socket.connect 檢測埠，若埠不能聯通，則會丟擲異常，若無異常，則埠存活，使用try except異常捕獲處理，進而進行埠掃瞄這裡將使用100個執行緒，每個埠連線都要檢查是否超過65535 coding utf8 i...

PHP使用CURL MULTI實現多執行緒採集的例子

這兩天有一客戶定製了乙個免登入發布模組，因為在模組中需要涉及到很多的問題，考慮到效能問題，所以特別寫了乙個curl multi遠端採集網頁的函式，以方便以後使用，估計以後都不會使用原來的單執行緒curl函式去foreach了，其效能對比很明顯的。同樣獲取我的部落格的十個不同網頁，curl mult...

使用多執行緒實現我愛我家租房資訊的爬取

使用python socket多執行緒實現大檔案分發

python使用socket實現多執行緒埠掃瞄

PHP使用CURL MULTI實現多執行緒採集的例子

相關推薦