import requests
from lxml import etree
import json
import csv
class
danke
(object):
def__init__
(self, num)
: self.headers =
self.base_url =
''self.num = num
defget_url_list
(self)
:'''獲取頁面'''
detail_href_list =
for i in
range(1
, self.num)
: url = self.base_url +
str(i)
# print(url)
response = requests.get(url=url, headers=self.headers)
self.html_doc = etree.html(response.text)
detail_href = self.html_doc.xpath(
"//div[@class='r_ls_box']/div[@class='r_lbx']/div[@class='r_lbx_cen']/div[@class='r_lbx_cena']/a/@href"
)# 遍歷出每乙個頁面鏈結
for i in detail_href:
set(detail_href_list)
list
(detail_href_list)
# print(detail_href_list)
return detail_href_list
defroom_info
(self, href)
:# 對應頁面
self.href = href
print
(self.href)
# 月租金
try:
self.price_num = self.html_doc.xpath(
"//div[@class='room-price-num']/text()")[
0]print
(self.price_num)
except
: self.price_num =
none
try:
self.price_sale = self.html_doc.xpath(
"//div[@class='room-price-sale']/text()")[
0].strip()+
' 元/月'
print
(self.price_sale)
except
: self.price_sale =
none
# 位址
self.places1 = self.html_doc.xpath(
"//div[@class='room-list-box']/div[2]/div[@class='room-list'][3]//label/div/a/text()")[
0]# print(self.places1)
self.places2 = self.html_doc.xpath(
"//div[@class='room-list-box']/div[2]/div[@class='room-list'][3]//label/div/a/text()")[
1]# print(self.places2)
try:
self.places3 = self.html_doc.xpath(
"//div[@class='room-list-box']/div[2]/div[@class='room-list'][3]//label/div/a/text()")[
2]except
: self.places3 =
none
# print(self.places3)
self.places = self.places1 +
' '+ self.places2 +
' '+ self.places3
print
(self.places)
# 建築面積
self.room_area = self.html_doc.xpath(
"//div[@class='room-detail-box'][1]/div[@class='room-list'][1]/label/text()")[
0]print
(self.room_area)
# **編號
self.room_num = self.html_doc.xpath(
"//div[@class='room-detail-box'][1]/div[@class='room-list'][2]/label/text()")[
0]print
(self.room_num)
# 戶型
self.room_style = self.html_doc.xpath(
"//div[@class='room-detail-box'][1]/div[@class='room-list'][3]/label/text()")[
0].strip(
)print
(self.room_style)
# 付款
self.pay_method = self.html_doc.xpath(
"//div[@class='room-detail-box'][1]/div[@class='room-list'][4]/label/a/text()")[
0]print
(self.pay_method)
# 樓層
a = self.html_doc.xpath(
"//div[@class='room-list-box']/div[2]/div[@class='room-list'][2]/label/text()")[
0].split(
':')
self.floor_num =
str(a[1]
)print
(self.floor_num)
self.dict_new =
return self.dict_new
defrun(self)
:# 1.構造url列表
detail_href_list = self.get_url_list(
)# 2.遍歷,傳送請求,獲取響應
for href in detail_href_list:
response = requests.get(url=href, headers=self.headers)
.text
self.html_doc = etree.html(response)
self.room_info(href)
keyname =
list
(self.dict_new.keys())
# print(keyname)
print
('='*50
)#儲存
csv_file =
open
('room_info1.csv'
,'a+'
) csv_writer = csv.writer(csv_file)
csv_writer.writerow(self.dict_new.values())
if __name__ ==
'__main__'
: num =
int(
input
('請輸入爬取頁面數量:'))
danke = danke(num)
danke.run(
)
爬蟲爬取趕集網租房資訊
如下 示例 import scrapy import numpy as np import pandas as pd import matplotlib.pyplot as plt 如下 示例 terminal 終端實現 cd 跳轉到上一層目錄 scrapy startproject booktop...
python爬蟲 爬取小豬網的租房資訊
pycharm簡介 pycharm是一種python ide,帶有一整套可以幫助使用者在使用python語言開發時提高其效率的工具,比如除錯 語法高亮 project管理 跳轉 智慧型提示 自動完成 單元測試 版本控制。此外,該ide提供了一些高階功能,以用於支援django框架下的專業web開發。...
使用多執行緒實現我愛我家租房資訊的爬取
我愛我家的租房 完整 import math import requests from lxml import etree import re from queue import queue import threading import time def request url url heade...