#coding=utf-8import requests
from fake_useragent import useragent
from bs4 import beautifulsoup
import json
import csv
import time
# 構建請求頭
useragent = useragent()
headers =
# 宣告乙個列表儲存字典
data_list =
def start_spider(page):
#設定重連次數
requests.adapters.default_retries = 15
s = requests.session()
#設定連線活躍狀態為false
s.keep_alive = false
#爬取的url,預設爬取的南京的鏈家房產資訊
url = ''.format(page)
# 請求url
resp = requests.get(url, headers=headers,timeout=10)
# 講返回體轉換成beautiful
soup = beautifulsoup(resp.content, 'lxml')
# 篩選全部的li標籤
selllistcontent = soup.select('.selllistcontent li.logclickdata')
# 迴圈遍歷
for sell in selllistcontent:
try:
# 標題
title = sell.select('div.title a')[0].string
# 先抓取全部的div資訊,再針對每一條進行提取
houseinfo = list(sell.select('div.houseinfo')[0].stripped_strings)
# 樓盤名字
loupan = houseinfo[0]
#對樓盤的資訊進行分割
info = houseinfo[0].split('|')
# 房子型別
house_type = info[1].strip()
# 面積大小
area = info[2].strip()
# 房間朝向
toward = info[3].strip()
# 裝修型別
renovation = info[4].strip()
# 房屋位址
positioninfo = ''.join(list(sell.select('div.positioninfo')[0].stripped_strings))
# 房屋總價
totalprice = ''.join(list(sell.select('div.totalprice')[0].stripped_strings))
# 房屋單價
unitprice = list(sell.select('div.unitprice')[0].stripped_strings)[0]
# 宣告乙個字典儲存資料
data_dict = {}
data_dict['title'] = title
data_dict['loupan'] = loupan
data_dict['house_type'] = house_type
data_dict['area'] = area
data_dict['toward'] = toward
data_dict['renovation'] = renovation
data_dict['positioninfo'] = positioninfo
data_dict['totalprice'] = totalprice
data_dict['unitprice'] = unitprice
except exception as e:
print(e)
continue
def main():
# 只爬取10頁
for page in range(1, 10):
start_spider(page)
time.sleep(3)
# 將資料寫入json檔案
with open('data_json.json', 'a+', encoding='utf-8') as f:
json.dump(data_list, f, ensure_ascii=false, indent=4)
print('json檔案寫入完成')
# 將資料寫入csv檔案
with open('./data_csv.csv', 'w', encoding='utf-8', newline='') as f:
# 表頭
print(data_list)
title = data_list[0].keys()
# 建立writer物件
writer = csv.dictwriter(f, title)
# 寫入表頭
writer.writeheader()
# 批量寫入資料
writer.writerows(data_list)
print('csv檔案寫入完成')
if __name__ == '__main__':
main()
python爬蟲爬取鏈家二手房資訊
問題一 鏈家 也有反爬蟲策略和robots限制,robots限制忽略 不然沒法爬 另外頻繁爬取會直接導致被ban,需要隔天才會解禁止。防止被ban的方法有多種,1.禁止cookie 2.設定header 3.加大爬取間隔 4.使用 我只用了前三種方法,具體可以在settings.py 和middle...
Python爬取鏈家二手房資訊
2 資料庫表結構 使用物件導向的方式,搭建專案框架 import requests from bs4 import beautifulsoup import pymysql class lianjiaspider mydb pymysql.connect localhost root 123456 ...
python爬蟲之鏈家鄭州二手房爬取
今天爬取鏈家鄭州二手房的資訊,先寫簡單一點的,通過鏈家的過濾標籤,共篩選出5家符合條件的 只爬取了列表頁的內容,後面在爬取稍微複雜一點的頁面。首先分析url及返回的html文字,發現所要的資訊就在當前url返回的html文字當中,這就非常簡單了。返回的html文字 既然這樣,那提取內容就十分的簡單了...