知識點:多執行緒,讀取csv,xpath
import json
import csv
import requests
import threading
import lxml
import lxml.etree
#遞迴鎖
rlock=threading.rlock()
headers =
#獲取區域
def getarealist(url):
html=requests.get(url,headers=headers).text
mytree=lxml.etree.html(html)
arelist=mytree.xpath('//div[@data-role="ershoufang"]//a')
areadict={}
for area in arelist:
# 區域名
areaname=area.xpath('./text()')[0]
#url
areaurl = ''+area.xpath('./@href')[0]
#print(areaname,areaurl)
areadict[areaname]=areaurl
print(areaname,areaurl)
return areadict
#獲取區域頁數
def getareapage(areaurl,areaname):
html = requests.get(areaurl, headers=headers).text
mytree = lxml.etree.html(html)
pagenum=mytree.xpath('//div[@class="page-box house-lst-page-box"]/@page-data')[0]
pagenum=int(json.loads(pagenum)["totalpage"])
#return int(pagenum)
gethourseinfo(areaurl, areaname, pagenum)
#獲取房子資訊
def gethourseinfo(areaurl,areaname,pagenum):
for page in range(1,pagenum):
#構造頁碼url
url=areaurl+'pg%d'%page
html = requests.get(url, headers=headers).text
mytree = lxml.etree.html(html)
#房子列表
hourselist = mytree.xpath('//ul[@class="selllistcontent"]/li')
for hourse in hourselist:
#房子標題
hoursetitle=hourse.xpath('.//div[@class="title"]/a/text()')[0]
#房子url
hourseurl = hourse.xpath('.//div[@class="title"]/a/@href')[0]
#位址資訊
hourseaddress=hourse.xpath('.//div[@class="houseinfo"]//text()')
hourseaddress=hourseaddress[0]+hourseaddress[1]
#位置xinxi
positioninfo=hourse.xpath('.//div[@class="positioninfo"]//text()')
positioninfo=positioninfo[0]+positioninfo[1]
#總價totalprice=hourse.xpath('.//div[@class="totalprice"]//text()')
totalprice=totalprice[0]+totalprice[1]
#單價unitprice
unitprice = hourse.xpath('.//div[@class="unitprice"]//text()')[0]
data=[hoursetitle,hourseaddress,positioninfo,totalprice,unitprice,hourseurl]
print(data)
#寫入csv
with rlock:
with open('./data/'+areaname+'.csv','a+') as f:
writer=csv.writer(f)
writer.writerow(data)
if __name__ == '__main__':
#廣州二手房
mainurl='/ershoufang/'
#獲取區域字典
areadict=getarealist(mainurl)
threadlist=
for areaname,areaurl in areadict.items():
# print(areaname)
# #獲取區域頁碼
# pagenum=getareapage(areaurl)
# gethourseinfo(areaurl,areaname,pagenum)
t=threading.thread(target=getareapage,args=(areaurl,areaname))
t.start()
#保證執行緒正常結束
for t in threadlist:
t.join()
Python鏈家廣州二手房的資料爬取 資料爬取
讀取原始資料 注意選擇gbk編碼方式 很簡單 就不做上傳 try soup beautifulsoup html,html.parser except exception return 1 house info div soup.find all div attrs 獲取整個標題塊 獲取到之後的資料...
Python爬取鏈家二手房資訊
2 資料庫表結構 使用物件導向的方式,搭建專案框架 import requests from bs4 import beautifulsoup import pymysql class lianjiaspider mydb pymysql.connect localhost root 123456 ...
python爬蟲爬取鏈家二手房資訊
問題一 鏈家 也有反爬蟲策略和robots限制,robots限制忽略 不然沒法爬 另外頻繁爬取會直接導致被ban,需要隔天才會解禁止。防止被ban的方法有多種,1.禁止cookie 2.設定header 3.加大爬取間隔 4.使用 我只用了前三種方法,具體可以在settings.py 和middle...