python爬取鏈家官網上廈門二手房資料:
防止封ip,cookies設定
def
url_open
(url)
:""" url open """
headers =
while1:
try:
req = requests.get(url=url, headers=headers, timeout=3)
break
except
:print
('timeout'
) time.sleep(1)
html = req.content
return html
彙總各個**資訊**,去重
def
get_links
(url)
:""" get link """
print
(url)
html = url_open(url)
# 解析html
soup = beautifulsoup(html,
'html.parser'
)# 定位父級
divs = soup(
'div',)
# 拿到每乙個子div中的house
house_urls =
[div.find(
'a')
['href'
]for div in divs]
return house_urls
**資訊爬取,返回字串
字典形式返回在多程序下報錯,why?
def
get_infos
(url)
:""" house information """
print
(url)
html = url_open(url)
soup = beautifulsoup(html,
'html.parser'
) house_info =
soup_area = soup(
'div',)
[0](
'a')
house_info[
'轄區'
]= soup_area[0]
.string
house_info[
'區域'
]= soup_area[1]
.string
house_info[
'小區'
]= soup(
'a',)[
0].string
house_info[
'建造年份'
]= soup(
'div',)
[0](
'div',)
[0].string.split(
'/')[0
] position = re.findall(
"resblockposition:'(.+),(.+)'"
, html.decode(
'utf-8'))
[0] house_info[
'經度'
]= position[0]
house_info[
'緯度'
]= position[1]
house_info[
'總價'
]= soup(
'span',)
[0].string
house_info[
'均價'
]= soup(
'span',)
[0].contents[0]
for name in
['base'
,'transaction']:
for li in soup(
'div',)
[0](
'li'):
contents =
[content.string.strip(
)for content in li.contents if content.string.strip()]
house_info[contents[0]
]= contents[1]
# for name in ['tags clear', 'baseattribute clear']:
# soup_baseinforms = soup('div', )
# for soup_baseinform in soup_baseinforms:
# key = soup_baseinform('div', )[0].string
# value = soup_baseinform('div', )[0].get_text()
# house_info[key] = value.strip()
# soup_rows = soup('div', )[0]('div', )
# for soup_row in soup_rows:
# layout = [s.string for s in soup_row('div')]
# for i,name in enumerate(['面積', '朝向', '窗戶']):
# house_info[layout[0] + name] = layout[i + 1]
return
str(house_info)
彙總**列表,去重
多程序,爬取**資訊
def
download_house()
:""" download house"""
# links
urls =
[''% page for page in
range(1
,101)]
pool = pool(
) house_links = pool.
map(get_links, urls)
# pool.close()
# pool.join()
# infos
urls =
set(
reduce
(lambda x, y: x + y, house_links)
)# pool = pool()
house_infos = pool.
map(get_infos, urls)
pool.close(
) pool.join(
) house_data =
[eval
(info)
for info in house_infos]
house_df = pd.dataframe(house_data)
excel_name = r"house.xls"
house_df.to_excel(excel_name, index=
false
)
if __name__ ==
'__main__'
: download_house(
)
Python爬蟲實戰之爬取鏈家廣州房價 03儲存
系列目錄 python爬蟲實戰之爬取鏈家廣州房價 01簡單的單頁爬蟲 python爬蟲實戰之爬取鏈家廣州房價 02把小爬蟲變大 這一小節主要講一下前面一直沒有實現的儲存,儲存主要分兩大類 檔案和資料庫。結合這次爬蟲的資料量及後期分析的需要,這次主要介紹sqlite。通過對sqlite資料庫的封裝,處...
Python爬取鏈家房價資訊
房子問題近些年來越來越受到大家的關注,要了解近些年的房價,首先就要獲取網上的房價資訊,我們以鏈家網上 的房價資訊為例,將資料爬取下來並儲存起來。這次資訊的爬取我們依然採取requests beautiful soup的線路來爬取鏈家網上的 房的資訊。需要安裝好anaconda,並保證系統中已經有re...
爬取鏈家網房價資料
感覺最近做的東西好菜 隨便了。d 鏈家房價資料.csv wt newline encoding utf8 writer csv.writer fp writer.writerow 樓盤名 位址 房間格式 房間面積 起價 優點 defget html url try response requests...