廈門房價鏈家爬蟲

2021-08-15 09:02:46 字數 3721 閱讀 8773

python爬取鏈家官網上廈門二手房資料:

防止封ip,cookies設定

def

url_open

(url)

:""" url open """

headers =

while1:

try:

req = requests.get(url=url, headers=headers, timeout=3)

break

except

:print

('timeout'

) time.sleep(1)

html = req.content

return html

彙總各個**資訊**,去重

def

get_links

(url)

:""" get link """

print

(url)

html = url_open(url)

# 解析html

soup = beautifulsoup(html,

'html.parser'

)# 定位父級

divs = soup(

'div',)

# 拿到每乙個子div中的house

house_urls =

[div.find(

'a')

['href'

]for div in divs]

return house_urls

**資訊爬取,返回字串

字典形式返回在多程序下報錯,why?

def

get_infos

(url)

:""" house information """

print

(url)

html = url_open(url)

soup = beautifulsoup(html,

'html.parser'

) house_info =

soup_area = soup(

'div',)

[0](

'a')

house_info[

'轄區'

]= soup_area[0]

.string

house_info[

'區域'

]= soup_area[1]

.string

house_info[

'小區'

]= soup(

'a',)[

0].string

house_info[

'建造年份'

]= soup(

'div',)

[0](

'div',)

[0].string.split(

'/')[0

] position = re.findall(

"resblockposition:'(.+),(.+)'"

, html.decode(

'utf-8'))

[0] house_info[

'經度'

]= position[0]

house_info[

'緯度'

]= position[1]

house_info[

'總價'

]= soup(

'span',)

[0].string

house_info[

'均價'

]= soup(

'span',)

[0].contents[0]

for name in

['base'

,'transaction']:

for li in soup(

'div',)

[0](

'li'):

contents =

[content.string.strip(

)for content in li.contents if content.string.strip()]

house_info[contents[0]

]= contents[1]

# for name in ['tags clear', 'baseattribute clear']:

# soup_baseinforms = soup('div', )

# for soup_baseinform in soup_baseinforms:

# key = soup_baseinform('div', )[0].string

# value = soup_baseinform('div', )[0].get_text()

# house_info[key] = value.strip()

# soup_rows = soup('div', )[0]('div', )

# for soup_row in soup_rows:

# layout = [s.string for s in soup_row('div')]

# for i,name in enumerate(['面積', '朝向', '窗戶']):

# house_info[layout[0] + name] = layout[i + 1]

return

str(house_info)

彙總**列表,去重

多程序,爬取**資訊

def

download_house()

:""" download house"""

# links

urls =

[''% page for page in

range(1

,101)]

pool = pool(

) house_links = pool.

map(get_links, urls)

# pool.close()

# pool.join()

# infos

urls =

set(

reduce

(lambda x, y: x + y, house_links)

)# pool = pool()

house_infos = pool.

map(get_infos, urls)

pool.close(

) pool.join(

) house_data =

[eval

(info)

for info in house_infos]

house_df = pd.dataframe(house_data)

excel_name = r"house.xls"

house_df.to_excel(excel_name, index=

false

)

if __name__ ==

'__main__'

: download_house(

)

Python爬蟲實戰之爬取鏈家廣州房價 03儲存

系列目錄 python爬蟲實戰之爬取鏈家廣州房價 01簡單的單頁爬蟲 python爬蟲實戰之爬取鏈家廣州房價 02把小爬蟲變大 這一小節主要講一下前面一直沒有實現的儲存,儲存主要分兩大類 檔案和資料庫。結合這次爬蟲的資料量及後期分析的需要,這次主要介紹sqlite。通過對sqlite資料庫的封裝,處...

Python爬取鏈家房價資訊

房子問題近些年來越來越受到大家的關注,要了解近些年的房價,首先就要獲取網上的房價資訊,我們以鏈家網上 的房價資訊為例,將資料爬取下來並儲存起來。這次資訊的爬取我們依然採取requests beautiful soup的線路來爬取鏈家網上的 房的資訊。需要安裝好anaconda,並保證系統中已經有re...

爬取鏈家網房價資料

感覺最近做的東西好菜 隨便了。d 鏈家房價資料.csv wt newline encoding utf8 writer csv.writer fp writer.writerow 樓盤名 位址 房間格式 房間面積 起價 優點 defget html url try response requests...