#第乙個模組 抓取所有頻道鏈結
from bs4 import
beautifulsoup
import
requests
start_url = '
'url_host = '
'def
get_index_url(url):
wb_data =requests.get(url)
soup = beautifulsoup(wb_data.text, '
lxml')
links = soup.select('
ul.ym-submnu > li > b > a')
for link in
links:
page_url = url_host + link.get('
href')
(page_url)
get_index_url(start_url)
#第二個模組 抓取所有商品鏈結和詳情資料
from bs4 import
beautifulsoup
import
requests
import
time
import
pymongo
client = pymongo.mongoclient('
localhost
', 27017)
ceshi = client['
ceshi']
url_list = ceshi['
url_list4']
item_info = ceshi['
item_info4']
#在最左邊是在python 中物件的名稱,後面的是在資料庫中的名稱
#spider 1
defget_links_from(channel, pages):
#td.t 沒有這個就終止
list_view = '
{}/pn{}/
'.format(channel, str(pages))
wb_data =requests.get(list_view)
time.sleep(1)
soup = beautifulsoup(wb_data.text, '
lxml')
if soup.find('
td', 't'
):
for link in soup.select('
td.t a.t'):
item_link = link.get('
href
').split('?'
)[0]
if item_link != '
':url_list.insert()
(item_link)
#return urls
else
:
#it's the last page !
pass
#spider 2
defget_item_info(url):
wb_data =requests.get(url)
soup = beautifulsoup(wb_data.text, '
lxml')
if url[:25] == '
':data=
item_info.insert(data)
else
: data=
item_info.insert(data)
#第三個模組 主檔案執行開始抓取
from multiprocessing import
pool
from pages_parsing import
get_item_info,url_list,item_info,get_links_from
from channel_extact import
channel_list
item_url = (item['
url'] for item in
url_list.find())
index_urls0 = (item['
url'] for item in
item_info.find())
x =set(item_url)
y =set(index_urls0)
rest_of_urls = x-y
defget_all_links_from(channel):
for i in range(1,100):
get_links_from(channel,i)
return
rest_of_urls
if__name__ == '
__main__':
pool =pool()
#pool = pool(processes=6)
#pool.map(get_all_links_from,channel_list.split())
pool.map(get_item_info,rest_of_urls)
#count = 0
#for url in rest_of_urls:
#print(url)
#count += 1
#print(count)
#第四個模組 檢視資料流
import
time
from pages_parsing import
url_list
while
true:
(url_list.find().count())
time.sleep(5)
爬取58二手房的放原標題
import requests from bs4 import beautifulsoup import re from lxml import etree import time 需求 爬取58二手房的 資訊 if name main headers 爬取到頁面原始碼資料 url page tex...
爬取二手房資訊
開源到github了 專案位址 基於springboot,idea 匯入依賴 org.jsoupgroupid jsoupartifactid 1.10.2version dependency 資料放入redis中,引人redis org.springframework.bootgroupid sp...
Python爬取58同城二手房資訊的標題名稱
今天,我們用python來爬取58同城頁面二手房資訊的資料。首先開啟 爬取頁面原始碼資料 page text requests.get url url,headers headers text 資料解析 tree etree.html page text 儲存li標籤物件 li list tree....