from bs4 import beautifulsoupimport requests
import time
def
get_item_info(url): #如何從詳情頁裡面要爬取的內容
#url = ''
wb_data = requests.get(url)
soup = beautifulsoup(wb_data.text,
'lxml')
time.sleep(2)
cate = soup.select('span.crb_i > a')[-1].text.strip()
title = soup.select('h1.info_titile')[0].text
view = soup.select('span.look_time')[0].text
price = soup.select('span.price_now')[0].text
zone = soup.select('div.palce_li > span > i')[0].text
data=
print(data)
def
get_all_items_info(url): #如何從一頁中獲得詳情頁連線
#url = ''
wb_data = requests.get(url)
soup = beautifulsoup(wb_data.text,
'lxml')
hrefs_list = soup.select('a.t')
for href in hrefs_list:
link = href.get('href')
if 'zhuanzhuan'
in link:
get_item_info(link)
def
get_page_link(page_number):#如何獲取每頁鏈結
for each_number in
range(1
, page_number):
full_url = '0/pn{}/'.format(each_number)
get_all_items_info(full_url)
get_page_link(3)
總結:感覺重要的還是乙個邏輯思路,函式與函式之間的聯絡。把關係弄清楚了,程式設計就簡單了
frombs4
import
beautifulsoup
import
requests
import
time
def
get_item_info
(url):
#3如何從詳情頁裡面要爬取的內容
wb_data = requests.get(url)
soup = beautifulsoup(wb_data.text
,'lxml'
) time.sleep(2)
cate = soup.select(
'span.crb_i > a'
)[-1
].text.strip()
title = soup.select(
'h1.info_titile')[0
].text
view = soup.select(
'span.look_time')[0
].text
price = soup.select(
'span.price_now')[0
].text
zone = soup.select(
'div.palce_li > span > i')[0
].text
data=
(data)
'''def get_all_items_info(url): #2如何從一頁中獲得詳情頁連線
wb_data = requests.get(url)
soup = beautifulsoup(wb_data.text,'lxml')
hrefs_list = soup.select('a.t')
for href in hrefs_list:
link = href.get('href')
if 'zhuanzhuan' in link:
get_item_info(link)
'''def
get_page_link
(page_number):
#1如何獲取每頁鏈結
for
each_number
in range(1
, page_number):
full_url =
'0/pn{}/'
.format(each_number)
#get_all_items_info(full_url)
wb_data = requests.get(full_url)
soup = beautifulsoup(wb_data.text
, 'lxml'
) hrefs_list = soup.select(
'a.t'
)
for
href
in hrefs_list:
link = href.get(
'href'
)
if 'zhuanzhuan'
in link:
get_item_info(link)
get_page_link(2)
這樣也可以,
爬蟲第一次
由於面試的需要,昨天看了下爬蟲,python的,原先一直以為很高階,但是才發現大體思路很清晰。1。連線到要抓取的某網 注意import urllib,比如這個樣子 def gethtml url page urllib.urlopen url html page.read return html 這...
記第一次爬蟲
出不來結果的時候,真是著急,期間犯了很多錯誤,這個過程痛苦並快樂著 哈哈哈哈哈,哈哈哈哈哈 import urllib.request import re import os url page urllib.request.urlopen url read page page.decode gbk ...
記第一次爬蟲
在學習爬蟲的過程中,我首先進行的是對豆瓣top250的爬取,步驟可分為如下幾步 第一步 抓包 url 第二步 請求url 第三步 解析,提取出我需要的資訊 第四步 存入檔案 首先我對豆瓣的網頁進行了分析,開啟要抓取的網頁,f12 f5,這樣你就可以看到網頁原始碼了,進入到network,找到要抓取的...