from bs4 import beautifulsoup
import requests
import ip_proxy
from urllib import parse
def get_boss_info(my_ip,detailed_url):
#url = 『
proxy =
response = requests.get(detailed_url, headers=headers, proxies = proxy, timeout=5)
soup = beautifulsoup(response.text, 'lxml')
title = soup.find('h1').text
#div_ele = soup.find('div', class_="name")
#print(div_ele)
salary = soup.find('span', class_="badge").text.replace('\n', '').strip()
print(title)
print(salary)
gezhong_info = soup.select('div.info-primary > p')[0].text.replace('\n', '').strip()
print(gezhong_info)
gangwei_info = soup.select('div.text')[0].text
print(gangwei_info)
def get_detail_url(my_ip, url):# 獲取詳情頁的url
# url = 『
proxy =
response = requests.get(url, headers = headers, proxies=proxy, timeout=5)
soup = beautifulsoup(response.text, 'lxml')
#a_ele_list = soup.select('h3.name > a')
a_ele_list = soup.select('div.job-list > ul > li div.info-primary > h3 > a')
for a_ele in a_ele_list:
# 屬性值的獲取可以通過類似字典的方式獲取
a_href = a_ele['href']
# 拼接詳情頁的鏈結
href = parse.urljoin(url, a_href)
print('詳情頁的href: ' + href)
# 重試三次, 獲取**訪問boss直聘, 三次沒有成功訪問就跳過
for i in range(0,3):
try:
# 獲取詳情頁的資訊
get_boss_info(my_ip, href)
break
except exception as e:
print(e)
my_ip.update_ip_proxy_str()
def get_all_info(my_ip):
base_url = 『
for i in range(1,4):
# 每乙個分頁的url
url = base_url % (i, i)
# 迴圈處理, 如果proxy不好使, 就需要換**, 如果重試4次依然不好使,就跳過
for i in range(0, 4):
try:
# 迴圈四次訪問boss直聘的**, 分頁的內容
# get_detail_url(my_ip, url)
get_detail_url(my_ip, url)
break
except exception as e:
print(e)
my_ip.update_ip_proxy_str()
ifname== 『main『:
my_ip = ip_proxy.ip_getter()
# 獲取乙個ip
# proxy_str = 『36.27.143.72:21450』
# print(proxy_str)
# 獲取所有的boss直聘資訊
get_all_info(my_ip)
爬取boss直聘招聘資訊
直接上主 from bs4 import beautifulsoup import requests import ip proxy from urllib import parse headers def get boss info my ip,detailed url url proxy res...
BOSS直聘招聘資訊爬取
設定搜尋職位名稱 key words 資料分析 key urllib.parse.quote key words url key page 1 ka page 1 defget data url try res requests.get url,headers page headers status...
Scrapy實戰 爬Boss直聘
我們爬取頁面中每個公司的崗位資訊,包括職位 辦公地點 工作經驗 上圖中的11個加上boss直聘的jobid共12個資訊 開啟shell scrapy shell view response 發現返回403 嘗試把headers一併給出 from scrapy import request fetch...