importjsonimporturllib
fromurllibimportparse, request
importmath
# 請求頭
headers =
# 獲得相關網頁數方法
defgetpagenum(kw):
# url = ''
# 路由(沒有輸查詢關鍵字的路由)
url = ''
# form data
data =
# url編碼
data = urllib.parse.urlencode(data).encode('utf-8')
# 請求體
req = urllib.request.request(url,
data=data,
headers=headers) # post請求
# 獲取響應
response = urllib.request.urlopen(req).read().decode('utf-8')
# 轉json
data = json.loads(response)
# 獲取崗位數
jobnum = data['content']['positionresult']['totalcount']
print(jobnum)
# 獲取單頁崗位數
pagesize = data['content']['pagesize']
print(pagesize)
# 獲取頁碼數
totalpage = math.ceil(jobnum / pagesize)
print(totalpage)
returnint(totalpage)
# 獲得崗位資訊的方法
defgetjobinfo(kw, pagenum):
# url = ''
# 路由(沒有輸查詢關鍵字的路由)
url = ''
foriinrange(1
, pagenum + 1):
data =
data = urllib.parse.urlencode(data).encode('utf-8')
req = urllib.request.request(url,
data=data,
headers=headers) # post請求
response = urllib.request.urlopen(req).read().decode('utf-8')
data = json.loads(response)
joblist = data['content']['positionresult']['result']
# print(joblist)
forjobinjoblist:
city = job['city']
companyfullname = job['companyfullname']
companylabellist = ['companylabellist']
companyshortname = job['companyshortname']
companysize = job['companysize']
district = job['district']
education = job['education']
firsttype = job['firsttype']
hitags = job['hitags']
positionadvantage = job['positionadvantage']
positionlables = job['positionlables']
print(city, companyfullname, companylabellist, companysize, district, education, firsttype, hitags,
positionadvantage, positionlables)
# 將爬取的結果儲存到pythonjob.txt中
withopen('pythonjob.txt'
, 'a+'
, encoding='utf-8'
, errors='ignore')asf:
f.write(
str((city, companyfullname, companylabellist, companysize, district, education, firsttype, hitags,
positionadvantage, positionlables)) + '
\n')
# 清除快取
f.flush()
if__name__ == '__main__':
totalpage = getpagenum('python')
getjobinfo('python'
, totalpage)
拉鉤JSON資料爬取
訪問url位址檢視網頁源 發現職位資訊為動態載入通過開發者工具檢視xhr請求,發現json資料位址為,將位址複製到瀏覽器,出現您操作太頻繁,請稍後訪問之類的資料,無法檢視完整資料向web中的位址傳送請求,獲取cookies和session資訊使用post方式將之前獲取的cookies,session...
python爬拉鉤案例 爬蟲
直接上 這裡拉勾網做了cookie的反扒機制,所以用 requests.utils.dict from cookiejar這個方法去獲取cookie然後賦值import requests url headers 或者response從而獲取cookie response requests.get h...
用Python爬取拉鉤網招聘職位資訊
本文實現自動爬取拉鉤網招聘資訊,並將爬取結果儲存在本地文字中 也可以將資料存入資料庫 使用到的python模組包 python3 1.urllib.request 2.urllib.parse 3.json 簡單分析 1.在向伺服器傳送請求,需要傳入post引數 2.搜尋的職位列表資訊存在乙個jos...