如何使用爬蟲
import requests,json,time,random
class
tencentspider
(object)
: def __init__
(self)
: self.headers =
self.one_url =
'' \
'=&cityid=&bgids=&productid=&categoryid=&parentcategoryid=&attrid=&keyword=&pageindex=&pagesize=10&language=zh-cn&area=cn'
self.two_url =
'' \
'&language=zh-cn'
# 請求函式(兩級頁面都需要請求)
def get_page
(self,url)
: res = requests.
get(url,headers=self.headers)
res.encoding=
'utf-8'
# json.
loads
()把響應內容轉為python資料型別
return json.
loads
(res.text)
# 獲取資料(名稱 職責 要求)
def get_data
(self,html)
: # 先解析一級頁面html
job_info =
# 依次遍歷10個職位,在通過postid的值拼接二級頁面位址
# html[『data']['posts']:[
,,,]
for job in html[
'data'][
'posts']:
# 職位名稱
job_info[
'job_name'
]= job[
'recruitpostname'
] # pdstid: 拼接二級頁面的位址
post_id = job[
'postid'
] two_url = self.two_url.
format
(post_id)
# 發請求,解析出職責和要求
job_info[
'job_duty'
],job_info[
'require'
]= \
self.
parse_two_page
(two_url)
print
(job_info)
# 解析二級頁面函式
(職責 要求)
def parse_two_page
(self,two_url)
: two_html = self.
get_page
(two_url)
# 職責
duty = two_html[
'data'][
'responsibility'
] # 要求
require =two_html[
'data'][
'requirement'
]return duty,require
def main
(self)
:for index in
range(1
,11):
url = self.one_url.
format
(index)
# 得到一級頁面的響應內容
one_html = self.
get_page
(url)
self.
get_data
(one_html)
time.
sleep
(random.
uniform
(0.5,2
))if __name__==
'__main__'
: spider =
tencentspider()
spider.
main
()
Scrapy實踐 爬取騰訊社會招聘資訊(文字爬取)
注 爬取後的資訊將以json格式儲存,並將檔案命名為 recruit.json 可用notepad 開啟。coding utf 8 import scrapy class txhritem scrapy.item 職位名稱 positionname scrapy.field 職位類別 positio...
python爬蟲爬取騰訊網招聘資訊
話不多說,直接上 from bs4 import beautifulsoup import urllib2 import json 使用了json格式儲存 deftengxun detail,num url detail position.php?start 0 a request urllib2....
python爬蟲爬取騰訊招聘資訊 (靜態爬蟲)
環境 windows7,python3.4 親測可正常執行 1 import requests 2from bs4 import beautifulsoup 3from math import ceil 45 header 78 9 獲取崗位頁數 10def getjobpage url 11 re...