環境:
windows7,python3.4
**:(親測可正常執行)
1import
requests
2from bs4 import
beautifulsoup
3from math import
ceil
45 header =78
9#獲取崗位頁數
10def
getjobpage(url):
11 ret = requests.get(url, headers=header)
12 ret.encoding = "
utf-8"#
解決亂碼問題
13 html =ret.text
14 soup = beautifulsoup(html, '
html.parser')
15#獲取崗位總數,< span class ="lightblue total" > 512 < / span >
16 totaljob = soup.select('
span[class="lightblue total"]
')[0].text
17 jobpage = ceil(int(totaljob) / 10)
18return
jobpage
1920
21def
getjoborder(url):
22 ret = requests.get(url, headers=header)
23 ret.encoding = "
utf-8"#
解決亂碼問題
24 html =ret.text
25 soup = beautifulsoup(html, '
html.parser')
26#工作職責
27 jobrequests = soup.select('
ul[class="squareli"]
')[0].text28#
工作要求
29 joborder = soup.select('
ul[class="squareli"]
')[1].text
30return
jobrequests, joborder
313233#
獲取崗位資訊
34def
getjobinfo(url):
35 myfile = open("
tencent_job.txt
", "
a", encoding='
gb18030
', errors='
ignore
') #
解決亂碼問題
36 ret = requests.get(url, headers=header)
37 ret.encoding = "
utf-8"#
解決亂碼問題
38 html =ret.text
39 soup = beautifulsoup(html, '
html.parser')
40 joblist = soup.find_all('
tr', class_=['
even
', '
odd'
])41
for job in
joblist:42#
url43 joburl = "
" + job.select('
td:nth-of-type(1) > a
')[0]['
href']
44#職位名稱
45 jobname = job.select('
td:nth-of-type(1) > a
')[0].text46#
人數47 jobpeople = job.select('
td:nth-of-type(3)
')[0].text48#
地點49 jobaddre = job.select('
td:nth-of-type(4)
')[0].text50#
發布時間
51 jobtime = job.select('
td:nth-of-type(5)
')[0].text52#
工作職責
53 jobrequests =getjoborder(joburl)[0]54#
工作要求
55 joborder = getjoborder(joburl)[1]
5657
#print(jobname, joburl, jobaddre, jobpeople, jobtime, jobrequests, joborder)
5859 tt = jobname + "
" + joburl + "
" + jobaddre + "
" + jobpeople + "
" + jobtime + "
" + jobrequests + "
" +joborder
60 myfile.write(tt + "\n"
)616263
if__name__ == '
__main__':
64 mainurl = '
position.php?keywords=python
'65 jobpage =getjobpage(mainurl)
66print
(jobpage)
67for page in
range(jobpage):
68 pageurl = '
position.php?keywords=python&start=
' + str(page * 10) + '#a'
69print("
第" + str(page + 1) + "頁"
)70 getjobinfo(pageurl)
python爬蟲爬取騰訊網招聘資訊
話不多說,直接上 from bs4 import beautifulsoup import urllib2 import json 使用了json格式儲存 deftengxun detail,num url detail position.php?start 0 a request urllib2....
爬蟲 爬取騰訊熱點
1.了解ajax載入 2.通過chrome的開發者工具,監控網路請求,並分析 3.用selenium完成爬蟲 4.實現 用selenium爬取 的熱點精選,熱點精選至少爬50個出來,儲存成 csv 每一行如下 標號 從1開始 標題,鏈結,前三個為必做,後面內容可以自己加 import time fr...
python3 scrapy 爬取騰訊招聘
安裝scrapy不再贅述,在控制台中輸入scrapy startproject tencent 建立爬蟲專案名字為 tencent 接著cd tencent 用pycharm開啟tencent專案 構建item檔案 coding utf 8 define here the models for yo...