使用selenium進行翻頁獲取職位鏈結,再對鏈結進行解析
會爬取到部分空列表,感覺是網速太慢了,加了time.sleep()還是會有空列表
1執行結果from selenium import
webdriver
2import
requests
3importre4
from lxml import
etree
5import
time
6from selenium.webdriver.support.ui import
webdriverwait
7from selenium.webdriver.support import
expected_conditions as ec
8from selenium.webdriver.common.by importby9
1011
class
lagouspider(object):
12def
__init__
(self):
13 opt =webdriver.chromeoptions()14#
把chrome設定成無介面模式
15opt.set_headless()
16 self.driver = webdriver.chrome(options=opt)
17 self.url = '
爬蟲?px=default&city=北京
'18 self.headers =
2122
23def
run(self):
24self.driver.get(self.url)
25while
true:
26 html = ''
27 links =
28 html =self.driver.page_source
29 links =self.get_one_page_links(html)
30for link in
links:
31print('
\n' + link+'\n'
)32self.parse_detail_page(link)
3334 webdriverwait(self.driver, 10).until(
35 ec.presence_of_element_located((by.class_name, '
pager_next
')))
36 next_page_btn = self.driver.find_element_by_class_name('
pager_next')
3738if'
pager_next_disabled
'in next_page_btn.get_attribute('
class'):
39break
40else:41
next_page_btn.click()
42 time.sleep(1)
4344
45def
get_one_page_links(self, html):
46 links =
47 hrefs = self.driver.find_elements_by_xpath('
//a[@class="position_link"]')
48for href in
hrefs:
href'))
50return
links
5152
53def
parse_detail_page(self, url):
54 job_information ={}
55 response = requests.get(url, headers=self.headers)
5657 time.sleep(2)
58 html =response.text
59 html_element =etree.html(html)
60 job_name = html_element.xpath('
//div[@class="job-name"]/@title')
61 job_description = html_element.xpath('
//dd[@class="job_bt"]//p//text()')
62for index, i in
enumerate(job_description):
63 job_description[index] = re.sub('
\xa0
', ''
, i)
64 job_address = html_element.xpath('
//div[@class="work_addr"]/a/text()')
65 job_salary = html_element.xpath('
//span[@class="salary"]/text()')
6667
#字串處理去掉不必要的資訊
68for index, i in
enumerate(job_address):
69 job_address[index] = re.sub('
檢視地圖
', ''
, i)
70while
''in
job_address:
71 job_address.remove('')72
73 job_information['
job_name
'] =job_name
74 job_information['
job_description
'] =job_description
75 job_information['
job_address
'] =job_address
76 job_information['
job_salary
'] =job_salary
77print
(job_information)
7879
80def
main():
81 spider =lagouspider()
82spider.run()
8384
85if
__name__ == '
__main__':
86 main()
使用selenium爬拉勾網資料
usr bin env python encoding utf 8 description 使用selenium爬拉勾網資料 from selenium import webdriver from selenium.webdriver.support.ui import webdriverwait ...
node爬蟲抓取拉勾網資料
初始化 1.安裝了node 2.新建乙個資料夾 3.在該資料夾中初始化node應用 npm init安裝依賴 使用express框架 使用superagent庫 superagent 是乙個輕量級 漸進式的請求庫,內部依賴 nodejs 原生的請求 api,適用於 nodejs 環境 使用cheer...
selenium 爬取拉勾
用到的知識點 用selenium爬取拉勾 from lxml import etree from selenium import webdriver from selenium.webdriver.support import expected conditions as ec from selen...