#!/usr/bin/env python
# encoding: utf-8
"""@description:使用selenium爬拉勾網資料
"""from selenium import webdriver
from selenium.webdriver.support.ui import webdriverwait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import by
import requests
from lxml import etree
import time
import re
# 封裝乙個爬蟲類
class
lagouspider
(object):
driver_path =
"/usr/local/bin/chromedriver"
def__init__
(self)
:# 初始化driver
self.driver = webdriver.chrome(executable_path=lagouspider.driver_path)
# init base url
self.base_url =
''# init spider result data
self.positions =
defrun
(self)
:"""
使用selenium爬蟲
:return:
"""# 1.open the base_url
self.driver.get(self.base_url)
while
true
:# 2.get detail page url
# 適用於第 1 頁,第 2 頁,第 3 頁
source = self.driver.page_source
webdriverwait(driver=self.driver, timeout=20)
.until(
ec.presence_of_element_located(
(by.xpath,
"//div[@class='pager_container']/span[last()]"))
)# 3.parse the first page
self.parse_list_page(source)
# 4.use selenium to click the next page
next_btn = self.driver.find_element_by_xpath(
"//div[@class='pager_container']/span[last()]"
)# 5.perform the click method
# 注意:確保不是在最後一頁
if"pager_next_disabled"
in next_btn.get_attribute(
'class'):
# 最後一頁面的時候,退出應用
self.driver.quit(
)break
else
: next_btn.click(
)# 6.爬一頁完成,就休息 1 秒鐘
time.sleep(1)
defparse_list_page
(self, source)
:"""
解析乙個頁面的資料,獲取詳情頁面的鏈結位址
:param source:頁面原始碼資料
:return:
"""# 思路:通過 a 標籤【class='position_link'】
html = etree.html(source)
links = html.xpath(
'//a[@class="position_link"]/@href'
)# 解析每乙個職位的詳情頁面
for link in links:
self.request_detail_page(link)
# 注意:爬完乙個詳情頁面,就休息 1 秒鐘
time.sleep(1)
defrequest_detail_page
(self, detail_url)
:"""
開啟詳情頁面
:param detail_url:詳情頁面的 url
:return:
"""# 注意:重新切換視窗,不能覆蓋之前的視窗
# 1.保證有且只有兩個視窗,第乙個視窗:列表頁面;第二個視窗:詳情頁面
# self.driver.get(detail_url)
self.driver.execute_script(
"window.open('%s')"
% detail_url)
self.driver.switch_to.window(self.driver.window_handles[1]
)# 2.獲取詳情頁面的內容
detail_page_source = self.driver.page_source
# 3.解析詳情頁面
self.parse_detail_page(detail_page_source)
# 4.關閉詳情頁面的視窗【關閉當前頁面]】,並把 driver 控制代碼切換回列表頁面
self.driver.close(
) self.driver.switch_to.window(self.driver.window_handles[0]
)def
parse_detail_page
(self, detail_page_source)
:"""
解析詳情頁面
:param detail_page_source:
:return:
"""html_element = etree.html(detail_page_source)
# 1.利用xpath解析頁面
# 【資料】職位名稱
position_name = html_element.xpath(
"//div[@class='job-name']/span/text()")[
0]job_request_spans = html_element.xpath(
"//dd[@class='job_request']//span"
)# 【資料】薪水
salary = job_request_spans[0]
.xpath(
'./text()')[
0].strip(
)# 【資料】城市
# 【注意:利用正規表示式去除特殊符號和空格】
city_pre = job_request_spans[1]
.xpath(
'./text()')[
0].strip(
) city = re.sub(r'[\s/]',''
, city_pre)
# 【資料】工作年限
work_years_pre = job_request_spans[2]
.xpath(
'./text()')[
0].strip(
) work_years = re.sub(r'[\s/]',''
, work_years_pre)
# 【資料】學歷
# 去掉空格、/ 符號
education_pre = job_request_spans[3]
.xpath(
'./text()')[
0].strip(
) education = re.sub(r'[\s/]',''
, education_pre)
# 【資料】全職
full_time = job_request_spans[4]
.xpath(
'./text()')[
0].strip(
)# 【資料】職位的詳情資訊 - 列表
desc_pre = html_element.xpath(
'//dd[@class="job_bt"]//text()'
)# 把列表轉換拼接為字串,並去掉首位的空字元
desc =
''.join(desc_pre)
.strip(
)# 【資料】公司名稱
company_name = html_element.xpath(
'//h2[@class="fl"]/text()')[
0].strip(
) position =
print
('=='*30
)print
('爬取乙個職位資料成功'
)print
(position)
print
("=="*30
)if __name__ ==
'__main__'
:# 1.init spider instance
spider = lagouspider(
)# 2.start to spider
spider.run(
)# 3.測試資料
print
(spider.positions)
selenium 爬取拉勾
用到的知識點 用selenium爬取拉勾 from lxml import etree from selenium import webdriver from selenium.webdriver.support import expected conditions as ec from selen...
selenium爬取拉勾
用到的知識點 用selenium爬取拉勾 from lxml import etree from selenium import webdriver from selenium.webdriver.support import expected conditions as ec from selen...
爬蟲 拉勾網 selenium
使用selenium進行翻頁獲取職位鏈結,再對鏈結進行解析 會爬取到部分空列表,感覺是網速太慢了,加了time.sleep 還是會有空列表 1 from selenium import webdriver 2import requests 3importre4 from lxml import et...