第一頁職位資訊
from selenium import webdriver
from lxml import etree
import re
import time
''''''
class lagouspider(object):
def __init__(self):
self.driver = webdriver.chrome()
#python職位
self.url = ''
self.position =
def run(self):
self.driver.get(self.url)
source = self.driver.page_source
self.parse_list_page(source)
def parse_list_page(self,source):
html = etree.html(source)
links = html.xpath("//a[@class='position_link']/@href")
#每一頁的所有職位的詳情url
for link in links:
self.request_detail_page(link)
time.sleep(1)
def request_detail_page(self,url):
self.driver.get(url)
#獲取職位詳情頁的源**
source = self.driver.page_source
self.parse_detail_page(source)
def parse_detail_page(self,source):
html = etree.html(source)
position_name = html.xpath("//span[@class='name']/text()")[0]
job_request_spans = html.xpath("//dd[@class='job_request']//span")
salary = job_request_spans[0].xpath('.//text()')[0].strip()
city = job_request_spans[1].xpath('.//text()')[0].strip()
city = re.sub(r"[\s/]","",city)
work_years = job_request_spans[2].xpath('.//text()')[0].strip()
work_years = re.sub(r"[\s/]","",work_years)
education = job_request_spans[3].xpath('.//text()')[0].strip()
education = re.sub(r"[\s/]","",education)
desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
position =
print(position)
print('-'*200)
if __name__ == '__main__':
spider = lagouspider()
spider.run()
from selenium import webdriver
from lxml import etree
import re
import time
from selenium.webdriver.support.ui import webdriverwait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import by
''''''
class lagouspider(object):
def __init__(self):
self.driver = webdriver.chrome()
#python職位
self.url = ''
self.position =
def run(self):
self.driver.get(self.url)
while true:
source = self.driver.page_source
webdriverwait(driver=self.driver,timeout=20).until(
ec.presence_of_element_located((by.xpath,"//div[@class='pager_container']/span[last()]"))
)self.parse_list_page(source)
next_btn = self.driver.find_element_by_xpath(
"//div[@class='pager_container']/span[last()]")
if "pager_next_disabled" in next_btn.get_attribute("class"):
break
else:
next_btn.click()
time.sleep(1)
def parse_list_page(self,source):
html = etree.html(source)
links = html.xpath("//a[@class='position_link']/@href")
#每一頁的所有職位的詳情url
for link in links:
self.request_detail_page(link)
time.sleep(1)
def request_detail_page(self,url):
# self.driver.get(url)
self.driver.execute_script("window.open('%s')"%url)
self.driver.switch_to.window(self.driver.window_handles[1])
webdriverwait(driver=self.driver,timeout=20).until(
ec.presence_of_element_located((by.xpath,"//div[@class='job-name']/span[@class='name']"))
)#獲取職位詳情頁的源**
source = self.driver.page_source
self.parse_detail_page(source)
#關閉當前詳情頁,並且切換到列表頁
self.driver.close()
self.driver.switch_to.window(self.driver.window_handles[0])
def parse_detail_page(self,source):
html = etree.html(source)
position_name = html.xpath("//span[@class='name']/text()")[0]
job_request_spans = html.xpath("//dd[@class='job_request']//span")
salary = job_request_spans[0].xpath('.//text()')[0].strip()
city = job_request_spans[1].xpath('.//text()')[0].strip()
city = re.sub(r"[\s/]","",city)
work_years = job_request_spans[2].xpath('.//text()')[0].strip()
work_years = re.sub(r"[\s/]","",work_years)
education = job_request_spans[3].xpath('.//text()')[0].strip()
education = re.sub(r"[\s/]","",education)
desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
company_name = html.xpath("//h2[@class='fl']/text()")[0].strip()
position =
print(position)
print('-'*200)
if __name__ == '__main__':
spider = lagouspider()
spider.run()
Python 爬蟲利器 Selenium
前面幾節,我們學習了用 requests 構造頁面請求來爬取靜態網頁中的資訊以及通過 requests 構造 ajax 請求直接獲取返回的 json 資訊。還記得前幾節,我們在構造請求時會給請求加上瀏覽器 headers,目的就是為了讓我們的請求模擬瀏覽器的行為,防止被 的反爬蟲策略限制。今天要介紹...
Python 爬蟲利器 Selenium
前面幾節,我們學習了用 requests 構造頁面請求來爬取靜態網頁中的資訊以及通過 requests 構造 ajax 請求直接獲取返回的 json 資訊。還記得前幾節,我們在構造請求時會給請求加上瀏覽器 headers,目的就是為了讓我們的請求模擬瀏覽器的行為,防止被 的反爬蟲策略限制。今天要介紹...
Python核心丨匿名函式
描述 匿名函式格式 lambda argument1,argument2,argumentn expression匿名函式的關鍵字是lambda,之後是一系列的引數,然後用冒號隔開,最後則是由這些引數組成的表示式。square lambda x x 2square 3 9寫成常規函式 def squ...