爬取51崗位(xpath的運用)

2021-08-21 06:01:33 字數 2742 閱讀 1647

# coding:utf-8

import os

import re

import requests

import lxml

from lxml import etree

#請求頭

#獲取城市列表

def getcitylist(url):

html=requests.get(url,headers=headers).content.decode('gbk')

mytree=lxml.etree.html(html)

citylist=mytree.xpath('//div[@class="e e4"][1]//div[@class="lkst"]/a')

#print(citylist)

citydict={}

for city in citylist:

cityname=city.xpath('./text()')[0]

cityurl=city.xpath('./@href')[0]

#print(cityname,cityurl)

citydict[cityname]=cityurl

#print(citydict)

return citydict

#獲取頁碼數

def getpagenum(url):

html=requests.get(url,headers=headers).content.decode('gb2312','ignore')

#print(html)

mytree=lxml.etree.html(html)

pagenum=mytree.xpath('//*[@id="cppageno"]/span[1]/text()')[0]

#print(pagenum)

pat='共(\d+)頁'

pagenum=re.findall(pat,pagenum)[0]

print(pagenum)

return int(pagenum)

#獲取崗位具體資訊

def getjobinfo(cityurl,pagenum,cityname):

#建立data目錄

for page in range(1,pagenum+1):

print('第'+str(page)+'頁')

pageurl=cityurl+'p%d'%page

html=requests.get(pageurl,headers=headers).content.decode('gbk','ignore')

mytree=lxml.etree.html(html)

#獲取崗位列表

#/html/body/div[3]/div[3]/div[1]/div[2]

joblist=mytree.xpath('//div[@class="detlist gbox"]/div')

for job in joblist:

#職位名稱

jobname=job.xpath('.//span[@class="title"]/a/@title')[0]

#url

joburl = job.xpath('.//span[@class="title"]/a/@href')[0]

#公司名稱

companyname = job.xpath('.//p[@class="info"]/a/@title')[0]

#工作地點

jobaddress=job.xpath('.//span[@class="location name"]/text()')[0]

#薪資jobmoney = job.xpath('.//span[@class="location"]/text()')

if len(jobmoney)==0:

jobmoney='面談'

else:

jobmoney=jobmoney[0]

#工作要求

joborder=''

orderlist=job.xpath('.//p[@class="order"]/text()')

for order in orderlist:

order=order.strip()

joborder+=order

#print(joborder)

#工作描述

jobcontent=job.xpath('.//p[@class="text"]/@title')[0]

content=str((jobname,companyname,jobaddress,jobmoney,joborder,jobcontent,joburl))

print(content)

#寫入檔案

with open('./data/'+cityname+'.txt','a+',encoding='utf-8',errors='ignore') as f:

f.write(content)

f.flush()

if __name__ == '__main__':

url = ''

#獲取城市列表

citydict=getcitylist(url)

for cityname,cityurl in citydict.items():

pagenum=getpagenum(cityurl)

#測試 pagenum=20

getjobinfo(cityurl,pagenum,cityname)

xpath 語法運用例項 爬取boos

一 url的處理 import urllib.request from lxml import etree def bo url url headers for bo in bo list bo dict 職位型別 bo.xpath li h3 div class job title text bo...

datawhale爬蟲(xpath爬取丁香網評論)

1.xpath基礎學習 前面我們介紹了 beautifulsoup 的用法,這個已經是非常強大的庫了,不過還有一些比較流行的解析庫,例如 lxml,使用的是 xpath 語法,同樣是效率比較高的解析方法。如果大家對 beautifulsoup 使用不太習慣的話,可以嘗試下 xpath。xpath 是...

使用xpath解析爬取鏈家

from urllib import request from time import sleep from lxml import etree import csv import redis import re 1 資料抓取 定義乙個函式,用於將頁碼,城市等資訊轉化為乙個request物件 def...