#!/user/bin/env python
# coding=utf-8
# @author: holley
# @file: getlegaldata.py
# @datetime: 20/11/2018 22:16
import re
import csv
from bs4 import beautifulsoup
from selenium import webdriver
from selenium.webdriver.common.by import by
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.wait import webdriverwait
from selenium.common.exceptions import nosuchelementexception
class downlegaldata(object):
def __init__(self):
self.id = 'mymenuid'
self.reexp1 = re.compile(r'var mymenu =\n.*?\[(null, \'201.*?)]]', re.s)
self.reexp2 = re.compile(r'var mymenu =\n.*?0%>(.*)nbsp', re.s)
self.csvfilename = "legaldata.csv"
self.csvdata =
def get_source(self, url):
""" 通過selenium來控制瀏覽器訪問self.server,並返回網頁原始碼
:return: sub_urls (第乙個法院分類的子集url)
"""browser = webdriver.chrome()
try:
browser.get(url)
wait = webdriverwait(browser, 10)
wait.until(ec.presence_of_element_located((by.id, self.id)))
# url = browser.current_url
return browser.page_source
except nosuchelementexception:
print('no element')
finally:
browser.close()
def get_urls(self, source):
'''獲得所有法院分類url,以及第一種法院分類的子分類url
:param source:
:return:
'''soup = beautifulsoup(source, 'lxml')
table = soup.find('div', )
result = table.find_all('a') # 所有鏈結帶名字
urls = result[12::2] # 篩選其他分類的鏈結
for i in urls:
url = i['href']
name = i.text
self.servers[name] = url # 法院分類url
suburls = result[:11:2] # 篩選第乙個法院及其子分類的鏈結
sub_urls = {}
for i in range(1, len(suburls)):
value = suburls[i]['href']
key = suburls[i].text
sub_urls[key] = value # 第一種法院分類的子分類url
return sub_urls
def get_secondurls(self, source):
'''獲得子分類下的年份url
:param source:
:return:
'''pattern = self.reexp1
table = re.search(pattern, source).group(1) # str
result = table.split('],[') # list
dict = {}
for i in result:
key = i.split(', ')[1].split('<')[0]
value = i.split(', ')[2].replace('amp;', '')
dict[key] = value
return dict
def get_thirdurls(self, source):
'''獲得最底層url,編號,日期
:param source:
:return:
'''pattern = self.reexp2
table = re.search(pattern, source).group(1) # str
# print(table)
firstresult = table.split('(.*?)<', secondresult).group(1)
date = re.search(r'33>(.*?)<', secondresult).group(1)
datas[date + ': ' + num] = url
return datas
def write_csv(self):
write_flag = true
with open(self.csvfilename, 'wb') as csvfile:
spamwriter = csv.writer(csvfile, dialect='excel')
# 設定標題
spamwriter.writerow(["法院分類", "子分類", "年份", "日期編號", "內容"])
# 將csvdata中的資料迴圈寫入到csvfilename檔案中
for item in self.csvdata:
spamwriter.writerow(item)
if __name__ == '__main__':
server = ''
dld = downlegaldata()
html1 = dld.get_source(server)
secondurls = dld.get_urls(html1)
for k2, v2 in secondurls.items():
html2 = dld.get_source(v2)
thirdurls = dld.get_secondurls(html2)
dic = {}
dic[k2] = thirdurls.keys()
print(dic)
# for k3, v3 in thirdurls.items():
# html3 = dld.get_source(v3)
# lasturls = dld.get_thirdurls(html3)
# print(lasturls)
js動態生成table 經典
www function removeassigndevice obj var tr obj.parentnode.parentnode var table tr.parentnode table.removechild tr if table.rows.length 1 var row table...
爬蟲測試之無法爬取js動態生成的元素資料
cheerio模組抓取的是網頁源 大部分的抓取資料都是這樣的 因此 這裡就抓取不到js動態生成的dom元素,查了很久,方法很少,大致是 2.還有一種是使用selenium 這個我也沒接觸過 具體就自己查吧 ps 畢竟本人也是萌新,而且我的重點不是這裡,就暫時不深究啦 3.模擬瀏覽器發請求獲取資料 自...
爬取動態載入的資料
下面展示一些 要請求到每一家企業的id,對應的header裡 import requests url 首頁的url data headers fp open company detail.txt w encoding utf 8 該json 的返回值中就有每家企業的id值 data dic requ...