爬取JS動態生成的table 表單

2021-09-01 15:44:41 字數 3634 閱讀 2381

#!/user/bin/env python

# coding=utf-8

# @author: holley

# @file: getlegaldata.py

# @datetime: 20/11/2018 22:16

import re

import csv

from bs4 import beautifulsoup

from selenium import webdriver

from selenium.webdriver.common.by import by

from selenium.webdriver.support import expected_conditions as ec

from selenium.webdriver.support.wait import webdriverwait

from selenium.common.exceptions import nosuchelementexception

class downlegaldata(object):

def __init__(self):

self.id = 'mymenuid'

self.reexp1 = re.compile(r'var mymenu =\n.*?\[(null, \'201.*?)]]', re.s)

self.reexp2 = re.compile(r'var mymenu =\n.*?0%>(.*)nbsp', re.s)

self.csvfilename = "legaldata.csv"

self.csvdata =

def get_source(self, url):

""" 通過selenium來控制瀏覽器訪問self.server,並返回網頁原始碼

:return: sub_urls (第乙個法院分類的子集url)

"""browser = webdriver.chrome()

try:

browser.get(url)

wait = webdriverwait(browser, 10)

wait.until(ec.presence_of_element_located((by.id, self.id)))

# url = browser.current_url

return browser.page_source

except nosuchelementexception:

print('no element')

finally:

browser.close()

def get_urls(self, source):

'''獲得所有法院分類url,以及第一種法院分類的子分類url

:param source:

:return:

'''soup = beautifulsoup(source, 'lxml')

table = soup.find('div', )

result = table.find_all('a') # 所有鏈結帶名字

urls = result[12::2] # 篩選其他分類的鏈結

for i in urls:

url = i['href']

name = i.text

self.servers[name] = url # 法院分類url

suburls = result[:11:2] # 篩選第乙個法院及其子分類的鏈結

sub_urls = {}

for i in range(1, len(suburls)):

value = suburls[i]['href']

key = suburls[i].text

sub_urls[key] = value # 第一種法院分類的子分類url

return sub_urls

def get_secondurls(self, source):

'''獲得子分類下的年份url

:param source:

:return:

'''pattern = self.reexp1

table = re.search(pattern, source).group(1) # str

result = table.split('],[') # list

dict = {}

for i in result:

key = i.split(', ')[1].split('<')[0]

value = i.split(', ')[2].replace('amp;', '')

dict[key] = value

return dict

def get_thirdurls(self, source):

'''獲得最底層url,編號,日期

:param source:

:return:

'''pattern = self.reexp2

table = re.search(pattern, source).group(1) # str

# print(table)

firstresult = table.split('(.*?)<', secondresult).group(1)

date = re.search(r'33>(.*?)<', secondresult).group(1)

datas[date + ': ' + num] = url

return datas

def write_csv(self):

write_flag = true

with open(self.csvfilename, 'wb') as csvfile:

spamwriter = csv.writer(csvfile, dialect='excel')

# 設定標題

spamwriter.writerow(["法院分類", "子分類", "年份", "日期編號", "內容"])

# 將csvdata中的資料迴圈寫入到csvfilename檔案中

for item in self.csvdata:

spamwriter.writerow(item)

if __name__ == '__main__':

server = ''

dld = downlegaldata()

html1 = dld.get_source(server)

secondurls = dld.get_urls(html1)

for k2, v2 in secondurls.items():

html2 = dld.get_source(v2)

thirdurls = dld.get_secondurls(html2)

dic = {}

dic[k2] = thirdurls.keys()

print(dic)

# for k3, v3 in thirdurls.items():

# html3 = dld.get_source(v3)

# lasturls = dld.get_thirdurls(html3)

# print(lasturls)

js動態生成table 經典

www function removeassigndevice obj var tr obj.parentnode.parentnode var table tr.parentnode table.removechild tr if table.rows.length 1 var row table...

爬蟲測試之無法爬取js動態生成的元素資料

cheerio模組抓取的是網頁源 大部分的抓取資料都是這樣的 因此 這裡就抓取不到js動態生成的dom元素,查了很久,方法很少,大致是 2.還有一種是使用selenium 這個我也沒接觸過 具體就自己查吧 ps 畢竟本人也是萌新,而且我的重點不是這裡,就暫時不深究啦 3.模擬瀏覽器發請求獲取資料 自...

爬取動態載入的資料

下面展示一些 要請求到每一家企業的id,對應的header裡 import requests url 首頁的url data headers fp open company detail.txt w encoding utf 8 該json 的返回值中就有每家企業的id值 data dic requ...