巧用selenium爬取巨潮資訊公司資料

2021-10-09 16:44:18 字數 4368 閱讀 6362

立項背景:在做深度學習的過程中利用python進行建模,需要資料來訓練模型。

專案流程圖

原始碼

from selenium.webdriver import chrome       #引入selenium中的chrome

from selenium.webdriver.common.keys import keys #引入鍵盤

import time #時間模組

import requests

import os

import csv

import codecs

import csv

data_index=

1data=[1

,2,3

,4,5

]temp_index1=

1temp_index2=

1browser=chrome(

)#建立瀏覽器

browser.get(

"")browser.find_element_by_id(

'btn2'

).click(

)#找到輸入框,輸入**並且回車

time.sleep(3)

js="var q=document.documentelement.scrolltop=10000"

browser.execute_script(js)

time.sleep(3)

browser.find_element_by_xpath(

'//div[@class="detail-cont-top"]//*[@type="text"]'

).send_keys(

"製造業"

,keys.enter)

browser.find_element_by_xpath(

'//*[@id="root"]/div/div[3]/div/div/div/div[1]/div/div[2]/div[1]/div[2]/div[1]/div[1]/div/i'

).click(

)lis=browser.find_elements_by_xpath(

'//*[@id="result_a"]/ul//li[@class="tree-empty tree-opened"]'

)for li in lis[11:

13]: src_name=li.find_element_by_xpath(

'.//a'

).get_attribute(

"data-name"

)print

(src_name)

path=

'f:\\pang'

+'\\'

+src_name

i***ists = os.path.exists(path)

ifnot i***ists:

os.makedirs(path)

#爬資料_第二個框

li.find_element_by_xpath(

'.//a'

).click(

) browser.find_element_by_xpath(

'//*[@id="root"]/div/div[3]/div/div/div/div[1]/div/div[2]/div[1]/div[2]/div[2]/div[1]/div[1]/label/i'

).click(

) browser.find_element_by_xpath(

'//div[@class="arrow-btns"]/button[1]'

).click(

)print

("手點"

) time.sleep(10)

browser.find_element_by_xpath(

'//*[@id="root"]/div/div[3]/div/div/div/div[1]/div/div[2]/div[1]/div[2]/div[2]/div[3]/div[1]/label/i'

).click(

) browser.find_element_by_xpath(

'//*[@id="root"]/div/div[3]/div/div/div/div[1]/div/div[2]/div[2]/div[2]/div[1]/div[3]/ul/li[1]/label/i'

).click(

) browser.find_element_by_xpath(

'//*[@id="root"]/div/div[3]/div/div/div/div[1]/div/div[2]/div[2]/div[2]/div[1]/div[3]/ul/li[3]/label/i'

).click(

) browser.find_element_by_xpath(

'//*[@id="root"]/div/div[3]/div/div/div/div[1]/div/div[2]/div[2]/div[2]/div[1]/div[3]/ul/li[15]/label/i'

).click(

) browser.find_element_by_xpath(

'//*[@id="root"]/div/div[3]/div/div/div/div[1]/div/div[2]/div[2]/div[2]/div[1]/div[3]/ul/li[16]/label/i'

).click(

) browser.find_element_by_xpath(

'//*[@id="root"]/div/div[3]/div/div/div/div[1]/div/div[2]/div[2]/div[2]/div[2]/button[1]'

).click(

) browser.find_element_by_xpath(

'//*[@id="root"]/div/div[3]/div/div/div/div[1]/div/div[2]/div[2]/div[2]/div[3]/div[1]/label/i'

).click(

) browser.find_element_by_xpath(

'//*[@id="root"]/div/div[3]/div/div/div/div[1]/div/button'

).click(

)#頁面出來了

for page_index in

range

(200):

for i in

range(10

):for j in

range(4

):temp=

'//*[@id="contenttable"]/tbody/tr['

+str

(i+1)+

']/td['

+str

(j+1)+

']' browser.execute_script(js)

time.sleep(3)

data_1=browser.find_element_by_xpath(temp)

.get_attribute(

'title'

) data[0]

=data_index

data[j+1]

=data_1

data_index+=

1print

(data)

path_new = path +

'\\data.csv'

with

open

(path_new,

'a',encoding=

'utf-8'

,newline='')

as csvfile:

writer = csv.writer(csvfile)

writer.writerow(data)

print

(page_index+1,

) browser.find_element_by_xpath(

'//li[@class="page-next"]'

).click(

)

time.sleep(

10)

專案執行效果

selenium 爬取拉勾

用到的知識點 用selenium爬取拉勾 from lxml import etree from selenium import webdriver from selenium.webdriver.support import expected conditions as ec from selen...

selenium爬取拉勾

用到的知識點 用selenium爬取拉勾 from lxml import etree from selenium import webdriver from selenium.webdriver.support import expected conditions as ec from selen...

selenium爬取京東商品

from selenium import webdriver import time import pandas as pd url browser webdriver.chrome browser.get url 找到頁面中的搜尋框,然後輸入想找的商品 browser.find element b...