此爬蟲使用了以下庫:
selenium + chromedriver
beautifulsoup
requests
**如下:
from selenium import webdriver
from selenium.webdriver.chrome.options import options
from bs4 import beautifulsoup
import requests_download
import time
files =
#檔名:鏈結
req_url =
''chrome_options = options(
)chrome_options.add_argument(
'--headless'
)#無頭模式
browser = webdriver.chrome(chrome_options=chrome_options)
browser.get(req_url)
soup = beautifulsoup(browser.page_source,
'html.parser'
)th = soup.find(
'th'
)uls = th.find_all(
'ul'
)for ul in uls:
urls = ul.find_all(
'a')
for url in urls:
text = url.get_text(
) link = url.get(
'href'
) files[text]
= link
for f in files:
requests_download.download(f, files[f]
) time.sleep(2)
browser.close(
)browser.quit(
)
import requests
defdownload
(file_name, url_file)
: url_file =
''+ url_file
r = requests.get(url_file, stream=
true
)#獲取伺服器的原始套接字響應
file_name = r'.\downloads\\'
+ file_name +
'.xls'
f =open
(file_name,
"wb+"
)for chunk in r.iter_content(chunk_size=
512)
:if chunk:
f.write(chunk)
部分結果如下:
新手上路,不足之處請多指教
selenium 爬取拉勾
用到的知識點 用selenium爬取拉勾 from lxml import etree from selenium import webdriver from selenium.webdriver.support import expected conditions as ec from selen...
selenium爬取拉勾
用到的知識點 用selenium爬取拉勾 from lxml import etree from selenium import webdriver from selenium.webdriver.support import expected conditions as ec from selen...
selenium爬取京東商品
from selenium import webdriver import time import pandas as pd url browser webdriver.chrome browser.get url 找到頁面中的搜尋框,然後輸入想找的商品 browser.find element b...