爬蟲 百度貼吧相簿

2021-10-09 11:43:41 字數 2481 閱讀 4391

import requests

from lxml import etree

from fake_useragent import useragent

import os

from selenium import webdriver

urls =

name =

defget_urls

(input):

ua = useragent(

) headers =

proxy =

''# 自己的**位址

proxies =

url =f''

response = requests.get(url, headers=headers, proxies=proxies)

.text

html = response.replace(r''

,' '

).replace(

'#',

'"')

# 這個必須注釋一下, 這裡xpath 解析不出來主要是因為 伺服器給我返回的原始碼 大部分都打了注釋

global urls

global name

html_useful = etree.html(html)

urls = html_useful.xpath(

) name = html_useful.xpath(

)# print(response)

global path_fore

for j in

range

(len

(name)):

path_fore = os.path.join(

'chenyixun'

, name[j]

)print

(urls)

print

(name)

defimage_urls

(num, xiang_name)

: url = f''

# 相簿位址

options = webdriver.chromeoptions(

)# options.add_experimental_option("prefs", ) # 不載入加快訪問速度

options.add_experimental_option(

'excludeswitches',[

'enable-automation'])

# 設定為開發者模式,防止被**識別出來是selenium

# options.add_argument('--headless') # 瀏覽器視窗不會彈出

browser = webdriver.chrome(executable_path=

, options=options)

browser.get(url)

html = browser.page_source

browser.close(

) html_useful = etree.html(html)

image_url = html_useful.xpath(

'//div[@class="ag_container"]//div[@class="ag_main"]//a[@class="ag_ele_a"]//img/@src'

) ua = useragent(

) headers =

proxy =

''# 自己的**位址

'陳奕迅'

) i =

0for num in urls:

xiang_name = name[i]

print

(name[i]+)

image_urls(num, xiang_name)

i +=

1print

(name[i]+)

百度貼吧爬蟲

encoding utf 8 import urllib.request import urllib.parse import time import random def load page url 通過url來獲取網頁內容jfa param url 待獲取的頁面 return url對應的網頁內...

3 百度貼吧爬蟲

被寫檔案坑了一晚上,因為自己寫了writefile 但是呼叫的是writefile 剛好python裡面有writefile 所以剛好不報錯!coding utf 8 created on 2018 7月12號 author sss 型別 get請求 from pip.vendor.distlib....

百度貼吧爬蟲練習

在互動平台列印貼吧內的的鏈結位址 1 coding utf 823 importre4 import urllib 導入庫56 defgethtml url 7 page urllib.urlopen url 開啟鏈結的頁面 8 html page.read 讀取鏈結的原始碼 正則 13 imgre...