import requests
from lxml import etree
from fake_useragent import useragent
import os
from selenium import webdriver
urls =
name =
defget_urls
(input):
ua = useragent(
) headers =
proxy =
''# 自己的**位址
proxies =
url =f''
response = requests.get(url, headers=headers, proxies=proxies)
.text
html = response.replace(r''
,' '
).replace(
'#',
'"')
# 這個必須注釋一下, 這裡xpath 解析不出來主要是因為 伺服器給我返回的原始碼 大部分都打了注釋
global urls
global name
html_useful = etree.html(html)
urls = html_useful.xpath(
) name = html_useful.xpath(
)# print(response)
global path_fore
for j in
range
(len
(name)):
path_fore = os.path.join(
'chenyixun'
, name[j]
(urls)
(name)
defimage_urls
(num, xiang_name)
: url = f''
# 相簿位址
options = webdriver.chromeoptions(
)# options.add_experimental_option("prefs", ) # 不載入加快訪問速度
options.add_experimental_option(
'excludeswitches',[
'enable-automation'])
# 設定為開發者模式,防止被**識別出來是selenium
# options.add_argument('--headless') # 瀏覽器視窗不會彈出
browser = webdriver.chrome(executable_path=
, options=options)
browser.get(url)
html = browser.page_source
browser.close(
) html_useful = etree.html(html)
image_url = html_useful.xpath(
'//div[@class="ag_container"]//div[@class="ag_main"]//a[@class="ag_ele_a"]//img/@src'
) ua = useragent(
) headers =
proxy =
''# 自己的**位址
'陳奕迅'
) i =
0for num in urls:
xiang_name = name[i]
(name[i]+)
image_urls(num, xiang_name)
i +=
1print
(name[i]+)
百度貼吧爬蟲
encoding utf 8 import urllib.request import urllib.parse import time import random def load page url 通過url來獲取網頁內容jfa param url 待獲取的頁面 return url對應的網頁內...
3 百度貼吧爬蟲
被寫檔案坑了一晚上,因為自己寫了writefile 但是呼叫的是writefile 剛好python裡面有writefile 所以剛好不報錯!coding utf 8 created on 2018 7月12號 author sss 型別 get請求 from pip.vendor.distlib....
百度貼吧爬蟲練習
在互動平台列印貼吧內的的鏈結位址 1 coding utf 823 importre4 import urllib 導入庫56 defgethtml url 7 page urllib.urlopen url 開啟鏈結的頁面 8 html page.read 讀取鏈結的原始碼 正則 13 imgre...