import requests
from bs4 import beautifulsoup
import os
import time
import random
import useragent
interval = 3 # 爬取的間隔時間
firstdir = 'd:/netbian' # 總路徑
classificationdict = {} # 存放**分類子頁面的資訊
# 獲取頁面篩選後的內容列表
def screen(url, select):
# 隨機獲取乙個headers
headers =
html = requests.get(url=url, headers=headers)
html.encoding = 'gbk' # **的編碼
html = html.text
soup = beautifulsoup(html, 'lxml')
return soup.select(select)
# 獲取頁碼
while (os.path.exists(path)): # 若檔名重複
path = path.split(".")[0] + str(random.randint(2, 17)) + '.' + path.split(".") [1]
with open(path, 'wb') as pic:
for chunk in response.iter_content(128):
pic.write(chunk)
# 定位到 1920 1080 解析度
def handleimgs(links, path):
for link in links:
href = link.get('href')
if (href == ''): # 過濾廣告
continue
# 第一次跳轉
url = href
else:
url = index + href
select = 'div#main div.endpage div.pic div.pic-down a'
link = screen(url, select)
if (link == ):
print(url + ' 無此,爬取失敗')
continue
href = link [0].get('href')
# 第二次跳轉
url = index + href
# 獲取到了
select = 'div#main table a img'
link = screen(url, select)
if (link == ):
print(url + " 該需要登入才能爬取,爬取失敗")
continue
name = link [0].get('alt').replace('\t', '').replace('|', '').replace(':', '').replace('\\', '').replace('/',
'').replace(
'*', '').replace('?', '').replace('"', '').replace('<', '').replace('>', '')
src = link [0].get('src')
if (requests.get(src).status_code == 404):
print()
continue
print()
download(src, name, path)
time.sleep(interval)
def select_classification(choice):
print('---------------------------')
print('--------------' + choice + '-------------')
print('---------------------------')
secondurl = classificationdict [choice] ['url']
seconddir = classificationdict [choice] ['path']
if (not os.path.exists(seconddir)):
os.mkdir(seconddir) # 建立分類目錄
select = '#main > div.page > span.slh'
pageindex = screenpage(secondurl, select)
lastpagenum = int(pageindex) # 獲取最後一頁的頁碼
for i in range(0, lastpagenum):
if i == 0:
url = secondurl
else:
url = secondurl + 'index_%d.htm' % (i + 1)
print('--------------' + choice + ': ' + str(i + 1) + '-------------')
path = seconddir
select = 'div#main div.list ul li a'
links = screen(url, select)
handleimgs(links, path)
def ui():
print('--------------netbian-------------')
print('全部', end=' ')
for c in classificationdict.keys():
print(c, end=' ')
print()
choice = input('請輸入分類名:')
if (choice == '全部'):
for c in classificationdict.keys():
select_classification(c)
elif (choice not in classificationdict.keys()):
print("輸入錯誤,請重新輸入!")
print('----')
ui()
else:
select_classification(choice)
# 將分類子頁面資訊存放在字典中
def init_classification():
url = index
select = '#header > div.head > ul > li:nth-child(1) > div > a'
#header相當於 id = "header"
#div.head相當於 div class="head"
#li:nth-child(1) 相當於父元素下第乙個子分類
classifications = screen(url, select)
for c in classifications:
href = c.get('href') # 獲取的是相對位址
text = c.string # 獲取分類名
if (text == '4k桌布'): # 4k桌布,因許可權問題無法爬取,直接跳過
continue
seconddir = firstdir + '/' + text # 分類目錄
url = index + href # 分類子頁面url
global classificationdict
classificationdict[text] =
def main():
if not os.path.exists(firstdir):
os.mkdir(firstdir) # 建立總目錄
init_classification()
ui()
if __name__ == '__main__':
main()
python爬取彼岸桌面桌布
1.目標站點分析 進入 經過f12分析,url都儲存在 2.選擇爬取工具,這裡網頁比較簡單,就採用requests庫和正則.import requests import osimport reimport time 主頁 main urls headers ifnot os.path.exists ...
Python 爬取高畫質桌面桌布
今天寫了乙個指令碼用來爬取zol桌面桌布 的高畫質 如下 coding utf 8 import urllib import re import time class spider baseurl pic index 0 itemgrouppic def init self,page count t...
框架 MFC 修改桌面 桌布
功 能 使用 iactivedesktop 介面獲取 設定和重新整理桌面背景 桌布 開發環境 vc vs2005 vs2008 vs2010 vs2012 vs2013 新建專案 mfc應用程式 基於對話方塊 include include shlobj.h shlwapi.h 包含了對檔案判別的a...