Python之爬蟲學習（七）問題記錄

1、**ip質量問題

獲取到的**ip可能會失效，所以需要多次校驗ip的質量及記錄問題url重新請求

獲取時校驗：

# 獲取可用ip**
print(self.urlproxylist)

使用時再次校驗：

def getusefulproxy(self):
proxy = random.choice(self.urlproxylist)
header = 
try:
r = requests.get(self.check_url, headers=header, proxies=proxy, timeout=2)
except:
return annualreport.getusefulproxy(self)
else:
return proxy

2、如果不使用ua/ip**，多次請求或者請求間隔時間太短會被部分**發現並封ip

3、多執行緒爬取時注意操作同一檔案的問題

4、可以使用**+非**相結合的手段

如果對資料要求全量，**總有失效的情況下，catch到異常呼叫無**的方法，注意執行緒等待1-2s避免安全部門**

def getreportbythreadpool(self):
print('get reports start...')
executor = threadpoolexecutor(max_workers=4)
#等待執行緒池所有任務完成
all_task = [executor.submit(annualreport.getreport,self,reporturl) for reporturl in self.reportlist]
wait(all_task,return_when=all_completed)
print('獲取上市公司中的年報列表結束')

"""
獲取年報列表中的所有年報並寫入txt
"""def getreport(self,reporturl):
try:
# 設定get請求的user-agent，用於偽裝瀏覽器ua
header = 
time.sleep(1.5)
response = request.request(reporturl, headers=header)
# 使用proxyhandler方法生成處理器物件
proxy = random.choice(self.urlproxylist)
proxy_handler = request.proxyhandler(proxy)
#建立**ip的opener例項
opener = request.build_opener(proxy_handler)
req = opener.open(response,timeout=2)
#req = request.urlopen(response,timeout=3)
html = req.read().decode("gb18030")
bf = beautifulsoup(html, 'html.parser')
texts = bf.find_all('div', id='content')
file_name = reporturl[reporturl.index('stockid=')+8:reporturl.index('&id=')]
txt_path = '/users/wecash/desktop/report/'+file_name+'.txt'
txt_before = '
'            txt_after = '
'            with open(txt_path, 'w') as f:
f.write(txt_before)
f.write(str(texts[0]))
f.write(txt_after)
print('使用**獲取年報txt成功')
text2 = bf.find_all('a')
for i in text2:
if i.string == '公告原文':
self.pdfurl = i.get('href')
download_dir = '/users/wecash/desktop/report/'+file_name+'.pdf'
r = requests.get(self.pdfurl, stream=true)
with open(download_dir, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush()
print('使用**獲取年報pdf成功')
return
except baseexception as e:
print('使用**獲取年報失敗：'+str(e))
self.getreportwithoutproxy(reporturl)
else:
print('獲取年報成功')

def getreportwithoutproxy(self, reporturl):
try:
header = 
response = request.request(reporturl, headers=header)
time.sleep(1.5)
req = request.urlopen(response, timeout=3)
html = req.read().decode("gb18030")
bf = beautifulsoup(html, 'html.parser')
texts = bf.find_all('div', id='content')
file_name = reporturl[reporturl.index('stockid=') + 8:reporturl.index('&id=')]
txt_path = '/users/wecash/desktop/report/' + file_name + '.txt';
txt_before = '
'        txt_after = '
'        with open(txt_path, 'w') as f:
f.write(txt_before)
f.write(str(texts[0]))
f.write(txt_after)
print('無**獲取年報txt成功')
text2 = bf.find_all('a')
for i in text2:
if i.string == '公告原文':
self.pdfurl = i.get('href')
download_dir = '/users/wecash/desktop/report/' + file_name + '.pdf'
r = requests.get(self.pdfurl, stream=true)
with open(download_dir, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush()
print('無**獲取年報pdf成功')
return
except baseexception as e:
print('獲取年報失敗：' + str(e))
else:
print('獲取年報成功')

python爬蟲問題記錄

環境搭建基本庫框架打碼平台 pycharm官方使用文件 python命名規範 python中文文件啟動參考 cd d e mongodb bin mongob dbpath e mongodb data db 驗證是否啟動成功啟動參考 cd d e redis redis server re...

python爬蟲學習（七）

from selenium.webdriver.common.keys import keys browser webdriver.chrome browser.get 1 在搜尋框中輸入 selenium browser.find element by id kw send keys 趙麗穎 2 ...

Python爬蟲學習筆記七

json是輕量級的資料互動格式給使用者看的，展示資料的簡單理解就是乙個字點或者list 書寫格式不能寫注釋 key value 必須都是雙引號末尾不能寫逗號整個檔案有且僅有乙個或字串 loads coding gbk import json 1 字串和dic list轉換字串 json...

Python之爬蟲學習（七） 問題記錄

python爬蟲問題記錄

python爬蟲學習（七）

Python爬蟲學習筆記 七

相關推薦

Python之爬蟲學習（七）問題記錄

Python爬蟲學習筆記七