1、**ip質量問題
獲取到的**ip可能會失效,所以需要多次校驗ip的質量及記錄問題url重新請求
獲取時校驗:
# 獲取可用ip**
print(self.urlproxylist)
使用時再次校驗:
def getusefulproxy(self):
proxy = random.choice(self.urlproxylist)
header =
try:
r = requests.get(self.check_url, headers=header, proxies=proxy, timeout=2)
except:
return annualreport.getusefulproxy(self)
else:
return proxy
2、如果不使用ua/ip**,多次請求或者請求間隔時間太短會被部分**發現並封ip
3、多執行緒爬取時注意操作同一檔案的問題
4、可以使用**+非**相結合的手段
如果對資料要求全量,**總有失效的情況下,catch到異常呼叫無**的方法,注意執行緒等待1-2s避免安全部門**
def getreportbythreadpool(self):print('get reports start...')
executor = threadpoolexecutor(max_workers=4)
#等待執行緒池所有任務完成
all_task = [executor.submit(annualreport.getreport,self,reporturl) for reporturl in self.reportlist]
wait(all_task,return_when=all_completed)
print('獲取上市公司中的年報列表結束')
"""獲取年報列表中的所有年報並寫入txt
"""def getreport(self,reporturl):
try:
# 設定get請求的user-agent,用於偽裝瀏覽器ua
header =
time.sleep(1.5)
response = request.request(reporturl, headers=header)
# 使用proxyhandler方法生成處理器物件
proxy = random.choice(self.urlproxylist)
proxy_handler = request.proxyhandler(proxy)
#建立**ip的opener例項
opener = request.build_opener(proxy_handler)
req = opener.open(response,timeout=2)
#req = request.urlopen(response,timeout=3)
html = req.read().decode("gb18030")
bf = beautifulsoup(html, 'html.parser')
texts = bf.find_all('div', id='content')
file_name = reporturl[reporturl.index('stockid=')+8:reporturl.index('&id=')]
txt_path = '/users/wecash/desktop/report/'+file_name+'.txt'
txt_before = '
' txt_after = '
' with open(txt_path, 'w') as f:
f.write(txt_before)
f.write(str(texts[0]))
f.write(txt_after)
print('使用**獲取年報txt成功')
text2 = bf.find_all('a')
for i in text2:
if i.string == '公告原文':
self.pdfurl = i.get('href')
download_dir = '/users/wecash/desktop/report/'+file_name+'.pdf'
r = requests.get(self.pdfurl, stream=true)
with open(download_dir, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush()
print('使用**獲取年報pdf成功')
return
except baseexception as e:
print('使用**獲取年報失敗:'+str(e))
self.getreportwithoutproxy(reporturl)
else:
print('獲取年報成功')
def getreportwithoutproxy(self, reporturl):try:
header =
response = request.request(reporturl, headers=header)
time.sleep(1.5)
req = request.urlopen(response, timeout=3)
html = req.read().decode("gb18030")
bf = beautifulsoup(html, 'html.parser')
texts = bf.find_all('div', id='content')
file_name = reporturl[reporturl.index('stockid=') + 8:reporturl.index('&id=')]
txt_path = '/users/wecash/desktop/report/' + file_name + '.txt';
txt_before = '
' txt_after = '
' with open(txt_path, 'w') as f:
f.write(txt_before)
f.write(str(texts[0]))
f.write(txt_after)
print('無**獲取年報txt成功')
text2 = bf.find_all('a')
for i in text2:
if i.string == '公告原文':
self.pdfurl = i.get('href')
download_dir = '/users/wecash/desktop/report/' + file_name + '.pdf'
r = requests.get(self.pdfurl, stream=true)
with open(download_dir, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush()
print('無**獲取年報pdf成功')
return
except baseexception as e:
print('獲取年報失敗:' + str(e))
else:
print('獲取年報成功')
python爬蟲問題記錄
環境搭建 基本庫框架 打碼平台 pycharm官方使用文件 python命名規範 python中文文件 啟動參考 cd d e mongodb bin mongob dbpath e mongodb data db 驗證是否啟動成功 啟動參考 cd d e redis redis server re...
python爬蟲學習(七)
from selenium.webdriver.common.keys import keys browser webdriver.chrome browser.get 1 在搜尋框中輸入 selenium browser.find element by id kw send keys 趙麗穎 2 ...
Python爬蟲學習筆記 七
json是輕量級的資料互動格式 給使用者看的,展示資料的 簡單理解就是乙個字點或者list 書寫格式 不能寫注釋 key value 必須都是雙引號 末尾不能寫逗號 整個檔案有且僅有乙個或 字串 loads coding gbk import json 1 字串和dic list轉換 字串 json...