# -*- coding:utf-8 -*-import re
from urllib import request
from tools import tools,dbmanager
import time
class qsbkspider(object):
def
__init__(self):
self.url = ''
self.headers=
self.html = ''
def
get_html(self):
count = 1
while true:
try:
req = request.request(
url=self.url,
headers=self.headers
)response = request.urlopen(req)
self.html = response.read().decode('utf-8')
except
exception
as e:
count += 1
if count > 5:
print('
當前頁資料獲取失敗
!') break
print('%s,
獲取資料錯誤,正在嘗試第
%s次連線
...' % (e, count))
else:
break
time.sleep(1)
def
parse_data(self):
pattern = re.compile('.*?"articlegender womenicon">(.*?)
.*?(.*?).*?(.*?).*?(.*?).*?'
,re.s)
result = re.findall(pattern,
self.html)
for rs in result:
data = list(rs)
data[1] = tools.strip_char(rs[1])
data[3] = tools.strip_char(rs[3])
dbmanager.insert_data(data)
# ,先找
""的字元,
然後擷取
index = self.html.find('class="next"')
if index != -1:
# 擷取"
"前後部分的字串
s = self.html[index-90:index]
pattern = re.compile('href="(.*?)"')
next_href = re.search(pattern, s)
page = next_href.group(1).split('/')[-2]
print('
正在爬取第{}頁
'.format(page))
self.url = '' + next_href.group(1)
self.get_html()
# 呼叫自身函式 解析資料
self.parse_data()
else:
print('
') return
def
start(self):
self.get_html()
self.parse_data()
if __name__ == '__main__':
# 連線資料庫
dbmanager.connect_db()
# 爬蟲qsbk = qsbkspider()
qsbk.start()
# 關閉資料庫
dbmanager.close_db()
python 爬取糗事百科
step 1 構建乙個提取糗事百科笑話的函式import urllib2 import urllib import re import thread import time import sys reload sys sys.setdefaultencoding utf 8 defgetpage p...
Python爬取糗事百科
一 引入模組 因為urlopen功能比較簡單,所以設定 ip需引入proxyhandler和build opener模組,ip的獲取可以上西祠 查詢 import re from urllib.request import request,build opener,proxyhandler base...
Python 爬取糗事百科
coding utf 8 import urllib2 import urllib import re class qiushi def init self self.page 1 從網頁獲取糗事 def getqiushis self,page url page 偽裝瀏覽器 user agent ...