import requests
from lxml import etree
class
qiubaispider
(object):
def__init__
(self)
:# 要爬取的位址
self.url_temp =
''# 請求頭
self.headers =
defget_url_list
(self, page_num)
:return
[self.url_temp.
format
(i)for i in
range(1
, page_num)
]def
paser_one_url
(self, url1)
:# 傳送請求,獲取響應
response = requests.get(url1, headers=self.headers)
return response.content.decode(
)def
paser_two_url
(self, url2)
: response = requests.get(url2, headers=self.headers)
return response.content
defget_page_url
(self, html_str)
: num =
1 html = etree.html(html_str)
self.div_list = html.xpath(
'//div[@id="content-left"]/div/a[1]/@href'
)# 遍歷出每乙個頁面
for i in self.div_list:
with
open(,
'r')
as f:
a = f.readlines(
)if i +
'\n'
notin a:
# 拼接頁面位址
url2 =
''+ i url2_html = self.paser_two_url(url2)
html2 = etree.html(url2_html)
data_list = html2.xpath(
'//div[@id="single-next-link"]/div[@class="content"]/text()'
) data =
''.join(data_list)
print
('-'
*1000
)print
('%s.'
% num)
print
(data)
# 儲存資料
with
open
('糗事百科.txt'
,'a'
, encoding=
'utf-8'
)as f:
f.write(
str(num)
+'.'
) f.write(data)
f.write(
'\n\n\n'
) num +=
1print
('儲存成功'
)print
('-'
*1000
)with
open(,
'a', encoding=
'utf-8'
)as f:
f.write(i)
f.write(
'\n'
)else
:print()
f.close(
)def
run(self)
:# 1.獲取url_list,所有網頁
page_num =
int(
input
('請輸入要爬取的頁數:'))
url_list = self.get_url_list(
int(page_num)+1
)print
(url_list)
# 2.遍歷,傳送請求,獲取響應
num =
1for url1 in url_list:
print
('第%s頁'
% num)
html_str = self.paser_one_url(url1)
get_data = self.get_page_url(html_str)
with
open
('糗事百科.txt'
簡單爬取糗事百科
剛剛入門,對於爬蟲還要折騰很久才行,雖然很多功能還沒開始掌握,但是爬取下來就很開心,接下來還會爭取進步的。把自己出現的一些錯誤都加上了注釋,我目前還在學習當中,大家一起進步。期間學了乙個新的函式,在這裡分享下 strip 網上是這麼說的 需要注意的是,傳入的是乙個字元陣列,編譯器去除兩端所有相應的字...
python 爬取糗事百科
step 1 構建乙個提取糗事百科笑話的函式import urllib2 import urllib import re import thread import time import sys reload sys sys.setdefaultencoding utf 8 defgetpage p...
Python爬取糗事百科
一 引入模組 因為urlopen功能比較簡單,所以設定 ip需引入proxyhandler和build opener模組,ip的獲取可以上西祠 查詢 import re from urllib.request import request,build opener,proxyhandler base...