將爬取到的網頁寫入檔案中
import urllib.request
))
爬取勵志語錄import urllib.request
import urllib.parse
import re
import os
import time
'''start_page = int(input("請輸入開始頁面:"))
end_page = int(input("請輸入結束頁面:"))
for page in range(start_page ,end_page+1):
url = ''+str(page)+'.html'
header =
request = urllib.request.request(url=url,headers= header)
response = urllib.request.urlopen(request)
content = response.read().decode('utf8')
with open('f1.html', 'w', encoding='utf8') as fp:
fp.write(content)
pattern = re.compile(r'(.*?).*?',re.s)
ret = pattern.findall(content)
#print(ret)
for text_info in ret:
text_title = text_info[0]
text_main = text_info[1]
dirname = 'lizhi'
if not os.path.exists(dirname):
os.mkdir(dirname)
filename = str(text_title).split('——')[0]
filepath = os.path.join(dirname, filename)
with open(filepath+ '.txt', 'w',encoding = 'utf8') as fp:
fp.write(text_main)
'''# 將上述過程封裝成函式
defhandle_request
(url, page)
: url +=
str(page)
+'.html'
header =
request = urllib.request.request(url=url, headers=header)
return request
defparse_content
(request)
: response = urllib.request.urlopen(request)
content = response.read(
).decode(
'utf8'
)with
open
('f1.html'
,'w'
, encoding=
'utf8'
)as fp:
fp.write(content)
pattern = re.
compile
(r'(.*?).*?'
, re.s)
ret = pattern.findall(content)
writein(ret)
defwritein
(ret)
:for text_info in ret:
text_title = text_info[0]
text_main = text_info[1]
dirname =
'lizhi'
ifnot os.path.exists(dirname)
: os.mkdir(dirname)
filename =
str(text_title)
filepath = os.path.join(dirname, filename)
print
("%s start writing......"
%filename)
with
open
(filepath+
'.txt'
,'w'
,encoding =
'utf8'
)as fp:
fp.write(text_main)
print
("%s write successfully"
% filename)
#time.sleep()
defmain()
: url =
''start_page =
int(
input
("請輸入開始頁面:"))
end_page =
int(
input
("請輸入結束頁面:"))
for page in
range
(start_page, end_page +1)
:print
("start downloading %s頁......"
% page)
request = handle_request(url, page)
parse_content(request)
print
("第%s頁 end download"
% page)
time.sleep(1)
if __name__ ==
'__main__'
: main(
)
Python 爬蟲爬取網頁
工具 python 2.7 import urllib import urllib2 defgetpage url 爬去網頁的方法 request urllib.request url 訪問網頁 reponse urllib2.urlopen request 返回網頁 return response...
python爬蟲爬取策略
在爬蟲系統中,待抓取url佇列是很重要的一部分。待抓取url佇列中的url以什麼樣的順序排列也是乙個很重要的問題,因為這涉及到先抓取那個頁面,後抓取哪個頁面。而決定這些url排列順序的方法,叫做抓取策略。下面重點介紹幾種常見的抓取策略 一 深度優先遍歷策略 深度優先遍歷策略是指網路爬蟲會從起始頁開始...
python爬蟲 seebug爬取
1.找相關的標籤一步一步往下查詢 2.有cookie才能查詢 3.用import re而不用from re import 是為了防止衝突 coding utf 8 from requests import import re from bs4 import beautifulsoup as bs h...