獲取url,就是把關鍵字進行urlencode。整理爬取的內容,就是把一些【回車】,【空格】等雜七雜八的東西過濾掉。輸出結果下面是**:
import re
from urllib import parse
import time
import requests
from bs4 import beautifulsoup
import pandas as pd
def html_decode
(url)
: agent =
headers =
time.
sleep(5
) html = requests.
get(url, headers=headers)
content = html.content
soup =
beautifulsoup
(content, fromencoding=
'utf-8'
) content_list =
author_list =
time_list =
for mulu in soup.
findall
('div'
, attrs=):
# get subject
subject = mulu.h3.a.
get_text()
author_time = mulu.div.p.
get_text()
text1 = re.
sub(r"\n| ",''
, subject)
text2 = re.
sub(r"\n| ",''
, author_time)
text2.
strip()
author = text2.
split
(' ')[
0]time = text2.
split
(' ')[
1]content_list.
(text1)
author_list.
(author)
time_list.
(time)
return content_list, author_list, time_list
def get_context
(keyword, page)
: all_content =
all_author =
all_time =
agent =
headers =
parameter_page = page *
10if
type
(keyword)
== str:
main_url =r''
parameter =
# print
(str
(url_data)
)else
:print
('url wrong!'
) # get first page
url_data = parse.
urlencode
(parameter)
first_page_paramter =
str(url_data)
.split
('&inputt')[
0]first_page = parse.
urljoin
(main_url,
's?'
+str
(first_page_paramter)
) first_item, first_author_list, first_time_list =
html_decode
(first_page)
all_content.
extend
(first_item)
all_author.
extend
(first_author_list)
all_time.
extend
(first_time_list)
# get other pages
for num in
range(1
, page+1)
:print
('this is page %d!'
%(num)
) parameter[
'pn'
]= num *
10 url_data = parse.
urlencode
(parameter)
all_url = parse.
urljoin
(main_url,
's?'
+str
(url_data)
) # print
(all_url)
other_items, other_author_list, other_time_list =
html_decode
(all_url)
all_content.
extend
(other_items)
all_author.
extend
(other_author_list)
all_time.
extend
(other_time_list)
return all_content, all_author, all_time
if __name__ ==
'__main__'
: date = time.
strftime
('%m-%d'
,time.
localtime()
) key_word =
'華山'
page_num =
5 all_content, all_author, all_time =
get_context
(key_word, page_num)
result = pd.
dataframe()
result.
to_excel
('result_%s.xlsx'
%date)
print
('總共%d條資料!'
%len
(all_content)
)print
('爬取完畢!'
)
參考的部落格有: 網頁爬蟲 爬取百度諮詢新聞
工具 import urllib request urllib.request庫可以模擬瀏覽器傳送網頁請求並獲取request的結果。以科技類新聞為例,擬爬取這樣一篇文章。首先,傳送請求 html request urllib request request html 寫入獲取到的網頁,並轉化成py...
爬取百度(有道)翻譯
難點是分析動態網頁 抓包 找出url。self.query input 請輸入要翻譯的內容 self.url self.data self.headers def run self post response requests.post url self.url,data self.data,hea...
爬取百度貼吧
import urllib.request import urllib.parse import os,time 輸入貼吧名字 baname input 請輸入貼吧的名字 start page int input 請輸入起始頁 end page int input 請輸入結束頁 不完整的url ur...