1.將新聞的正文內容儲存到文字檔案
2.將新聞資料結構化為字典的列表:
3.安裝pandas,用pandas.dataframe(newstotal),建立乙個dataframe物件df.
4.通過df將提取的資料儲存到csv或excel 檔案。
5.用pandas提供的函式和方法進行資料分析:
import requests
from bs4 import beautifulsoup
from datetime import datetime
import re
import pandas
news_list =
def crawlonepageschoolnews(page_url):
res0 = requests.get(page_url)
res0.encoding = 'utf-8'
soup0 = beautifulsoup(res0.text, 'html.parser')
news = soup0.select('.news-list > li')
for n in news:
# print(n)
print('**' * 5 + '列表頁資訊' + '**' * 10)
print('新聞描述:' + n.a.select('.news-list-description')[0].text)
news = getnewdetail(n.a.attrs['href'])
news['標題'] = n.select('.news-list-title')[0].text
return news_list
def getnewdetail(href):
print('**' * 5 + '詳情頁資訊' + '**' * 10)
print(href)
res1 = requests.get(href)
res1.encoding = 'utf-8'
soup1 = beautifulsoup(res1.text, 'html.parser')
news = {}
if soup1.select('#content'):
news_content = soup1.select('#content')[0].text
news['內容'] = news_content
print(news_content) # 文章內容
else:
news['內容'] = ''
if soup1.select('.show-info'): # 防止之前網頁沒有show_info
news_info = soup1.select('.show-info')[0].text
else:
return news
info_list = ['**', '發布時間', '點選', '作者', '審核', '攝影'] # 需要解析的字段
news_info_set = set(news_info.split('\xa0')) - # 網頁中的 獲取後會解析成\xa0,所以可以使用\xa0作為分隔符
# 迴圈列印文章資訊
for n_i in news_info_set:
for info_flag in info_list:
if n_i.find(info_flag) != -1: # 因為時間的冒號採用了英文符所以要進行判斷
if info_flag == '發布時間':
# 將發布時間字串轉為datetime格式,方便日後儲存到資料庫
release_time = datetime.strptime(n_i[n_i.index(':') + 1:], '%y-%m-%d %h:%m:%s ')
news[info_flag] = release_time
print(info_flag + ':', release_time)
elif info_flag == '點選': # 點選次數是通過文章id訪問php後使用js寫入,所以這裡單獨處理
news[info_flag] = getclickcount(href)
else:
news[info_flag] = n_i[n_i.index(':') + 1:]
print(info_flag + ':' + n_i[n_i.index(':') + 1:])
print('————' * 40)
return news
def getclickcount(news_url):
click_num_url = ''
click_num_url = click_num_url.format(re.search('_(.*)/(.*).html', news_url).group(2))
res2 = requests.get(click_num_url)
res2.encoding = 'utf-8'
click_num = re.search("\$\('#hits'\).html\('(\d*)'\)", res2.text).group(1)
print('點選:' + click_num)
return click_num
print(crawlonepageschoolnews(''))
pageurl = '{}.html'
res = requests.get('')
res.encoding = 'utf-8'
soup = beautifulsoup(res.text, 'html.parser')
newssum = int(re.search('(\d*)條', soup.select('a.a1')[0].text).group(1))
if newssum % 10:
pagesum = int(newssum / 10) + 1
else:
pagesum = int(newssum / 10)
for i in range(2, pagesum + 1):
crawlonepageschoolnews(pageurl.format(i))
dit = pandas.dataframe(news_list)
dit.to_excel('test.xlsx')
dit.to_csv('test.csv')
print(dit[['作者', '**']][:6])
print(dit[(dit['**'] == '學校綜合辦') & (dit['點選'] > 3000)])
print(dit[dit['**'].isin(['國際學院', '學生工作處'])])
資料結構化與儲存
1.將新聞的正文內容儲存到文字檔案。soup beautifulsoup res.text,html.parser content soup.select show content 0 text f open news.txt w encoding utf 8 f.write content f.c...
資料結構化與儲存
作業是 同學的,因為沒有對新聞資訊做提取,所有無法新增新聞資訊到字典。已練習pandas庫的相關使用方法,匯出excel檔案。ps 自己的 會盡快修改!import requests from bs4 import beautifulsoup from datetime import datetim...
資料結構化與儲存
1.將新聞的正文內容儲存到文字檔案。newscontent soup.select show content 0 text f open news.txt w f.write newscontent f open news.txt r print f.read 3.安裝pandas,用pandas....