import requestsfrom bs4 import beautifulsoup
from datetime import datetime
import re
import pandas
import openpyxl
import sqlite3
url = ""
res = requests.get(url);
res.encoding = "utf-8"
soup = beautifulsoup(res.text, "html.parser");
def writenewsdetails(contents):
f = open('gzccnews.txt', "a", encoding="utf-8")
f.write(contents)
f.close()
def getclickcount(newurl):
newsid = re.findall("\_(.*).html", newurl)[0].split("/")[-1];
res = requests.get(" {}&modelid=80".format(newsid))
return int(res.text.split(".html")[-1].lstrip("('").rsplit("');")[0])
# 獲取新聞詳情
def getnewdetails(newsdetailurl):
detail_res = requests.get(newsdetailurl)
detail_res.encoding = "utf-8"
detail_soup = beautifulsoup(detail_res.text, "html.parser")
news = {}
news['title'] = detail_soup.select(".show-title")[0].text
info = detail_soup.select(".show-info")[0].text
else:
news['source'] = 'none'
news['content'] = detail_soup.select("#content")[0].text
writedetailnews(news['content'])
news['click'] = getclickcount(newsdetailurl)
return news
# print(news)
# 獲取總頁數
def getpagen(url):
res = requests.get(url)
res.encoding = 'utf-8'
soup = beautifulsoup(res.text, 'html.parser')
return int(soup.select(".a1")[0].text.rstrip("條")) // 10 + 1
# 獲取新聞一頁的所有資訊
def getlistpage(url):
newslist =
for news in soup.select("li"):
if len(news.select(".news-list-title")) > 0: # 排除為空的li
# time = news.select(".news-list-info")[0].contents[0].text
# title = news.select(".news-list-title")[0].text
# description = news.select(".news-list-description")[0].text
detail_url = news.select('a')[0].attrs['href']
return newslist
newstotal =
totalpagenum = getpagen(url)
firstpageurl = ""
newstotal.extend(getlistpage(firstpageurl))
for num in range(totalpagenum, totalpagenum + 1):
listpageurl = "{}.html".format(num)
getlistpage(listpageurl)
print(newstotal)
#3. 安裝pandas,用pandas.dataframe(newstotal),建立乙個dataframe物件df.
df = pandas.dataframe(newstotal)
print(df)
#4. 通過df將提取的資料儲存到csv或excel 檔案。
df.to_excel('gzcss.xlsx')
# 5. 用pandas提供的函式和方法進行資料分析:
## 提取包含點選次數、標題、**的前6行資料
# 提取『學校綜合辦』發布的,『點選次數』超過3000的新聞。
# 提取'國際學院'和'學生工作處'發布的新聞。
# 進取2023年3月的新聞
print(df[['title', 'clickcount', 'source']][:6])
print(df[(df['clickcount'] > 3000) & (df['source'] == '學校綜合辦')])
sou = ['國際學院', '學生工作處']
print(df[df['source'].isin(sou)])
df1 = df.set_index('time')
print(df1['2018-03'])
資料結構化與儲存
1.將新聞的正文內容儲存到文字檔案。soup beautifulsoup res.text,html.parser content soup.select show content 0 text f open news.txt w encoding utf 8 f.write content f.c...
資料結構化與儲存
作業是 同學的,因為沒有對新聞資訊做提取,所有無法新增新聞資訊到字典。已練習pandas庫的相關使用方法,匯出excel檔案。ps 自己的 會盡快修改!import requests from bs4 import beautifulsoup from datetime import datetim...
資料結構化與儲存
1.將新聞的正文內容儲存到文字檔案。newscontent soup.select show content 0 text f open news.txt w f.write newscontent f open news.txt r print f.read 3.安裝pandas,用pandas....