1. 將新聞的正文內容儲存到文字檔案。
def writenewsdetail(content):# 將新聞的正文內容儲存到文字檔案。f=open('gzccnews.txt','a',encoding='utf-8')
f.write(content)
f.close()
2. 將新聞資料結構化為字典的列表:
import reimport requests
import pandas
import requests
from bs4 import beautifulsoup
from datetime import datetime
def getclickcount(url):
newid=re.findall(r'\_(.*).html',url)[0][-4:]
clickurl = ''.format(newid)
clickres = requests.get(clickurl)
clickcount = int(re.search("hits'\).html\('(.*)'\);", clickres.text).group(1))
return clickcount
#將獲取新聞詳情的**定義成乙個函式 def getnewdetail(newsurl):
def getnewsdetail(newsurl):#一篇新聞的全部訊息
resd=requests.get(newsurl)
resd.encoding = 'utf-8'
soupd = beautifulsoup(resd.text, 'html.parser')#開啟新聞詳情頁並解析
news={}
news['title'] = soupd.select('.show-title')[0].text
info = soupd.select('.show-info')[0].text
else:
source = 'none'
else:
source = 'none'
news['content']=soupd.select('.show-content')[0].text.strip()
# writenewsdetail(content)
news['click'] = getclickcount(newsurl)
news['newsurl']=newsurl
return (news)
def getnewslist(pageurl):
res=requests.get(pageurl)
res.encoding = 'utf-8'
soup = beautifulsoup(res.text, 'html.parser')
for news in soup.select('li'):
if len(news.select('.news-list-title'))>0:
# print(news.select('.news-list-title'))
#t=news.select('.news-list-title')[0].text#取標題
#dt=news.select('.news-list-info')[0].contents[0].text#取日期
newsurl=news.select('a')[0].attrs['href']#取鏈結
getnewsdetail(newsurl)
break
def getlistpage(pageurl):
res=requests.get(pageurl)
res.encoding='utf-8'
soup=beautifulsoup(res.text,'html.parser')
newslist=
for news in soup.select('li'):
if len(news.select('.news-list-title'))>0:
newsurl=news.select('a')[0].attrs['href']
return (newslist)
def getpagen():
res = requests.get('')
res.encoding = 'utf-8'
soup = beautifulsoup(res.text, 'html.parser')
n = int(soup.select('#pages a')[0].text.rstrip('條'))
return (n // 10 + 1)
# return n
#將新聞資料結構化為字典的列表:
newstotal=
firstpageurl=''
newstotal.extend(getlistpage(firstpageurl))
n= getpagen()
for i in range(n,n+1):
listpageurl='{}.html'.format(i)
newstotal.extend(getlistpage(listpageurl))
for news in newstotal:
print(news)
3. 安裝pandas,用pandas.dataframe(newstotal),建立乙個dataframe物件df.
import pandasdf = pandas.dataframe(newstotal)
print(df)
4. 通過df將提取的資料儲存到csv或excel 檔案。
df.to_excel('news1.xlsx')
5. 用pandas提供的函式和方法進行資料分析:
print(df[['click','title','source']][:6])print(df[(df['source']=='學校綜合辦')&(df['click']>3000)])
#使用isin篩選值
sou = ['國際學院','學生工作處']
df[df['source'].isin(sou)]
print(df[df['source'].isin(sou)])
df1 = df.set_index('dt')
print(df1['2018-03'])
6. 儲存到sqlite3資料庫
import sqlite3with sqlite3.connect('gzccnewsdb.sqlite') as db:
df3.to_sql('gzccnews05',con = db, if_exists='replace')
7. 從sqlite3讀資料
with sqlite3.connect('gzccnewsdb.sqlite') as db:df2 = pandas.read_sql_query('select * from gzccnews05',con=db)
print(df2)
8. df儲存到mysql資料庫
import pymysqlfrom sqlalchemy import create_engine
conn=create_engine('mysql+pymysql://root:@localhost://3306/gzcc?charset=utf8')
pandas.io.sql.to_sql(df,'gzccnews',con=conn,if_exists='replace')
資料結構化與儲存
1.將新聞的正文內容儲存到文字檔案。soup beautifulsoup res.text,html.parser content soup.select show content 0 text f open news.txt w encoding utf 8 f.write content f.c...
資料結構化與儲存
作業是 同學的,因為沒有對新聞資訊做提取,所有無法新增新聞資訊到字典。已練習pandas庫的相關使用方法,匯出excel檔案。ps 自己的 會盡快修改!import requests from bs4 import beautifulsoup from datetime import datetim...
資料結構化與儲存
1.將新聞的正文內容儲存到文字檔案。newscontent soup.select show content 0 text f open news.txt w f.write newscontent f open news.txt r print f.read 3.安裝pandas,用pandas....