1.取出乙個新聞列表頁的全部新聞 包裝成函式。
2.獲取總的新聞篇數,算出新聞總頁數。
3.獲取全部新聞列表頁的全部新聞詳情。
importrequests
from bs4 import
beautifulsoup
from datetime import
datetime
importre#
獲取新聞點選次數
defgetnewsid(url):
newsid = re.findall(r'
\_(.*).html
', url)[0][-4:]
clickurl = '
'.format(newsid)
clickres =requests.get(clickurl)
#利用正規表示式獲取新聞點選次數
clickcount = int(re.search("
hits'\).html\('(.*)'\);
", clickres.text).group(1))
return
clickcount
#獲取新聞細節
defgetnewsdetail(newsurl):
resd =requests.get(newsurl)
resd.encoding = '
utf-8
'soupd = beautifulsoup(resd.text, '
html.parser')
content = soupd.select('
#content
')[0].text
info = soupd.select('
.show-info
')[0].text
#呼叫getnewsid()獲取點選次數
count =getnewsid(newsurl)
#識別時間格式
date = re.search('
(\d.\d.\d\s\d.\d.\d)
', info).group(1)
#識別乙個至三個資料
if(info.find('
')>0):
author = re.search('
', info).group(1)
if(info.find('
')>0):
check = re.search('
', info).group(1)
if(info.find('
')>0):
sources = re.search('
', info).group(1)
#用datetime將時間字串轉換為datetime型別
datetime = datetime.strptime(date, '
%y-%m-%d %h:%m:%s')
#利用format對字串進行操作
print('
'.format(datetime, author, check, sources, count))
(content)
defgetlistpage(listurl):
res =requests.get(listurl)
res.encoding = '
utf-8
'soup = beautifulsoup(res.text, '
html.parser')
for new in soup.select('li'
):
if len(new.select('
.news-list-title
')) >0:
title = new.select('
.news-list-title
')[0].text
description = new.select('
.news-list-description
')[0].text
newsurl = new.select('
a')[0]['
href']
print('
'.format(title, description, newsurl))
#呼叫getnewsdetail()獲取新聞詳情
getnewsdetail(newsurl)
break
listurl = '
'getlistpage(listurl)
res =requests.get(listurl)
res.encoding = '
utf-8
'soup = beautifulsoup(res.text, '
html.parser')
listcount = int(soup.select('
.a1')[0].text.rstrip('
條'))//10+1
for i in range(2,listcount):
listurl= '
{}.html
'.format(i)
getlistpage(listurl)
4.找乙個自己感興趣的主題,進行資料爬取,並進行分詞分析。不能與其它同學雷同。
importrequests
from bs4 import
beautifulsoup
from datetime import
datetime
import
jieba
newsurl = '
'def
sort(text):
str = '''
一!「」,。?;』"',.、:\n
'''for s in
str:
text = text.replace(s, '')
wordlist =list(jieba.cut(text))
exclude =
set2 = set(wordlist) -exclude
dict ={}
for key in
set2:
dict[key] =wordlist.count(key)
dictlist =list(dict.items())
dictlist.sort(key=lambda x: x[1], reverse=true)
print("")
for i in range(5):
(dictlist[i])
defgetcontent(url):
res =requests.get(url)
res.encoding = '
utf-8
'soup2 = beautifulsoup(res.text, '
html.parser')
for news in soup2.select('
.l_a'):
if len(news.select('
.author
'))>0:
author=news.select('
.author
')[0].text
print("作者"
,author)
content = soup2.select('
.la_con
')[0].text.rstrip('
ad_survey_add_adpos("7000531");')
print("
正文:"
, content)
sort(content)
defgetnewdetails(newsurl):
res =requests.get(newsurl)
res.encoding = '
utf-8
'soup = beautifulsoup(res.text, '
html.parser')
for news in soup.select('
.item'):
#print(news)
title = news.select('
a')[0].attrs['
title']
a = news.select('
a')[0].attrs['
href']
brief = news.select('
h5')[0].text.rstrip('
[詳細]')
time = news.select('h6'
)[0].text
dt = datetime.strptime(time, '
%y-%m-%d %h:%m')
print("
", title)
print("
", a)
print("
內容簡介:
", brief)
print("
", dt)
getcontent(a)
print('\n'
)
#break
res =requests.get(newsurl)
res.encoding = '
utf-8
'soup = beautifulsoup(res.text, '
html.parser')
getnewdetails(newsurl)
獲取全部校園新聞
1.取出乙個新聞列表頁的全部新聞 包裝成函式。2.獲取總的新聞篇數,算出新聞總頁數。3.獲取全部新聞列表頁的全部新聞詳情。4.找乙個自己感興趣的主題,進行資料爬取,並進行分詞分析。不能與其它同學雷同。import requests from bs4 import beautifulsoup from...
獲取全部校園新聞
1.取出乙個新聞列表頁的全部新聞 包裝成函式。2.獲取總的新聞篇數,算出新聞總頁數。3.獲取全部新聞列表頁的全部新聞詳情。import requests from bs4 import beautifulsoup from datetime import datetime importre 獲得新聞...
獲取全部校園新聞
1.取出乙個新聞列表頁的全部新聞 包裝成函式。2.獲取總的新聞篇數,算出新聞總頁數。3.獲取全部新聞列表頁的全部新聞詳情。4.找乙個自己感興趣的主題,進行資料爬取,並進行分詞分析。不能與其它同學雷同。import requests from bs4 import beautifulsoup from...