寫入資料庫版本:
import requests
from bs4 import beautifulsoup
import numpy as np
import pandas as pd
import sqlalchemy
import pymysql
pymysql_engine = sqlalchemy.create_engine('mysql+pymysql://root:1234@localhost/sampledb?charset=utf8') #設定資料庫
# all_page_news =
root_url = ''
for i in range(1,4): #限定爬去前三頁
url = root_url + "?page={}".format(i)
print("抓取頁面: ",url)
html = requests.get(url)
soup = beautifulsoup(html.text, "lxml")
items = soup.find("div",class_="items")
one_page_news =
for i in
items("div",class_="item-inner")[:4]: #限定爬去每頁前四條
title = i.h2.a.string
#新聞標題
tmp_url = "" + i.h2.a["href"].split("#")[0] + "?full=y"
#對應新聞全文超連結,去除廣告
print("抓取新聞: ",tmp_url)
tmp_soup = beautifulsoup(requests.get(tmp_url).text, "lxml")
try:
text = "\n".join(tmp_soup.find("div",class_="story-body").stripped_strings) #新聞正文,對非文字粗略判定為新聞
except attributeerror:
text = "新聞"
lead = i.find("div",class_="item-lead", recursive=false).string
#新聞摘要
one_news = [title, lead, tmp_url, text]
# 寫入資料庫
print("to database...")
data = np.array(one_page_news)
df = pd.dataframe(data, columns=['title', 'lead', 'tmp_url', 'text'])
# data = np.array(all_page_news)
json版本:
import requests
from bs4 import beautifulsoup
import json
all_page_news =
root_url = ''
for i in range(1,4):
url = root_url + "?page={}".format(i)
print("抓取頁面: ",url)
html = requests.get(url)
soup = beautifulsoup(html.text, "lxml")
items = soup.find("div",class_="items")
one_page_news =
for i in
items("div",class_="item-inner")[:4]:
title = i.h2.a.string
tmp_url = "" + i.h2.a["href"].split("#")[0] + "?full=y"
print("抓取新聞: ",tmp_url)
tmp_soup = beautifulsoup(requests.get(tmp_url).text, "lxml")
try:
text = "\n".join(tmp_soup.find("div",class_="story-body").stripped_strings)
except attributeerror:
text = "新聞"
lead = i.find("div",class_="item-lead", recursive=false).string
one_news = [title, lead, tmp_url, text]
all_page_news.extend(one_page_news)
# 寫入json檔案
with
open("ftnews.json",'w',encoding="utf8") as
file:
json.dump(all_page_news,file)
# 讀取
# with open("ftnews.json",'r',encoding="utf8") as file:
# data_in = json.load(file)
python爬蟲案例 Python爬蟲案例集合
在python2.x裡面有urllib和urllib2 在python3.x裡面就把urllib和urllib2合成乙個urllib urllib3是在python3.x了裡面新增的第三方擴充套件。import urllib.request 向指定的url位址傳送請求,並返回伺服器響應的類檔案物件 ...
python案例 Python爬蟲案例集合
urllib2 在python2.x裡面有urllib和urllib2 在python3.x裡面就把urllib和urllib2合成乙個urllib urllib3是在python3.x了裡面新增的第三方擴充套件。urllib2 官方文件 urllib2 原始碼 urllib2 在 python3....
python爬蟲案例講解 Python爬蟲案例集合
伺服器返回的類檔案物件支援python檔案物件的操作方法 read 方法就是讀取檔案裡的全部內容,返回字串 html response.read 列印響應內容 我們需要稍微偽裝下,要不然第一步就會被反爬蟲發現 usr bin env python coding utf 8 import urllib...