目標**:m.weibo.cn
url的獲取可以從瀏覽器的f12中的network的xhr中找到。
weibo_demo.py:
import requests
import json
from w3lib.html import remove_tags
from mysqlhelper import mysqlhelper
import time
helper = mysqlhelper(
)max_page =
50#設定header
headers =
defget_one_page_info
(url)
:#需要:text,comments_count,attitudes_count,reposts_count,created_at ,source,儲存到mysql中
response = requests.get(url=url , headers=headers)
#json.loads()將json的字串轉化為dict
res_dict = json.loads(response.text)
cards_list = res_dict[
'data'][
'cards'
]#具體的獲取資料
for card in cards_list:
if'mblog'
in card:
text = remove_tags(card[
'mblog'][
'text'])
comments_count = card[
'mblog'][
'comments_count'
] attitudes_count = card[
'mblog'][
'attitudes_count'
] reposts_count = card[
'mblog'][
'reposts_count'
] created_at = card[
'mblog'][
'created_at'
] source_a = card[
'mblog'][
'source'
]# print(text,comments_count,attitudes_count,reposts_count,created_at,source_a)
insert_sql =
'insert into weibo_test (source_a, created_at, `text`, comments_count, attitudes_count, reposts_count)values (%s, %s, %s, %s, %s, %s)'
data =
(source_a, created_at, text, comments_count, attitudes_count, reposts_count)
helper.execute_insert_sql(insert_sql, data)
# time.sleep(1)
# create table wb_test(id int primary key auto_increment,source_a varchar(50),created_at varchar(40),`text` text,comments_count int,attitudes_count int,reposts_count int) default charset=utf8;
#truncate table 表名 制空表
if __name__ ==
'__main__'
:for i in
range
(max_page)
:print
('page '
+str
(i +1)
+' has done!'
) url =
''.format
(i +1)
get_one_page_info(url)
mysqlhelper.py:
import pymysql
class
mysqlhelper
(object):
def__init__
(self)
: self.conn = pymysql.connect(host=
'localhost'
, port=
3306
, db=
'wb'
, user=
'root'
, passwd=
'123456'
, charset=
'utf8'
)#cursor游標,類似與yeild生成器
self.cursor = self.conn.cursor(
)def
execute_insert_sql
(self, sql, data)
: self.cursor.execute(sql, data)
self.conn.commit(
)def
__del__
(self)
: self.cursor.close(
) self.conn.close(
)if __name__ ==
'__main__'
:#例項化
helper = mysqlhelper(
) insert_sql =
'insert into weibo_test (source_a, created_at, `text`, comments_count, attitudes_count, reposts_count)values (%s, %s, %s, %s, %s, %s)'
data =
('mi'
,'2020-4-22'
,'今天天氣好',2
,3,5
) helper.execute_insert_sql(insert_sql, data)
執行結果:
2020.4.30報錯:
後來在執行中發現會報pymysql.err.internalerror: (1366, "incorrect string value: '\\xf0\\x9f\\x98\\xb7 ' for column 'text' at row 1")
錯誤,這個錯誤發生在當需要將特殊字元寫入mysql時。此時需要在mysqlhelper.py中加上self.cursor.execute("set names utf8mb4;")
爬取熱門微博資料2018 3 27更新
import requests import os import re import csv import time import json cookies 當出現一些解決不了的問題時候 試著更新一下cookies 使用者資訊,同時也能獲取到uid fid oid等關鍵引數 def get user...
自動獲取cookie,爬取新浪微博熱門評論
目錄 一 前言 二 網盤 selenium僅僅用於獲取cookie,實際爬取將直接使用requests請求,以保證爬取效率 話不多說,也不複雜,直接上 了,關鍵的地方有注釋 import requests import selenium from selenium import webdriver ...
爬取新浪微博
學到的東西。1 習慣用logger,而不是用print self.logger.debug 開始解析 format response.url 2 習慣用正規表示式 這是在pipeline清理資料時用到的 s 5分鐘前 if re.match d 分鐘前 s minute re.match d s g...