爬取熱門微博資料2018 3 27更新

2021-08-17 17:45:45 字數 3709 閱讀 6467

import requests

import os

import re

import csv

import time

import json

cookies =

#當出現一些解決不了的問題時候 試著更新一下cookies

#使用者資訊,同時也能獲取到uid、fid、oid等關鍵引數

def get_user_info(usr_id):

url = ''.format(usr_id=usr_id)

resp = requests.get(url, headers=headers, cookies=cookies)

jsondata = resp.json()

#print(jsondata)

nickname = jsondata.get('data').get('userinfo').get('screen_name')

mblog_num = jsondata.get('data').get('userinfo').get('statuses_count')

verified = jsondata.get('data').get('userinfo').get('verified')

verified_reason = jsondata.get('data').get('userinfo').get('verified_reason')

gender = jsondata.get('data').get('userinfo').get('gender')

urank = jsondata.get('data').get('userinfo').get('urank') #使用者等級

mbrank = jsondata.get('data').get('userinfo').get('mbrank')

followers_count = jsondata.get('data').get('userinfo').get('followers_count')

follow_count = jsondata.get('data').get('userinfo').get('follow_count')

uid = jsondata.get('data').get('userinfo').get('*******_menus')[0].get('params').get('uid')

try:

fid = jsondata.get('data').get('userinfo').get('*******_menus')[1].get('actionlog').get('fid')

oid = jsondata.get('data').get('userinfo').get('*******_menus')[2].get('params').get('menu_list')[0].get('actionlog').get('oid')

cardid = jsondata.get('data').get('userinfo').get('*******_menus')[1].get('actionlog').get('cardid')

except:

uid = ''

fid = ''

oid = ''

cardid = ''

containerid = jsondata.get('data').get('tabsinfo').get('tabs')[0].get('containerid')

info =

print(info)

return info

def mblog_list(uid,oid):

mblog_list =

base_url = ''

page_url = '&type=uid&value=&page='

url = base_url.format(oid=oid)

resp = requests.get(url, headers=headers, cookies=cookies)

resp.encoding = 'gbk'

response = resp.json()

#print(response)

#熱門微博數total

total = response['data']['cardlistinfo']['total']

print(total)

#熱門微博網頁數

page_num = int(int(total)/10)+1

for i in range(1,page_num+1,1):

p_url = page_url.format(oid=oid, uid=uid, page=i)

#print(p_url)

page_resp = requests.get(p_url,headers=headers,cookies=cookies)

resp.encoding = 'gbk'

page_data = page_resp.json()

'''filename='22.json'

with open(filename,'w') as f:

json.dump(page_data,f)'''

try:

cards = page_data['data']['cards']

#print(cards)

for card in cards:

#print(card)

try:

mblog = card['mblog']

created_at = mblog['created_at']

id = mblog['id']

dirty_text = mblog['text'] #dirty_text中含有很多鏈結雜質

cleaned1 = re.sub(r'', '', dirty_text)

text = re.sub(r"", '', cleaned1)

reposts_count = mblog['reposts_count']

comments_count = mblog['comments_count']

attitudes_count = mblog['attitudes_count']

mblog_data =

print(' '*10,mblog_data)

except:

continue

print('................')

except:

continue

time.sleep(1)

return mblog_list

def main():

#user_id= '1655128924'

#user_id='2736225585'

#user_id = '2386831995'

user_id= '1282005885'

user_info = get_user_info(user_id)

uid = user_info.get('uid')

oid = user_info.get('oid')

print(uid,oid)

mblog_list(uid,oid)

print('............')

main()

自動獲取cookie,爬取新浪微博熱門評論

目錄 一 前言 二 網盤 selenium僅僅用於獲取cookie,實際爬取將直接使用requests請求,以保證爬取效率 話不多說,也不複雜,直接上 了,關鍵的地方有注釋 import requests import selenium from selenium import webdriver ...

Python爬取熱門微博,並儲存到MySQL中

目標 m.weibo.cn url的獲取可以從瀏覽器的f12中的network的xhr中找到。weibo demo.py import requests import json from w3lib.html import remove tags from mysqlhelper import my...

爬取新浪微博

學到的東西。1 習慣用logger,而不是用print self.logger.debug 開始解析 format response.url 2 習慣用正規表示式 這是在pipeline清理資料時用到的 s 5分鐘前 if re.match d 分鐘前 s minute re.match d s g...