import requests
import os
import re
import csv
import time
import json
cookies =
#當出現一些解決不了的問題時候 試著更新一下cookies
#使用者資訊,同時也能獲取到uid、fid、oid等關鍵引數
def get_user_info(usr_id):
url = ''.format(usr_id=usr_id)
resp = requests.get(url, headers=headers, cookies=cookies)
jsondata = resp.json()
#print(jsondata)
nickname = jsondata.get('data').get('userinfo').get('screen_name')
mblog_num = jsondata.get('data').get('userinfo').get('statuses_count')
verified = jsondata.get('data').get('userinfo').get('verified')
verified_reason = jsondata.get('data').get('userinfo').get('verified_reason')
gender = jsondata.get('data').get('userinfo').get('gender')
urank = jsondata.get('data').get('userinfo').get('urank') #使用者等級
mbrank = jsondata.get('data').get('userinfo').get('mbrank')
followers_count = jsondata.get('data').get('userinfo').get('followers_count')
follow_count = jsondata.get('data').get('userinfo').get('follow_count')
uid = jsondata.get('data').get('userinfo').get('*******_menus')[0].get('params').get('uid')
try:
fid = jsondata.get('data').get('userinfo').get('*******_menus')[1].get('actionlog').get('fid')
oid = jsondata.get('data').get('userinfo').get('*******_menus')[2].get('params').get('menu_list')[0].get('actionlog').get('oid')
cardid = jsondata.get('data').get('userinfo').get('*******_menus')[1].get('actionlog').get('cardid')
except:
uid = ''
fid = ''
oid = ''
cardid = ''
containerid = jsondata.get('data').get('tabsinfo').get('tabs')[0].get('containerid')
info =
print(info)
return info
def mblog_list(uid,oid):
mblog_list =
base_url = ''
page_url = '&type=uid&value=&page='
url = base_url.format(oid=oid)
resp = requests.get(url, headers=headers, cookies=cookies)
resp.encoding = 'gbk'
response = resp.json()
#print(response)
#熱門微博數total
total = response['data']['cardlistinfo']['total']
print(total)
#熱門微博網頁數
page_num = int(int(total)/10)+1
for i in range(1,page_num+1,1):
p_url = page_url.format(oid=oid, uid=uid, page=i)
#print(p_url)
page_resp = requests.get(p_url,headers=headers,cookies=cookies)
resp.encoding = 'gbk'
page_data = page_resp.json()
'''filename='22.json'
with open(filename,'w') as f:
json.dump(page_data,f)'''
try:
cards = page_data['data']['cards']
#print(cards)
for card in cards:
#print(card)
try:
mblog = card['mblog']
created_at = mblog['created_at']
id = mblog['id']
dirty_text = mblog['text'] #dirty_text中含有很多鏈結雜質
cleaned1 = re.sub(r'', '', dirty_text)
text = re.sub(r"", '', cleaned1)
reposts_count = mblog['reposts_count']
comments_count = mblog['comments_count']
attitudes_count = mblog['attitudes_count']
mblog_data =
print(' '*10,mblog_data)
except:
continue
print('................')
except:
continue
time.sleep(1)
return mblog_list
def main():
#user_id= '1655128924'
#user_id='2736225585'
#user_id = '2386831995'
user_id= '1282005885'
user_info = get_user_info(user_id)
uid = user_info.get('uid')
oid = user_info.get('oid')
print(uid,oid)
mblog_list(uid,oid)
print('............')
main()
自動獲取cookie,爬取新浪微博熱門評論
目錄 一 前言 二 網盤 selenium僅僅用於獲取cookie,實際爬取將直接使用requests請求,以保證爬取效率 話不多說,也不複雜,直接上 了,關鍵的地方有注釋 import requests import selenium from selenium import webdriver ...
Python爬取熱門微博,並儲存到MySQL中
目標 m.weibo.cn url的獲取可以從瀏覽器的f12中的network的xhr中找到。weibo demo.py import requests import json from w3lib.html import remove tags from mysqlhelper import my...
爬取新浪微博
學到的東西。1 習慣用logger,而不是用print self.logger.debug 開始解析 format response.url 2 習慣用正規表示式 這是在pipeline清理資料時用到的 s 5分鐘前 if re.match d 分鐘前 s minute re.match d s g...