# encoding=utf-8
defget()
:# pros[0].click()
# # 切換到開啟的頁面
# driver.switch_to.window(driver.window_handles[1])
# # 單數xpath
# title = driver.find_element_by_xpath('').text
# print(title)
# # 找到**
# price = driver.find_element_by_xpath('//爺爺//孫子').text
# print(price)
## # 關閉頁面批量爬取
# driver.close()
# # 切換頁面
# # 切換到開啟的頁面
# driver.switch_to.window(driver.window_handles[0])
# # 迴圈
## for i in pros[:3]:
# i.click()
# driver.switch_to.window(driver.window_handles[0])
## # 目標**,獲取各個標題
# pros = driver.find_elements_by_xpath('//div[@class="keyword-out-div"]')
## print(len(pros))
# pros[0].click()
# # 切換到開啟的頁面
# driver.switch_to.window(driver.window_handles[1])
# # 單數xpath
# title = driver.find_element_by_xpath('').text
# print(title)
# # 找到**
# price = driver.find_element_by_xpath('//爺爺//孫子').text
# print(price)
# print('##############')
# # 關閉頁面批量爬取
# driver.close()
# driver.switch_to.window(driver.window_handles[0])##
##return
0if __name__ ==
'__main__'
:from selenium import webdriver
url =
''# 開啟**,指定為chrome瀏覽器
driver = webdriver.chrome(r'c:\users\wangxutao\desktop\chromedriver_win32\chromedriver.exe'
)# 載入網頁
driver.get(url)
# titles_list =
# on_times_list=
# 目標**,獲取各個標題
pros = driver.find_elements_by_xpath(
'//div[@class="keyword-out-div"]//p'
)# to_click= driver.find_element_by_xpath('//button[@class="ui-btn ui-btn-inline" id="btn_after"]')
# to_click.click()
for pp in
range(11
*30*24
*30*10
):to_click = driver.find_elements_by_xpath(
'//button[@class="ui-btn ui-btn-inline"]'
) to_click=to_click[1]
to_click.click(
)import time
# 記錄titles
for i in pros[
:200:4
]:f =
open
('2020year_hot_titles_for_weibo.txt'
,'a+'
) title = i.text
# 取出標題 儲存在txt裡面
title = title[title.find(
'.')+1
:]title = title.replace(
'(微博廣告位留空)',''
) f.write(title+
'\n'
) f.close(
)# print(title)
# # 記錄在榜單時間
# for i in pros[0+3:200+3:4]:
# on_times = i.text
# on_times = on_times[on_times.find('在榜時長:') + 5:]
# print(on_times_list)
# print(titles_list)
# print(on_times_list)
# import pandas as pd
# data =pd.dataframe()
# data['標題'] = titles_list
# data['持續時間'] = on_times_list
# print(data)
# data.to_csv('mycsv.xlsx')
# encoding=utf-8
# jieba cut
defjieba_cut
(mytxt)
:import jieba
mytext =
" ".join(jieba.cut(mytxt)
)return mytext
defmakeword_cloud
(txt)
:from wordcloud import wordcloud
wordcloud = wordcloud(font_path=
'simsun.ttf'
,width=
1920
, height=
1080
,mode=
'rgba'
, background_color=
none
).generate(txt)
import matplotlib.pyplot as plt
# plt.switch_backend('agg')
plt.imshow(wordcloud, interpolation=
'bilinear'
) plt.axis(
"off"
)if __name__ ==
'__main__'
: word_cloud =
'2020year_hot_titles_for_weibo.txt'
f =open
(file
=word_cloud,mode=
'r')
content = f.read(
)# content= content.replace('(微博廣告位留空)',' ')
content_for_ciyun = jieba_cut(content)
content_for_ciyun=content_for_ciyun.replace(
'感染'
,'釘釘'
) content_for_ciyun=content_for_ciyun.replace(
'境外',)
content_for_ciyun=content_for_ciyun.replace(
'輸入'
,'辦公'
) content_for_ciyun=content_for_ciyun.replace(
'開學',)
content_for_ciyun=content_for_ciyun.replace(
'時間'
,'會議'
)print
(content_for_ciyun)
# content_for_ciyun=content_for_ciyun.replace('確診','釘釘')
# print(content_for_ciyun)
makeword_cloud(content_for_ciyun)
Python爬取熱門微博,並儲存到MySQL中
目標 m.weibo.cn url的獲取可以從瀏覽器的f12中的network的xhr中找到。weibo demo.py import requests import json from w3lib.html import remove tags from mysqlhelper import my...
Scrapy爬取並儲存到TXT檔案
在建立完成專案並建立爬蟲的基礎上,編寫儲存到txt的專案 1.將 robotstxt obey 設定為false 2.將 item pipelines 開啟 item是scrapy提供的類似於字典型別的資料容器,它與字典最大的區別在於它規定了統一的資料規格樣式,即具有統一性與結構性。這樣既方便資料的...
scrapy爬取資料並儲存到文字
1.scrapy專案結構如下 2.開啟spidler目錄下的duba.py檔案,如下 這個是根據豆瓣一部分頁面獲取的熱門話題內容,有6條資料 coding utf 8 import scrapy from scrapydemo.items import scrapydemoitem from lxm...