python獲取頁面文字資訊

# -*- coding: utf-8 -*-
from
selenium import webdriver
import time, re,requests,os,time,random,traceback
import urllib.request,threading
from
bs4 import beautifulsoup
import html.parser
from tkinter import *
from
tkinter import ttk
import tkinter.messagebox 
def gethtml(questionid,page):
chrome_options =webdriver.chromeoptions()
chrome_options.add_argument(
'--start-maximized
')  # 最大化執行（全屏視窗）,不設定，取元素會報錯
chrome_options.add_argument(
'--disable-infobars
')  # 禁用瀏覽器正在被自動化程式控制的提示
chrome_options.add_argument(
'--incognito
')  # 隱身模式（無痕模式）
chrome_options.add_argument(
'--headless
')  # 瀏覽器不提供視覺化頁面
driver = webdriver.chrome(executable_path = "
chromedriver
",options=chrome_options)  # 開啟瀏覽器
driver.
get("
"+questionid+"
/answers/updated?page=
"+str(page)) # 開啟想要爬取的知乎頁面 
# 模擬使用者操作
def execute_times(times):
for i in
range(times):
print('第
'+str(i)+'
次點選'
)             driver.execute_script(
"window.scrollto(0, 
"+str(1000 * i)+");"
)            time.sleep(3)
driver.execute_script(
"window.scrollto(0, document.body.scrollheight);
")    
execute_times(12)
result_raw =driver.page_source  # 這是原網頁 html 資訊
result_soup = beautifulsoup(result_raw, '
html.parser
')# 然後將其解析
result_bf =result_soup.prettify()  # 結構化原 html 檔案
answers = driver.find_elements_by_class_name("
richcontent-inner")
txt = "
start\n
"for answer in
answers:
if len(answer.text) > 300
:           txt = txt + answer.text + "
\n-----------我是分隔符------\n
"with open(questionid +"
/page_
"+str(page)+"
.txt
", '
w',encoding="
utf-8
") as
zhpage:  # 儲存路徑裡的資料夾需要事先建立。
zhpage.write(txt)
zhpage.close()
print(
"爬取回答頁面成功!!!")
driver.quit()
return
result_soup
def readtxt(path):
f = open(path,'
r',encoding='
utf-8')
strtxt =f.read()
f.close()
return
strtxt
def main(questionid,startpage,endpage):
mkdir([questionid])
for i in
range(startpage,endpage):
try:
gethtml(questionid,i)
time.sleep(random.choice(range(
5,8)))
except exception:
traceback.print_exc()
pass
def mkdir(paths):
for path in
paths:
ifnot os.path.exists(path):
os.mkdir(path)
def getanswer():
questionid = var_id.get
()    start = var_start.get
()    end = var_end.get
()    main(questionid,start,end)
if __name__ == '
__main__':
main(str(
308829198),101,200
)tk =tk()
tk.title(
'獲取知乎問題所有答案')
tk.geometry(
'600x150')
frame =frame(tk)
label(tk,text='
問題標識:(例：324405640/answer/720532471中的324405640 )
',width=200,anchor=w, justify=left).place(x=10,y=10
)var_id =variable()
question_id = entry(tk,textvariable=var_id,width=30
)question_id.place(x=10,y=40
)label(tk,text='
開始頁：
').place(x=230,y=40
)var_start =variable()
e = entry(tk, textvariable=var_start,width=10).place(x=290,y=40
)var_start.
set(1
)label(tk,text='
結束頁：
').place(x=360,y=40
)var_end =variable()
e = entry(tk, textvariable=var_end,width=10).place(x=420,y=40
)var_end.
set(10
)button(tk, text="
獲取答案
", command=getanswer).place(x=200,y=80
)#tk.mainloop()

獲取頁面尺寸文字資訊

獲取介面我們常用的方法有 size 獲取元素尺寸 text 獲取元素文字 get attribute name 獲取屬性值 is displayed 設定元素是否使用者可見如下 from selenium import webdriver driver webdriver.chrome drive...

JS獲取當前頁面頁面URL資訊

url即統一資源定位符 uniform resource locator,url 完整的url由這幾個部分構成 scheme host port path?query fragment scheme 通訊協議常用的http,ftp,maito等設定或獲取url從頭到埠號部分。url windo...

python獲取本機資訊

python的確是簡單方便,庫函式完成了許多可能用到的功能,今天學習到的是獲取本機資訊的功能.import socket defget host ip 查詢本機ip位址 return ip try s socket.socket socket.af inet,socket.sock dgram s....

python獲取頁面文字資訊

獲取頁面尺寸文字資訊

JS獲取當前頁面頁面URL資訊

python獲取本機資訊

相關推薦