需要的庫:
python3.x
urllib
requestes
bs4json
根目錄:
1:?lesson_id=457&view=video
2:?lesson_id=458&view=video
3:?lesson_id=459&view=video
32:?lesson_id=488&view=video
mp4 根目錄:
***根目錄:
課程名稱:/community/dailylesson/lessonhandler.ashx?operate=getlessonbyid&v=4&lesson_id=459&transculturecode=zh-cn
from bs4 import beautifulsoup
import urllib.request
import requests
import json
import os
import time
classef(
):''' 整個ef專案的主體
'''def__init__
(self)
: self.baseurl =
"/community/dailylesson/lessonhandler.ashx?operate=preloaddata&teachculturecode=en&ss=ee&v=4&"
self.header =
self.name_url =
"/community/dailylesson/lessonhandler.ashx?operate=getlessonbyid&v=4&"
defgetrequsetcontent
(self, url)
:'''
獲取頁面請求
'''try:
req = urllib.request.request(url, headers=self.header)
response = urllib.request.urlopen(req, timeout=10)
except
:print
("頁面載入失敗"
)else
:return response.read(
).decode(
'utf-8'
)def
spyder
(self, url, name_url)
: html = self.getrequsetcontent(url)
html4name = self.getrequsetcontent(name_url)
data_dict = json.loads(html)
data_dict_name = json.loads(html4name)
lesson = data_dict_name[
'lesson'][
'lessonnamewithperfix'
] not_allow =
['/'
,'\\'
,':'
,'*'
,"'"
,'"'
,"<"
,">"
,"|"
,"?"
,'\r'
,'\n'
]# 非法字元
lesson_name = lesson.split(
"-")[1
]for char in not_allow:
if char in lesson_name:
lesson_name = lesson_name.replace(char,
'_')
# 建立資料夾
ifnot os.path.exists(lesson_name)
: os.mkdir(lesson_name)
slides = data_dict[
'slides'][
0]localizedslides = slides[
'localizedslides'
] en = localizedslides[
'en'
] mediasource = en[
'mediasource'
]# mp4位址
mp4_url =
""+ mediasource
dialogue = en[
'dialogue'
] sentences = dialogue[
'sentences'
] en_list =
# 儲存的是英文
cn_list =
# 儲存的是中文
***_list =
# 儲存的是***
for sentence in sentences:
text = sentence[
'sentence'][
'text'
] *** = sentence[
'sentence'][
'sentenceaudiosrc'
] trans = sentence[
'trans'][
'zh-cn'][
'text'
]for en, cn, *** in
zip(en_list, cn_list, ***_list)
:print
("英文:{}, 中文:{}, ***:{}"
.format
(en, cn, ***)
)with
open
(lesson_name +
"\\sentences.txt"
,'a'
)as f:
f.write(
"英文:{}, 中文:{}, ***:{}"
.format
(en, cn, cn +
"***"))
f.write(
"\n"
) ***_url =
""+ ***
self.dl(***_url, lesson_name, cn)
time.sleep(
0.5)
self.dl(mp4_url, lesson_name,"")
defdl
(self, url, fd_name, ***_name)
: res = requests.get(url, headers = self.header)
if ***_name =="":
fn = fd_name +
".mp4"
else
: fn = ***_name +
".***"
with
open
(fd_name +
"\\"
+ fn,
'wb'
)as f:
f.write(res.content)
if __name__ ==
"__main__"
: ef = ef(
)for i in
range
(457
,489):
url = ef.baseurl +
"lesson_id={}&transculturecode=zh-cn"
.format
(i) name_url = ef.name_url +
"lesson_id={}&transculturecode=zh-cn"
.format
(i) ef.spyder(url, name_url)
time.sleep(
1)
注意點
python爬取網頁每日一句英語發給微信好友
使用urllib獲取網頁 使用etree解析網頁資料並使用xpath找到想要的內容 傳送 獲取每日一句英語 from lxml import etree from urllib.request import urlopen import urllib import itchat as ic 位址 u...
python動態爬取知乎 python爬取微博動態
在初學爬蟲的過程中,我們會發現很多 都使用ajax技術動態載入資料,和常規的 不一樣,資料是動態載入的,如果我們使用常規的方法爬取網頁,得到的只是一堆html 沒有任何的資料。比如微博就是如此,我們可以通過下滑來獲取更多的動態。對於這樣的網頁該如何抓取呢?我們以微博使用者動態為例,抓取某名使用者的文...
Python爬取小說
感覺這個夠蛋疼的,因為你如果正常寫的話,前幾次執行沒問題,之後你連 都沒改,再執行就出錯了。其實這可能是網路請求失敗,或者有反爬蟲的東西吧。但這就會讓你寫的時候非常苦惱,所以這這東西,健壯性及其重要!import requests from bs4 import beautifulsoup impo...