# 把文字儲存在本地
# filename 表示文字的路徑,操作文字的方式許可權
with open(filename, "a",encoding='utf-8') as f:
f.write(html)
f.write('\n')
def loadnextlink(url,path):
request = urllib.request.request(url, headers=headers)
response = urllib.request.urlopen(request)
content = etree.html(response.read().decode("utf-8"))
link_movie=content.xpath(path)
return link_movie
def loadinfo(url,filename):
list_title = ['片名','導演', '編劇', '主演', '型別', '製片國家/地區','語言', '上映時間', '片長', '又名', 'imdb鏈結']
dict = {}
url=url+"/"
link_movie=loadnextlink(url,"//div[@class='mod movie-list']/dl/dd/a/@href")
for movie in link_movie:
name=loadnextlink(movie,"//div[@id='info']/span/span")
num=0
title=loadnextlink(movie,"//span[@property='v:itemreviewed']/text()")[0]
dict[list_title[num]] = title
num+=1
for i in name:
link=i.xpath("a/text()")
if(link!=):
dict[list_title[num]]="/".join(link)
num+=1
types=loadnextlink(movie,"//div/span[@property='v:genre']/text()")
dict[list_title[num]]="/".join(types)
num+=1
country=loadnextlink(movie,"//span[text()='製片國家/地區:']/following::text()")[0]
dict[list_title[num]] = country
num+=1
language=loadnextlink(movie,'//span[text()="語言:"]/following::text()')[0]
dict[list_title[num]]=language
num+=1
date=loadnextlink(movie,"//span[@property='v:initialreleasedate']/text()")
dict[list_title[num]]="/".join(date)
num+=1
times=loadnextlink(movie,"//span[@property='v:runtime']/text()")
dict[list_title[num]] = times
num+=1
another_name=loadnextlink(movie,"//span[text()='又名:']/following::text()")[0]
dict[list_title[num]] = another_name
num+=1
dict[list_title[num]] = imdb
dataframe=pd.dataframe(dict)
def find_index():
url = ""
#獲取電影分類的url
movie_name = loadnextlink(url, "//div[@class='mod']/div[@class='tags list']/ul/li/a/@href")
for i in movie_name:
i = i[29:]
#轉碼next_url = url + 'tag/' + quote(i)+"/movie?start="
#選擇前30頁
for num in range(30):
next_url=next_url+str(num*15)
loadinfo(next_url,i+".txt")
#寫進csv檔案
total_dict.to_csv("movie.csv",index=false,header=false)
if __name__=='__main__':
find_index()
爬取豆瓣網電影資訊
coding utf 8 import urllib2 import bs4 from bs4 import beautifulsoup 爬取豆瓣網電影簡介,包括電影名,導演,評分以及介紹等 class dbtop def init self self.usr agent mozilla 5.0 w...
python爬蟲爬取豆瓣電影資訊
我們準備使用python的requests和lxml庫,直接安裝完之後開始操作 目標爬取肖申克救贖資訊 傳送門 導入庫import requests from lxml import etree 給出鏈結 url 獲取網頁html前端 一行搞定,在requests中已經封裝好了 data reque...
控制流程,爬取豆瓣電影資訊
if 條件判斷 if 條件成立,執行 1,條件不成立,執行 2 1else 2age 19 if age 18 print 你已經成年了!雙分支結構 age 19 if age 18 print 成年 else print 未成年 如果 成績 90,列印 優秀 如果 成績 80 並且 成績 90,列...