#!/user/bin/env python
# coding=utf-8
用正規表示式取資料
'''import requests
import re
import time
import json
from bs4 import beautifulsoup
class downmovie(object):
def __init__(self):
self.server = ''
self.start_url = '/video/list/99?page=1'
self.proxy = 'username:password@host:port'
self.proxies =
self.names = # 存放電影名
self.urls = # 存放電影詳細介面url
self.content = {} # 存放電影相關資訊
self.nums = 0 # 電影數量
'''函式說明:
獲取目標頁面html以及其中所含的電影名字和電影詳細介面的href
parameters:
none
returns:
none
modify:
2018-09-19
'''def get_html(self):
html = requests.get(self.start_url, proxies=self.proxies).text
soup = beautifulsoup(html, 'lxml')
movie_name = soup.find_all('a', )
self.nums = len(movie_name)
for each in movie_name:
'''函式說明:
parameters:
url - 電影詳細介面鏈結
returns:
self.content - 電影簡介
modify:
2018-09-19
'''def get_content(self, url):
html = requests.get(url, proxies=self.proxies).text
soup = beautifulsoup(html, 'lxml')
div = soup.find_all('div', )
soup1 = beautifulsoup(str(div[0]), 'lxml')
movie_dl = soup1.find_all('script')
# 通過正規表示式來獲取電影相關資訊
pattern = re.compile(''
+ '(.*?)(.*?).*?' # rating
+ '(.*?)(.*?).*?' # duration
+ '(.*?)(.*?).*?' # producer
+ '(.*?)(.*?).*?' # editor
+ '(.*?)(.*?).*?' # screenplay
+ '(.*?)(.*?).*?class="div180">' # actor
+ '(.*?)(.*?)', re.s) # introduction
items = re.findall(pattern, html)[0]
for i in range(0, 14, 2):
self.content[items[i].strip()] = items[i + 1].strip()
return self.content, userurl
'''函式說明:
寫檔案parameters:
filename - 檔名稱(string)
movie_name - 電影名稱名稱(string)
content - 電影簡介內容(string)
returns:
none
modify:
2018-09-19
'''def write_m(self, filename, movie_name, content, dlurl):
write_flag = true
with open(filename, 'a', encoding='utf-8') as fp:
fp.write(movie_name + '\n')
fp.write(json.dumps(content, ensure_ascii=false) + '\n')
fp.writelines(self.server + dlurl)
fp.write('\n\n')
if __name__ == '__main__':
dl = downmovie()
dl.get_html()
for i in range(dl.nums):
content, dlurl = dl.get_content(dl.urls[i])
dl.write_m('movies.txt', dl.names[i], content, dlurl)
time.sleep(1)
爬取豆瓣網電影資訊
coding utf 8 import urllib2 import bs4 from bs4 import beautifulsoup 爬取豆瓣網電影簡介,包括電影名,導演,評分以及介紹等 class dbtop def init self self.usr agent mozilla 5.0 w...
Scrapy爬取1908電影網電影資料
import scrapy class movie1905item scrapy.item define the fields for your item here like name scrapy.field 電影名稱 movie name scrapy.field 評分rating scrapy...
python爬蟲 爬取豆瓣網電影資訊
豆瓣網 如下 import requests import urllib.request if name main 指定ajax get請求的url 通過抓包進行獲取 url 定製請求頭資訊,相關的頭資訊必須封裝在字典結構中 headers import requests import urllib...