spider_main.py 爬蟲排程器
# coding:utf8
import url_manager,html_**********,html_parser,html_outputer
class
spidermain
(object):
def__init__
(self):
self.urls = url_manager.urlmanager()
self.********** = html_**********.html**********()
self.parser = html_parser.htmlparser()
self.outputer = html_outputer.htmloutputer()
defcraw
(self,root_url):
count = 1
#將第乙個url新增到待爬取的url
self.urls.add_new_url(root_url)
#判斷是否還有未爬取的url
while self.urls.has_new_url():
try:
#獲取待爬取的url
new_url = self.urls.get_new_url()
print('craw %d : %s' % (count,new_url))
html_cont = self.**********.download(new_url)
new_urls, new_data = self.parser.parse(new_url,html_cont)
self.urls.add_new_urls(new_urls)
self.outputer.collect_date(new_date)
if count == 1000:
break
count = count + 1
except:
print('craw failed')
#將爬取到的內容輸出到html中
self.outputer.output_html()
if __name__=="__main__":
root_url = ''
obj_spider = spidermain()
obj_spider.craw(root_url)
url管理器 url_manager.py
class
urlmanager
(object):
def__init__
(self):
self.new_urls = set()
self.old_urls = set()
defadd_new_url
(self,url):
if url is
none:
return
if url not
in self.new_urls and url not
in self.old_urls:
self.new_urls.add(url)
defadd_new_urls
(self,urls):
if urls is
none
or len(urls) == 0:
return
for url in urls:
self.add_new_url(url)
defhas_new_url
(self):
return len(self.new_urls) != 0
defget_new_url
(self):
new_url = self.new_urls.pop()
self.old_urls.add(new_url)
return new_url
import urllib
class
html**********
(object):
defdownload
(self,url):
if url is
none:
return
none
response = urllib.urlopen(url)
if response.getcode() != 200:
return
none
return response.read()
網頁解析器 html_parser.py
from bs4 import beautifulsoup
import re
import urllib.parse
class
htmlparser
(object):
defparse
(self,page_url,html_cont):
if page_url is
none
or html_cont is
none:
return
soup = beautifulsoup(html_cont,'html.parser',from_encoding='utf-8')
new_urls = self._get_new_urls(page_url,soup) #疑問呼叫自己的方法是要加下劃線嗎
new_urls = self._get_new_data(page_url,soup)
return new_urls,new_urls
def_get_new_urls
(self,page_url,soup):
#/item/python/407313
links = soup.find_all('a',href=re.compile(r"/item/"))
for link in links:
new_url = link['href']
new_full_url = urllib.parse.urljoin(page_url,new_url)
new_urls.add(new_full_url)
return new_urls
def_get_new_data
(self,page_url,html_cont):
res_data = {}
#url
res_data['url'] = page_url
##~ title_node = soup.find('dd',class_='lemmawgt-lemmatitle-title').find('h1')
res_data['title'] = title_node.get_text()
#~ summary_node = soup.find('div',class_='lemma-summary')
res_data['summary'] = summary_node.get_text()
return res_data
將爬取到的價值資料輸出 html_outputer.py
class
htmloutputer
(object):
def__init__
(self):
self.datas=
defcollect_date
(self,data):
if data is
none:
return
defoutput_html
(self):
fout = open('output.html','w')
fout.write("")
fout.write("")
fout.write("")
fout.write("")
for data in self.datas:
fout.write("")
fout.write("%s"% data['url'].encode('utf-8'))
fout.write("%s"% title['title'].encode('utf-8'))
fout.write("%s"% summary['url'].encode('utf-8'))
fout.write("")
fout.write("")
fout.write("")
fout.close()
Python實現簡單爬蟲
簡單爬蟲構架 時序圖 管理待抓取url集合和已抓取url集合 通過兩個列表 已抓取url列表,未抓取url的列表 防止重複抓取 防止迴圈抓取 request.add header user agent mozilla 5.0 偽裝成火狐瀏覽器 urllib2.install opener opene...
Python實現簡單爬蟲
簡單爬蟲構架 時序圖 管理待抓取url集合和已抓取url集合 通過兩個列表 已抓取url列表,未抓取url的列表 防止重複抓取 防止迴圈抓取 request.add header user agent mozilla 5.0 偽裝成火狐瀏覽器 urllib2.install opener opene...
python實現簡單爬蟲 Python實現簡單爬蟲
簡介 爬蟲架構 1 url管理器 3 網頁分析器 4 爬蟲呼叫器 5 價值資料使用 爬蟲實現 1 排程器實現 coding utf 8 import url manager import html import html parser import html outputer import url ...