url管理器:
class urlmanager (object):
def __init__(self):
self.new_urls = set() #未爬取url集合
self.old_urls = set()#已爬取url集合
def has_new_url(self):
#判斷是否有未爬取的url
return self.new_url_size()!=0
def get_new_url(self):
#獲取乙個未爬取的url
new_url = self.new_urls.pop()
self.old_urls.add(new_url)
return new_url
def add_new_url(self,url):
#將新的url新增到未爬取的url集合
if url is none:
return
if url not in self.new_urls and url not in self.old_urls:
self.new_urls.add(url)
def add_new_urls(self,urls):
#將新的url集合新增到未爬取的url集合中
if urls is none or len(urls)==0:
return
for url in urls:
self.add_new_url(url)
def new_url_size(self):
#獲取未爬取url集合大小
return len(self.new_urls)
def old_url_size(self):
return len(self.old_urls)
import requests
class html**********(object):
def download(self,url):
if url is none:
return none
user_agent = 'mozilla/5.0 (windows nt 10.0; win64; x64; rv:64.0) gecko/20100101 firefox/64.0'
headers =
r = requests.get(url,headers=headers)
if r.status_code ==200:
r.encoding = 'utf-8'
return r.text
return none
html解析器:
import re
from urllib import parse
from bs4 import beautifulsoup
class htmlparser(object):
def parser(self,page_url,html_cont):
#用於解析網頁內容,抽取url和資料
if page_url is none or html_cont is none:
return
soup = beautifulsoup(html_cont,'html.parser')
new_urls = self._get_new_urls(page_url,soup)
new_data = self._get_new_data(page_url,soup)
return new_urls,new_data
def _get_new_urls(self,page_url,soup):
#抽取新的url集合
new_urls = set()
#抽取符合要求的a標記
links = soup.find_all('a',href=re.compile("/item/.*"))
for link in links:
#提取href屬性
new_url = link['href']
#拼接成完整**
new_full_url = parse.urljoin('',new_url)
new_urls.add(new_full_url)
return new_urls
def _get_new_data(self,page_url,soup):
#抽取有效資料
data = {}
data['url']=page_url
title = soup.find('dd',class_='lemmawgt-lemmatitle-title').find('h1')
data['title']=title.get_text()
summary = soup.find('div',class_='lemma-summary')
try:
data['summary']=summary.get_text()
except exception:
data['summary']="暫無說明"
return data
資料儲存器:
import codecs
class dataoutput(object):
def __init__(self):
self.datas=
def store_data(self,data):
if data is none:
return
def output_html(self):
fout=codecs.open('baike.html','w')
fout.write("")
fout.write("")
fout.write("")
for data in self.datas:
fout.write("")
fout.write("%s"%data['url'])
fout.write("%s"%data['title'])
fout.write("%s"%data['summary'])
fout.write("")
self.datas.remove(data)
fout.write("")
fout.write("")
fout.write("")
fout.close()
爬蟲排程器:
from dataoutput import dataoutput
from html********** import html**********
from htmlparser import htmlparser
from urlmanager import urlmanager
class spiderman(object):
def __init__(self):
self.manager = urlmanager()
self.********** = html**********()
self.parser = htmlparser()
self.output = dataoutput()
def crawl(self,root_url):
#新增入口url
self.manager.add_new_url(root_url)
#判斷url管理器中是否有新的url,同時判斷抓取了多少url
while(self.manager.has_new_url() and self.manager.old_url_size()<100):
try:
new_url = self.manager.get_new_url()
html = self.**********.download(new_url)
new_urls,data = self.parser.parser(new_url,html)
self.manager.add_new_urls(new_urls)
self.output.store_data(data)
#print(data)
print("已經抓取%s個鏈結"%self.manager.old_url_size())
except exception:
print("crawl failed")
self.output.output_html()
if __name__=="__main__":
spider_man = spiderman()
spider_man.crawl("/item/網路爬蟲")
爬蟲基礎4 框架Scrapy
scrapy是乙個為了爬取 資料,提取結構性資料而編寫的應用框架。其可以應用在資料探勘,資訊處理或儲存歷史資料等一系列的程式中。其最初是為了頁面抓取 更確切來說,網路抓取 所設計的,也可以應用在獲取api所返回的資料 例如 amazon associates web services 或者通用的網路...
網路爬蟲值scrapy框架基礎
scrapy是乙個高階的python爬蟲框架,它不僅包含了爬蟲的特性,還可以方便的將爬蟲資料儲存到csv json等檔案中。首先我們安裝scrapy。其可以應用在資料探勘,資訊處理或儲存歷史資料等一系列的程式中。其最初是為了頁面抓取 更確切來說,網路抓取 所設計的,也可以應用在獲取api所返回的資料...
搜狗詞庫爬蟲(2) 基礎爬蟲框架的執行流程
系列目錄 搜狗詞庫爬蟲 1 基礎爬蟲架構和爬取詞庫分類 各模組對應的內容如下 getcategory.py,提取詞庫分類id和名字,以字典形式返回。spiderman.py,爬蟲排程器。urlmanager.py,url管理器。htmlparser.py,網頁解析器。dataoutput.py,資料...