上**之前先說下這個簡易爬蟲框架的思路:
排程器:爬蟲的入口
知道沒有url或爬蟲終端,輸出結果
上**:
1,排程器
from myspider import urls_manager, html_**********, html_paser, html_outer
class legendspider(object):
def __init__(self):
# url管理器
self.url_manager = urls_manager.urlmanager()
self.********** = html_**********.html**********()
# html解析器
self.parser = html_paser.htmlparser()
# html輸出器
self.outer = html_outer.htmlouter()
# 開始爬蟲
def start_crow(self):
count = 1
# 把爬蟲的入口url新增到url管理器
self.url_manager.add_new_url(root_url)
# 當url管理器裡面有url時繼續爬蟲
while self.url_manager.has_new_url():
try:
# 獲取到url管理器的url
new_url = self.url_manager.get_new_url()
print('craw: %d: %s' % (count, new_url))
html_content = self.**********.download_html(new_url)
new_urls, new_data = self.parser.parse(new_url, html_content)
# 把獲取到的新的url繼續新增到url管理器,用於後面繼續爬蟲
self.url_manager.add_new_urls(new_urls)
# 把爬取到的資料收集起來
self.outer.collect_datas(new_data)
if count == 50:
break
count = count+1
except:
print('craw failed')
# 將爬取到的資料輸出
self.outer.out_put()
if __name__ == '__main__':
# 爬蟲入口
root_url = ''
# 建立爬蟲排程器類
spider_obj = legendspider()
# 排程器物件呼叫開始爬蟲方法
spider_obj.start_crow()
2,url管理器
4,html解析器class urlmanager(object):
def __init__(self):
# 初始化時候定義未爬取、已爬取兩個url set
self.new_urls = set()
self.old_urls = set()
# 新增新的url
def add_new_url(self, new_url):
if new_url is none:
return
if new_url not in self.new_urls and new_url not in self.old_urls:
self.new_urls.add(new_url)
# 新增新的多個url
def add_new_urls(self, new_urls):
if new_urls is none or len(new_urls) == 0:
return
for new_url in new_urls:
self.add_new_url(new_url)
# 是否還有未爬取的url
def has_new_url(self):
return len(self.new_urls) != 0
# 獲取未爬取的url
def get_new_url(self):
new_url = self.new_urls.pop()
self.old_urls.add(new_url)
return new_url
from urllib import request
class html**********(object):def download_html(self, new_url):
if new_url is none:
return none
with request.urlopen(new_url) as f:
if f.status != 200:
return none
return f.read().decode('utf-8')
import re
from urllib import parse
from bs4 import beautifulsoup
class htmlparser(object):
def parse(self, page_url, html_content):
if page_url is none or html_content is none:
return
# 解析使用第三方框架beautifulsoup
soup = beautifulsoup(html_content, "html.parser")
new_urls = self.get_new_urls(page_url, soup)
new_data = self.get_new_data(page_url, soup)
return new_urls, new_data
def get_new_urls(self, page_url, soup):
new_urls = set()
# < a target = "_blank" href = "/item/%e6%95%99%e5%ad%a6" > 教學 < / a >
links = soup.find_all('a', href=re.compile(r'/item/[0-9a-za-z%]+'))
for link in links:
new_url = link['href']
full_url = parse.urljoin(page_url, new_url)
new_urls.add(full_url)
return new_urls
def get_new_data(self, page_url, soup):
new_data =
# title_node = soup.find('dd', class_='lemmawgt-lemmatitle-title').find('h1')
new_data['title'] = title_node.get_text()
# < div class ="lemma-summary" label-module="lemmasummary" >
summary_node = soup.find('div', class_='lemma-summary')
new_data['summary'] = summary_node.get_text()
return new_data
5,html輸出器
class htmlouter(object):
def __init__(self):
self.datas =
def collect_datas(self, new_data):
if new_data is none:
return
# 輸出html格式檔案
def out_put(self):
fout = open('out.html', 'w', encoding='utf-8')
fout.write('')
fout.write('')
fout.write('')
fout.write('')
for data in self.datas:
fout.write('')
fout.write('%s' % data['url'])
fout.write('%s' % data['title'])
fout.write('%s' % data['summary'])
fout.write('')
fout.write('')
fout.write('')
fout.write('')
fout.write('')
結果就是這樣式兒的:
python爬蟲初學
0x01環境搭建 import os import requests from lxml import etree from urllib.parse import urljoin import urllib pip installl 包名字0x02介紹這裡寫了乙個爬 的爬蟲指令碼 如果不能解決就手...
Python 爬蟲初學
爬取 中的1import re 正規表示式庫 2import urllib url鏈結庫34 defgethtml url 5 page urllib.urlopen url 開啟鏈結 6 html page.read 像讀文字一樣讀取網頁內容 7return html89 defgetimg ht...
初學python,爬蟲開刀
coding utf 8 import urllib import json import csv import codecs csvfile file pythonsalary.csv wb csvfile.write codecs.bom utf8 writer csv.writer csvfi...