初學python爬蟲

2021-08-08 18:35:47 字數 4627 閱讀 1261

上**之前先說下這個簡易爬蟲框架的思路:

排程器:爬蟲的入口

知道沒有url或爬蟲終端,輸出結果

上**:

1,排程器

from myspider import urls_manager, html_**********, html_paser, html_outer

class legendspider(object):

def __init__(self):

# url管理器

self.url_manager = urls_manager.urlmanager()

self.********** = html_**********.html**********()

# html解析器

self.parser = html_paser.htmlparser()

# html輸出器

self.outer = html_outer.htmlouter()

# 開始爬蟲

def start_crow(self):

count = 1

# 把爬蟲的入口url新增到url管理器

self.url_manager.add_new_url(root_url)

# 當url管理器裡面有url時繼續爬蟲

while self.url_manager.has_new_url():

try:

# 獲取到url管理器的url

new_url = self.url_manager.get_new_url()

print('craw: %d: %s' % (count, new_url))

html_content = self.**********.download_html(new_url)

new_urls, new_data = self.parser.parse(new_url, html_content)

# 把獲取到的新的url繼續新增到url管理器,用於後面繼續爬蟲

self.url_manager.add_new_urls(new_urls)

# 把爬取到的資料收集起來

self.outer.collect_datas(new_data)

if count == 50:

break

count = count+1

except:

print('craw failed')

# 將爬取到的資料輸出

self.outer.out_put()

if __name__ == '__main__':

# 爬蟲入口

root_url = ''

# 建立爬蟲排程器類

spider_obj = legendspider()

# 排程器物件呼叫開始爬蟲方法

spider_obj.start_crow()

2,url管理器

class urlmanager(object):

def __init__(self):

# 初始化時候定義未爬取、已爬取兩個url set

self.new_urls = set()

self.old_urls = set()

# 新增新的url

def add_new_url(self, new_url):

if new_url is none:

return

if new_url not in self.new_urls and new_url not in self.old_urls:

self.new_urls.add(new_url)

# 新增新的多個url

def add_new_urls(self, new_urls):

if new_urls is none or len(new_urls) == 0:

return

for new_url in new_urls:

self.add_new_url(new_url)

# 是否還有未爬取的url

def has_new_url(self):

return len(self.new_urls) != 0

# 獲取未爬取的url

def get_new_url(self):

new_url = self.new_urls.pop()

self.old_urls.add(new_url)

return new_url

from urllib import requestclass html**********(object):

def download_html(self, new_url):

if new_url is none:

return none

with request.urlopen(new_url) as f:

if f.status != 200:

return none

return f.read().decode('utf-8')

4,html解析器

import re

from urllib import parse

from bs4 import beautifulsoup

class htmlparser(object):

def parse(self, page_url, html_content):

if page_url is none or html_content is none:

return

# 解析使用第三方框架beautifulsoup

soup = beautifulsoup(html_content, "html.parser")

new_urls = self.get_new_urls(page_url, soup)

new_data = self.get_new_data(page_url, soup)

return new_urls, new_data

def get_new_urls(self, page_url, soup):

new_urls = set()

# < a target = "_blank" href = "/item/%e6%95%99%e5%ad%a6" > 教學 < / a >

links = soup.find_all('a', href=re.compile(r'/item/[0-9a-za-z%]+'))

for link in links:

new_url = link['href']

full_url = parse.urljoin(page_url, new_url)

new_urls.add(full_url)

return new_urls

def get_new_data(self, page_url, soup):

new_data =

# title_node = soup.find('dd', class_='lemmawgt-lemmatitle-title').find('h1')

new_data['title'] = title_node.get_text()

# < div class ="lemma-summary" label-module="lemmasummary" >

summary_node = soup.find('div', class_='lemma-summary')

new_data['summary'] = summary_node.get_text()

return new_data

5,html輸出器

class htmlouter(object):

def __init__(self):

self.datas =

def collect_datas(self, new_data):

if new_data is none:

return

# 輸出html格式檔案

def out_put(self):

fout = open('out.html', 'w', encoding='utf-8')

fout.write('')

fout.write('')

fout.write('')

fout.write('')

for data in self.datas:

fout.write('')

fout.write('%s' % data['url'])

fout.write('%s' % data['title'])

fout.write('%s' % data['summary'])

fout.write('')

fout.write('')

fout.write('')

fout.write('')

fout.write('')

結果就是這樣式兒的:

python爬蟲初學

0x01環境搭建 import os import requests from lxml import etree from urllib.parse import urljoin import urllib pip installl 包名字0x02介紹這裡寫了乙個爬 的爬蟲指令碼 如果不能解決就手...

Python 爬蟲初學

爬取 中的1import re 正規表示式庫 2import urllib url鏈結庫34 defgethtml url 5 page urllib.urlopen url 開啟鏈結 6 html page.read 像讀文字一樣讀取網頁內容 7return html89 defgetimg ht...

初學python,爬蟲開刀

coding utf 8 import urllib import json import csv import codecs csvfile file pythonsalary.csv wb csvfile.write codecs.bom utf8 writer csv.writer csvfi...