Python實現簡單爬蟲邏輯

2021-08-09 10:57:19 字數 4355 閱讀 6317

spider_main.py 爬蟲排程器

# coding:utf8

import url_manager,html_**********,html_parser,html_outputer

class

spidermain

(object):

def__init__

(self):

self.urls = url_manager.urlmanager()

self.********** = html_**********.html**********()

self.parser = html_parser.htmlparser()

self.outputer = html_outputer.htmloutputer()

defcraw

(self,root_url):

count = 1

#將第乙個url新增到待爬取的url

self.urls.add_new_url(root_url)

#判斷是否還有未爬取的url

while self.urls.has_new_url():

try:

#獲取待爬取的url

new_url = self.urls.get_new_url()

print('craw %d : %s' % (count,new_url))

html_cont = self.**********.download(new_url)

new_urls, new_data = self.parser.parse(new_url,html_cont)

self.urls.add_new_urls(new_urls)

self.outputer.collect_date(new_date)

if count == 1000:

break

count = count + 1

except:

print('craw failed')

#將爬取到的內容輸出到html中

self.outputer.output_html()

if __name__=="__main__":

root_url = ''

obj_spider = spidermain()

obj_spider.craw(root_url)

url管理器 url_manager.py

class

urlmanager

(object):

def__init__

(self):

self.new_urls = set()

self.old_urls = set()

defadd_new_url

(self,url):

if url is

none:

return

if url not

in self.new_urls and url not

in self.old_urls:

self.new_urls.add(url)

defadd_new_urls

(self,urls):

if urls is

none

or len(urls) == 0:

return

for url in urls:

self.add_new_url(url)

defhas_new_url

(self):

return len(self.new_urls) != 0

defget_new_url

(self):

new_url = self.new_urls.pop()

self.old_urls.add(new_url)

return new_url

import urllib

class

html**********

(object):

defdownload

(self,url):

if url is

none:

return

none

response = urllib.urlopen(url)

if response.getcode() != 200:

return

none

return response.read()

網頁解析器 html_parser.py

from bs4 import beautifulsoup

import re

import urllib.parse

class

htmlparser

(object):

defparse

(self,page_url,html_cont):

if page_url is

none

or html_cont is

none:

return

soup = beautifulsoup(html_cont,'html.parser',from_encoding='utf-8')

new_urls = self._get_new_urls(page_url,soup) #疑問呼叫自己的方法是要加下劃線嗎

new_urls = self._get_new_data(page_url,soup)

return new_urls,new_urls

def_get_new_urls

(self,page_url,soup):

#/item/python/407313

links = soup.find_all('a',href=re.compile(r"/item/"))

for link in links:

new_url = link['href']

new_full_url = urllib.parse.urljoin(page_url,new_url)

new_urls.add(new_full_url)

return new_urls

def_get_new_data

(self,page_url,html_cont):

res_data = {}

#url

res_data['url'] = page_url

##~ title_node = soup.find('dd',class_='lemmawgt-lemmatitle-title').find('h1')

res_data['title'] = title_node.get_text()

#~ summary_node = soup.find('div',class_='lemma-summary')

res_data['summary'] = summary_node.get_text()

return res_data

將爬取到的價值資料輸出 html_outputer.py

class

htmloutputer

(object):

def__init__

(self):

self.datas=

defcollect_date

(self,data):

if data is

none:

return

defoutput_html

(self):

fout = open('output.html','w')

fout.write("")

fout.write("")

fout.write("")

fout.write("")

for data in self.datas:

fout.write("")

fout.write("%s"% data['url'].encode('utf-8'))

fout.write("%s"% title['title'].encode('utf-8'))

fout.write("%s"% summary['url'].encode('utf-8'))

fout.write("")

fout.write("")

fout.write("")

fout.close()

Python實現簡單爬蟲

簡單爬蟲構架 時序圖 管理待抓取url集合和已抓取url集合 通過兩個列表 已抓取url列表,未抓取url的列表 防止重複抓取 防止迴圈抓取 request.add header user agent mozilla 5.0 偽裝成火狐瀏覽器 urllib2.install opener opene...

Python實現簡單爬蟲

簡單爬蟲構架 時序圖 管理待抓取url集合和已抓取url集合 通過兩個列表 已抓取url列表,未抓取url的列表 防止重複抓取 防止迴圈抓取 request.add header user agent mozilla 5.0 偽裝成火狐瀏覽器 urllib2.install opener opene...

python實現簡單爬蟲 Python實現簡單爬蟲

簡介 爬蟲架構 1 url管理器 3 網頁分析器 4 爬蟲呼叫器 5 價值資料使用 爬蟲實現 1 排程器實現 coding utf 8 import url manager import html import html parser import html outputer import url ...