爬蟲程式 改進

2021-08-26 17:52:57 字數 1943 閱讀 2066

import requests

from lxml import etree

import os

urls =

num = 1

defget_urls

(page_num):

global urls

headers =

for num in range(1, page_num+1):

try:

url = '' + str(num)

data_list = requests.get(url,headers=headers )

data_list.encoding = 'utf-8'

data_html = etree.html(data_list.text)

data_urls = data_html.xpath('//div[@id="info_list"]/div[@class="list"]/a//@href')

# data_title = data_html.xpath('//div[@id="info_list"]/div[@class="list"]/a//text()')

urls += data_urls

except:

print(data_urls + " : 獲取失敗...")

print(urls)

defwrite_data

(title, content):

global num

ifnot os.path.exists('./wenzhang'):

os.makedirs('wenzhang')

with open('wenzhang/' + 'cnbeta.txt', 'a', encoding='utf-8') as f:

f.write(' ' + str(num) + ' --> ' + title + ' <--\n\n')

num += 1

f.write(content + '\n\n--------------------------\n--------------------------\n\n\n')

defget_articles

(urls):

headers =

for url in urls:

try:

new_url = '' + url

response = requests.get(new_url,headers=headers)

response.encoding = "utf-8"

response_html = etree.html(response.text)

title = response_html.xpath('//div[@class="title"]/b//text()')

print(title)

content = response_html.xpath('//div[@class="content"]/p//text()')

content_all = ''

for content_x in content:

content_all = content_all + "\n" + content_x

write_data(title[0], content_all)

except:

print(url + "文章錯誤...")

print('''

這是乙個爬蟲程式,爬取的是www.c***a.com的wap手機版頁面.

採集了文章標題,和文章正文.您可以選擇你要的頁數.(每頁35條新聞)

''')

page_num = int(input("請輸入您想得到幾頁的資料: "))

if __name__ == '__main__':

get_urls(page_num)

get_articles(urls)

程式閱讀的改進

之前看到老師在群裡 的做得好的同學的部落格,都有點開來看,看懂別的同學的實現思路 演算法,卻也沒想著要自己總結一下。這次就總結一下吧。程式是要找乙個不能被除數集相鄰兩個數整除的數,之前的分析這會是乙個很大的數。如果乙個數能被另乙個數整除,那一定能被這個數的因子整除。反之,不能被乙個數的質數整除,則不...

爬蟲筆記04 爬蟲小練習多程序改進版

上篇寫了乙個單程序爬224個頁面需要用450秒的時間,這次用下python的多程序的方式去改寫一下,儘管大家都說python的多程序是雞肋,但存在就是合理,多程序思路也很簡單。1,建立多個queue佇列,在迴圈links鏈結時將每個link鏈結put進queue列表裡。2.開啟多個執行緒同時將每個q...

C 爬蟲程式

public string doget string url catch exception ex return result 返回匹配多個的集合值 開始html tag 結束html tag html public static ilistgethtmls string start,string ...