搜韻網詩詞採集

2022-07-20 03:45:12 字數 4148 閱讀 6401

搜韻網詩詞採集,會封ip 建議使用**採集

import

pymongo

import

requests

from pyquery import

pyquery as pq

from urllib.parse import

urljoin

class

poetry:

def__init__

(self):

self.start_url = '

'self.comment_url = '

'self.headers =

self.client = pymongo.mongoclient(host='

localhost

', port=27017).poetry['

poetry']

self.dynasty_dict =dict()

self.run()

defget_response(self, url):

"""單獨設定請求,便於新增**

"""response = requests.get(url=url, headers=self.headers)

return

response

defget_dynasty_content(self):

"""獲取朝代的鏈結位址,每乙個朝代的資訊

"""dynasty_response =self.get_response(self.start_url)

doc = pq(dynasty_response.content.decode('

utf8'))

dynasty_cate_doc = doc('

.inline1

').items()

for dynasty_doc in

dynasty_cate_doc:

dynasty_name = dynasty_doc('a'

).text()

dynasty_url = urljoin(self.start_url, dynasty_doc('

a').attr('

href'))

self.dynasty_dict[dynasty_name] =dynasty_url

defget_poetry(self, dynasty, person_name, person_url):

"""獲取每乙個朝代的歷史人物的詩詞,翻頁採用遞迴的方式

"""poetry_response =self.get_response(person_url)

doc = pq(poetry_response.content.decode('

utf8'))

poetry_doc_list = doc('

._poem

').items()

for poetry_doc in

poetry_doc_list:

poetry_id = poetry_doc.attr('

id').replace('

poem_

', ''

) poetry_title = poetry_doc('

.poemcommentlink

').text()

title_comment = poetry_doc('

.titlecomment

').text()

poetry_content = poetry_doc('

.poemsentence

').text().strip()

poetry_comment_doc = poetry_doc('

.poemcomment')

poetry_comment = ''

ifpoetry_comment_doc:

comment_url =self.comment_url.format(poetry_id)

comment_response =self.get_response(comment_url).json()

comments = comment_response['

shidata

'][0]['

comments']

for comment in

comments:

book = comment['

book']

content = comment['

content

'].replace('

', '\n'

) poetry_comment += book + '

:' + '

\n' + content + '\n'

poetry_type = poetry_doc('

.titleindent

').text()

poetry_note = poetry_doc('

.poemnote

').text()

poetry_dict =dict()

poetry_dict['朝代

'] =dynasty

poetry_dict['作者

'] =person_name

poetry_dict['標題

'] =poetry_title

poetry_dict[

'標題注釋

'] =title_comment

poetry_dict['型別

'] =poetry_type

poetry_dict['內容

'] =poetry_content

poetry_dict['評注

'] =poetry_comment

poetry_dict['注釋

'] =poetry_note

self.client.insert_one(poetry_dict)

#翻頁邏輯

next_page_doc_list = doc('

#content>div:last-child a

').items()

next_page_url = ''

for next_page_doc in

next_page_doc_list:

if''in

next_page_doc.text():

next_page_url_doc = next_page_doc.attr('

href')

next_page_url =urljoin(self.start_url, next_page_url_doc)

ifnext_page_url:

self.get_poetry(dynasty, person_name, next_page_url)

defget_person_content(self):

"""根據朝代獲取每乙個朝代的名人的鏈結位址

"""for dynasty, dynasty_url in

self.dynasty_dict.items():

person_response =self.get_response(dynasty_url)

doc = pq(person_response.content.decode('

utf8'))

person_doc_list = doc('

.inline1

').items()

for person_doc in

person_doc_list:

person_name = person_doc('a'

).text()

person_url = urljoin(self.start_url, person_doc('

a').attr('

href'))

self.get_poetry(dynasty, person_name, person_url)

defrun(self):

self.get_dynasty_content()

self.get_person_content()

if__name__ == '

__main__':

poetry()

**未設定**

通訊搜網過程梳理

通訊搜網過程梳理 搜網流程 搜網篇 梳理搜網流程之前,先說幾個常見的名詞,plmn imsi imei plmn mcc 移動國家碼 3位 mnc 流動網路碼 2位 imsi international mobile subscriber identification number,用於區別移動使用...

搜物網 代發貨平台

搜物網 貨源分銷平台 總部所在地 中國 搜物網是乙個貨源分銷平台,它想做的更是乙個整合及簡化 鏈與渠道的平台。搜物網希望幫助電子商務 賣家更簡單有效地推廣與銷售,商也從中擴大銷售增加利潤。工廠作為貨源提供方存在銷售渠道侷限,缺乏拓展電子商務渠道的基因 同時網路c店賣家又多存在資金貨源倉儲物流等物體。...

人才網 採集 退役軍人資訊採集有什麼意義?

一 保證軍人榮譽,使軍人受到尊重。資訊採集可以把老兵們的軍旅生涯中的優秀立功表現與獲得榮譽記錄下來。這樣,即便是退出現役,老兵們曾經的輝煌也不會隨之東流,仍然會被國家被社會記住。加強對退役軍人的尊重。進一步提高退役軍人的社會地位。二 為更好的服務退役軍人打下基礎。充分採集退役軍人群體的資訊,才能展開...