NCBI MeSH 詞庫爬蟲獲取（保留樹狀結構）

本文**

爬蟲資料獲取位址

#!/usr/bin/env python
# coding: utf8
import time
import requests
from lxml import etree
base_url =
''def 獲取某頁面主節點(mesh_url)
:    time.sleep(2)
page_html = etree.html(requests.get(base_url + mesh_url)
.content)
名稱 = page_html.xpath(
'//*[@class="title"]/text()')[
0]return page_html.xpath(
'//b[text()="'
+ 名稱 +
'"]')[
0]# 遞迴獲取某節點下的子節點
def 遞迴(節點, node_data)
:    mesh_list = 節點.xpath(
'following-sibling::ul'
)    node_child =
for mesh_item in mesh_list:
node_child_data =
a_text = mesh_item.xpath(
'a/text()')[
0]a_url = mesh_item.xpath(
'a/@href')[
0]node_child_data[
'text'
]= a_text
node_child_data[
'url'
]= a_url
print
('node_child_data'
, a_text, a_url)
# 子節點
uls = mesh_item.xpath(
'ul'
)if uls:
_node_child =
for ul in uls:
_node_child_data =
_a_url = ul.xpath(
'a/@href')[
0]_a_text = ul.xpath(
'a/text()')[
0]_node_child_data[
'text'
]= _a_text
_node_child_data[
'url'
]= _a_url
# 此子節點存在下級節點
if ul.xpath(
'text()'):
# 為空: 不為空:[' +']
遞迴(獲取某頁面主節點(_a_url)
, _node_child_data)
print
('\t_node_child_data'
, _a_text, _a_url)
node_child_data[
'child'
]= _node_child
node_data[
'child'
]= node_child
def(data, index=
'1')
:    target_url = data[
'url'
]    data[
'index'
]= index
time.sleep(2)
page_html = etree.html(requests.get(base_url + target_url)
.content)
# entry terms 根據文字內容定位節點
entry_terms = page_html.xpath(
'//p[text()="entry terms:"]/following-sibling::ul[1]//text()'
)    data[
'entryterms'
]= entry_terms
# tree number 和 mesh unique id
texts = page_html.xpath(
'//*[@class="rprt abstract"]/p/text()'
)if texts:
for text in texts:
replace_keys =
['tree number(s): '
,'mesh unique id: '
]for replace_key in replace_keys:
if replace_key in text:
data[replace_key]
= text.replace(replace_key,'')
print
('已處理'
, data[
'text'])
if'child'
in data:
for target_index, target_item in
enumerate
(data[
'child'])
:'-'
+str
(target_index +1)
)if __name__ ==
'__main__'
:    mesh_url =
'/mesh/68002318'
result_data =
遞迴(獲取某頁面主節點(mesh_url)
, result_data)
print
('result_data'
, result_data)
# 此處保留url備份  獲取位址 
# result_data = eval(open('url.txt').read())
# 儲存資料至檔案
file
=open
('data.txt'
,'w'
, encoding=
'utf8'
)file
.write(
str(result_data)
)file
.close(
)

獲取儲存過程內容 SQL server

查詢儲存過程內容 select name 儲存過程名稱,definition 儲存過程內容 from sys.sql modules as m inner join sys.all objects as o on m.object id o.object id where o.type p and...

搜狗詞庫爬蟲（2）基礎爬蟲框架的執行流程

系列目錄搜狗詞庫爬蟲 1 基礎爬蟲架構和爬取詞庫分類各模組對應的內容如下 getcategory.py，提取詞庫分類id和名字，以字典形式返回。spiderman.py，爬蟲排程器。urlmanager.py，url管理器。htmlparser.py，網頁解析器。dataoutput.py，資料...

獲取儲存過程的ReturnValue值

input 此引數只用於將資訊從應用程式傳輸到儲存過程。inputoutput 此引數可將資訊從應用程式傳輸到儲存過程，並將資訊從儲存過程傳輸回應用程式。output 此引數只用於將資訊從儲存過程傳輸回應用程式。returnvalue 此引數表示儲存過程的返回值。sql server 的儲存過程引數...

NCBI MeSH 詞庫爬蟲 獲取（保留樹狀結構）

獲取儲存過程內容 SQL server

搜狗詞庫爬蟲（2） 基礎爬蟲框架的執行流程

獲取儲存過程的ReturnValue值

相關推薦

NCBI MeSH 詞庫爬蟲獲取（保留樹狀結構）

搜狗詞庫爬蟲（2）基礎爬蟲框架的執行流程