./data/city.txt
北京
上海廣州
深圳海西
海西蒙古族藏族自治州
./data/scene.txt
故宮
長城圓明園
外灘白雲山
歡樂谷
input:
我想去北京的圓明園玩
output:
[, ]
檔名作為實體名
絕對匹配
# -*- coding: utf-8 -*-
# @author : xercis
# @time : 2020/5/15 17:00
# @function: 絕對匹配提取實體詞
import os
defread
(file_path)
:"""讀取文字文件生成器"""
with
open
(file_path, mode=
"r", encoding=
"utf-8"
)as f:
for line in f:
yield line.strip(
)# 去除空格換行
defextract
(message, dictionary_path)
:"""絕對匹配提取實體詞"""
entities =
for file_path in os.listdir(dictionary_path)
:if file_path.endswith(
".txt"):
file_path = os.path.join(dictionary_path, file_path)
it = read(file_path)
for i in it:
start = message.find(i)
if start !=-1
:)return entities
if __name__ ==
"__main__"
:print
(extract(
"我想去北京的圓明園玩"
, dictionary_path=
"./data"
))
優點:逐行讀取檔案,記憶體消耗極小
缺點:執行效率慢
# -*- coding: utf-8 -*-
# @author : xercis
# @time : 2020/5/15 17:00
# @function: 絕對匹配提取實體詞
import os
defread
(dictionary_path)
:"""讀取文字文件資料進字典"""
data =
for file_path in os.listdir(dictionary_path)
:if file_path.endswith(
".txt"):
file_path = os.path.join(dictionary_path, file_path)
file_name = os.path.basename(file_path)[:
-4]# 以檔名作實體名
with
open
(file_path, mode=
"r", encoding=
"utf-8"
)as f:
data[file_name]
= f.read(
).splitlines(
)return data
defextract
(data, message)
:"""絕對匹配提取實體詞"""
entities =
for entity, value in data.items():
for i in value:
start = message.find(i)
if start !=-1
:)return entities
if __name__ ==
"__main__"
: data = read(
"./data"
)print
(extract(data, message=
"我想去北京的圓明園玩"
))
# -*- coding: utf-8 -*-
# @author : xercis
# @time : 2020/5/15 17:00
# @function: 絕對匹配提取實體詞
import os
from itertools import combinations
defread
(dictionary_path)
:"""讀取文字文件資料進字典"""
data =
for file_path in os.listdir(dictionary_path)
:if file_path.endswith(
".txt"):
file_path = os.path.join(dictionary_path, file_path)
file_name = os.path.basename(file_path)[:
-4]# 以檔名作實體名
with
open
(file_path, mode=
"r", encoding=
"utf-8"
)as f:
data[file_name]
= f.read(
).splitlines(
)return data
defextract
(data, message, take_long=
false
, take_short=
false):
"""絕對匹配提取實體詞"""
if take_long and take_short:
raise valueerror(
"take_long and take_short can not be both true"
) entities =
for entity, value in data.items():
for i in value:
start = message.find(i)
if start !=-1
:)for i in
list
(combinations(entities,2)
):v0, v1 = i[0]
["value"
], i[1]
["value"
]if v0 in v1 or v1 in v0:
(long
, short)
=(i[0]
, i[1]
)iflen(v0)
>
len(v1)
else
(i[1
], i[0]
)if take_long ==
true
and short in entities:
entities.remove(short)
if take_short ==
true
andlong
in entities:
entities.remove(
long
)return entities
if __name__ ==
"__main__"
: data = read(
"./data"
)print
(extract(data, message=
"海西全稱為海西蒙古族藏族自治州"
, take_long=
true))
print
(extract(data, message=
"海西全稱為海西蒙古族藏族自治州"
, take_short=
true))
# #
python可迭代物件、迭代器和生成器的區別
python os.path() 模組 | 菜鳥教程
python 字串
嚴格匹配 perl的文字匹配提取
perl的正規表示式極其強大,對於文字處理很有優勢。下面這個例子展示在ic驗證中怎麼利用perl的正規表示式做匹配提取。在ic驗證中會寫大量的task function,對很複雜的系統進行 的時候,會大量呼叫task function,如果能夠在進出task function的時候在log中列印出進...
python提取內容 使用Python提取小說內容
具體實現功能如下 輸入 目錄頁的url之後,指令碼會自動分析目錄頁,提取 的章節名和章節鏈結位址。然後再從章節鏈結位址逐個提取章節內容。現階段只是將 從第一章開始,每次提取一章內容,回車之後提取下一章內容。其他 的結果可能有不同,需要做一定修改。在逐浪測試過正常。coding utf8 usr bi...
Python爬蟲基礎 re模組的提取 匹配和替換
re是python的乙個第三方庫。為了能更直觀的看出re的效果,我們先新建乙個html網頁檔案 可直接複製 index.html email re qq.com 手機號 88888888 ok,然後我們進入主題。re主要有三個功能 提取 匹配 替換。1 提取findall re.findall 正規...