python自動提取文字中的時間（包含中文日期）

有時在處理不規則資料時需要提取文字包含的時間日期。

dateutil.parser模組可以統一日期字串格式。

datefinder模組可以在字串中提取日期。

datefinder模組實現也是用正則，功能很全但是對中文不友好。

但是這兩個模組都不能支援中文及一些特殊的情況；所以我用正則寫了段**可進行中文日期及一些特殊的時間識別

例如：'2023年12月12日','3小時前','在2012/12/13哈哈','時間2012-12-11 12:22:30','日期2012-13-11','測試2013.12.24','今天12:13'

import re
import chardet
from datetime import datetime,timedelta
# 匹配正規表示式
matchs = %s\d%s\d%s \d%s\d%s\d%s','%%y%s%%m%s%%d%s %%h%s%%m%s%%s%s'),
2:(r'\d%s\d%s\d%s \d%s\d%s','%%y%s%%m%s%%d%s %%h%s%%m%s'),
3:(r'\d%s\d%s\d%s','%%y%s%%m%s%%d%s'),
4:(r'\d%s\d%s\d%s','%%y%s%%m%s%%d%s'),
# 沒有年份
5:(r'\d%s\d%s \d%s\d%s\d%s','%%m%s%%d%s %%h%s%%m%s%%s%s'),
6:(r'\d%s\d%s \d%s\d%s','%%m%s%%d%s %%h%s%%m%s'),
7:(r'\d%s\d%s','%%m%s%%d%s'),
# 沒有年月日
8:(r'\d%s\d%s\d%s','%%h%s%%m%s%%s%s'),
9:(r'\d%s\d%s','%%h%s%%m%s'),
}# 正則中的%s分割
splits = [,,
,,,,
,,
,]def func(parten,tp):
re.search(parten,parten)
parten_other = '\d+天前|\d+分鐘前|\d+小時前|\d+秒前'
class timefinder(object):
def __init__(self,base_date=none):
self.base_date = base_date
self.match_item = 
self.init_args()
self.init_match_item()
def init_args(self):
# 格式化基礎時間
if not self.base_date:
self.base_date = datetime.now()
if self.base_date and not isinstance(self.base_date,datetime):
try:
self.base_date = datetime.strptime(self.base_date,'%y-%m-%d %h:%m:%s')
except exception as e:
raise 'type of base_date must be str of%y-%m-%d %h:%m:%s or datetime'
def init_match_item(self):
# 構建窮舉正則匹配公式 及提取的字串轉datetime格式對映
for item in splits:
for num,value in item.items():
match = matchs[num]
for sp in value:
tmp = 
for m in match:
def get_time_other(self,text):
m = re.search('\d+',text)
if not m:
return none
num = int(m.group())
if '天' in text:
return self.base_date - timedelta(days=num)
elif '小時' in text:
return self.base_date - timedelta(hours=num)
elif '分鐘' in text:
return self.base_date - timedelta(minutes=num)
elif '秒' in text:
return self.base_date - timedelta(seconds=num)
return none
def find_time(self,text):
# 格式化text為str型別
if isinstance(text,bytes):
encoding =chardet.detect(text)['encoding']
text = text.decode(encoding)
res = 
parten = '|'.join([x[0] for x in self.match_item])
parten = parten+ '|' +parten_other
match_list = re.findall(parten,text)
if not match_list:
return none
for match in match_list:
for item in self.match_item:
try:
date = datetime.strptime(match,item[1].replace('\\',''))
if date.year==1900:
date = date.replace(year=self.base_date.year)
if date.month==1:
date = date.replace(month=self.base_date.month)
if date.day==1:
date = date.replace(day=self.base_date.day)
break
except exception as e:
date = self.get_time_other(match)
if date:
break
if not res:
return none
return res
def test():
timefinder =timefinder(base_date='2020-04-23 00:00:00')
for text in ['2023年12月12日','3小時前','在2012/12/13哈哈','時間2012-12-11 12:22:30','日期2012-13-11','測試2013.12.24','今天12:13']:
res = timefinder.find_time(text)
print('text----',text)
print('res---',res)
if __name__ == '__main__':
test()

測試執行結果如下

text---- 2023年12月12日
res--- ['2012-12-12 00:00:00']
text---- 3小時前
res--- ['2020-04-22 21:00:00']
text---- 在2012/12/13哈哈
res--- ['2012-12-13 00:00:00']
text---- 時間2012-12-11 12:22:30
res--- ['2012-12-11 12:22:30']
text---- 日期2012-13-11
res--- none
text---- 測試2013.12.24
res--- ['2013-12-24 00:00:00']
text---- 今天12:13
res--- ['2020-04-23 12:13:00']

python提取中文字元 Python提取中文字元

寫這個jupyter的原因是好幾次自己爬完新聞之後，發現中間有些是html標籤或者其他多餘的英文本元，自己也不想保留，那麼這時候乙個暴力簡單的方法就是使用 unicode 範圍 u4e00 u9fff 來判別漢字 unicode 分配給漢字中日韓越統一表意文字的範圍為 4e00 9fff 目前...

Python提取中文字元

通過Python提取文字中指定開始結尾中的內容

今天，在爬取資料後，提取時，遇到了無法直接通過標籤獲取資料。比如說，我想要提取開本的規格 16開解決方案後面，查詢資料的時候發現通過正規表示式子，可以鎖定提取的前後部分。具體操作如下 import re 要提取的原材料 a 想提取的內容假設為 16開分析它前為本後為在正則中表示可代表一切字...

python自動提取文字中的時間（包含中文日期）

python提取中文字元 Python提取中文字元

Python提取中文字元

通過Python提取文字中指定開始 結尾中的內容

相關推薦

通過Python提取文字中指定開始結尾中的內容