forked from ryanInf/Time-NLPY
-
Notifications
You must be signed in to change notification settings - Fork 0
/
TimeNormalizer.py
133 lines (123 loc) · 4.79 KB
/
TimeNormalizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2017/11/20 16:39
# @Author : zhm revised by stacy
# @File : TimeNormalizer.py
# @Software: PyCharm
import pickle
import regex as re
import arrow
import json
import os
from StringPreHandler import StringPreHandler
from TimePoint import TimePoint
from TimeUnit import TimeUnit
import sys
import imp
# imp.reload(sys)
# sys.setdefaultencoding('utf8')
# 时间表达式识别的主要工作类
class TimeNormalizer:
def __init__(self, isPreferFuture=True):
self.isPreferFuture = isPreferFuture
self.pattern, self.holi_solar, self.holi_lunar = self.init()
def init(self):
fpath = os.path.dirname(__file__) + '/resource/reg.pkl'
try:
with open(fpath, 'rb') as f:
pattern = pickle.load(f)
except:
with open(os.path.dirname(__file__) + '/resource/regex.txt', 'r', encoding='utf-8') as f:
content = f.read()
p = re.compile(str(content))
with open(fpath, 'wb') as f:
pickle.dump(p, f)
with open(fpath, 'rb') as f:
pattern = pickle.load(f)
with open(os.path.dirname(__file__) + '/resource/holi_solar.json', 'r', encoding='utf-8') as f:
holi_solar = json.load(f)
with open(os.path.dirname(__file__) + '/resource/holi_lunar.json', 'r', encoding='utf-8') as f:
holi_lunar = json.load(f)
return pattern, holi_solar, holi_lunar
def parse(self, target, timeBase=arrow.now()):
"""
TimeNormalizer的构造方法,timeBase取默认的系统当前时间
:param timeBase: 基准时间点
:param target: 待分析字符串
:return: 时间单元数组
"""
self.isTimeSpan = False
self.invalidSpan = False
self.timeSpan = ''
self.target = str(target)
self.timeBase = arrow.get(timeBase).format('YYYY-M-D-H-m-s')
self.oldTimeBase = self.timeBase
self.__preHandling()
self.timeToken = self.__timeEx()
dic = {}
res = self.timeToken
if self.isTimeSpan:
if self.invalidSpan:
dic['error'] = 'no time pattern could be extracted.'
else:
dic['type'] = 'timedelta'
dic['timedelta'] = self.timeSpan
else:
if len(res) == 0:
dic['error'] = 'no time pattern could be extracted.'
elif len(res) == 1:
dic['type'] = 'timestamp'
dic['timestamp'] = res[0].time.format("YYYY-MM-DD HH:mm:ss")
else:
dic['type'] = 'timespan'
dic['timespan'] = [res[0].time.format("YYYY-MM-DD HH:mm:ss"), res[1].time.format("YYYY-MM-DD HH:mm:ss")]
return json.dumps(dic)
def __preHandling(self):
"""
待匹配字符串的清理空白符和语气助词以及大写数字转化的预处理
:return:
"""
self.target = StringPreHandler.delKeyword(self.target, "\\s+") # 清理空白符
self.target = StringPreHandler.delKeyword(self.target, "[的]+") # 清理语气助词
self.target = StringPreHandler.numberTranslator(self.target) # 大写数字转化
def __timeEx(self):
"""
:param target: 输入文本字符串
:param timeBase: 输入基准时间
:return: TimeUnit[]时间表达式类型数组
"""
startline = -1
endline = -1
rpointer = 0
temp = []
match = self.pattern.finditer(self.target)
for m in match:
startline = m.start()
if startline == endline:
rpointer -= 1
temp[rpointer] = temp[rpointer] + m.group()
else:
temp.append(m.group())
endline = m.end()
rpointer += 1
res = []
# 时间上下文: 前一个识别出来的时间会是下一个时间的上下文,用于处理:周六3点到5点这样的多个时间的识别,第二个5点应识别到是周六的。
contextTp = TimePoint()
for i in range(0, rpointer):
res.append(TimeUnit(temp[i], self, contextTp))
contextTp = res[i].tp
res = self.__filterTimeUnit(res)
return res
def __filterTimeUnit(self, tu_arr):
"""
过滤timeUnit中无用的识别词。无用识别词识别出的时间是1970.01.01 00:00:00(fastTime=0)
:param tu_arr:
:return:
"""
if (tu_arr is None) or (len(tu_arr) < 1):
return tu_arr
res = []
for tu in tu_arr:
if tu.time.timestamp != 0:
res.append(tu)
return res