-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
31 lines (22 loc) · 821 Bytes
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import re
def remove_nonalpha(text: str):
return ''.join([c for c in text if c.isalpha() or c == ' '])
def remove_nonalphanumeric(text: str):
return ''.join([c for c in text if c.isalpha() or c.isdigit() or c == ' '])
def clean_text(text: str):
text = text.replace("<unk>", "")
text = re.sub(r'\([^()]*\)', '', text) # remove parenthesis and their contents
text = re.sub(r'\[[^()]*\]', '', text) # remove parenthesis and their contents
text = " ".join(text.split()) # remove repeating whitespaces
return text
def fix_overlap(segs):
"""
Fixes overlapping segments for SER calculation
:param segs:
:return:
"""
for i in range(len(segs)-1):
curr = segs[i+1]
prev = segs[i]
if curr[0] < prev[1]:
segs[i] = (prev[0], curr[0])