-
Notifications
You must be signed in to change notification settings - Fork 2.6k
/
tokenizer_utils.py
194 lines (154 loc) · 7.04 KB
/
tokenizer_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import unicodedata
from builtins import str as unicode
from typing import List, Tuple
__all__ = [
"chinese_text_preprocessing",
"english_text_preprocessing",
"any_locale_text_preprocessing",
"spanish_text_preprocessing",
"any_locale_word_tokenize",
"english_word_tokenize",
"LATIN_CHARS_ALL",
"normalize_unicode_text",
]
# Derived from LJSpeech
_synoglyphs = {
"'": ['’'],
'"': ['”', '“'],
}
SYNOGLYPH2ASCII = {g: asc for asc, glyphs in _synoglyphs.items() for g in glyphs}
# Example of parsing by groups via _WORDS_RE_EN.
# Regular expression pattern groups:
# 1st group -- valid english words,
# 2nd group -- any substring starts from | to | (mustn't be nested), useful when you want to leave sequence unchanged,
# 3rd group -- punctuation marks or whitespaces.
# Text (first line) and mask of groups for every char (second line).
# config file must contain |EY1 EY1|, B, C, D, E, F, and G.
# define char set based on https://en.wikipedia.org/wiki/List_of_Unicode_characters
LATIN_ALPHABET_BASIC = "A-Za-z"
ACCENTED_CHARS = "À-ÖØ-öø-ÿ"
LATIN_CHARS_ALL = f"{LATIN_ALPHABET_BASIC}{ACCENTED_CHARS}"
_WORDS_RE_EN = re.compile(
fr"([{LATIN_ALPHABET_BASIC}]+(?:[{LATIN_ALPHABET_BASIC}\-']*[{LATIN_ALPHABET_BASIC}]+)*)|(\|[^|]*\|)|([^{LATIN_ALPHABET_BASIC}|]+)"
)
_WORDS_RE_ANY_LOCALE = re.compile(
fr"([{LATIN_CHARS_ALL}]+(?:[{LATIN_CHARS_ALL}\-']*[{LATIN_CHARS_ALL}]+)*)|(\|[^|]*\|)|([^{LATIN_CHARS_ALL}|]+)"
)
def english_text_preprocessing(text, lower=True):
text = unicode(text)
text = ''.join(char for char in unicodedata.normalize('NFD', text) if unicodedata.category(char) != 'Mn')
text = ''.join(char if char not in SYNOGLYPH2ASCII else SYNOGLYPH2ASCII[char] for char in text)
if lower:
text = text.lower()
return text
def any_locale_text_preprocessing(text: str) -> str:
"""
Normalize unicode text with "NFC", and convert right single quotation mark (U+2019, decimal 8217) as an apostrophe.
Args:
text (str): the original input sentence.
Returns: normalized text (str).
"""
res = []
for c in normalize_unicode_text(text):
if c in ['’']: # right single quotation mark (U+2019, decimal 8217) as an apostrophe
res.append("'")
else:
res.append(c)
return ''.join(res)
def normalize_unicode_text(text: str) -> str:
"""
TODO @xueyang: Apply NFC form may be too aggressive since it would ignore some accented characters that do not exist
in predefined German alphabet (nemo.collections.common.tokenizers.text_to_speech.ipa_lexicon.IPA_CHARACTER_SETS),
such as 'é'. This is not expected. A better solution is to add an extra normalization with NFD to discard the
diacritics and consider 'é' and 'e' produce similar pronunciations.
Note that the tokenizer needs to run `unicodedata.normalize("NFC", x)` before calling `encode` function,
especially for the characters that have diacritics, such as 'ö' in the German alphabet. 'ö' can be encoded as
b'\xc3\xb6' (one char) as well as b'o\xcc\x88' (two chars). Without the normalization of composing two chars
together and without a complete predefined set of diacritics, when the tokenizer reads the input sentence
char-by-char, it would skip the combining diaeresis b'\xcc\x88', resulting in indistinguishable pronunciations
for 'ö' and 'o'.
Args:
text (str): the original input sentence.
Returns:
NFC normalized sentence (str).
"""
# normalize word with NFC form
if not unicodedata.is_normalized("NFC", text):
text = unicodedata.normalize("NFC", text)
return text
def _word_tokenize(words: List[Tuple[str, str, str]], is_lower: bool = False) -> List[Tuple[List[str], bool]]:
"""
Process a list of words and attach indicators showing if each word is unchangeable or not. Each word representation
can be one of valid word, any substring starting from | to | (unchangeable word), or punctuation marks including
whitespaces. This function will split unchanged strings by whitespaces and return them as `List[str]`. For example,
.. code-block:: python
[
('Hello', '', ''), # valid word
('', '', ' '), # punctuation mark
('World', '', ''), # valid word
('', '', ' '), # punctuation mark
('', '|NVIDIA unchanged|', ''), # unchangeable word
('', '', '!') # punctuation mark
]
will be converted into,
.. code-block:: python
[
(["Hello"], False),
([" "], False),
(["World"], False),
([" "], False),
(["NVIDIA", "unchanged"], True),
(["!"], False)
]
Args:
words (List[str]): a list of tuples like `(maybe_word, maybe_without_changes, maybe_punct)` where each element
corresponds to a non-overlapping match of either `_WORDS_RE_EN` or `_WORDS_RE_ANY_LOCALE`.
is_lower (bool): a flag to trigger lowercase all words. By default, it is False.
Returns: List[Tuple[List[str], bool]], a list of tuples like `(a list of words, is_unchanged)`.
"""
result = []
for word in words:
maybe_word, maybe_without_changes, maybe_punct = word
without_changes = False
if maybe_word != '':
if is_lower:
token = [maybe_word.lower()]
else:
token = [maybe_word]
elif maybe_punct != '':
token = [maybe_punct]
elif maybe_without_changes != '':
without_changes = True
token = maybe_without_changes[1:-1].split(" ")
else:
raise ValueError(
f"This is not expected. Found empty string: <{word}>. "
f"Please validate your regular expression pattern '_WORDS_RE_EN' or '_WORDS_RE_ANY_LOCALE'."
)
result.append((token, without_changes))
return result
def english_word_tokenize(text: str) -> List[Tuple[List[str], bool]]:
words = _WORDS_RE_EN.findall(text)
return _word_tokenize(words, is_lower=True)
def any_locale_word_tokenize(text: str) -> List[Tuple[List[str], bool]]:
words = _WORDS_RE_ANY_LOCALE.findall(text)
return _word_tokenize(words)
# TODO @xueyang: deprecate language-specific text preprocessing and use any_locale_text_preprocessing.
def spanish_text_preprocessing(text):
return text.lower()
def chinese_text_preprocessing(text):
return text.lower()