-
Notifications
You must be signed in to change notification settings - Fork 27.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add changes for uroman package to handle non-Roman characters #32404
Changes from all commits
f7cea15
de3f1e5
fb03d30
beb2236
ac2d81c
7a7ef2d
b413053
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,12 +20,14 @@ | |
from typing import Any, Dict, List, Optional, Tuple, Union | ||
|
||
from ...tokenization_utils import PreTrainedTokenizer | ||
from ...utils import is_phonemizer_available, logging | ||
from ...utils import is_phonemizer_available, is_uroman_available, logging | ||
|
||
|
||
if is_phonemizer_available(): | ||
import phonemizer | ||
|
||
if is_uroman_available(): | ||
import uroman as ur | ||
|
||
logger = logging.get_logger(__name__) | ||
|
||
|
@@ -172,11 +174,16 @@ def prepare_for_tokenization( | |
filtered_text = self._preprocess_char(text) | ||
|
||
if has_non_roman_characters(filtered_text) and self.is_uroman: | ||
logger.warning( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is my only qualm with the PR: it's currently not backwards compatible. Previously, if a user was omitting the uroman step and inputting non-roman characters, they would have only get a simple warning. Now, the code will error out and stop them from doing this (i.e. breaking backwards compatibility). I would be in favour of keeping the
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, didn't noticed the default behaviour, I have modified accordingly. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks great after the changes! |
||
"Text to the tokenizer contains non-Roman characters. Ensure the `uroman` Romanizer is " | ||
"applied to the text prior to passing it to the tokenizer. See " | ||
"`https://github.com/isi-nlp/uroman` for details." | ||
) | ||
if not is_uroman_available(): | ||
logger.warning( | ||
"Text to the tokenizer contains non-Roman characters. To apply the `uroman` pre-processing " | ||
"step automatically, ensure the `uroman` Romanizer is installed with: `pip install uroman` " | ||
"Note `uroman` requires python version >= 3.10" | ||
"Otherwise, apply the Romanizer manually as per the instructions: https://github.com/isi-nlp/uroman" | ||
) | ||
else: | ||
uroman = ur.Uroman() | ||
filtered_text = uroman.romanize_string(filtered_text) | ||
|
||
if self.phonemize: | ||
if not is_phonemizer_available(): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -141,6 +141,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[ | |
_pandas_available = _is_package_available("pandas") | ||
_peft_available = _is_package_available("peft") | ||
_phonemizer_available = _is_package_available("phonemizer") | ||
_uroman_available = _is_package_available("uroman") | ||
_psutil_available = _is_package_available("psutil") | ||
_py3nvml_available = _is_package_available("py3nvml") | ||
_pyctcdecode_available = _is_package_available("pyctcdecode") | ||
|
@@ -1069,6 +1070,10 @@ def is_phonemizer_available(): | |
return _phonemizer_available | ||
|
||
|
||
def is_uroman_available(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Awesome job here! |
||
return _uroman_available | ||
|
||
|
||
def torch_only_method(fn): | ||
def wrapper(*args, **kwargs): | ||
if not _torch_available: | ||
|
@@ -1338,6 +1343,11 @@ def is_mlx_available(): | |
{0} requires the phonemizer library but it was not found in your environment. You can install it with pip: | ||
`pip install phonemizer`. Please note that you may need to restart your runtime after installation. | ||
""" | ||
# docstyle-ignore | ||
UROMAN_IMPORT_ERROR = """ | ||
{0} requires the uroman library but it was not found in your environment. You can install it with pip: | ||
`pip install uroman`. Please note that you may need to restart your runtime after installation. | ||
""" | ||
|
||
|
||
# docstyle-ignore | ||
|
@@ -1478,6 +1488,7 @@ def is_mlx_available(): | |
("g2p_en", (is_g2p_en_available, G2P_EN_IMPORT_ERROR)), | ||
("pandas", (is_pandas_available, PANDAS_IMPORT_ERROR)), | ||
("phonemizer", (is_phonemizer_available, PHONEMIZER_IMPORT_ERROR)), | ||
("uroman", (is_uroman_available, UROMAN_IMPORT_ERROR)), | ||
("pretty_midi", (is_pretty_midi_available, PRETTY_MIDI_IMPORT_ERROR)), | ||
("levenshtein", (is_levenshtein_available, LEVENSHTEIN_IMPORT_ERROR)), | ||
("librosa", (is_librosa_available, LIBROSA_IMPORT_ERROR)), | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So much simpler!