Skip to content

Commit

Permalink
Add changes for uroman package to handle non-Roman characters (huggin…
Browse files Browse the repository at this point in the history
…gface#32404)

* Add changes for uroman package to handle non-Roman characters

* Update docs for uroman changes

* Modifying error message to warning, for backward compatibility

* Update instruction for user to install uroman

* Update docs for uroman python version dependency and backward compatibility

* Update warning message for python version compatibility with uroman

* Refine docs
  • Loading branch information
nandwalritik authored and BernardZach committed Dec 6, 2024
1 parent 8012873 commit da133bc
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 8 deletions.
25 changes: 23 additions & 2 deletions docs/source/en/model_doc/vits.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,33 @@ from transformers import VitsTokenizer
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
print(tokenizer.is_uroman)
```
If the is_uroman attribute is `True`, the tokenizer will automatically apply the `uroman` package to your text inputs, but you need to install uroman if not already installed using:
```
pip install --upgrade uroman
```
Note: Python version required to use `uroman` as python package should be >= `3.10`.
You can use the tokenizer as usual without any additional preprocessing steps:
```python
import torch
from transformers import VitsTokenizer, VitsModel, set_seed
import os
import subprocess

If required, you should apply the uroman package to your text inputs **prior** to passing them to the `VitsTokenizer`,
since currently the tokenizer does not support performing the pre-processing itself.
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-kor")
model = VitsModel.from_pretrained("facebook/mms-tts-kor")
text = "이봐 무슨 일이야"
inputs = tokenizer(text=text, return_tensors="pt")

set_seed(555) # make deterministic
with torch.no_grad():
outputs = model(inputs["input_ids"])

waveform = outputs.waveform[0]
```
If you don't want to upgrade to python >= `3.10`, then you can use the `uroman` perl package to pre-process the text inputs to the Roman alphabet.
To do this, first clone the uroman repository to your local machine and set the bash variable `UROMAN` to the local path:


```bash
git clone https://github.com/isi-nlp/uroman.git
cd uroman
Expand Down
19 changes: 13 additions & 6 deletions src/transformers/models/vits/tokenization_vits.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,14 @@
from typing import Any, Dict, List, Optional, Tuple, Union

from ...tokenization_utils import PreTrainedTokenizer
from ...utils import is_phonemizer_available, logging
from ...utils import is_phonemizer_available, is_uroman_available, logging


if is_phonemizer_available():
import phonemizer

if is_uroman_available():
import uroman as ur

logger = logging.get_logger(__name__)

Expand Down Expand Up @@ -172,11 +174,16 @@ def prepare_for_tokenization(
filtered_text = self._preprocess_char(text)

if has_non_roman_characters(filtered_text) and self.is_uroman:
logger.warning(
"Text to the tokenizer contains non-Roman characters. Ensure the `uroman` Romanizer is "
"applied to the text prior to passing it to the tokenizer. See "
"`https://github.com/isi-nlp/uroman` for details."
)
if not is_uroman_available():
logger.warning(
"Text to the tokenizer contains non-Roman characters. To apply the `uroman` pre-processing "
"step automatically, ensure the `uroman` Romanizer is installed with: `pip install uroman` "
"Note `uroman` requires python version >= 3.10"
"Otherwise, apply the Romanizer manually as per the instructions: https://github.com/isi-nlp/uroman"
)
else:
uroman = ur.Uroman()
filtered_text = uroman.romanize_string(filtered_text)

if self.phonemize:
if not is_phonemizer_available():
Expand Down
1 change: 1 addition & 0 deletions src/transformers/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@
is_torchdynamo_compiling,
is_torchvision_available,
is_training_run_on_sagemaker,
is_uroman_available,
is_vision_available,
requires_backends,
torch_only_method,
Expand Down
11 changes: 11 additions & 0 deletions src/transformers/utils/import_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
_pandas_available = _is_package_available("pandas")
_peft_available = _is_package_available("peft")
_phonemizer_available = _is_package_available("phonemizer")
_uroman_available = _is_package_available("uroman")
_psutil_available = _is_package_available("psutil")
_py3nvml_available = _is_package_available("py3nvml")
_pyctcdecode_available = _is_package_available("pyctcdecode")
Expand Down Expand Up @@ -1107,6 +1108,10 @@ def is_phonemizer_available():
return _phonemizer_available


def is_uroman_available():
return _uroman_available


def torch_only_method(fn):
def wrapper(*args, **kwargs):
if not _torch_available:
Expand Down Expand Up @@ -1383,6 +1388,11 @@ def is_liger_kernel_available():
{0} requires the phonemizer library but it was not found in your environment. You can install it with pip:
`pip install phonemizer`. Please note that you may need to restart your runtime after installation.
"""
# docstyle-ignore
UROMAN_IMPORT_ERROR = """
{0} requires the uroman library but it was not found in your environment. You can install it with pip:
`pip install uroman`. Please note that you may need to restart your runtime after installation.
"""


# docstyle-ignore
Expand Down Expand Up @@ -1523,6 +1533,7 @@ def is_liger_kernel_available():
("g2p_en", (is_g2p_en_available, G2P_EN_IMPORT_ERROR)),
("pandas", (is_pandas_available, PANDAS_IMPORT_ERROR)),
("phonemizer", (is_phonemizer_available, PHONEMIZER_IMPORT_ERROR)),
("uroman", (is_uroman_available, UROMAN_IMPORT_ERROR)),
("pretty_midi", (is_pretty_midi_available, PRETTY_MIDI_IMPORT_ERROR)),
("levenshtein", (is_levenshtein_available, LEVENSHTEIN_IMPORT_ERROR)),
("librosa", (is_librosa_available, LIBROSA_IMPORT_ERROR)),
Expand Down

0 comments on commit da133bc

Please sign in to comment.