From 3e6ce11e3b0d986c697731e355d09592adf33318 Mon Sep 17 00:00:00 2001 From: Martin Davis Date: Wed, 13 Sep 2023 16:28:50 -0400 Subject: [PATCH 1/6] Updated characters, underscore and comma to be torchscriptable. --- ludwig/utils/tokenizers.py | 75 +++++++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 21 deletions(-) diff --git a/ludwig/utils/tokenizers.py b/ludwig/utils/tokenizers.py index 74e33b437bb..c36c38ad8c0 100644 --- a/ludwig/utils/tokenizers.py +++ b/ludwig/utils/tokenizers.py @@ -29,11 +29,7 @@ logger = logging.getLogger(__name__) torchtext_version = torch.torch_version.TorchVersion(torchtext.__version__) -SPACE_PUNCTUATION_REGEX = re.compile(r"\w+|[^\w\s]") -COMMA_REGEX = re.compile(r"\s*,\s*") -UNDERSCORE_REGEX = re.compile(r"\s*_\s*") - -TORCHSCRIPT_COMPATIBLE_TOKENIZERS = {"space", "space_punct"} +TORCHSCRIPT_COMPATIBLE_TOKENIZERS = {"space", "space_punct", "comma", "underscore", "characters"} TORCHTEXT_0_12_0_TOKENIZERS = {"sentencepiece", "clip", "gpt2bpe"} TORCHTEXT_0_13_0_TOKENIZERS = {"bert"} @@ -50,12 +46,59 @@ def __call__(self, text: str): pass -class CharactersToListTokenizer(BaseTokenizer): - def __call__(self, text): - return [char for char in text] +class StringSplitTokenizer(torch.nn.Module): + def __init__(self, split_string, **kwargs): + super().__init__() + self.split_string = split_string + + def forward(self, v: Union[str, List[str], torch.Tensor]) -> Any: + if isinstance(v, torch.Tensor): + raise ValueError(f"Unsupported input: {v}") + + inputs: List[str] = [] + # Ludwig calls map on List[str] objects, so we need to handle individual strings as well. + if isinstance(v, str): + inputs.append(v) + else: + inputs.extend(v) + + tokens: List[List[str]] = [] + for sequence in inputs: + split_sequence = sequence.strip().split(self.split_string) + token_sequence: List[str] = [] + for token in self.get_tokens(split_sequence): + if len(token) > 0: + token_sequence.append(token) + tokens.append(token_sequence) + + return tokens[0] if isinstance(v, str) else tokens + + def get_tokens(self, tokens: List[str]) -> List[str]: + return tokens + + +class SpaceStringToListTokenizer(StringSplitTokenizer): + """Implements torchscript-compatible whitespace tokenization.""" + + def __init__(self, **kwargs): + super().__init__(split_string=" ", **kwargs) -class SpaceStringToListTokenizer(torch.nn.Module): +class UnderscoreStringToListTokenizer(StringSplitTokenizer): + """Implements torchscript-compatible whitespace tokenization.""" + + def __init__(self, **kwargs): + super().__init__(split_string="_", **kwargs) + + +class CommaStringToListTokenizer(StringSplitTokenizer): + """Implements torchscript-compatible whitespace tokenization.""" + + def __init__(self, **kwargs): + super().__init__(split_string=",", **kwargs) + + +class CharactersToListTokenizer(torch.nn.Module): """Implements torchscript-compatible whitespace tokenization.""" def __init__(self, **kwargs): @@ -74,7 +117,7 @@ def forward(self, v: Union[str, List[str], torch.Tensor]) -> Any: tokens: List[List[str]] = [] for sequence in inputs: - split_sequence = sequence.strip().split(" ") + split_sequence = [char for char in sequence] token_sequence: List[str] = [] for token in self.get_tokens(split_sequence): if len(token) > 0: @@ -142,16 +185,6 @@ def forward(self, v: Union[str, List[str], torch.Tensor]) -> Any: return tokens[0] if isinstance(v, str) else tokens -class UnderscoreStringToListTokenizer(BaseTokenizer): - def __call__(self, text): - return UNDERSCORE_REGEX.split(text.strip()) - - -class CommaStringToListTokenizer(BaseTokenizer): - def __call__(self, text): - return COMMA_REGEX.split(text.strip()) - - class UntokenizedStringToListTokenizer(BaseTokenizer): def __call__(self, text): return [text] @@ -855,10 +888,10 @@ def _set_pad_token(self) -> None: "space": SpaceStringToListTokenizer, "space_punct": SpacePunctuationStringToListTokenizer, "ngram": NgramTokenizer, - # Tokenizers not compatible with torchscript "characters": CharactersToListTokenizer, "underscore": UnderscoreStringToListTokenizer, "comma": CommaStringToListTokenizer, + # Tokenizers not compatible with torchscript "untokenized": UntokenizedStringToListTokenizer, "stripped": StrippedStringToListTokenizer, "english_tokenize": EnglishTokenizer, From 95f0501d1e81ccb6174e3a0ffe87927ddef16f7c Mon Sep 17 00:00:00 2001 From: Martin Davis Date: Wed, 13 Sep 2023 16:36:07 -0400 Subject: [PATCH 2/6] Updated Class descriptions. --- ludwig/utils/tokenizers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ludwig/utils/tokenizers.py b/ludwig/utils/tokenizers.py index c36c38ad8c0..66680cb776b 100644 --- a/ludwig/utils/tokenizers.py +++ b/ludwig/utils/tokenizers.py @@ -85,21 +85,21 @@ def __init__(self, **kwargs): class UnderscoreStringToListTokenizer(StringSplitTokenizer): - """Implements torchscript-compatible whitespace tokenization.""" + """Implements torchscript-compatible underscore tokenization.""" def __init__(self, **kwargs): super().__init__(split_string="_", **kwargs) class CommaStringToListTokenizer(StringSplitTokenizer): - """Implements torchscript-compatible whitespace tokenization.""" + """Implements torchscript-compatible comma tokenization.""" def __init__(self, **kwargs): super().__init__(split_string=",", **kwargs) class CharactersToListTokenizer(torch.nn.Module): - """Implements torchscript-compatible whitespace tokenization.""" + """Implements torchscript-compatible characters tokenization.""" def __init__(self, **kwargs): super().__init__() From 42ba7e3b83bfc081220c8ba40430f9e77759d68e Mon Sep 17 00:00:00 2001 From: Martin Davis Date: Thu, 14 Sep 2023 10:19:15 -0400 Subject: [PATCH 3/6] Updated apt-get before installing dependencies. --- .github/workflows/pytest.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 315677b7deb..02b0e922f2c 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -78,6 +78,7 @@ jobs: - name: Setup Linux if: runner.os == 'linux' run: | + sudo apt-get update sudo apt-get install -y cmake libsndfile1 wget - name: Setup macOS @@ -230,6 +231,7 @@ jobs: - name: Setup Linux if: runner.os == 'linux' run: | + sudo apt-get update sudo apt-get install -y cmake libsndfile1 - name: Setup macOS @@ -287,6 +289,7 @@ jobs: - name: Setup Linux if: runner.os == 'linux' run: | + sudo apt-get update sudo apt-get install -y cmake libsndfile1 - name: Setup macOS @@ -344,6 +347,7 @@ jobs: - name: Setup Linux if: runner.os == 'linux' run: | + sudo apt-get update sudo apt-get install -y cmake libsndfile1 - name: Setup macOS @@ -401,6 +405,7 @@ jobs: - name: Setup Linux if: runner.os == 'linux' run: | + sudo apt-get update sudo apt-get install -y cmake libsndfile1 - name: Setup macOS @@ -458,6 +463,7 @@ jobs: - name: Setup Linux if: runner.os == 'linux' run: | + sudo apt-get update sudo apt-get install -y cmake libsndfile1 - name: Setup macOS @@ -499,6 +505,7 @@ jobs: - name: Setup Linux if: runner.os == 'linux' run: | + sudo apt-get update sudo apt-get install -y cmake libsndfile1 - name: Setup macOS @@ -540,6 +547,7 @@ jobs: - name: Setup Linux if: runner.os == 'linux' run: | + sudo apt-get update sudo apt-get install -y cmake libsndfile1 - name: Setup macOS @@ -579,6 +587,7 @@ jobs: - name: Setup Linux if: runner.os == 'linux' run: | + sudo apt-get update sudo apt-get install -y cmake libsndfile1 - name: Setup macOS From 23459ec8def8335a9835eb106b01c12ae60fcd98 Mon Sep 17 00:00:00 2001 From: Martin Davis Date: Thu, 14 Sep 2023 11:14:13 -0400 Subject: [PATCH 4/6] Removed unused import. --- ludwig/utils/tokenizers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ludwig/utils/tokenizers.py b/ludwig/utils/tokenizers.py index 66680cb776b..5c585d4ecaf 100644 --- a/ludwig/utils/tokenizers.py +++ b/ludwig/utils/tokenizers.py @@ -14,7 +14,6 @@ """ import logging -import re from abc import abstractmethod from typing import Any, Dict, List, Optional, Union From 5635f8dd8815d79002f111f4289d9ba321197063 Mon Sep 17 00:00:00 2001 From: Martin Davis Date: Thu, 14 Sep 2023 11:14:24 -0400 Subject: [PATCH 5/6] Added test for SplitString Tokenizer. --- tests/ludwig/utils/test_tokenizers.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/ludwig/utils/test_tokenizers.py b/tests/ludwig/utils/test_tokenizers.py index 1c0aa08fb69..82f6d86bdff 100644 --- a/tests/ludwig/utils/test_tokenizers.py +++ b/tests/ludwig/utils/test_tokenizers.py @@ -4,7 +4,7 @@ import torch import torchtext -from ludwig.utils.tokenizers import EnglishLemmatizeFilterTokenizer, NgramTokenizer +from ludwig.utils.tokenizers import EnglishLemmatizeFilterTokenizer, NgramTokenizer, StringSplitTokenizer TORCHTEXT_0_14_0_HF_NAMES = [ "bert-base-uncased", @@ -73,6 +73,13 @@ def test_ngram_tokenizer(): assert tokens == tokens_expected +def test_string_split_tokenizer(): + inputs = "Multiple,Elements,Are here!" + tokenizer = StringSplitTokenizer(",") + tokens = tokenizer(inputs) + assert tokens == ["Multiple", "Elements", "Are here!"] + + def test_english_lemmatize_filter_tokenizer(): inputs = "Hello, I'm a single sentence!" tokenizer = EnglishLemmatizeFilterTokenizer() From 6fa1e6309412adf360014f87d4370325f9679292 Mon Sep 17 00:00:00 2001 From: Martin Davis Date: Thu, 14 Sep 2023 11:15:00 -0400 Subject: [PATCH 6/6] Revert "Updated apt-get before installing dependencies." This reverts commit 42ba7e3b83bfc081220c8ba40430f9e77759d68e. --- .github/workflows/pytest.yml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 02b0e922f2c..315677b7deb 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -78,7 +78,6 @@ jobs: - name: Setup Linux if: runner.os == 'linux' run: | - sudo apt-get update sudo apt-get install -y cmake libsndfile1 wget - name: Setup macOS @@ -231,7 +230,6 @@ jobs: - name: Setup Linux if: runner.os == 'linux' run: | - sudo apt-get update sudo apt-get install -y cmake libsndfile1 - name: Setup macOS @@ -289,7 +287,6 @@ jobs: - name: Setup Linux if: runner.os == 'linux' run: | - sudo apt-get update sudo apt-get install -y cmake libsndfile1 - name: Setup macOS @@ -347,7 +344,6 @@ jobs: - name: Setup Linux if: runner.os == 'linux' run: | - sudo apt-get update sudo apt-get install -y cmake libsndfile1 - name: Setup macOS @@ -405,7 +401,6 @@ jobs: - name: Setup Linux if: runner.os == 'linux' run: | - sudo apt-get update sudo apt-get install -y cmake libsndfile1 - name: Setup macOS @@ -463,7 +458,6 @@ jobs: - name: Setup Linux if: runner.os == 'linux' run: | - sudo apt-get update sudo apt-get install -y cmake libsndfile1 - name: Setup macOS @@ -505,7 +499,6 @@ jobs: - name: Setup Linux if: runner.os == 'linux' run: | - sudo apt-get update sudo apt-get install -y cmake libsndfile1 - name: Setup macOS @@ -547,7 +540,6 @@ jobs: - name: Setup Linux if: runner.os == 'linux' run: | - sudo apt-get update sudo apt-get install -y cmake libsndfile1 - name: Setup macOS @@ -587,7 +579,6 @@ jobs: - name: Setup Linux if: runner.os == 'linux' run: | - sudo apt-get update sudo apt-get install -y cmake libsndfile1 - name: Setup macOS