vocab tokens_of_type(), try keyerror removed to force _token_types_in…

…dexes to be well-built by tokenizers, docstring updates
Natooz · Sep 13, 2022 · 8f9a89f · 8f9a89f
1 parent 7d873ca
commit 8f9a89f
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 7 deletions.
diff --git a/miditok/bpe.py b/miditok/bpe.py
@@ -35,9 +35,9 @@ def bpe(self, tokens_path: Union[Path, PurePath, str], vocab_size: int, out_dir:
             Note that this implementation is in pure Python and will be slow if you use a large amount of
             tokens files. You might use the files_lim argument.
 
-            :param tokens_path: path to token files to load
+            :param tokens_path: path to token files to learn the BPE combinations from
             :param vocab_size: the new vocabulary size
-            :param out_dir: directory to save the tokenizer's parameters and vocabulary
+            :param out_dir: directory to save the tokenizer's parameters and vocabulary after BPE learning is finished
             :param files_lim: limit of token files to use (default: None)
             :param save_converted_samples: will save in out_dir the samples that have been used
                     to create the BPE vocab. Files will keep the same name and relative path (default: True)
@@ -139,7 +139,7 @@ def apply_bpe(self, tokens: List[int]) -> List[int]:
             return tokens
 
         def apply_bpe_to_dataset(self, dataset_path: Union[Path, PurePath, str], out_path: Union[Path, PurePath, str]):
-            r"""Converts a sequence of tokens into tokens with BPE.
+            r"""Apply BPE to an already tokenized dataset (with no BPE).
 
             :param dataset_path: path to token files to load
             :param out_path: output directory to save

diff --git a/miditok/vocabulary.py b/miditok/vocabulary.py
@@ -125,10 +125,7 @@ def tokens_of_type(self, token_type: str) -> List[int]:
         :param token_type: token type to get the associated tokens
         :return: list of tokens
         """
-        try:
-            return self._token_types_indexes[token_type]
-        except KeyError:
-            return []
+        return self._token_types_indexes[token_type]
 
     def __add_mask(self):
         r"""Adds a MASK token to the vocabulary. This may be used to