add typing and doc to PreTrainedSentencepieceTok.

huggingface · Jun 1, 2021 · 4000bed · 4000bed
1 parent b772f0a
commit 4000bed
Showing 1 changed file with 77 additions and 8 deletions.
diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
@@ -775,36 +775,82 @@ def _decode(
 
 
 class PreTrainedSentencepieceTokenizer(PreTrainedTokenizer):
-    # TODO: add docstring with vocab_file & sp_model_kwargs
-    def __init__(self, vocab_file, sp_model_kwargs: Optional[Dict[str, Any]] = None, *args, **kwargs) -> None:
+    """
+    Base class for all slow sentencepiece tokenizers.
+
+    Inherits from :class:`~transformers.tokenization_utils.PreTrainedTokenizer`.
+
+    Args:
+        vocab_file (:obj:`str`):
+            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        sp_model_kwargs (:obj:`dict`, `optional`):
+            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
+            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+
+            - ``enable_sampling``: Enable subword regularization.
+            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - ``nbest_size = {0,1}``: No sampling is performed.
+              - ``nbest_size > 1``: samples from the nbest_size results.
+              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+    """
+
+    def __init__(self, vocab_file: str, sp_model_kwargs: Optional[Dict[str, Any]] = None, *args, **kwargs) -> None:
         super().__init__(sp_model_kwargs=sp_model_kwargs, *args, **kwargs)
 
         self.vocab_file = vocab_file
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
         self.sp_model = self.load_spm(self.vocab_file, self.sp_model_kwargs)
 
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
-        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        """
+        Convert a sequence of tokens (strings for sub-words) to a single string.
+
+        This is the reverse function of ``_tokenize``.
+        """
         return self.sp_model.decode(tokens)
 
     def _tokenize(self, text: str) -> List[str]:
+        """
+        Encode text input to segmented tokens.
+
+        This can be revertet with the ``convert_tokens_to_string`` function.
+
+        Args:
+            text (str): the test to tokenize
+
+        Returns:
+            List[str]: the segmented tokens
+        """
         return self.sp_model.encode(text, out_type=str)
 
     @property
     def vocab_size(self) -> int:
+        """Size of the vocab."""
         return self.sp_model.get_piece_size()
 
-    def get_vocab(self):
+    def get_vocab(self) -> Dict[str, int]:
+        """
+        Return the vocab.
+
+        Returns:
+            Dict[str, int]: the vocab
+        """
         vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
         vocab.update(self.added_tokens_encoder)
         return vocab
 
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
+    def _convert_token_to_id(self, token: str) -> int:
+        """Convert a token (str) to an id (int) using the vocab."""
         return self.sp_model.PieceToId(token)
 
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
+    def _convert_id_to_token(self, index: int) -> str:
+        """Convert an index (int) to a token (str) using the vocab."""
         return self.sp_model.IdToPiece(index)
 
     def __getstate__(self):
@@ -823,6 +869,29 @@ def __setstate__(self, d):
 
     @staticmethod
     def load_spm(path: str, sp_model_kwargs: Dict[str, Any]) -> SentencePieceProcessor:
+        """
+        Load a ``SentencePieceProcessor``.
+
+        Args:
+            path (:obj:`str`):
+                `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension)
+                that contains the vocabulary necessary to instantiate a tokenizer.
+            sp_model_kwargs (:obj:`dict`, `optional`):
+                Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for
+                SentencePiece <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other
+                things, to set:
+
+                - ``enable_sampling``: Enable subword regularization.
+                - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+                - ``nbest_size = {0,1}``: No sampling is performed.
+                - ``nbest_size > 1``: samples from the nbest_size results.
+                - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis
+                  (lattice) using forward-filtering-and-backward-sampling algorithm.
+
+                - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+                BPE-dropout.
+        """
         sp_model = SentencePieceProcessor(**sp_model_kwargs)
         sp_model.Load(str(path))
         return sp_model