Skip to content

Commit

Permalink
add typing and doc to PreTrainedSentencepieceTok.
Browse files Browse the repository at this point in the history
  • Loading branch information
PhilipMay committed Jun 1, 2021
1 parent b772f0a commit 4000bed
Showing 1 changed file with 77 additions and 8 deletions.
85 changes: 77 additions & 8 deletions src/transformers/tokenization_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -775,36 +775,82 @@ def _decode(


class PreTrainedSentencepieceTokenizer(PreTrainedTokenizer):
# TODO: add docstring with vocab_file & sp_model_kwargs
def __init__(self, vocab_file, sp_model_kwargs: Optional[Dict[str, Any]] = None, *args, **kwargs) -> None:
"""
Base class for all slow sentencepiece tokenizers.
Inherits from :class:`~transformers.tokenization_utils.PreTrainedTokenizer`.
Args:
vocab_file (:obj:`str`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
contains the vocabulary necessary to instantiate a tokenizer.
sp_model_kwargs (:obj:`dict`, `optional`):
Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
<https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
- ``enable_sampling``: Enable subword regularization.
- ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
- ``nbest_size = {0,1}``: No sampling is performed.
- ``nbest_size > 1``: samples from the nbest_size results.
- ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
"""

def __init__(self, vocab_file: str, sp_model_kwargs: Optional[Dict[str, Any]] = None, *args, **kwargs) -> None:
super().__init__(sp_model_kwargs=sp_model_kwargs, *args, **kwargs)

self.vocab_file = vocab_file
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
self.sp_model = self.load_spm(self.vocab_file, self.sp_model_kwargs)

def convert_tokens_to_string(self, tokens: List[str]) -> str:
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
"""
Convert a sequence of tokens (strings for sub-words) to a single string.
This is the reverse function of ``_tokenize``.
"""
return self.sp_model.decode(tokens)

def _tokenize(self, text: str) -> List[str]:
"""
Encode text input to segmented tokens.
This can be revertet with the ``convert_tokens_to_string`` function.
Args:
text (str): the test to tokenize
Returns:
List[str]: the segmented tokens
"""
return self.sp_model.encode(text, out_type=str)

@property
def vocab_size(self) -> int:
"""Size of the vocab."""
return self.sp_model.get_piece_size()

def get_vocab(self):
def get_vocab(self) -> Dict[str, int]:
"""
Return the vocab.
Returns:
Dict[str, int]: the vocab
"""
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab

def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
def _convert_token_to_id(self, token: str) -> int:
"""Convert a token (str) to an id (int) using the vocab."""
return self.sp_model.PieceToId(token)

def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
def _convert_id_to_token(self, index: int) -> str:
"""Convert an index (int) to a token (str) using the vocab."""
return self.sp_model.IdToPiece(index)

def __getstate__(self):
Expand All @@ -823,6 +869,29 @@ def __setstate__(self, d):

@staticmethod
def load_spm(path: str, sp_model_kwargs: Dict[str, Any]) -> SentencePieceProcessor:
"""
Load a ``SentencePieceProcessor``.
Args:
path (:obj:`str`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension)
that contains the vocabulary necessary to instantiate a tokenizer.
sp_model_kwargs (:obj:`dict`, `optional`):
Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for
SentencePiece <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other
things, to set:
- ``enable_sampling``: Enable subword regularization.
- ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
- ``nbest_size = {0,1}``: No sampling is performed.
- ``nbest_size > 1``: samples from the nbest_size results.
- ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis
(lattice) using forward-filtering-and-backward-sampling algorithm.
- ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
"""
sp_model = SentencePieceProcessor(**sp_model_kwargs)
sp_model.Load(str(path))
return sp_model

0 comments on commit 4000bed

Please sign in to comment.