From 55cfb47b5a0c84fa9ae97b792f82805919a0149a Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Fri, 25 Sep 2020 14:18:38 -0700 Subject: [PATCH] The truncation setting doesn't do anything anymore (#4672) * The truncation setting doesn't do anything anymore * Changelog --- CHANGELOG.md | 2 ++ .../pretrained_transformer_tokenizer.py | 16 +++++----------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1056a250fdd..4b5fea56c2c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `transformers` dependency updated to version 3.1.0. - When `cached_path` is called on a local archive with `extract_archive=True`, the archive is now extracted into a unique subdirectory of the cache root instead of a subdirectory of the archive's directory. The extraction directory is also unique to the modification time of the archive, so if the file changes, subsequent calls to `cached_path` will know to re-extract the archive. +- Removed the `truncation_strategy` parameter to `PretrainedTransformerTokenizer`. The way we're calling the tokenizer, the truncation strategy takes no effect anyways. ### Fixed @@ -46,6 +47,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed a bug in our doc building script where markdown links did not render properly if the "href" part of the link (the part inside the `()`) was on a new line. + ## [v1.1.0](https://github.com/allenai/allennlp/releases/tag/v1.1.0) - 2020-09-08 ### Fixed diff --git a/allennlp/data/tokenizers/pretrained_transformer_tokenizer.py b/allennlp/data/tokenizers/pretrained_transformer_tokenizer.py index b8c45ed1a5d..1fa3c155294 100644 --- a/allennlp/data/tokenizers/pretrained_transformer_tokenizer.py +++ b/allennlp/data/tokenizers/pretrained_transformer_tokenizer.py @@ -44,13 +44,6 @@ class PretrainedTransformerTokenizer(Tokenizer): stride : `int`, optional (default=`0`) If set to a number along with max_length, the overflowing tokens returned will contain some tokens from the main sequence returned. The value of this argument defines the number of additional tokens. - truncation_strategy : `str`, optional (default=`'longest_first'`) - String selected in the following options: - - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length - starting from the longest one at each token (when there is a pair of input sequences) - - 'only_first': Only truncate the first sequence - - 'only_second': Only truncate the second sequence - - 'do_not_truncate': Do not truncate (raise an error if the input sequence is longer than max_length) tokenizer_kwargs: `Dict[str, Any]`, optional (default = `None`) Dictionary with [additional arguments](https://github.com/huggingface/transformers/blob/155c782a2ccd103cf63ad48a2becd7c76a7d2115/transformers/tokenization_utils.py#L691) @@ -63,7 +56,6 @@ def __init__( add_special_tokens: bool = True, max_length: Optional[int] = None, stride: int = 0, - truncation_strategy: str = "longest_first", tokenizer_kwargs: Optional[Dict[str, Any]] = None, ) -> None: if tokenizer_kwargs is None: @@ -82,7 +74,6 @@ def __init__( self._add_special_tokens = add_special_tokens self._max_length = max_length self._stride = stride - self._truncation_strategy = truncation_strategy self._tokenizer_lowercases = self.tokenizer_lowercases(self.tokenizer) @@ -230,12 +221,15 @@ def tokenize(self, text: str) -> List[Token]: """ This method only handles a single sentence (or sequence) of text. """ + max_length = self._max_length + if max_length is not None and self._add_special_tokens: + max_length -= self.num_special_tokens_for_sequence() + encoded_tokens = self.tokenizer.encode_plus( text=text, add_special_tokens=False, - max_length=self._max_length, + max_length=max_length, stride=self._stride, - truncation=self._truncation_strategy if self._max_length is not None else False, return_tensors=None, return_offsets_mapping=self.tokenizer.is_fast, return_attention_mask=False,