From 1b52b04b48cb31c6ce0e824ba72ba34e4a4ac0d0 Mon Sep 17 00:00:00 2001 From: Gabriele Sarti Date: Mon, 30 Oct 2023 10:48:47 +0100 Subject: [PATCH] Remove `max_input_length` from `model.encode` (#227) --- inseq/models/huggingface_model.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/inseq/models/huggingface_model.py b/inseq/models/huggingface_model.py index 49740436..08ee0c4a 100644 --- a/inseq/models/huggingface_model.py +++ b/inseq/models/huggingface_model.py @@ -234,7 +234,6 @@ def encode( as_targets: bool = False, return_baseline: bool = False, include_eos_baseline: bool = False, - max_input_length: int = 512, add_bos_token: bool = True, add_special_tokens: bool = True, ) -> BatchEncoding: @@ -249,21 +248,12 @@ def encode( """ if as_targets and not self.is_encoder_decoder: raise ValueError("Decoder-only models should use tokenization as source only.") - max_length = self.tokenizer.max_len_single_sentence - # Some tokenizer have weird values for max_len_single_sentence - # Cap length with max_model_input_sizes instead - if max_length > 1e6: - if hasattr(self.tokenizer, "max_model_input_sizes") and self.tokenizer.max_model_input_sizes: - max_length = max(v for _, v in self.tokenizer.max_model_input_sizes.items()) - else: - max_length = max_input_length batch = self.tokenizer( text=texts if not as_targets else None, text_target=texts if as_targets else None, add_special_tokens=add_special_tokens, padding=True, truncation=True, - max_length=max_length, return_tensors="pt", ).to(self.device) baseline_ids = None