From fbde27c26f8897ab6c1801436c5c0e878fdda18d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=8A=B9=EB=8D=95/Infrastructure=EA=B7=B8?= =?UTF-8?q?=EB=A3=B9=28YA=29?= Date: Wed, 11 Oct 2023 22:18:18 +0900 Subject: [PATCH] revert removing sequence_len --- src/axolotl/prompt_strategies/completion.py | 4 ++-- src/axolotl/prompt_strategies/metharme.py | 4 ++-- src/axolotl/prompt_tokenizers.py | 7 +++++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/axolotl/prompt_strategies/completion.py b/src/axolotl/prompt_strategies/completion.py index bb30c2710..3285e667c 100644 --- a/src/axolotl/prompt_strategies/completion.py +++ b/src/axolotl/prompt_strategies/completion.py @@ -53,8 +53,8 @@ def tokenize_prompt(self, prompt): tokenized_full_prompt = self._tokenize(full_prompt) for key, val in tokenized_full_prompt.items(): - for i in range(0, len(val), self.max_length): - res[key].append(val[i : i + self.max_length]) + for i in range(0, len(val), self.sequence_len): + res[key].append(val[i : i + self.sequence_len]) return dict(res) diff --git a/src/axolotl/prompt_strategies/metharme.py b/src/axolotl/prompt_strategies/metharme.py index 62c5349bd..52d77c00c 100644 --- a/src/axolotl/prompt_strategies/metharme.py +++ b/src/axolotl/prompt_strategies/metharme.py @@ -31,7 +31,7 @@ def _tokenize( result = self.tokenizer( prompt, truncation=True, - max_length=self.max_length, + max_length=self.sequence_len, padding=False, return_tensors=None, ) @@ -43,7 +43,7 @@ def _tokenize( if num_eos_tokens > 0 and add_eos_token and len(result["input_ids"]) > 0: for _ in range(num_eos_tokens): - if len(result["input_ids"]) < self.max_length: + if len(result["input_ids"]) < self.sequence_len: result["input_ids"].append(self.tokenizer.eos_token_id) result["attention_mask"].append(1) diff --git a/src/axolotl/prompt_tokenizers.py b/src/axolotl/prompt_tokenizers.py index 22a66f876..918514b19 100644 --- a/src/axolotl/prompt_tokenizers.py +++ b/src/axolotl/prompt_tokenizers.py @@ -45,6 +45,9 @@ def __init__( self.prompter = prompter self.tokenizer: PreTrainedTokenizer = tokenizer self.train_on_inputs = train_on_inputs + # sequence_len and max_length can be different for CompletionPromptTokenizingStrategy. + # TODO: Document how they are different. + self.sequence_len = sequence_len self.max_length = sequence_len @abc.abstractmethod @@ -290,13 +293,13 @@ def _tokenize(self, prompt, add_eos_token=True, strip_bos_token=False): result = self.tokenizer( prompt, truncation=True, - max_length=self.max_length, + max_length=self.sequence_len, padding=False, return_tensors=None, ) if ( result["input_ids"][-1] != self.tokenizer.eos_token_id - and len(result["input_ids"]) < self.max_length + and len(result["input_ids"]) < self.sequence_len and add_eos_token ): result["input_ids"].append(self.tokenizer.eos_token_id)