Remove token_type_ids from model_input_names (like huggingface#24788) (…

…huggingface#28325) * remove token_type_ids from model_input_names (like huggingface#24788) * removed test that assumed token_type_ids should be present and updated a model reference so that it points to an available model)
wgifford · Jan 21, 2024 · 01066cd · 01066cd
1 parent 258bc18
commit 01066cd
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 3 deletions.
diff --git a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
@@ -103,7 +103,7 @@ class GPTSw3Tokenizer(PreTrainedTokenizer):
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    model_input_names = ["input_ids", "token_type_ids", "attention_mask"]
+    model_input_names = ["input_ids", "attention_mask"]
 
     def __init__(
         self,

diff --git a/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py b/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py
@@ -120,10 +120,10 @@ def test_tokenizer_integration(self):
             "Det är inget fel på Mr. Cool",
         ]
 
-        expected_encoding = {"input_ids": [[63423, 5, 6811, 14954, 282, 816, 3821, 63466, 63425, 63462, 18, 63978, 678, 301, 1320, 63423, 63455, 63458, 18, 63982, 4246, 3940, 1901, 47789, 5547, 18994], [19630, 1100, 63446, 1342, 633, 544, 4488, 593, 5102, 2416, 63495, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1652, 428, 268, 1936, 515, 268, 58593, 22413, 9106, 546, 268, 33213, 63979, 698, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [55130, 63450, 924, 63449, 2249, 4062, 1558, 318, 63504, 21498, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [509, 377, 2827, 2559, 332, 6575, 63443, 26801, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], "token_type_ids": [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], "attention_mask": [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # fmt: skip
+        expected_encoding = {"input_ids": [[63423, 5, 6811, 14954, 282, 816, 3821, 63466, 63425, 63462, 18, 63978, 678, 301, 1320, 63423, 63455, 63458, 18, 63982, 4246, 3940, 1901, 47789, 5547, 18994], [19630, 1100, 63446, 1342, 633, 544, 4488, 593, 5102, 2416, 63495, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1652, 428, 268, 1936, 515, 268, 58593, 22413, 9106, 546, 268, 33213, 63979, 698, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [55130, 63450, 924, 63449, 2249, 4062, 1558, 318, 63504, 21498, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [509, 377, 2827, 2559, 332, 6575, 63443, 26801, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], "attention_mask": [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # fmt: skip
         self.tokenizer_integration_test_util(
             expected_encoding=expected_encoding,
-            model_name="AI-Sweden/gpt-sw3-126m",
+            model_name="AI-Sweden-Models/gpt-sw3-126m",
             sequences=sequences,
         )