reworked deberta test

huggingface · Mar 18, 2021 · 205c60d · 205c60d
1 parent d22a452
commit 205c60d
Showing 1 changed file with 120 additions and 38 deletions.
diff --git a/tests/test_tokenization_deberta.py b/tests/test_tokenization_deberta.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2018 Microsoft, the Hugging Face Team.
+# Copyright 2019 Hugging Face inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,61 +14,143 @@
 # limitations under the License.
 
 
-import re
+import os
+import json
 import unittest
-from typing import Tuple
 
-from transformers.models.deberta.tokenization_deberta import DebertaTokenizer
-from transformers.testing_utils import require_torch
+from transformers import DebertaTokenizer
+from transformers.testing_utils import slow
+from transformers.models.deberta.tokenization_deberta import VOCAB_FILES_NAMES
 
 from .test_tokenization_common import TokenizerTesterMixin
 
 
-@require_torch
 class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     tokenizer_class = DebertaTokenizer
+    test_rust_tokenizer = False
 
     def setUp(self):
         super().setUp()
 
-    def get_tokenizer(self, name="microsoft/deberta-base", **kwargs):
-        return DebertaTokenizer.from_pretrained(name, **kwargs)
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
+        vocab = [
+            "l",
+            "o",
+            "w",
+            "e",
+            "r",
+            "s",
+            "t",
+            "i",
+            "d",
+            "n",
+            "\u0120",
+            "\u0120l",
+            "\u0120n",
+            "\u0120lo",
+            "\u0120low",
+            "er",
+            "\u0120lowest",
+            "\u0120newer",
+            "\u0120wider",
+            "[UNK]",
+        ]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
+        self.special_tokens_map = {"unk_token": "[UNK]"}
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        with open(self.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+
+    def get_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "lower newer"
         output_text = "lower newer"
         return input_text, output_text
 
-    def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20) -> Tuple[str, list]:
-        toks = [
-            (i, tokenizer.decode([i], clean_up_tokenization_spaces=False))
-            for i in range(5, min(len(tokenizer), 50260))
-        ]
-        toks = list(filter(lambda t: re.match(r"^[ a-zA-Z]+$", t[1]), toks))
-        toks = list(filter(lambda t: [t[0]] == tokenizer.encode(t[1], add_special_tokens=False), toks))
-        if max_length is not None and len(toks) > max_length:
-            toks = toks[:max_length]
-        # toks_str = [t[1] for t in toks]
-        toks_ids = [t[0] for t in toks]
-
-        # Ensure consistency
-        output_txt = tokenizer.decode(toks_ids, clean_up_tokenization_spaces=False)
-        if " " not in output_txt and len(toks_ids) > 1:
-            output_txt = (
-                tokenizer.decode([toks_ids[0]], clean_up_tokenization_spaces=False)
-                + " "
-                + tokenizer.decode(toks_ids[1:], clean_up_tokenization_spaces=False)
-            )
-        if with_prefix_space and not output_txt.startswith(" "):
-            output_txt = " " + output_txt
-        output_ids = tokenizer.encode(output_txt, add_special_tokens=False)
-        return output_txt, output_ids
-
     def test_full_tokenizer(self):
-        tokenizer = self.get_tokenizer("microsoft/deberta-base")
-        input_str = "UNwant\u00E9d,running"
-        tokens = tokenizer.tokenize(input_str)
-        token_ids = tokenizer.convert_tokens_to_ids(tokens)
+        tokenizer = self.get_tokenizer()
+        text = "lower newer"
+        bpe_tokens = ["l", "o", "w", "er", "\u0120", "n", "e", "w", "er"]
+        tokens = tokenizer.tokenize(text)  
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + [tokenizer.unk_token]
+        input_bpe_tokens = [0, 1, 2, 15, 10, 9, 3, 2, 15, 19]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+    @slow
+    def test_sequence_builders(self):
+        tokenizer = self.tokenizer_class.from_pretrained("microsoft/deberta-base")
+
+        text = tokenizer.encode("sequence builders", add_special_tokens=False)
+        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
+
+        encoded_text_from_decode = tokenizer.encode(
+            "sequence builders", add_special_tokens=True, add_prefix_space=False
+        )
+        encoded_pair_from_decode = tokenizer.encode(
+            "sequence builders", "multi-sequence build", add_special_tokens=True, add_prefix_space=False
+        )
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        assert encoded_sentence == encoded_text_from_decode
+        assert encoded_pair == encoded_pair_from_decode
+
+    @slow
+    def test_tokenizer_integration(self):
+        tokenizer_classes = [self.tokenizer_class]
+        if self.test_rust_tokenizer:
+            tokenizer_classes.append(self.rust_tokenizer_class)
+
+        for tokenizer_class in tokenizer_classes:
+            tokenizer = tokenizer_class.from_pretrained("microsoft/deberta-base")
+
+            sequences = [
+                "ALBERT: A Lite BERT for Self-supervised Learning of Language Representations",
+                "ALBERT incorporates two parameter reduction techniques",
+                "The first one is a factorized embedding parameterization. By decomposing the large vocabulary embedding matrix into two small matrices, we separate the size of the hidden layers from the size of vocabulary embedding.", 
+            ]
+
+            encoding = tokenizer(sequences, padding=True)
+            decoded_sequences = [tokenizer.decode(seq, skip_special_tokens=True) for seq in encoding["input_ids"]]
+
+            # fmt: off
+            expected_encoding = {
+                'input_ids': [
+                    [1, 2118, 11126, 565, 35, 83, 25191, 163, 18854, 13, 12156, 12, 16101, 25376, 13807, 9, 22205, 27893, 1635, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [1, 2118, 11126, 565, 24536, 80, 43797, 4878, 7373, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [1, 133, 78, 65, 16, 10, 3724, 1538, 33183, 11303, 43797, 1938, 4, 870, 24165, 29105, 5, 739, 32644, 33183, 11303, 36173, 88, 80, 650, 7821, 45940, 6, 52, 2559, 5, 1836, 9, 5, 7397, 13171, 31, 5, 1836, 9, 32644, 33183, 11303, 4, 2]
+                ],
+                'token_type_ids': [
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                ], 
+                'attention_mask': [
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+                ]
+            }
+
+            expected_decoded_sequence = [
+                'ALBERT: A Lite BERT for Self-supervised Learning of Language Representations',
+                'ALBERT incorporates two parameter reduction techniques',
+                'The first one is a factorized embedding parameterization. By decomposing the large vocabulary embedding matrix into two small matrices, we separate the size of the hidden layers from the size of vocabulary embedding.'
+            ]
+
+            self.assertDictEqual(encoding.data, expected_encoding)
 
-        self.assertEqual(tokenizer.decode(token_ids), input_str)
+            for expected, decoded in zip(expected_decoded_sequence, decoded_sequences):
+                self.assertEqual(expected, decoded)