Revert "Fix DataCollatorForWholeWordMask again (huggingface#8397)"

This reverts commit 4f4ea4f.
fabiocapsouza · Nov 15, 2020 · ef96ce2 · ef96ce2
1 parent 2465626
commit ef96ce2
Showing 1 changed file with 2 additions and 6 deletions.
diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py
@@ -206,10 +206,6 @@ def _collate_batch(examples, tokenizer):
     return result
 
 
-def tolist(x: Union[List[Any], torch.Tensor]):
-    return x.tolist() if isinstance(x, torch.Tensor) else x
-
-
 @dataclass
 class DataCollatorForLanguageModeling:
     """
@@ -324,13 +320,13 @@ def __call__(
         mask_labels = []
         for e in examples:
             ref_tokens = []
-            for id in tolist(e["input_ids"]):
+            for id in e["input_ids"].tolist():
                 token = self.tokenizer._convert_id_to_token(id)
                 ref_tokens.append(token)
 
             # For Chinese tokens, we need extra inf to mark sub-word, e.g [喜,欢]-> [喜，##欢]
             if "chinese_ref" in e:
-                ref_pos = tolist(e["chinese_ref"])
+                ref_pos = e["chinese_ref"].tolist()
                 len_seq = e["input_ids"].size(0)
                 for i in range(len_seq):
                     if i in ref_pos: