Skip to content

Commit

Permalink
Revert "Fix DataCollatorForWholeWordMask again (huggingface#8397)"
Browse files Browse the repository at this point in the history
This reverts commit 4f4ea4f.
  • Loading branch information
fabiocapsouza authored Nov 15, 2020
1 parent 2465626 commit ef96ce2
Showing 1 changed file with 2 additions and 6 deletions.
8 changes: 2 additions & 6 deletions src/transformers/data/data_collator.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,10 +206,6 @@ def _collate_batch(examples, tokenizer):
return result


def tolist(x: Union[List[Any], torch.Tensor]):
return x.tolist() if isinstance(x, torch.Tensor) else x


@dataclass
class DataCollatorForLanguageModeling:
"""
Expand Down Expand Up @@ -324,13 +320,13 @@ def __call__(
mask_labels = []
for e in examples:
ref_tokens = []
for id in tolist(e["input_ids"]):
for id in e["input_ids"].tolist():
token = self.tokenizer._convert_id_to_token(id)
ref_tokens.append(token)

# For Chinese tokens, we need extra inf to mark sub-word, e.g [喜,欢]-> [喜,##欢]
if "chinese_ref" in e:
ref_pos = tolist(e["chinese_ref"])
ref_pos = e["chinese_ref"].tolist()
len_seq = e["input_ids"].size(0)
for i in range(len_seq):
if i in ref_pos:
Expand Down

0 comments on commit ef96ce2

Please sign in to comment.