Revert "# Add whole word mask support for lm fine-tune (huggingface#7925

)" This reverts commit 3f8b9a5.
fabiocapsouza · Nov 15, 2020 · 4b2684c · 4b2684c
1 parent 53d47c3
commit 4b2684c
Show file tree

Hide file tree

Showing 8 changed files with 7 additions and 394 deletions.
diff --git a/examples/language-modeling/README.md b/examples/language-modeling/README.md
@@ -45,69 +45,19 @@ slightly slower (over-fitting takes more epochs).
 
 We use the `--mlm` flag so that the script may change its loss function.
 
-If using whole-word masking, use both the`--mlm` and `--wwm` flags.
-
-```bash
-export TRAIN_FILE=/path/to/dataset/wiki.train.raw
-export TEST_FILE=/path/to/dataset/wiki.test.raw
-
-python run_language_modeling.py \
-    --output_dir=output \
-    --model_type=roberta \
-    --model_name_or_path=roberta-base \
-    --do_train \
-    --train_data_file=$TRAIN_FILE \
-    --do_eval \
-    --eval_data_file=$TEST_FILE \
-    --mlm \
-    --wwm
-```
-
-For Chinese models, it's same with English model with only --mlm`. If using whole-word masking, we need to generate a reference files, case it's char level.
-
-**Q :** Why ref file ?
-
-**A :** Suppose we have a Chinese sentence like : `我喜欢你` The original Chinese-BERT will tokenize it as `['我','喜','欢','你']` in char level.
-Actually, `喜欢` is a whole word. For whole word mask proxy, We need res like `['我','喜','##欢','你']`.
-So we need a ref file to tell model which pos of BERT original token should be added `##`.
-
-**Q :** Why LTP ?
-
-**A :** Cause the best known Chinese WWM BERT is [Chinese-BERT-wwm](https://github.com/ymcui/Chinese-BERT-wwm) by HIT. It works well on so many Chines Task like CLUE (Chinese GLUE).
-They use LTP, so if we want to fine-tune their model, we need LTP.
-
-```bash
-export TRAIN_FILE=/path/to/dataset/wiki.train.raw
-export LTP_RESOURCE=/path/to/ltp/tokenizer
-export BERT_RESOURCE=/path/to/bert/tokenizer
-export SAVE_PATH=/path/to/data/ref.txt
-
-python chinese_ref.py \
-    --file_name=$TRAIN_FILE \
-    --ltp=$LTP_RESOURCE
-    --bert=$BERT_RESOURCE \
-    --save_path=$SAVE_PATH 
-```
-Now Chinese Ref is only supported by `LineByLineWithRefDataset` Class, so we need add `line_by_line` flag: 
-
-
 ```bash
 export TRAIN_FILE=/path/to/dataset/wiki.train.raw
 export TEST_FILE=/path/to/dataset/wiki.test.raw
-export REF_FILE=/path/to/ref.txt
 
 python run_language_modeling.py \
     --output_dir=output \
     --model_type=roberta \
     --model_name_or_path=roberta-base \
     --do_train \
     --train_data_file=$TRAIN_FILE \
-    --chinese_ref_file=$REF_FILE \
     --do_eval \
     --eval_data_file=$TEST_FILE \
-    --mlm \
-    --line_by_line \
-    --wwm
+    --mlm
 ```
 
 ### XLNet and permutation language modeling

diff --git a/examples/language-modeling/chinese_ref.py b/examples/language-modeling/chinese_ref.py
diff --git a/examples/language-modeling/run_language_modeling.py b/examples/language-modeling/run_language_modeling.py
@@ -37,10 +37,8 @@
     AutoTokenizer,
     DataCollatorForLanguageModeling,
     DataCollatorForPermutationLanguageModeling,
-    DataCollatorForWholeWordMask,
     HfArgumentParser,
     LineByLineTextDataset,
-    LineByLineWithRefDataset,
     PreTrainedTokenizer,
     TextDataset,
     Trainer,
@@ -103,10 +101,6 @@ class DataTrainingArguments:
         default=None,
         metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
     )
-    chinese_ref_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input ref data file for whole word mask in Chinees."},
-    )
     line_by_line: bool = field(
         default=False,
         metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
@@ -115,7 +109,6 @@ class DataTrainingArguments:
     mlm: bool = field(
         default=False, metadata={"help": "Train with masked-language modeling loss instead of language modeling."}
     )
-    whole_word_mask: bool = field(default=False, metadata={"help": "Whether ot not to use whole word mask."})
     mlm_probability: float = field(
         default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
     )
@@ -150,16 +143,6 @@ def get_dataset(
 ):
     def _dataset(file_path):
         if args.line_by_line:
-            if args.chinese_ref_file is not None:
-                if not args.whole_word_mask or not args.mlm:
-                    raise ValueError("You need to set world whole masking and mlm to True for Chinese Whole Word Mask")
-                return LineByLineWithRefDataset(
-                    tokenizer=tokenizer,
-                    file_path=file_path,
-                    block_size=args.block_size,
-                    ref_path=args.chinese_ref_file,
-                )
-
             return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size)
         else:
             return TextDataset(
@@ -191,6 +174,7 @@ def main():
             "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
             "or remove the --do_eval argument."
         )
+
     if (
         os.path.exists(training_args.output_dir)
         and os.listdir(training_args.output_dir)
@@ -286,14 +270,9 @@ def main():
             max_span_length=data_args.max_span_length,
         )
     else:
-        if data_args.mlm and data_args.whole_word_mask:
-            data_collator = DataCollatorForWholeWordMask(
-                tokenizer=tokenizer, mlm_probability=data_args.mlm_probability
-            )
-        else:
-            data_collator = DataCollatorForLanguageModeling(
-                tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability
-            )
+        data_collator = DataCollatorForLanguageModeling(
+            tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability
+        )
 
     # Initialize our Trainer
     trainer = Trainer(

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -284,15 +284,13 @@
         DataCollatorForNextSentencePrediction,
         DataCollatorForPermutationLanguageModeling,
         DataCollatorForSOP,
-        DataCollatorForWholeWordMask,
         DataCollatorWithPadding,
         default_data_collator,
     )
     from .data.datasets import (
         GlueDataset,
         GlueDataTrainingArguments,
         LineByLineTextDataset,
-        LineByLineWithRefDataset,
         LineByLineWithSOPTextDataset,
         SquadDataset,
         SquadDataTrainingArguments,