TF Examples Rewrite (#18451)

* Finished QA example * Dodge a merge conflict * Update text classification and LM examples * Update NER example * New Keras metrics WIP, fix NER example * Update NER example * Update MC, summarization and translation examples * Add XLA warnings when shapes are variable * Make sure batch_size is consistently scaled by num_replicas * Add PushToHubCallback to all models * Add docs links for KerasMetricCallback * Add docs links for prepare_tf_dataset and jit_compile * Correct inferred model names * Don't assume the dataset has 'lang' * Don't assume the dataset has 'lang' * Write metrics in text classification * Add 'framework' to TrainingArguments and TFTrainingArguments * Export metrics in all examples and add tests * Fix training args for Flax * Update command line args for translation test * make fixup * Fix accidentally running other tests in fp16 * Remove do_train/do_eval from run_clm.py * Remove do_train/do_eval from run_mlm.py * Add tensorflow tests to circleci * Fix circleci * Update examples/tensorflow/language-modeling/run_mlm.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update examples/tensorflow/test_tensorflow_examples.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update examples/tensorflow/translation/run_translation.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update examples/tensorflow/token-classification/run_ner.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Fix save path for tests * Fix some model card kwargs * Explain the magical -1000 * Actually enable tests this time * Skip text classification PR until we fix shape inference * make fixup Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
huggingface · Aug 10, 2022 · 6eb5145 · 6eb5145
1 parent d7e2d7b
commit 6eb5145
Show file tree

Hide file tree

Showing 15 changed files with 1,491 additions and 661 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -658,6 +658,71 @@ jobs:
             - store_artifacts:
                   path: ~/transformers/reports
 
+    run_examples_tensorflow:
+        working_directory: ~/transformers
+        docker:
+            - image: cimg/python:3.7.12
+        environment:
+            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
+            PYTEST_TIMEOUT: 120
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.5-tensorflow_examples-{{ checksum "setup.py" }}
+                      - v0.5-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,tensorflow,sentencepiece,testing]
+            - run: pip install -r examples/tensorflow/_tests_requirements.txt
+            - save_cache:
+                  key: v0.5-tensorflow_examples-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: python utils/tests_fetcher.py --filters examples tests | tee test_preparation.txt
+            - store_artifacts:
+                  path: ~/transformers/test_preparation.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -s --make-reports=examples_tensorflow ./examples/tensorflow/ | tee tests_output.txt
+                  fi
+            - store_artifacts:
+                  path: ~/transformers/tensorflow_examples_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
+    run_examples_tensorflow_all:
+        working_directory: ~/transformers
+        docker:
+            - image: cimg/python:3.7.12
+        environment:
+            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
+            PYTEST_TIMEOUT: 120
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.5-tensorflow_examples-{{ checksum "setup.py" }}
+                      - v0.5-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,tensorflow,sentencepiece,testing]
+            - run: pip install -r examples/tensorflow/_tests_requirements.txt
+            - save_cache:
+                  key: v0.5-tensorflow_examples-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: |
+                  TRANSFORMERS_IS_CI=1 python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -s --make-reports=examples_tensorflow ./examples/tensorflow/ | tee examples_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tensorflow_examples_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
     run_examples_flax:
         working_directory: ~/transformers
         docker:
@@ -1000,6 +1065,7 @@ workflows:
             - check_code_quality
             - check_repository_consistency
             - run_examples_torch
+            - run_examples_tensorflow
             - run_examples_flax
             - run_tests_custom_tokenizers
             - run_tests_torch_and_tf
@@ -1022,6 +1088,7 @@ workflows:
                             - main
         jobs:
             - run_examples_torch_all
+            - run_examples_tensorflow_all
             - run_examples_flax_all
             - run_tests_torch_and_tf_all
             - run_tests_torch_and_flax_all

diff --git a/examples/tensorflow/_tests_requirements.txt b/examples/tensorflow/_tests_requirements.txt
@@ -0,0 +1,25 @@
+tensorflow
+tensorboard
+scikit-learn
+seqeval
+psutil
+sacrebleu >= 1.4.12
+git+https://github.com/huggingface/accelerate@main#egg=accelerate
+rouge-score
+tensorflow_datasets
+matplotlib
+git-python==1.0.3
+faiss-cpu
+streamlit
+elasticsearch
+nltk
+pandas
+datasets >= 1.13.3
+fire
+pytest
+conllu
+sentencepiece != 0.1.92
+protobuf
+jiwer
+librosa
+evaluate >= 0.2.0
diff --git a/examples/tensorflow/language-modeling/run_clm.py b/examples/tensorflow/language-modeling/run_clm.py
@@ -22,6 +22,8 @@
 """
 # You can also adapt this script on your own clm task. Pointers for this are left as comments.
 
+import json
+
 # region Imports
 import logging
 import math
@@ -46,8 +48,8 @@
     TF_MODEL_FOR_CAUSAL_LM_MAPPING,
     AutoConfig,
     AutoTokenizer,
-    DefaultDataCollator,
     HfArgumentParser,
+    PushToHubCallback,
     TFAutoModelForCausalLM,
     TFTrainingArguments,
     create_optimizer,
@@ -205,21 +207,6 @@ def __post_init__(self):
                 assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
 
 
-# endregion
-
-# region Helper classes
-class SavePretrainedCallback(tf.keras.callbacks.Callback):
-    # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
-    # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
-    # that saves the model with this method after each epoch.
-    def __init__(self, output_dir, **kwargs):
-        super().__init__()
-        self.output_dir = output_dir
-
-    def on_epoch_end(self, epoch, logs=None):
-        self.model.save_pretrained(self.output_dir)
-
-
 # endregion
 
 
@@ -299,19 +286,22 @@ def main():
         raw_datasets = load_dataset(
             data_args.dataset_name,
             data_args.dataset_config_name,
+            cache_dir=model_args.cache_dir,
             use_auth_token=True if model_args.use_auth_token else None,
         )
         if "validation" not in raw_datasets.keys():
             raw_datasets["validation"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
                 split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
                 use_auth_token=True if model_args.use_auth_token else None,
             )
             raw_datasets["train"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
                 split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
                 use_auth_token=True if model_args.use_auth_token else None,
             )
     else:
@@ -321,16 +311,39 @@ def main():
             data_files["train"] = data_args.train_file
         if data_args.validation_file is not None:
             data_files["validation"] = data_args.validation_file
-        extension = data_args.train_file.split(".")[-1]
+        extension = (
+            data_args.train_file.split(".")[-1]
+            if data_args.train_file is not None
+            else data_args.validation_file.split(".")[-1]
+        )
         if extension == "txt":
             extension = "text"
             dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
         raw_datasets = load_dataset(
             extension,
             data_files=data_files,
+            cache_dir=model_args.cache_dir,
             use_auth_token=True if model_args.use_auth_token else None,
             **dataset_args,
         )
+        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+                use_auth_token=True if model_args.use_auth_token else None,
+                **dataset_args,
+            )
+            raw_datasets["train"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+                use_auth_token=True if model_args.use_auth_token else None,
+                **dataset_args,
+            )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
     # endregion
@@ -446,7 +459,7 @@ def group_texts(examples):
         eval_dataset = eval_dataset.select(range(max_eval_samples))
 
     # Log a few random samples from the training set:
-    for index in random.sample(range(len(train_dataset)), 3):
+    for index in random.sample(range(len(train_dataset)), min(3, len(train_dataset))):
         logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
     # endregion
 
@@ -465,44 +478,88 @@ def group_texts(examples):
 
         # region TF Dataset preparation
         num_replicas = training_args.strategy.num_replicas_in_sync
-        data_collator = DefaultDataCollator(return_tensors="tf")
         options = tf.data.Options()
         options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
 
-        tf_train_dataset = train_dataset.to_tf_dataset(
-            # labels are passed as input, as we will use the model's internal loss
-            columns=[col for col in train_dataset.features if col != "special_tokens_mask"],
+        # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
+        # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
+        # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
+        # yourself if you use this method, whereas they are automatically inferred from the model input names when
+        # using model.prepare_tf_dataset()
+        # For more info see the docs:
+        # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
+        # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
+
+        tf_train_dataset = model.prepare_tf_dataset(
+            train_dataset,
             shuffle=True,
             batch_size=num_replicas * training_args.per_device_train_batch_size,
-            collate_fn=data_collator,
-            drop_remainder=True,
         ).with_options(options)
 
-        tf_eval_dataset = eval_dataset.to_tf_dataset(
-            # labels are passed as input, as we will use the model's internal loss
-            columns=[col for col in eval_dataset.features if col != "special_tokens_mask"],
+        tf_eval_dataset = model.prepare_tf_dataset(
+            eval_dataset,
             shuffle=False,
-            batch_size=num_replicas * training_args.per_device_train_batch_size,
-            collate_fn=data_collator,
+            batch_size=num_replicas * training_args.per_device_eval_batch_size,
             drop_remainder=True,
         ).with_options(options)
         # endregion
 
         # region Optimizer and loss
-        batches_per_epoch = len(train_dataset) // (num_replicas * training_args.per_device_train_batch_size)
+        num_train_steps = len(tf_train_dataset) * int(training_args.num_train_epochs)
+        if training_args.warmup_steps > 0:
+            num_warmup_steps = training_args.warmup_steps
+        elif training_args.warmup_ratio > 0:
+            num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
+        else:
+            num_warmup_steps = 0
+
         # Bias and layernorm weights are automatically excluded from the decay
         optimizer, lr_schedule = create_optimizer(
             init_lr=training_args.learning_rate,
-            num_train_steps=int(training_args.num_train_epochs * batches_per_epoch),
-            num_warmup_steps=training_args.warmup_steps,
+            num_train_steps=num_train_steps,
+            num_warmup_steps=num_warmup_steps,
             adam_beta1=training_args.adam_beta1,
             adam_beta2=training_args.adam_beta2,
             adam_epsilon=training_args.adam_epsilon,
             weight_decay_rate=training_args.weight_decay,
+            adam_global_clipnorm=training_args.max_grad_norm,
         )
 
         # no user-specified loss = will use the model internal loss
-        model.compile(optimizer=optimizer)
+        model.compile(optimizer=optimizer, jit_compile=training_args.xla)
+        # endregion
+
+        # region Preparing push_to_hub and model card
+        push_to_hub_model_id = training_args.push_to_hub_model_id
+        model_name = model_args.model_name_or_path.split("/")[-1]
+        if not push_to_hub_model_id:
+            if data_args.dataset_name is not None:
+                push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
+            else:
+                push_to_hub_model_id = f"{model_name}-finetuned-clm"
+
+        model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
+        if data_args.dataset_name is not None:
+            model_card_kwargs["dataset_tags"] = data_args.dataset_name
+            if data_args.dataset_config_name is not None:
+                model_card_kwargs["dataset_args"] = data_args.dataset_config_name
+                model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+            else:
+                model_card_kwargs["dataset"] = data_args.dataset_name
+
+        if training_args.push_to_hub:
+            callbacks = [
+                PushToHubCallback(
+                    output_dir=training_args.output_dir,
+                    model_id=push_to_hub_model_id,
+                    organization=training_args.push_to_hub_organization,
+                    token=training_args.push_to_hub_token,
+                    tokenizer=tokenizer,
+                    **model_card_kwargs,
+                )
+            ]
+        else:
+            callbacks = []
         # endregion
 
         # region Training and validation
@@ -512,33 +569,45 @@ def group_texts(examples):
         logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
         logger.info(f"  Total train batch size = {training_args.per_device_train_batch_size * num_replicas}")
 
+        # For long training runs, you may wish to use the PushToHub() callback here to save intermediate checkpoints
+        # to the Hugging Face Hub rather than just pushing the finished model.
+        # See https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.PushToHubCallback
+
         history = model.fit(
             tf_train_dataset,
             validation_data=tf_eval_dataset,
             epochs=int(training_args.num_train_epochs),
-            steps_per_epoch=len(train_dataset) // (training_args.per_device_train_batch_size * num_replicas),
-            callbacks=[SavePretrainedCallback(output_dir=training_args.output_dir)],
+            callbacks=callbacks,
         )
+        train_loss = history.history["loss"][-1]
         try:
-            train_perplexity = math.exp(history.history["loss"][-1])
+            train_perplexity = math.exp(train_loss)
         except OverflowError:
             train_perplexity = math.inf
+        logger.info(f"  Final train loss: {train_loss:.3f}")
+        logger.info(f"  Final train perplexity: {train_perplexity:.3f}")
+        validation_loss = history.history["val_loss"][-1]
         try:
-            validation_perplexity = math.exp(history.history["val_loss"][-1])
+            validation_perplexity = math.exp(validation_loss)
         except OverflowError:
             validation_perplexity = math.inf
-        logger.info(f"  Final train loss: {history.history['loss'][-1]:.3f}")
-        logger.info(f"  Final train perplexity: {train_perplexity:.3f}")
-        logger.info(f"  Final validation loss: {history.history['val_loss'][-1]:.3f}")
+        logger.info(f"  Final validation loss: {validation_loss:.3f}")
         logger.info(f"  Final validation perplexity: {validation_perplexity:.3f}")
-        # endregion
 
         if training_args.output_dir is not None:
-            model.save_pretrained(training_args.output_dir)
+            output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
+            results_dict = dict()
+            results_dict["train_loss"] = train_loss
+            results_dict["train_perplexity"] = train_perplexity
+            results_dict["eval_loss"] = validation_loss
+            results_dict["eval_perplexity"] = validation_perplexity
+            with open(output_eval_file, "w") as writer:
+                writer.write(json.dumps(results_dict))
+        # endregion
 
-    if training_args.push_to_hub:
-        # You'll probably want to include some of your own metadata here!
-        model.push_to_hub()
+    if training_args.output_dir is not None and not training_args.push_to_hub:
+        # If we're not pushing to hub, at least save a local copy when we're done
+        model.save_pretrained(training_args.output_dir)
 
 
 if __name__ == "__main__":