[Trainer] Allow passing image processor (#29896)

* Add image processor to trainer * Replace tokenizer=image_processor everywhere
huggingface · Apr 22, 2024 · 228d0e5 · 228d0e5
1 parent 060dbfd
commit 228d0e5
Show file tree

Hide file tree

Showing 21 changed files with 43 additions and 26 deletions.
diff --git a/docs/source/en/tasks/image_classification.md b/docs/source/en/tasks/image_classification.md
@@ -322,7 +322,7 @@ At this point, only three steps remain:
 ...     data_collator=data_collator,
 ...     train_dataset=food["train"],
 ...     eval_dataset=food["test"],
-...     tokenizer=image_processor,
+...     image_processor=image_processor,
 ...     compute_metrics=compute_metrics,
 ... )
 
@@ -418,7 +418,7 @@ and use the [PushToHubCallback](../main_classes/keras_callbacks#transformers.Pus
 >>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_eval_dataset)
 >>> push_to_hub_callback = PushToHubCallback(
 ...     output_dir="food_classifier",
-...     tokenizer=image_processor,
+...     image_processor=image_processor,
 ...     save_strategy="no",
 ... )
 >>> callbacks = [metric_callback, push_to_hub_callback]

diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md
@@ -384,7 +384,7 @@ Finally, bring everything together, and call [`~transformers.Trainer.train`]:
 ...     args=training_args,
 ...     data_collator=collate_fn,
 ...     train_dataset=cppe5["train"],
-...     tokenizer=image_processor,
+...     image_processor=image_processor,
 ... )
 
 >>> trainer.train()

diff --git a/docs/source/en/tasks/semantic_segmentation.md b/docs/source/en/tasks/semantic_segmentation.md
@@ -642,7 +642,7 @@ and use the [`PushToHubCallback`] to upload the model:
 ...     metric_fn=compute_metrics, eval_dataset=tf_eval_dataset, batch_size=batch_size, label_cols=["labels"]
 ... )
 
->>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", tokenizer=image_processor)
+>>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", image_processor=image_processor)
 
 >>> callbacks = [metric_callback, push_to_hub_callback]
 ```

diff --git a/docs/source/en/tasks/video_classification.md b/docs/source/en/tasks/video_classification.md
@@ -407,7 +407,7 @@ Then you just pass all of this along with the datasets to `Trainer`:
 ...     args,
 ...     train_dataset=train_dataset,
 ...     eval_dataset=val_dataset,
-...     tokenizer=image_processor,
+...     image_processor=image_processor,
 ...     compute_metrics=compute_metrics,
 ...     data_collator=collate_fn,
 ... )

diff --git a/docs/source/es/tasks/image_classification.md b/docs/source/es/tasks/image_classification.md
@@ -160,7 +160,7 @@ Al llegar a este punto, solo quedan tres pasos:
 ...     data_collator=data_collator,
 ...     train_dataset=food["train"],
 ...     eval_dataset=food["test"],
-...     tokenizer=image_processor,
+...     image_processor=image_processor,
 ... )
 
 >>> trainer.train()

diff --git a/docs/source/ja/tasks/image_classification.md b/docs/source/ja/tasks/image_classification.md
@@ -328,7 +328,7 @@ food["test"].set_transform(preprocess_val)
 ...     data_collator=data_collator,
 ...     train_dataset=food["train"],
 ...     eval_dataset=food["test"],
-...     tokenizer=image_processor,
+...     image_processor=image_processor,
 ...     compute_metrics=compute_metrics,
 ... )
 
@@ -426,7 +426,7 @@ Convert your datasets to the `tf.data.Dataset` format using the [`~datasets.Data
 >>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_eval_dataset)
 >>> push_to_hub_callback = PushToHubCallback(
 ...     output_dir="food_classifier",
-...     tokenizer=image_processor,
+...     image_processor=image_processor,
 ...     save_strategy="no",
 ... )
 >>> callbacks = [metric_callback, push_to_hub_callback]

diff --git a/docs/source/ja/tasks/object_detection.md b/docs/source/ja/tasks/object_detection.md
@@ -376,7 +376,7 @@ DETR モデルをトレーニングできる「ラベル」。画像プロセッ
 ...     args=training_args,
 ...     data_collator=collate_fn,
 ...     train_dataset=cppe5["train"],
-...     tokenizer=image_processor,
+...     image_processor=image_processor,
 ... )
 
 >>> trainer.train()

diff --git a/docs/source/ja/tasks/semantic_segmentation.md b/docs/source/ja/tasks/semantic_segmentation.md
@@ -434,7 +434,7 @@ TensorFlow でモデルを微調整するには、次の手順に従います。
 ...     metric_fn=compute_metrics, eval_dataset=tf_eval_dataset, batch_size=batch_size, label_cols=["labels"]
 ... )
 
->>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", tokenizer=image_processor)
+>>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", image_processor=image_processor)
 
 >>> callbacks = [metric_callback, push_to_hub_callback]
 ```

diff --git a/docs/source/ja/tasks/sequence_classification.md b/docs/source/ja/tasks/sequence_classification.md
@@ -436,7 +436,7 @@ TensorFlow でモデルを微調整するには、次の手順に従います。
 ...     metric_fn=compute_metrics, eval_dataset=tf_eval_dataset, batch_size=batch_size, label_cols=["labels"]
 ... )
 
->>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", tokenizer=image_processor)
+>>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", image_processor=image_processor)
 
 >>> callbacks = [metric_callback, push_to_hub_callback]
 ```

diff --git a/docs/source/ja/tasks/video_classification.md b/docs/source/ja/tasks/video_classification.md
@@ -414,7 +414,7 @@ def compute_metrics(eval_pred):
 ...     args,
 ...     train_dataset=train_dataset,
 ...     eval_dataset=val_dataset,
-...     tokenizer=image_processor,
+...     image_processor=image_processor,
 ...     compute_metrics=compute_metrics,
 ...     data_collator=collate_fn,
 ... )

diff --git a/docs/source/ko/tasks/image_classification.md b/docs/source/ko/tasks/image_classification.md
@@ -321,7 +321,7 @@ food["test"].set_transform(preprocess_val)
 ...     data_collator=data_collator,
 ...     train_dataset=food["train"],
 ...     eval_dataset=food["test"],
-...     tokenizer=image_processor,
+...     image_processor=image_processor,
 ...     compute_metrics=compute_metrics,
 ... )
 
@@ -417,7 +417,7 @@ TensorFlow에서 모델을 미세 조정하려면 다음 단계를 따르세요:
 >>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_eval_dataset)
 >>> push_to_hub_callback = PushToHubCallback(
 ...     output_dir="food_classifier",
-...     tokenizer=image_processor,
+...     image_processor=image_processor,
 ...     save_strategy="no",
 ... )
 >>> callbacks = [metric_callback, push_to_hub_callback]

diff --git a/docs/source/ko/tasks/object_detection.md b/docs/source/ko/tasks/object_detection.md
@@ -366,7 +366,7 @@ DatasetDict({
 ...     args=training_args,
 ...     data_collator=collate_fn,
 ...     train_dataset=cppe5["train"],
-...     tokenizer=image_processor,
+...     image_processor=image_processor,
 ... )
 
 >>> trainer.train()

diff --git a/docs/source/ko/tasks/semantic_segmentation.md b/docs/source/ko/tasks/semantic_segmentation.md
@@ -424,7 +424,7 @@ TensorFlow에서 모델을 미세 조정하려면 다음 단계를 따르세요:
 ...     metric_fn=compute_metrics, eval_dataset=tf_eval_dataset, batch_size=batch_size, label_cols=["labels"]
 ... )
 
->>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", tokenizer=image_processor)
+>>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", image_processor=image_processor)
 
 >>> callbacks = [metric_callback, push_to_hub_callback]
 ```

diff --git a/docs/source/ko/tasks/video_classification.md b/docs/source/ko/tasks/video_classification.md
@@ -411,7 +411,7 @@ def compute_metrics(eval_pred):
 ...     args,
 ...     train_dataset=train_dataset,
 ...     eval_dataset=val_dataset,
-...     tokenizer=image_processor,
+...     image_processor=image_processor,
 ...     compute_metrics=compute_metrics,
 ...     data_collator=collate_fn,
 ... )

diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py
@@ -411,7 +411,7 @@ def val_transforms(example_batch):
         train_dataset=dataset["train"] if training_args.do_train else None,
         eval_dataset=dataset["validation"] if training_args.do_eval else None,
         compute_metrics=compute_metrics,
-        tokenizer=image_processor,
+        image_processor=image_processor,
         data_collator=collate_fn,
     )
 

diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py
@@ -369,7 +369,7 @@ def preprocess_images(examples):
         args=training_args,
         train_dataset=ds["train"] if training_args.do_train else None,
         eval_dataset=ds["validation"] if training_args.do_eval else None,
-        tokenizer=image_processor,
+        image_processor=image_processor,
         data_collator=collate_fn,
     )
 

diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py
@@ -458,7 +458,7 @@ def preprocess_images(examples):
         args=training_args,
         train_dataset=ds["train"] if training_args.do_train else None,
         eval_dataset=ds["validation"] if training_args.do_eval else None,
-        tokenizer=image_processor,
+        image_processor=image_processor,
         data_collator=collate_fn,
     )
 

diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@@ -510,7 +510,7 @@ def preprocess_val(example_batch):
         train_dataset=dataset["train"] if training_args.do_train else None,
         eval_dataset=dataset["validation"] if training_args.do_eval else None,
         compute_metrics=compute_metrics,
-        tokenizer=image_processor,
+        image_processor=image_processor,
         data_collator=default_data_collator,
     )
 

diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py
@@ -552,7 +552,7 @@ def compute_metrics(p):
                     output_dir=training_args.output_dir,
                     hub_model_id=push_to_hub_model_id,
                     hub_token=training_args.push_to_hub_token,
-                    tokenizer=image_processor,
+                    image_processor=image_processor,
                     **model_card_kwargs,
                 )
             )

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
@@ -59,6 +59,7 @@
 from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator
 from .debug_utils import DebugOption, DebugUnderflowOverflow
 from .hyperparameter_search import ALL_HYPERPARAMETER_SEARCH_BACKENDS, default_hp_search_backend
+from .image_processing_utils import BaseImageProcessor
 from .integrations.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_available
 from .integrations.tpu import tpu_spmd_dataloader
 from .modelcard import TrainingSummary
@@ -303,6 +304,9 @@ class Trainer:
             The tokenizer used to preprocess the data. If provided, will be used to automatically pad the inputs to the
             maximum length when batching inputs, and it will be saved along the model to make it easier to rerun an
             interrupted training or reuse the fine-tuned model.
+        image_processor ([`BaseImageProcessor`], *optional*):
+            The image processor used to preprocess the data. If provided, it will be saved along the model to make it easier
+            to rerun an interrupted training or reuse the fine-tuned model.
         model_init (`Callable[[], PreTrainedModel]`, *optional*):
             A function that instantiates the model to be used. If provided, each call to [`~Trainer.train`] will start
             from a new instance of the model as given by this function.
@@ -357,6 +361,7 @@ def __init__(
         train_dataset: Optional[Union[Dataset, IterableDataset]] = None,
         eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
         tokenizer: Optional[PreTrainedTokenizerBase] = None,
+        image_processor: Optional["BaseImageProcessor"] = None,
         model_init: Optional[Callable[[], PreTrainedModel]] = None,
         compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
         callbacks: Optional[List[TrainerCallback]] = None,
@@ -485,11 +490,12 @@ def __init__(
         ):
             self.place_model_on_device = False
 
-        default_collator = default_data_collator if tokenizer is None else DataCollatorWithPadding(tokenizer)
+        default_collator = DataCollatorWithPadding(tokenizer) if tokenizer is not None else default_data_collator
         self.data_collator = data_collator if data_collator is not None else default_collator
         self.train_dataset = train_dataset
         self.eval_dataset = eval_dataset
         self.tokenizer = tokenizer
+        self.image_processor = image_processor
 
         # Bnb Quantized models doesn't support `.to` operation.
         if (
@@ -541,7 +547,7 @@ def __init__(
         default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks(self.args.report_to)
         callbacks = default_callbacks if callbacks is None else default_callbacks + callbacks
         self.callback_handler = CallbackHandler(
-            callbacks, self.model, self.tokenizer, self.optimizer, self.lr_scheduler
+            callbacks, self.model, self.tokenizer, self.image_processor, self.optimizer, self.lr_scheduler
         )
         self.add_callback(PrinterCallback if self.args.disable_tqdm else DEFAULT_PROGRESS_CALLBACK)
 
@@ -3276,6 +3282,8 @@ def _save_tpu(self, output_dir: Optional[str] = None):
             )
         if self.tokenizer is not None and self.args.should_save:
             self.tokenizer.save_pretrained(output_dir)
+        if self.image_processor is not None and self.args.should_save:
+            self.image_processor.save_pretrained(output_dir)
 
         # We moved the model from TPU -> CPU for saving the weights.
         # Now we should move it back to subsequent compute still works.
@@ -3313,6 +3321,8 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None):
 
         if self.tokenizer is not None:
             self.tokenizer.save_pretrained(output_dir)
+        if self.image_processor is not None:
+            self.image_processor.save_pretrained(output_dir)
 
         # Good practice: save your training arguments together with the trained model
         torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
@@ -4009,6 +4019,9 @@ def _push_from_checkpoint(self, checkpoint_folder):
         # Saving the tokenizer is fast and we don't know how many files it may have spawned, so we resave it to be sure.
         if self.tokenizer is not None:
             self.tokenizer.save_pretrained(output_dir)
+        # Same for the image processor
+        if self.image_processor is not None:
+            self.image_processor.save_pretrained(output_dir)
         # Same for the training arguments
         torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
 
@@ -4056,7 +4069,7 @@ def _finish_current_push(self):
 
     def push_to_hub(self, commit_message: Optional[str] = "End of training", blocking: bool = True, **kwargs) -> str:
         """
-        Upload `self.model` and `self.tokenizer` to the 🤗 model hub on the repo `self.args.hub_model_id`.
+        Upload `self.model` and `self.tokenizer` or `self.image_processor` to the 🤗 model hub on the repo `self.args.hub_model_id`.
 
         Parameters:
             commit_message (`str`, *optional*, defaults to `"End of training"`):

diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py
@@ -189,6 +189,8 @@ class TrainerCallback:
             The model being trained.
         tokenizer ([`PreTrainedTokenizer`]):
             The tokenizer used for encoding the data.
+        image_processor ([`BaseImageProcessor`]):
+            The image processor used for encoding the images.
         optimizer (`torch.optim.Optimizer`):
             The optimizer used for the training steps.
         lr_scheduler (`torch.optim.lr_scheduler.LambdaLR`):
@@ -307,12 +309,13 @@ def on_prediction_step(self, args: TrainingArguments, state: TrainerState, contr
 class CallbackHandler(TrainerCallback):
     """Internal class that just calls the list of callbacks in order."""
 
-    def __init__(self, callbacks, model, tokenizer, optimizer, lr_scheduler):
+    def __init__(self, callbacks, model, tokenizer, image_processor, optimizer, lr_scheduler):
         self.callbacks = []
         for cb in callbacks:
             self.add_callback(cb)
         self.model = model
         self.tokenizer = tokenizer
+        self.image_processor = image_processor
         self.optimizer = optimizer
         self.lr_scheduler = lr_scheduler
         self.train_dataloader = None
@@ -417,6 +420,7 @@ def call_event(self, event, args, state, control, **kwargs):
                 control,
                 model=self.model,
                 tokenizer=self.tokenizer,
+                image_processor=self.image_processor,
                 optimizer=self.optimizer,
                 lr_scheduler=self.lr_scheduler,
                 train_dataloader=self.train_dataloader,