huggingface · JingyaHuang · Mar 24, 2023 · Jan 9, 2023 · Jan 10, 2023 · Jan 17, 2023
diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py
@@ -20,6 +20,7 @@
 import shutil
 import sys
 import time
+import types
 import warnings
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Type, Union
@@ -134,6 +135,32 @@
 SCALER_NAME = "scaler.pt"
 
 
+class ModuleWithLoss(nn.Module):
+ def __init__(self, model, args, label_smoother):
+ super().__init__()
+ self._original_model = model
+ self.args = args
+ # Label smoothing
+ self.label_smoother = label_smoother
+
+ def forward(self, inputs: Dict[str, Union[torch.Tensor, Any]], return_outputs):
+ # The compute_model_plus_loss_internal is assigned once the class is instantiated.
+ # It should have same signature as Trainer.compute_loss().
+ # We do this to avoid potential un-synced states if we duplicated compute loss codes .
+ return self.compute_model_plus_loss_internal(self._original_model, inputs, return_outputs)
+
+ @property
+ def module(self):
+ """The original `torch.nn.Module` that this module wraps.
+ This property provides access to methods and properties on the original module."""
+
+ return self._original_model.module
+
+ @property
+ def config(self):
+ return self._original_model.config
+
+
 class ORTFeaturesManager:
  _TASKS_TO_ORTMODELS = {
  "default": ORTModelForFeatureExtraction,
@@ -279,12 +306,55 @@ def __init__(
  preprocess_logits_for_metrics=preprocess_logits_for_metrics,
  )
 
+ # We leverage both training_model and inference_model in conjunction with model.
+ # _training_model will be wrapped so it will use ORT and will use the overriden functions in ModuleWithLoss.
+ # _training_model will be storing the default version of the model and will unwrap it in case of eval/test.
+
+ # Only Wrap the model if we pass --use_module_with_loss flag.
+ if args.use_module_with_loss:
+ self._training_model = self.create_model_with_loss()
+
+ self.model = model
+
  self.feature = feature
  self.onnx_model_path = onnx_model_path
  self.exported_with_loss = False
  if self.args.local_rank:
  torch.cuda.set_device(self.args.local_rank)
 
+ # this method will create a ModuleWithLoss Instance to use if you are passing --use_module_with_loss flag.
+ # It will help reducing the peak memory usage by computing loss inside training.
+ def create_model_with_loss(self):
+ model_with_loss = ModuleWithLoss(self.model, self.args, self.label_smoother)
+ model_with_loss.compute_model_plus_loss_internal = types.MethodType(Trainer.compute_loss, model_with_loss)
+
+ return model_with_loss
+
+ # we assume that training_model and inference_model have the same forward signature column.
+ # self._signature_columns attribute only stores the first-time parsed signature
+ def _set_signature_columns_if_needed(self):
+ if self._signature_columns is None:
+ # Inspect model forward signature to keep only the arguments it accepts.
+ import inspect
+
+ if isinstance(self.model, ModuleWithLoss):
+ signature = inspect.signature(self.model._original_model.forward)
+ else:
+ signature = inspect.signature(self.model.forward)
+
+ self._signature_columns = list(signature.parameters.keys())
+ # Labels may be named label or label_ids, the default data collator handles that.
+ self._signature_columns += list(set(["label", "label_ids"] + self.label_names))
+
+ def compute_loss(self, model_with_loss, inputs, return_outputs=False):
+ # Run model forward + loss compute.
+ if isinstance(self.model, ModuleWithLoss):
+ # ORTModule Does not support the BatchEncoding Type so we have to convert to a dict.
+ dict_inputs = dict(inputs.items())
+ return model_with_loss(dict_inputs, return_outputs)
+ else:
+ return super().compute_loss(model_with_loss, inputs, return_outputs)
+
  def train(
  self,
  resume_from_checkpoint: Optional[Union[str, bool]] = None,
@@ -313,6 +383,8 @@ def train(
  "You need to install `onnxruntime-training` to use `ORTTrainer` for training. Check out "
  "https://huggingface.co/docs/optimum/onnxruntime/usage_guides/trainer#install-onnx-runtime."
  )
+ if self.args.use_module_with_loss:
+ self.model = self._training_model
 
  if resume_from_checkpoint is False:
  resume_from_checkpoint = None
@@ -801,6 +873,8 @@ def evaluate(
  dictionary also contains the epoch number which comes from the training state.
  """
  # memory metrics - must set up as early as possible
+ # TODO: We need to enable evaluation using ORT backend.
+ self.model = unwrap_model(self.model)
  self._memory_tracker.start()
 
  eval_dataloader = self.get_eval_dataloader(eval_dataset)
@@ -892,6 +966,9 @@ def predict(
  - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained
  labels).
  """
+ # TODO: We need to enable evaluation using ORT backend.
+ self.model = unwrap_model(self.model)
+
  # memory metrics - must set up as early as possible
  self._memory_tracker.start()
 
@@ -909,10 +986,7 @@ def predict(
 
  try:
  output = eval_loop(
- test_dataloader,
- description="Prediction",
- ignore_keys=ignore_keys,
- metric_key_prefix=metric_key_prefix,
+ test_dataloader, description="Prediction", ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix
  )
  except Exception as error:
  logger.error(error)
@@ -1522,13 +1596,7 @@ def _export(
  opset = max(opset, 12) # Operators like `nll_loss`are added for opset>=12
 
  output_path = model_path / ONNX_WEIGHTS_NAME
- _ = export(
- model=model,
- config=onnx_config,
- opset=opset,
- output=output_path,
- device=device,
- )
+ _ = export(model=model, config=onnx_config, opset=opset, output=output_path, device=device)
 
  model.config.save_pretrained(model_path)
 
@@ -1697,11 +1765,7 @@ def create_optimizer(self):
  optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
 
  if self.sharded_ddp == ShardedDDPOption.SIMPLE:
- self.optimizer = OSS(
- params=optimizer_grouped_parameters,
- optim=optimizer_cls,
- **optimizer_kwargs,
- )
+ self.optimizer = OSS(params=optimizer_grouped_parameters, optim=optimizer_cls, **optimizer_kwargs)
  else:
  self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
  if optimizer_cls.__name__ == "Adam8bit":
@@ -1731,10 +1795,7 @@ def get_ort_optimizer_cls_and_kwargs(args: ORTTrainingArguments) -> Tuple[Any, A
  The training arguments for the training session.
  """
  optimizer_kwargs = {"lr": args.learning_rate}
- adam_kwargs = {
- "betas": (args.adam_beta1, args.adam_beta2),
- "eps": args.adam_epsilon,
- }
+ adam_kwargs = {"betas": (args.adam_beta1, args.adam_beta2), "eps": args.adam_epsilon}
  if args.optim == ORTOptimizerNames.ADAMW_ORT_FUSED:
  try:
  from onnxruntime.training.optim import FusedAdam

diff --git a/optimum/onnxruntime/training_args.py b/optimum/onnxruntime/training_args.py
@@ -60,9 +60,13 @@ class ORTTrainingArguments(TrainingArguments):
  The optimizer to use, including optimizers in Transformers: adamw_hf, adamw_torch, adamw_apex_fused, or adafactor. And optimizers implemented by ONNX Runtime: adamw_ort_fused.
  """
 
- optim: Optional[str] = field(
- default="adamw_hf",
- metadata={"help": "The optimizer to use."},
+ optim: Optional[str] = field(default="adamw_hf", metadata={"help": "The optimizer to use."})
+
+ use_module_with_loss: Optional[bool] = field(
+ default=False,
+ metadata={
+ "help": "Use ModuleWithLoss Wrapper to compute loss inside the training loop, having this will help save memory for ORTModule Runs."
+ },
  )
 
  # This method will not need to be overriden after the deprecation of `--adafactor` in version 5 of 🤗 Transformers.
@@ -336,3 +340,10 @@ def __post_init__(self):
  f"{self.hub_model_id}).",
  FutureWarning,
  )
+ if self.use_module_with_loss is True:
+ logger.info(
+ "Using ModuleWithLoss Wrapper."
+ "loss will be computed during training loop and it will save memory peak "
+ )
+ else:
+ logger.info("Not Using ModuleWithLoss Wrapper.")