From 69c6f111be5b243f92d8d480350bd06355eb389c Mon Sep 17 00:00:00 2001
From: adamlouly <adamlouly3@gmail.com>
Date: Mon, 9 Jan 2023 17:22:42 +0000
Subject: [PATCH 01/16] improved solutio

---
 optimum/onnxruntime/trainer.py | 111 ++++++++++++++++++++++++++++++---
 1 file changed, 102 insertions(+), 9 deletions(-)

diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py
index edc4181dc1..3164e4f746 100644
--- a/optimum/onnxruntime/trainer.py
+++ b/optimum/onnxruntime/trainer.py
@@ -129,6 +129,54 @@
 SCALER_NAME = "scaler.pt"
 
 
+class ModuleWithLoss(nn.Module):
+    def __init__(self, model, args) -> None:
+        super().__init__()
+        self._original_model = model
+        self.args = args
+        # Label smoothing
+        if self.args.label_smoothing_factor != 0:
+            from transformers.trainer_pt_utils import LabelSmoother
+
+            self.label_smoother = LabelSmoother(epsilon=self.args.label_smoothing_factor)
+        else:
+            self.label_smoother = None
+
+    def forward(self, inputs: Dict[str, Union[torch.Tensor, Any]], return_outputs):
+        if self.label_smoother is not None and "labels" in inputs:
+            labels = inputs.pop("labels")
+        else:
+            labels = None
+        outputs = self._original_model(**inputs)
+
+        # Save past state if it exists
+        # TODO: this needs to be fixed and made cleaner later.
+        if self.args.past_index >= 0:
+            self._past = outputs[self.args.past_index]
+
+        if labels is not None:
+            from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+
+            if unwrap_model(self._original_model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
+                loss = self.label_smoother(outputs, labels, shift_labels=True)
+            else:
+                loss = self.label_smoother(outputs, labels)
+        else:
+            if isinstance(outputs, dict) and "loss" not in outputs:
+                raise ValueError(
+                    "The model did not return a loss from the inputs, only the following keys: "
+                    f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
+                )
+            # We don't use .loss here since the model may return tuples instead of ModelOutput.
+            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
+
+        return (loss, outputs) if return_outputs else loss
+
+    @property
+    def config(self):
+        return self._original_model.config
+
+
 class ORTFeaturesManager:
     _TASKS_TO_ORTMODELS = {
         "default": ORTModelForFeatureExtraction,
@@ -259,13 +307,62 @@ def __init__(
             optimizers=optimizers,
             preprocess_logits_for_metrics=preprocess_logits_for_metrics,
         )
-
+        self._training_model = ModuleWithLoss(model, args)
+        self.model = model
+        self._inferencing_model = model
         self.feature = feature
         self.onnx_model_path = onnx_model_path
         self.exported_with_loss = False
         if self.args.local_rank:
             torch.cuda.set_device(self.args.local_rank)
 
+    def _set_signature_columns_if_needed(self):
+        if self._signature_columns is None:
+            # Inspect model forward signature to keep only the arguments it accepts.
+            import inspect
+
+            if self.model == self._training_model:
+                signature = inspect.signature(self.model._original_model.forward)
+            else:
+                signature = inspect.signature(self.model.forward)
+
+            self._signature_columns = list(signature.parameters.keys())
+            # Labels may be named label or label_ids, the default data collator handles that.
+            self._signature_columns += list(set(["label", "label_ids"] + self.label_names))
+
+    def compute_loss(self, model_with_loss, inputs, return_outputs=False):
+        # Run model forward + loss compute.
+        if self.model == self._training_model:
+            outputs = model_with_loss(inputs, return_outputs)
+            return outputs
+        else:
+
+            if self.label_smoother is not None and "labels" in inputs:
+                labels = inputs.pop("labels")
+            else:
+                labels = None
+                outputs = model_with_loss(**inputs)
+                # Save past state if it exists
+                # TODO: this needs to be fixed and made cleaner later.
+                if self.args.past_index >= 0:
+                    self._past = outputs[self.args.past_index]
+
+                if labels is not None:
+                    if unwrap_model(model_with_loss)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
+                        loss = self.label_smoother(outputs, labels, shift_labels=True)
+                    else:
+                        loss = self.label_smoother(outputs, labels)
+                else:
+                    if isinstance(outputs, dict) and "loss" not in outputs:
+                        raise ValueError(
+                            "The model did not return a loss from the inputs, only the following keys: "
+                            f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
+                        )
+                    # We don't use .loss here since the model may return tuples instead of ModelOutput.
+                    loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
+
+                return (loss, outputs) if return_outputs else loss
+
     def train(
         self,
         resume_from_checkpoint: Optional[Union[str, bool]] = None,
@@ -289,7 +386,7 @@ def train(
             kwargs:
                 Additional keyword arguments used to hide deprecated arguments
         """
-
+        self.model = self._training_model
         if resume_from_checkpoint is False:
             resume_from_checkpoint = None
 
@@ -437,7 +534,7 @@ def _inner_training_loop(
                     RuntimeWarning,
                 )
 
-            self.model = model
+            self.model = self._training_model
             deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
                 self, num_training_steps=max_steps, resume_from_checkpoint=resume_from_checkpoint
             )
@@ -564,7 +661,6 @@ def _inner_training_loop(
                     # Otherwise we need to call the whooooole sampler cause there is some random operation added
                     # AT THE VERY END!
                     _ = list(train_dataloader.sampler)
-
         for epoch in range(epochs_trained, num_train_epochs):
             if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler):
                 train_dataloader.sampler.set_epoch(epoch)
@@ -774,6 +870,7 @@ def evaluate(
             dictionary also contains the epoch number which comes from the training state.
         """
         # memory metrics - must set up as early as possible
+        self.model = self._inferencing_model
         self._memory_tracker.start()
 
         eval_dataloader = self.get_eval_dataloader(eval_dataset)
@@ -1609,11 +1706,7 @@ def create_optimizer(self):
                 optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
 
             if self.sharded_ddp == ShardedDDPOption.SIMPLE:
-                self.optimizer = OSS(
-                    params=optimizer_grouped_parameters,
-                    optim=optimizer_cls,
-                    **optimizer_kwargs,
-                )
+                self.optimizer = OSS(params=optimizer_grouped_parameters, optim=optimizer_cls, **optimizer_kwargs,)
             else:
                 self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
                 if optimizer_cls.__name__ == "Adam8bit":

From c918008982ffb033af0c9520a0adbf1ecf530460 Mon Sep 17 00:00:00 2001
From: adamlouly <adamlouly3@gmail.com>
Date: Tue, 10 Jan 2023 23:39:24 +0000
Subject: [PATCH 02/16] compute loss fix

---
 optimum/onnxruntime/trainer.py | 48 +++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py
index 3164e4f746..d1d1b55052 100644
--- a/optimum/onnxruntime/trainer.py
+++ b/optimum/onnxruntime/trainer.py
@@ -23,7 +23,6 @@
 import warnings
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Type, Union
-
 from tqdm.auto import tqdm
 
 
@@ -33,6 +32,7 @@
     is_fairscale_available,
 )
 
+
 import numpy as np
 import torch
 import torch.distributed as dist
@@ -46,6 +46,7 @@
 from transformers.debug_utils import DebugOption, DebugUnderflowOverflow
 from transformers.deepspeed import deepspeed_init, is_deepspeed_zero3_enabled
 from transformers.dependency_versions_check import dep_version_check
+from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 from transformers.file_utils import (
     CONFIG_NAME,
     WEIGHTS_NAME,
@@ -341,27 +342,28 @@ def compute_loss(self, model_with_loss, inputs, return_outputs=False):
                 labels = inputs.pop("labels")
             else:
                 labels = None
-                outputs = model_with_loss(**inputs)
-                # Save past state if it exists
-                # TODO: this needs to be fixed and made cleaner later.
-                if self.args.past_index >= 0:
-                    self._past = outputs[self.args.past_index]
-
-                if labels is not None:
-                    if unwrap_model(model_with_loss)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
-                        loss = self.label_smoother(outputs, labels, shift_labels=True)
-                    else:
-                        loss = self.label_smoother(outputs, labels)
+
+            outputs = model_with_loss(**inputs)
+            # Save past state if it exists
+            # TODO: this needs to be fixed and made cleaner later.
+            if self.args.past_index >= 0:
+                self._past = outputs[self.args.past_index]
+
+            if labels is not None:
+                if unwrap_model(model_with_loss)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
+                    loss = self.label_smoother(outputs, labels, shift_labels=True)
                 else:
-                    if isinstance(outputs, dict) and "loss" not in outputs:
-                        raise ValueError(
-                            "The model did not return a loss from the inputs, only the following keys: "
-                            f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
-                        )
-                    # We don't use .loss here since the model may return tuples instead of ModelOutput.
-                    loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
+                    loss = self.label_smoother(outputs, labels)
+            else:
+                if isinstance(outputs, dict) and "loss" not in outputs:
+                    raise ValueError(
+                        "The model did not return a loss from the inputs, only the following keys: "
+                        f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
+                    )
+                # We don't use .loss here since the model may return tuples instead of ModelOutput.
+                loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
 
-                return (loss, outputs) if return_outputs else loss
+            return (loss, outputs) if return_outputs else loss
 
     def train(
         self,
@@ -1706,7 +1708,11 @@ def create_optimizer(self):
                 optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
 
             if self.sharded_ddp == ShardedDDPOption.SIMPLE:
-                self.optimizer = OSS(params=optimizer_grouped_parameters, optim=optimizer_cls, **optimizer_kwargs,)
+                self.optimizer = OSS(
+                    params=optimizer_grouped_parameters,
+                    optim=optimizer_cls,
+                    **optimizer_kwargs,
+                )
             else:
                 self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
                 if optimizer_cls.__name__ == "Adam8bit":

From 7e968107cf4972e1eb0b4bccb6e0ca7edea1d649 Mon Sep 17 00:00:00 2001
From: Adam Louly
 <adamlouly@microsoft.com@orttrainingdev9.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>
Date: Wed, 15 Feb 2023 22:04:17 +0000
Subject: [PATCH 03/16] esolved comments

---
 optimum/onnxruntime/trainer.py | 52 +++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py
index caa5d1e0db..f7885a2892 100644
--- a/optimum/onnxruntime/trainer.py
+++ b/optimum/onnxruntime/trainer.py
@@ -27,10 +27,7 @@
 
 
 # Integrations must be imported before ML frameworks:
-from transformers.integrations import (  # isort: split
-    hp_params,
-    is_fairscale_available,
-)
+from transformers.integrations import hp_params, is_fairscale_available  # isort: split
 
 
 import numpy as np
@@ -144,22 +141,23 @@ def __init__(self, model, args) -> None:
         else:
             self.label_smoother = None
 
-    def forward(self, inputs: Dict[str, Union[torch.Tensor, Any]], return_outputs):
+    def compute_loss(self, model, inputs, return_outputs=False):
+        """
+        How the loss is computed by Trainer. By default, all models return the loss in the first element.
+        Subclass and override for custom behavior.
+        """
         if self.label_smoother is not None and "labels" in inputs:
             labels = inputs.pop("labels")
         else:
             labels = None
-        outputs = self._original_model(**inputs)
-
+        outputs = model(**inputs)
         # Save past state if it exists
         # TODO: this needs to be fixed and made cleaner later.
         if self.args.past_index >= 0:
             self._past = outputs[self.args.past_index]
 
         if labels is not None:
-            from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
-
-            if unwrap_model(self._original_model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
+            if unwrap_model(model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
                 loss = self.label_smoother(outputs, labels, shift_labels=True)
             else:
                 loss = self.label_smoother(outputs, labels)
@@ -174,6 +172,9 @@ def forward(self, inputs: Dict[str, Union[torch.Tensor, Any]], return_outputs):
 
         return (loss, outputs) if return_outputs else loss
 
+    def forward(self, inputs: Dict[str, Union[torch.Tensor, Any]], return_outputs):
+        return self.compute_loss(self._original_model, inputs, return_outputs=False)
+
     @property
     def config(self):
         return self._original_model.config
@@ -309,6 +310,10 @@ def __init__(
             optimizers=optimizers,
             preprocess_logits_for_metrics=preprocess_logits_for_metrics,
         )
+
+        # We leverage both training_model and inference_model in conjunction with model.
+        # _training_model will be wrapped so it will use ORT and will use the overriden functions in ModuleWithLoss.
+        # _inferencing_model will be storing the default version of the model and we will switch to it in case of eval/test.
         self._training_model = ModuleWithLoss(model, args)
         self.model = model
         self._inferencing_model = model
@@ -318,12 +323,14 @@ def __init__(
         if self.args.local_rank:
             torch.cuda.set_device(self.args.local_rank)
 
+    # we assume that training_model and inference_model have the same forward signature column.
+    # self._signature_columns attribute only stores the first-time parsed signature
     def _set_signature_columns_if_needed(self):
         if self._signature_columns is None:
             # Inspect model forward signature to keep only the arguments it accepts.
             import inspect
 
-            if self.model == self._training_model:
+            if isinstance(self.model, ModuleWithLoss):
                 signature = inspect.signature(self.model._original_model.forward)
             else:
                 signature = inspect.signature(self.model.forward)
@@ -394,7 +401,7 @@ def train(
                 "You need to install `onnxruntime-training` to use `ORTTrainer` for training. Check out "
                 "https://huggingface.co/docs/optimum/onnxruntime/usage_guides/trainer#install-onnx-runtime."
             )
-            
+
         self.model = self._training_model
 
         if resume_from_checkpoint is False:
@@ -544,7 +551,6 @@ def _inner_training_loop(
                     RuntimeWarning,
                 )
 
-            self.model = self._training_model
             deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
                 self, num_training_steps=max_steps, resume_from_checkpoint=resume_from_checkpoint
             )
@@ -885,6 +891,7 @@ def evaluate(
             dictionary also contains the epoch number which comes from the training state.
         """
         # memory metrics - must set up as early as possible
+        # TODO: We need to enable evaluation using ORT backend.
         self.model = self._inferencing_model
         self._memory_tracker.start()
 
@@ -977,6 +984,9 @@ def predict(
             - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained
               labels).
         """
+        # TODO: We need to enable evaluation using ORT backend.
+        self.model = self._inferencing_model
+
         # memory metrics - must set up as early as possible
         self._memory_tracker.start()
 
@@ -994,10 +1004,7 @@ def predict(
 
         try:
             output = eval_loop(
-                test_dataloader,
-                description="Prediction",
-                ignore_keys=ignore_keys,
-                metric_key_prefix=metric_key_prefix,
+                test_dataloader, description="Prediction", ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix
             )
         except Exception as error:
             logger.error(error)
@@ -1740,11 +1747,7 @@ def create_optimizer(self):
                 optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
 
             if self.sharded_ddp == ShardedDDPOption.SIMPLE:
-                self.optimizer = OSS(
-                    params=optimizer_grouped_parameters,
-                    optim=optimizer_cls,
-                    **optimizer_kwargs,
-                )
+                self.optimizer = OSS(params=optimizer_grouped_parameters, optim=optimizer_cls, **optimizer_kwargs)
             else:
                 self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
                 if optimizer_cls.__name__ == "Adam8bit":
@@ -1774,10 +1777,7 @@ def get_ort_optimizer_cls_and_kwargs(args: ORTTrainingArguments) -> Tuple[Any, A
                 The training arguments for the training session.
         """
         optimizer_kwargs = {"lr": args.learning_rate}
-        adam_kwargs = {
-            "betas": (args.adam_beta1, args.adam_beta2),
-            "eps": args.adam_epsilon,
-        }
+        adam_kwargs = {"betas": (args.adam_beta1, args.adam_beta2), "eps": args.adam_epsilon}
         if args.optim == ORTOptimizerNames.ADAMW_ORT_FUSED:
             try:
                 from onnxruntime.training.optim import FusedAdam

From 4732f2c2e7250a417567afb4f720918838e0be91 Mon Sep 17 00:00:00 2001
From: Adam Louly
 <adamlouly@microsoft.com@orttrainingdev9.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>
Date: Wed, 22 Feb 2023 21:50:04 +0000
Subject: [PATCH 04/16] removed duplicated code .. used main trainer compute
 loss

---
 optimum/onnxruntime/trainer.py | 64 +++-------------------------------
 1 file changed, 5 insertions(+), 59 deletions(-)

diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py
index f7885a2892..fee06bbcc8 100644
--- a/optimum/onnxruntime/trainer.py
+++ b/optimum/onnxruntime/trainer.py
@@ -133,6 +133,7 @@ def __init__(self, model, args) -> None:
         super().__init__()
         self._original_model = model
         self.args = args
+        self.hf_trainer = Trainer(model)
         # Label smoothing
         if self.args.label_smoothing_factor != 0:
             from transformers.trainer_pt_utils import LabelSmoother
@@ -141,39 +142,8 @@ def __init__(self, model, args) -> None:
         else:
             self.label_smoother = None
 
-    def compute_loss(self, model, inputs, return_outputs=False):
-        """
-        How the loss is computed by Trainer. By default, all models return the loss in the first element.
-        Subclass and override for custom behavior.
-        """
-        if self.label_smoother is not None and "labels" in inputs:
-            labels = inputs.pop("labels")
-        else:
-            labels = None
-        outputs = model(**inputs)
-        # Save past state if it exists
-        # TODO: this needs to be fixed and made cleaner later.
-        if self.args.past_index >= 0:
-            self._past = outputs[self.args.past_index]
-
-        if labels is not None:
-            if unwrap_model(model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
-                loss = self.label_smoother(outputs, labels, shift_labels=True)
-            else:
-                loss = self.label_smoother(outputs, labels)
-        else:
-            if isinstance(outputs, dict) and "loss" not in outputs:
-                raise ValueError(
-                    "The model did not return a loss from the inputs, only the following keys: "
-                    f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
-                )
-            # We don't use .loss here since the model may return tuples instead of ModelOutput.
-            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
-
-        return (loss, outputs) if return_outputs else loss
-
     def forward(self, inputs: Dict[str, Union[torch.Tensor, Any]], return_outputs):
-        return self.compute_loss(self._original_model, inputs, return_outputs=False)
+        return self.hf_trainer.compute_loss(self._original_model, inputs, return_outputs=False)
 
     @property
     def config(self):
@@ -345,33 +315,8 @@ def compute_loss(self, model_with_loss, inputs, return_outputs=False):
             outputs = model_with_loss(inputs, return_outputs)
             return outputs
         else:
-
-            if self.label_smoother is not None and "labels" in inputs:
-                labels = inputs.pop("labels")
-            else:
-                labels = None
-
-            outputs = model_with_loss(**inputs)
-            # Save past state if it exists
-            # TODO: this needs to be fixed and made cleaner later.
-            if self.args.past_index >= 0:
-                self._past = outputs[self.args.past_index]
-
-            if labels is not None:
-                if unwrap_model(model_with_loss)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
-                    loss = self.label_smoother(outputs, labels, shift_labels=True)
-                else:
-                    loss = self.label_smoother(outputs, labels)
-            else:
-                if isinstance(outputs, dict) and "loss" not in outputs:
-                    raise ValueError(
-                        "The model did not return a loss from the inputs, only the following keys: "
-                        f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
-                    )
-                # We don't use .loss here since the model may return tuples instead of ModelOutput.
-                loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
-
-            return (loss, outputs) if return_outputs else loss
+            loss, outputs = super().compute_loss(model_with_loss, inputs, return_outputs)
+            return (loss, outputs)
 
     def train(
         self,
@@ -551,6 +496,7 @@ def _inner_training_loop(
                     RuntimeWarning,
                 )
 
+            self.model = model
             deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
                 self, num_training_steps=max_steps, resume_from_checkpoint=resume_from_checkpoint
             )

From c47fa80f26f7fb1403c79155a39e1deed1e2c3d3 Mon Sep 17 00:00:00 2001
From: Adam Louly
 <adamlouly@microsoft.com@orttrainingdev9.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>
Date: Wed, 22 Feb 2023 23:31:13 +0000
Subject: [PATCH 05/16] added --loss_in_train flag

---
 optimum/onnxruntime/trainer.py       | 13 +++++++++----
 optimum/onnxruntime/training_args.py | 14 ++++++++++++++
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py
index fee06bbcc8..01428b80c3 100644
--- a/optimum/onnxruntime/trainer.py
+++ b/optimum/onnxruntime/trainer.py
@@ -284,7 +284,13 @@ def __init__(
         # We leverage both training_model and inference_model in conjunction with model.
         # _training_model will be wrapped so it will use ORT and will use the overriden functions in ModuleWithLoss.
         # _inferencing_model will be storing the default version of the model and we will switch to it in case of eval/test.
-        self._training_model = ModuleWithLoss(model, args)
+
+        # Only Wrap the model if we pass --loss_in_train flag.
+        if args.loss_in_train:
+            self._training_model = ModuleWithLoss(model, args)
+        else:
+            self._training_model = model
+
         self.model = model
         self._inferencing_model = model
         self.feature = feature
@@ -311,12 +317,11 @@ def _set_signature_columns_if_needed(self):
 
     def compute_loss(self, model_with_loss, inputs, return_outputs=False):
         # Run model forward + loss compute.
-        if self.model == self._training_model:
+        if self.args.loss_in_train and self.model == self._training_model:
             outputs = model_with_loss(inputs, return_outputs)
             return outputs
         else:
-            loss, outputs = super().compute_loss(model_with_loss, inputs, return_outputs)
-            return (loss, outputs)
+            return super().compute_loss(self.model, inputs, return_outputs)
 
     def train(
         self,
diff --git a/optimum/onnxruntime/training_args.py b/optimum/onnxruntime/training_args.py
index 85456c7ad7..fb3e37aac8 100644
--- a/optimum/onnxruntime/training_args.py
+++ b/optimum/onnxruntime/training_args.py
@@ -65,6 +65,11 @@ class ORTTrainingArguments(TrainingArguments):
         metadata={"help": "The optimizer to use."},
     )
 
+    loss_in_train: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Use ModuleWithLoss Wrapper to compute loss inside the training loop."},
+    )
+
     # This method will not need to be overriden after the deprecation of `--adafactor` in version 5 of 🤗 Transformers.
     def __post_init__(self):
         # Handle --use_env option in torch.distributed.launch (local_rank not passed as an arg then).
@@ -337,3 +342,12 @@ def __post_init__(self):
                 f"{self.hub_model_id}).",
                 FutureWarning,
             )
+        if self.loss_in_train is True:
+            logger.info(
+                "Using ModuleWithLoss Wrapper."
+                "loss will be computed during training loop and it will save memory peak "
+            )
+        else:
+            logger.info(
+                "Not Using ModuleWithLoss Wrapper."
+            )

From c45bd53757f2b789139ff0076288d875940699ea Mon Sep 17 00:00:00 2001
From: Adam Louly
 <adamlouly@microsoft.com@orttrainingdev9.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>
Date: Thu, 23 Feb 2023 22:29:30 +0000
Subject: [PATCH 06/16] resolved comments

---
 optimum/onnxruntime/trainer.py       | 4 ++--
 optimum/onnxruntime/training_args.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py
index cc9c00efab..0253e0b39e 100644
--- a/optimum/onnxruntime/trainer.py
+++ b/optimum/onnxruntime/trainer.py
@@ -23,7 +23,6 @@
 import warnings
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Type, Union
-from tqdm.auto import tqdm
 
 
 # Integrations must be imported before ML frameworks:
@@ -51,7 +50,6 @@
 from transformers.debug_utils import DebugOption, DebugUnderflowOverflow
 from transformers.deepspeed import deepspeed_init, is_deepspeed_zero3_enabled
 from transformers.dependency_versions_check import dep_version_check
-from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 from transformers.file_utils import (
     is_apex_available,
     is_sagemaker_dp_enabled,
@@ -146,6 +144,8 @@ def __init__(self, model, args) -> None:
         super().__init__()
         self._original_model = model
         self.args = args
+
+        # Creating an instance of huggingFace Trainer so we can use compute_loss() logic and avoid duplicated code.
         self.hf_trainer = Trainer(model)
         # Label smoothing
         if self.args.label_smoothing_factor != 0:
diff --git a/optimum/onnxruntime/training_args.py b/optimum/onnxruntime/training_args.py
index 9b213b2921..bf467bb908 100644
--- a/optimum/onnxruntime/training_args.py
+++ b/optimum/onnxruntime/training_args.py
@@ -67,7 +67,7 @@ class ORTTrainingArguments(TrainingArguments):
 
     loss_in_train: Optional[bool] = field(
         default=False,
-        metadata={"help": "Use ModuleWithLoss Wrapper to compute loss inside the training loop."},
+        metadata={"help": "Use ModuleWithLoss Wrapper to compute loss inside the training loop, when label smoother is NOT none having this will help save memory for ORTMOdule Runs."},
     )
 
     # This method will not need to be overriden after the deprecation of `--adafactor` in version 5 of 🤗 Transformers.

From 31178c8e1c8d67482d2a16a63470e9c01c6cb3b5 Mon Sep 17 00:00:00 2001
From: Adam Louly
 <adamlouly@microsoft.com@orttrainingdev9.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>
Date: Thu, 2 Mar 2023 08:41:51 +0000
Subject: [PATCH 07/16] resolved comments

---
 optimum/onnxruntime/trainer.py       | 114 ++++++++++++++++++---------
 optimum/onnxruntime/training_args.py |  17 ++--
 2 files changed, 84 insertions(+), 47 deletions(-)

diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py
index 0253e0b39e..1e2de8b486 100644
--- a/optimum/onnxruntime/trainer.py
+++ b/optimum/onnxruntime/trainer.py
@@ -28,17 +28,6 @@
 # Integrations must be imported before ML frameworks:
 from transformers.integrations import hp_params, is_fairscale_available  # isort: split
 
-
-
-# Integrations must be imported before ML frameworks:
-# isort: off
-from transformers.integrations import (
-    hp_params,
-    is_fairscale_available,
-)
-
-# isort: on
-
 import numpy as np
 import torch
 import torch.distributed as dist
@@ -140,23 +129,48 @@
 
 
 class ModuleWithLoss(nn.Module):
-    def __init__(self, model, args) -> None:
+    def __init__(self, model, args, label_smoother) -> None:
         super().__init__()
         self._original_model = model
         self.args = args
+        self.label_smoother = label_smoother
+
+    def forward(self, inputs: Dict[str, Union[torch.Tensor, Any]], return_outputs):
+        if self.label_smoother is not None and "labels" in inputs:
+            labels = inputs.pop("labels")
+        else:
+            labels = None
+        outputs = self._original_model(**inputs)
 
-        # Creating an instance of huggingFace Trainer so we can use compute_loss() logic and avoid duplicated code.
-        self.hf_trainer = Trainer(model)
-        # Label smoothing
-        if self.args.label_smoothing_factor != 0:
-            from transformers.trainer_pt_utils import LabelSmoother
+        # Save past state if it exists
+        # TODO: this needs to be fixed and made cleaner later.
+        if self.args.past_index >= 0:
+            self._past = outputs[self.args.past_index]
+
+        if labels is not None:
+            from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 
-            self.label_smoother = LabelSmoother(epsilon=self.args.label_smoothing_factor)
+            if unwrap_model(self._original_model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
+                loss = self.label_smoother(outputs, labels, shift_labels=True)
+            else:
+                loss = self.label_smoother(outputs, labels)
         else:
-            self.label_smoother = None
+            if isinstance(outputs, dict) and "loss" not in outputs:
+                raise ValueError(
+                    "The model did not return a loss from the inputs, only the following keys: "
+                    f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
+                )
+            # We don't use .loss here since the model may return tuples instead of ModelOutput.
+            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
 
-    def forward(self, inputs: Dict[str, Union[torch.Tensor, Any]], return_outputs):
-        return self.hf_trainer.compute_loss(self._original_model, inputs, return_outputs=False)
+        return (loss, outputs) if return_outputs else loss
+
+    @property
+    def module(self):
+        """The original `torch.nn.Module` that this module wraps.
+        This property provides access to methods and properties on the original module."""
+
+        return self._original_model.module
 
     @property
     def config(self):
@@ -310,22 +324,28 @@ def __init__(
 
         # We leverage both training_model and inference_model in conjunction with model.
         # _training_model will be wrapped so it will use ORT and will use the overriden functions in ModuleWithLoss.
-        # _inferencing_model will be storing the default version of the model and we will switch to it in case of eval/test.
+        # _training_model will be storing the default version of the model and will unwrap it in case of eval/test.
 
-        # Only Wrap the model if we pass --loss_in_train flag.
-        if args.loss_in_train:
-            self._training_model = ModuleWithLoss(model, args)
+        # Only Wrap the model if we pass --use_module_with_loss flag.
+        if args.use_module_with_loss:
+            self._training_model = self.create_model_with_loss()
         else:
             self._training_model = model
 
         self.model = model
-        self._inferencing_model = model
+
         self.feature = feature
         self.onnx_model_path = onnx_model_path
         self.exported_with_loss = False
         if self.args.local_rank:
             torch.cuda.set_device(self.args.local_rank)
 
+    # this method will create a ModuleWithLoss Instance to use if you are passing --use_module_with_loss flag.
+    # It will help reducing the peak memory usage by computing loss inside training.
+    def create_model_with_loss(self):
+        model_with_loss = ModuleWithLoss(self.model, self.args, self.label_smoother)
+        return model_with_loss
+
     # we assume that training_model and inference_model have the same forward signature column.
     # self._signature_columns attribute only stores the first-time parsed signature
     def _set_signature_columns_if_needed(self):
@@ -344,11 +364,37 @@ def _set_signature_columns_if_needed(self):
 
     def compute_loss(self, model_with_loss, inputs, return_outputs=False):
         # Run model forward + loss compute.
-        if self.args.loss_in_train and self.model == self._training_model:
+        if isinstance(self.model, ModuleWithLoss):
             outputs = model_with_loss(inputs, return_outputs)
             return outputs
         else:
-            return super().compute_loss(self.model, inputs, return_outputs)
+
+            if self.label_smoother is not None and "labels" in inputs:
+                labels = inputs.pop("labels")
+            else:
+                labels = None
+
+            outputs = model_with_loss(**inputs)
+            # Save past state if it exists
+            # TODO: this needs to be fixed and made cleaner later.
+            if self.args.past_index >= 0:
+                self._past = outputs[self.args.past_index]
+
+            if labels is not None:
+                if unwrap_model(model_with_loss)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
+                    loss = self.label_smoother(outputs, labels, shift_labels=True)
+                else:
+                    loss = self.label_smoother(outputs, labels)
+            else:
+                if isinstance(outputs, dict) and "loss" not in outputs:
+                    raise ValueError(
+                        "The model did not return a loss from the inputs, only the following keys: "
+                        f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
+                    )
+                # We don't use .loss here since the model may return tuples instead of ModelOutput.
+                loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
+
+            return (loss, outputs) if return_outputs else loss
 
     def train(
         self,
@@ -868,7 +914,7 @@ def evaluate(
         """
         # memory metrics - must set up as early as possible
         # TODO: We need to enable evaluation using ORT backend.
-        self.model = self._inferencing_model
+        self.model = unwrap_model(self.model)
         self._memory_tracker.start()
 
         eval_dataloader = self.get_eval_dataloader(eval_dataset)
@@ -961,7 +1007,7 @@ def predict(
               labels).
         """
         # TODO: We need to enable evaluation using ORT backend.
-        self.model = self._inferencing_model
+        self.model = unwrap_model(self.model)
 
         # memory metrics - must set up as early as possible
         self._memory_tracker.start()
@@ -1590,13 +1636,7 @@ def _export(
                 opset = max(opset, 12)  # Operators like `nll_loss`are added for opset>=12
 
             output_path = model_path / ONNX_WEIGHTS_NAME
-            _ = export(
-                model=model,
-                config=onnx_config,
-                opset=opset,
-                output=output_path,
-                device=device,
-            )
+            _ = export(model=model, config=onnx_config, opset=opset, output=output_path, device=device)
 
         model.config.save_pretrained(model_path)
 
diff --git a/optimum/onnxruntime/training_args.py b/optimum/onnxruntime/training_args.py
index bf467bb908..223fd58ecd 100644
--- a/optimum/onnxruntime/training_args.py
+++ b/optimum/onnxruntime/training_args.py
@@ -60,14 +60,13 @@ class ORTTrainingArguments(TrainingArguments):
             The optimizer to use, including optimizers in Transformers: adamw_hf, adamw_torch, adamw_apex_fused, or adafactor. And optimizers implemented by ONNX Runtime: adamw_ort_fused.
     """
 
-    optim: Optional[str] = field(
-        default="adamw_hf",
-        metadata={"help": "The optimizer to use."},
-    )
+    optim: Optional[str] = field(default="adamw_hf", metadata={"help": "The optimizer to use."})
 
-    loss_in_train: Optional[bool] = field(
+    use_module_with_loss: Optional[bool] = field(
         default=False,
-        metadata={"help": "Use ModuleWithLoss Wrapper to compute loss inside the training loop, when label smoother is NOT none having this will help save memory for ORTMOdule Runs."},
+        metadata={
+            "help": "Use ModuleWithLoss Wrapper to compute loss inside the training loop, when label smoother is NOT none having this will help save memory for ORTMOdule Runs."
+        },
     )
 
     # This method will not need to be overriden after the deprecation of `--adafactor` in version 5 of 🤗 Transformers.
@@ -341,12 +340,10 @@ def __post_init__(self):
                 f"{self.hub_model_id}).",
                 FutureWarning,
             )
-        if self.loss_in_train is True:
+        if self.use_module_with_loss is True:
             logger.info(
                 "Using ModuleWithLoss Wrapper."
                 "loss will be computed during training loop and it will save memory peak "
             )
         else:
-            logger.info(
-                "Not Using ModuleWithLoss Wrapper."
-            )
+            logger.info("Not Using ModuleWithLoss Wrapper.")

From 19cfe045900410aa52c2e492759e4d60c3b5050e Mon Sep 17 00:00:00 2001
From: Adam Louly
 <adamlouly@microsoft.com@orttrainingdev9.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>
Date: Thu, 2 Mar 2023 17:54:54 +0000
Subject: [PATCH 08/16] formatter usng latest black

---
 optimum/onnxruntime/trainer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py
index 1e2de8b486..02c4985f41 100644
--- a/optimum/onnxruntime/trainer.py
+++ b/optimum/onnxruntime/trainer.py
@@ -368,7 +368,6 @@ def compute_loss(self, model_with_loss, inputs, return_outputs=False):
             outputs = model_with_loss(inputs, return_outputs)
             return outputs
         else:
-
             if self.label_smoother is not None and "labels" in inputs:
                 labels = inputs.pop("labels")
             else:

From f268040dbaf3eec667f37aa8368dbdef226eff87 Mon Sep 17 00:00:00 2001
From: Adam Louly
 <adamlouly@microsoft.com@orttrainingdev9.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>
Date: Thu, 2 Mar 2023 17:59:51 +0000
Subject: [PATCH 09/16] add import for code quality

---
 optimum/onnxruntime/trainer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py
index 02c4985f41..34acf05833 100644
--- a/optimum/onnxruntime/trainer.py
+++ b/optimum/onnxruntime/trainer.py
@@ -380,6 +380,8 @@ def compute_loss(self, model_with_loss, inputs, return_outputs=False):
                 self._past = outputs[self.args.past_index]
 
             if labels is not None:
+                from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+
                 if unwrap_model(model_with_loss)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
                     loss = self.label_smoother(outputs, labels, shift_labels=True)
                 else:

From 4d8624a8f5343c1be53079779009f508b1e00252 Mon Sep 17 00:00:00 2001
From: Adam Louly
 <adamlouly@microsoft.com@orttrainingdev9.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>
Date: Thu, 2 Mar 2023 18:05:03 +0000
Subject: [PATCH 10/16] formatter usng latest black

---
 optimum/onnxruntime/trainer.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py
index 34acf05833..d4c2f96a0b 100644
--- a/optimum/onnxruntime/trainer.py
+++ b/optimum/onnxruntime/trainer.py
@@ -26,7 +26,13 @@
 
 
 # Integrations must be imported before ML frameworks:
-from transformers.integrations import hp_params, is_fairscale_available  # isort: split
+# isort: off
+from transformers.integrations import (
+    hp_params,
+    is_fairscale_available,
+)
+
+# isort: on
 
 import numpy as np
 import torch

From ee6ef104139dd402f9a1c7344e8cb433f1b236ac Mon Sep 17 00:00:00 2001
From: Adam Louly
 <adamlouly@microsoft.com@orttrainingdev9.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>
Date: Fri, 3 Mar 2023 19:41:30 +0000
Subject: [PATCH 11/16] readding super loss compute

---
 optimum/onnxruntime/trainer.py       | 29 +---------------------------
 optimum/onnxruntime/training_args.py |  2 +-
 2 files changed, 2 insertions(+), 29 deletions(-)

diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py
index d4c2f96a0b..80aeaa6101 100644
--- a/optimum/onnxruntime/trainer.py
+++ b/optimum/onnxruntime/trainer.py
@@ -374,34 +374,7 @@ def compute_loss(self, model_with_loss, inputs, return_outputs=False):
             outputs = model_with_loss(inputs, return_outputs)
             return outputs
         else:
-            if self.label_smoother is not None and "labels" in inputs:
-                labels = inputs.pop("labels")
-            else:
-                labels = None
-
-            outputs = model_with_loss(**inputs)
-            # Save past state if it exists
-            # TODO: this needs to be fixed and made cleaner later.
-            if self.args.past_index >= 0:
-                self._past = outputs[self.args.past_index]
-
-            if labels is not None:
-                from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
-
-                if unwrap_model(model_with_loss)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
-                    loss = self.label_smoother(outputs, labels, shift_labels=True)
-                else:
-                    loss = self.label_smoother(outputs, labels)
-            else:
-                if isinstance(outputs, dict) and "loss" not in outputs:
-                    raise ValueError(
-                        "The model did not return a loss from the inputs, only the following keys: "
-                        f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
-                    )
-                # We don't use .loss here since the model may return tuples instead of ModelOutput.
-                loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
-
-            return (loss, outputs) if return_outputs else loss
+            return super().compute_loss(self.model, inputs, return_outputs)
 
     def train(
         self,
diff --git a/optimum/onnxruntime/training_args.py b/optimum/onnxruntime/training_args.py
index 223fd58ecd..00e01ef1c1 100644
--- a/optimum/onnxruntime/training_args.py
+++ b/optimum/onnxruntime/training_args.py
@@ -65,7 +65,7 @@ class ORTTrainingArguments(TrainingArguments):
     use_module_with_loss: Optional[bool] = field(
         default=False,
         metadata={
-            "help": "Use ModuleWithLoss Wrapper to compute loss inside the training loop, when label smoother is NOT none having this will help save memory for ORTMOdule Runs."
+            "help": "Use ModuleWithLoss Wrapper to compute loss inside the training loop, having this will help save memory for ORTMOdule Runs."
         },
     )
 

From dc8de71505aa834f33e3520b9c6c2f7e9ec07474 Mon Sep 17 00:00:00 2001
From: Adam Louly
 <adamlouly@microsoft.com@orttrainingdev9.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>
Date: Tue, 7 Mar 2023 02:31:20 +0000
Subject: [PATCH 12/16] resolv comments

---
 optimum/onnxruntime/trainer.py | 44 +++++++++-------------------------
 1 file changed, 11 insertions(+), 33 deletions(-)

diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py
index 80aeaa6101..868d1fcddc 100644
--- a/optimum/onnxruntime/trainer.py
+++ b/optimum/onnxruntime/trainer.py
@@ -139,37 +139,17 @@ def __init__(self, model, args, label_smoother) -> None:
         super().__init__()
         self._original_model = model
         self.args = args
+        # Creating an instance of huggingFace Trainer so we can use compute_loss() logic and avoid duplicated code.
+        self.hf_trainer = Trainer(model)
+        # Label smoothing
         self.label_smoother = label_smoother
 
     def forward(self, inputs: Dict[str, Union[torch.Tensor, Any]], return_outputs):
-        if self.label_smoother is not None and "labels" in inputs:
-            labels = inputs.pop("labels")
-        else:
-            labels = None
-        outputs = self._original_model(**inputs)
-
-        # Save past state if it exists
-        # TODO: this needs to be fixed and made cleaner later.
-        if self.args.past_index >= 0:
-            self._past = outputs[self.args.past_index]
-
-        if labels is not None:
-            from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
-
-            if unwrap_model(self._original_model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
-                loss = self.label_smoother(outputs, labels, shift_labels=True)
-            else:
-                loss = self.label_smoother(outputs, labels)
-        else:
-            if isinstance(outputs, dict) and "loss" not in outputs:
-                raise ValueError(
-                    "The model did not return a loss from the inputs, only the following keys: "
-                    f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
-                )
-            # We don't use .loss here since the model may return tuples instead of ModelOutput.
-            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
+        return self.hf_trainer.compute_loss(self._original_model, inputs, return_outputs=False)
 
-        return (loss, outputs) if return_outputs else loss
+    @property
+    def config(self):
+        return self._original_model.config
 
     @property
     def module(self):
@@ -335,8 +315,6 @@ def __init__(
         # Only Wrap the model if we pass --use_module_with_loss flag.
         if args.use_module_with_loss:
             self._training_model = self.create_model_with_loss()
-        else:
-            self._training_model = model
 
         self.model = model
 
@@ -404,8 +382,8 @@ def train(
                 "You need to install `onnxruntime-training` to use `ORTTrainer` for training. Check out "
                 "https://huggingface.co/docs/optimum/onnxruntime/usage_guides/trainer#install-onnx-runtime."
             )
-
-        self.model = self._training_model
+        if self.args.use_module_with_loss:
+            self.model = self._training_model
 
         if resume_from_checkpoint is False:
             resume_from_checkpoint = None
@@ -534,10 +512,10 @@ def _inner_training_loop(
             or is_sagemaker_mp_enabled()
             or self.fsdp is not None
         )
-
         # Wrap the model with `ORTModule`
         logger.info("Wrap ORTModule for ONNX Runtime training.")
-        model = ORTModule(self.model)
+        from onnxruntime.training.ortmodule import ORTModule, DebugOptions, LogLevel
+        model = ORTModule(self.model, DebugOptions(save_onnx=True, log_level=LogLevel.VERBOSE, onnx_prefix="distil_bert"))
         self.model_wrapped = model
 
         if args.deepspeed:

From 55ad1d2767c5957d96a6cb4551518328374a2136 Mon Sep 17 00:00:00 2001
From: Adam Louly
 <adamlouly@microsoft.com@orttrainingdev9.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>
Date: Tue, 7 Mar 2023 02:32:20 +0000
Subject: [PATCH 13/16] fix typo

---
 optimum/onnxruntime/training_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/onnxruntime/training_args.py b/optimum/onnxruntime/training_args.py
index 00e01ef1c1..bf16b37b72 100644
--- a/optimum/onnxruntime/training_args.py
+++ b/optimum/onnxruntime/training_args.py
@@ -65,7 +65,7 @@ class ORTTrainingArguments(TrainingArguments):
     use_module_with_loss: Optional[bool] = field(
         default=False,
         metadata={
-            "help": "Use ModuleWithLoss Wrapper to compute loss inside the training loop, having this will help save memory for ORTMOdule Runs."
+            "help": "Use ModuleWithLoss Wrapper to compute loss inside the training loop, having this will help save memory for ORTModule Runs."
         },
     )
 

From 432efe547f2cd3b7fead033b07b7a4ccdc4b63cf Mon Sep 17 00:00:00 2001
From: Adam Louly
 <adamlouly@microsoft.com@orttrainingdev9.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>
Date: Wed, 8 Mar 2023 00:00:39 +0000
Subject: [PATCH 14/16] solve not exporting onnx models

---
 optimum/onnxruntime/trainer.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py
index 868d1fcddc..a233d7e7f6 100644
--- a/optimum/onnxruntime/trainer.py
+++ b/optimum/onnxruntime/trainer.py
@@ -145,7 +145,7 @@ def __init__(self, model, args, label_smoother) -> None:
         self.label_smoother = label_smoother
 
     def forward(self, inputs: Dict[str, Union[torch.Tensor, Any]], return_outputs):
-        return self.hf_trainer.compute_loss(self._original_model, inputs, return_outputs=False)
+        return self.hf_trainer.compute_loss(self._original_model, inputs, return_outputs)
 
     @property
     def config(self):
@@ -349,10 +349,9 @@ def _set_signature_columns_if_needed(self):
     def compute_loss(self, model_with_loss, inputs, return_outputs=False):
         # Run model forward + loss compute.
         if isinstance(self.model, ModuleWithLoss):
-            outputs = model_with_loss(inputs, return_outputs)
-            return outputs
+            return model_with_loss(inputs, return_outputs)
         else:
-            return super().compute_loss(self.model, inputs, return_outputs)
+            return super().compute_loss(model_with_loss, inputs, return_outputs)
 
     def train(
         self,
@@ -512,10 +511,10 @@ def _inner_training_loop(
             or is_sagemaker_mp_enabled()
             or self.fsdp is not None
         )
+
         # Wrap the model with `ORTModule`
         logger.info("Wrap ORTModule for ONNX Runtime training.")
-        from onnxruntime.training.ortmodule import ORTModule, DebugOptions, LogLevel
-        model = ORTModule(self.model, DebugOptions(save_onnx=True, log_level=LogLevel.VERBOSE, onnx_prefix="distil_bert"))
+        model = ORTModule(self.model)
         self.model_wrapped = model
 
         if args.deepspeed:
@@ -663,6 +662,7 @@ def _inner_training_loop(
                     # Otherwise we need to call the whooooole sampler cause there is some random operation added
                     # AT THE VERY END!
                     _ = list(train_dataloader.sampler)
+
         for epoch in range(epochs_trained, num_train_epochs):
             if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler):
                 train_dataloader.sampler.set_epoch(epoch)

From 2b5e57b4ddbbe00e7c0c9935a4f8b4cf77675bfd Mon Sep 17 00:00:00 2001
From: Adam Louly
 <adamlouly@microsoft.com@orttrainingdev9.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>
Date: Wed, 8 Mar 2023 18:13:44 +0000
Subject: [PATCH 15/16] dictionary casting , bind method

---
 optimum/onnxruntime/trainer.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py
index a233d7e7f6..c011b9ac33 100644
--- a/optimum/onnxruntime/trainer.py
+++ b/optimum/onnxruntime/trainer.py
@@ -23,7 +23,7 @@
 import warnings
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Type, Union
-
+import types
 
 # Integrations must be imported before ML frameworks:
 # isort: off
@@ -135,21 +135,18 @@
 
 
 class ModuleWithLoss(nn.Module):
-    def __init__(self, model, args, label_smoother) -> None:
+    def __init__(self, model, args, label_smoother):
         super().__init__()
         self._original_model = model
         self.args = args
-        # Creating an instance of huggingFace Trainer so we can use compute_loss() logic and avoid duplicated code.
-        self.hf_trainer = Trainer(model)
         # Label smoothing
         self.label_smoother = label_smoother
 
     def forward(self, inputs: Dict[str, Union[torch.Tensor, Any]], return_outputs):
-        return self.hf_trainer.compute_loss(self._original_model, inputs, return_outputs)
-
-    @property
-    def config(self):
-        return self._original_model.config
+        # The compute_model_plus_loss_internal is assigned once the class is instantiated.
+        # It should have same signature as Trainer.compute_loss().
+        # We do this to avoid potential un-synced states if we duplicated compute loss codes .
+        return self.compute_model_plus_loss_internal(self._original_model, inputs, return_outputs)
 
     @property
     def module(self):
@@ -328,6 +325,8 @@ def __init__(
     # It will help reducing the peak memory usage by computing loss inside training.
     def create_model_with_loss(self):
         model_with_loss = ModuleWithLoss(self.model, self.args, self.label_smoother)
+        model_with_loss.compute_model_plus_loss_internal = types.MethodType(Trainer.compute_loss, model_with_loss)
+
         return model_with_loss
 
     # we assume that training_model and inference_model have the same forward signature column.
@@ -349,7 +348,9 @@ def _set_signature_columns_if_needed(self):
     def compute_loss(self, model_with_loss, inputs, return_outputs=False):
         # Run model forward + loss compute.
         if isinstance(self.model, ModuleWithLoss):
-            return model_with_loss(inputs, return_outputs)
+            # ORTModule Does not support the BatchEncoding Type so we have to convert to a dict.
+            dict_inputs = {k: v for k, v in inputs.items()}
+            return model_with_loss(dict_inputs, return_outputs)
         else:
             return super().compute_loss(model_with_loss, inputs, return_outputs)
 

From b6ccb53a5450ba2eabc13c02dc582c308f9ba4e3 Mon Sep 17 00:00:00 2001
From: adamlouly <adamlouly3@gmail.com>
Date: Thu, 23 Mar 2023 19:36:43 +0000
Subject: [PATCH 16/16] trainer fix with ruff

---
 optimum/onnxruntime/trainer.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py
index c011b9ac33..29b3183862 100644
--- a/optimum/onnxruntime/trainer.py
+++ b/optimum/onnxruntime/trainer.py
@@ -20,10 +20,11 @@
 import shutil
 import sys
 import time
+import types
 import warnings
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Type, Union
-import types
+
 
 # Integrations must be imported before ML frameworks:
 # isort: off
@@ -349,7 +350,7 @@ def compute_loss(self, model_with_loss, inputs, return_outputs=False):
         # Run model forward + loss compute.
         if isinstance(self.model, ModuleWithLoss):
             # ORTModule Does not support the BatchEncoding Type so we have to convert to a dict.
-            dict_inputs = {k: v for k, v in inputs.items()}
+            dict_inputs = dict(inputs.items())
             return model_with_loss(dict_inputs, return_outputs)
         else:
             return super().compute_loss(model_with_loss, inputs, return_outputs)