diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py
index cc9c00efab..0253e0b39e 100644
--- a/optimum/onnxruntime/trainer.py
+++ b/optimum/onnxruntime/trainer.py
@@ -23,7 +23,6 @@
 import warnings
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Type, Union
-from tqdm.auto import tqdm
 
 
 # Integrations must be imported before ML frameworks:
@@ -51,7 +50,6 @@
 from transformers.debug_utils import DebugOption, DebugUnderflowOverflow
 from transformers.deepspeed import deepspeed_init, is_deepspeed_zero3_enabled
 from transformers.dependency_versions_check import dep_version_check
-from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 from transformers.file_utils import (
     is_apex_available,
     is_sagemaker_dp_enabled,
@@ -146,6 +144,8 @@ def __init__(self, model, args) -> None:
         super().__init__()
         self._original_model = model
         self.args = args
+
+        # Creating an instance of huggingFace Trainer so we can use compute_loss() logic and avoid duplicated code.
         self.hf_trainer = Trainer(model)
         # Label smoothing
         if self.args.label_smoothing_factor != 0:
diff --git a/optimum/onnxruntime/training_args.py b/optimum/onnxruntime/training_args.py
index 9b213b2921..bf467bb908 100644
--- a/optimum/onnxruntime/training_args.py
+++ b/optimum/onnxruntime/training_args.py
@@ -67,7 +67,7 @@ class ORTTrainingArguments(TrainingArguments):
 
     loss_in_train: Optional[bool] = field(
         default=False,
-        metadata={"help": "Use ModuleWithLoss Wrapper to compute loss inside the training loop."},
+        metadata={"help": "Use ModuleWithLoss Wrapper to compute loss inside the training loop, when label smoother is NOT none having this will help save memory for ORTMOdule Runs."},
     )
 
     # This method will not need to be overriden after the deprecation of `--adafactor` in version 5 of 🤗 Transformers.