From 69c6f111be5b243f92d8d480350bd06355eb389c Mon Sep 17 00:00:00 2001 From: adamlouly Date: Mon, 9 Jan 2023 17:22:42 +0000 Subject: [PATCH 01/16] improved solutio --- optimum/onnxruntime/trainer.py | 111 ++++++++++++++++++++++++++++++--- 1 file changed, 102 insertions(+), 9 deletions(-) diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py index edc4181dc1..3164e4f746 100644 --- a/optimum/onnxruntime/trainer.py +++ b/optimum/onnxruntime/trainer.py @@ -129,6 +129,54 @@ SCALER_NAME = "scaler.pt" +class ModuleWithLoss(nn.Module): + def __init__(self, model, args) -> None: + super().__init__() + self._original_model = model + self.args = args + # Label smoothing + if self.args.label_smoothing_factor != 0: + from transformers.trainer_pt_utils import LabelSmoother + + self.label_smoother = LabelSmoother(epsilon=self.args.label_smoothing_factor) + else: + self.label_smoother = None + + def forward(self, inputs: Dict[str, Union[torch.Tensor, Any]], return_outputs): + if self.label_smoother is not None and "labels" in inputs: + labels = inputs.pop("labels") + else: + labels = None + outputs = self._original_model(**inputs) + + # Save past state if it exists + # TODO: this needs to be fixed and made cleaner later. + if self.args.past_index >= 0: + self._past = outputs[self.args.past_index] + + if labels is not None: + from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES + + if unwrap_model(self._original_model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): + loss = self.label_smoother(outputs, labels, shift_labels=True) + else: + loss = self.label_smoother(outputs, labels) + else: + if isinstance(outputs, dict) and "loss" not in outputs: + raise ValueError( + "The model did not return a loss from the inputs, only the following keys: " + f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." + ) + # We don't use .loss here since the model may return tuples instead of ModelOutput. + loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] + + return (loss, outputs) if return_outputs else loss + + @property + def config(self): + return self._original_model.config + + class ORTFeaturesManager: _TASKS_TO_ORTMODELS = { "default": ORTModelForFeatureExtraction, @@ -259,13 +307,62 @@ def __init__( optimizers=optimizers, preprocess_logits_for_metrics=preprocess_logits_for_metrics, ) - + self._training_model = ModuleWithLoss(model, args) + self.model = model + self._inferencing_model = model self.feature = feature self.onnx_model_path = onnx_model_path self.exported_with_loss = False if self.args.local_rank: torch.cuda.set_device(self.args.local_rank) + def _set_signature_columns_if_needed(self): + if self._signature_columns is None: + # Inspect model forward signature to keep only the arguments it accepts. + import inspect + + if self.model == self._training_model: + signature = inspect.signature(self.model._original_model.forward) + else: + signature = inspect.signature(self.model.forward) + + self._signature_columns = list(signature.parameters.keys()) + # Labels may be named label or label_ids, the default data collator handles that. + self._signature_columns += list(set(["label", "label_ids"] + self.label_names)) + + def compute_loss(self, model_with_loss, inputs, return_outputs=False): + # Run model forward + loss compute. + if self.model == self._training_model: + outputs = model_with_loss(inputs, return_outputs) + return outputs + else: + + if self.label_smoother is not None and "labels" in inputs: + labels = inputs.pop("labels") + else: + labels = None + outputs = model_with_loss(**inputs) + # Save past state if it exists + # TODO: this needs to be fixed and made cleaner later. + if self.args.past_index >= 0: + self._past = outputs[self.args.past_index] + + if labels is not None: + if unwrap_model(model_with_loss)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): + loss = self.label_smoother(outputs, labels, shift_labels=True) + else: + loss = self.label_smoother(outputs, labels) + else: + if isinstance(outputs, dict) and "loss" not in outputs: + raise ValueError( + "The model did not return a loss from the inputs, only the following keys: " + f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." + ) + # We don't use .loss here since the model may return tuples instead of ModelOutput. + loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] + + return (loss, outputs) if return_outputs else loss + def train( self, resume_from_checkpoint: Optional[Union[str, bool]] = None, @@ -289,7 +386,7 @@ def train( kwargs: Additional keyword arguments used to hide deprecated arguments """ - + self.model = self._training_model if resume_from_checkpoint is False: resume_from_checkpoint = None @@ -437,7 +534,7 @@ def _inner_training_loop( RuntimeWarning, ) - self.model = model + self.model = self._training_model deepspeed_engine, optimizer, lr_scheduler = deepspeed_init( self, num_training_steps=max_steps, resume_from_checkpoint=resume_from_checkpoint ) @@ -564,7 +661,6 @@ def _inner_training_loop( # Otherwise we need to call the whooooole sampler cause there is some random operation added # AT THE VERY END! _ = list(train_dataloader.sampler) - for epoch in range(epochs_trained, num_train_epochs): if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler): train_dataloader.sampler.set_epoch(epoch) @@ -774,6 +870,7 @@ def evaluate( dictionary also contains the epoch number which comes from the training state. """ # memory metrics - must set up as early as possible + self.model = self._inferencing_model self._memory_tracker.start() eval_dataloader = self.get_eval_dataloader(eval_dataset) @@ -1609,11 +1706,7 @@ def create_optimizer(self): optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args) if self.sharded_ddp == ShardedDDPOption.SIMPLE: - self.optimizer = OSS( - params=optimizer_grouped_parameters, - optim=optimizer_cls, - **optimizer_kwargs, - ) + self.optimizer = OSS(params=optimizer_grouped_parameters, optim=optimizer_cls, **optimizer_kwargs,) else: self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs) if optimizer_cls.__name__ == "Adam8bit": From c918008982ffb033af0c9520a0adbf1ecf530460 Mon Sep 17 00:00:00 2001 From: adamlouly Date: Tue, 10 Jan 2023 23:39:24 +0000 Subject: [PATCH 02/16] compute loss fix --- optimum/onnxruntime/trainer.py | 48 +++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py index 3164e4f746..d1d1b55052 100644 --- a/optimum/onnxruntime/trainer.py +++ b/optimum/onnxruntime/trainer.py @@ -23,7 +23,6 @@ import warnings from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Type, Union - from tqdm.auto import tqdm @@ -33,6 +32,7 @@ is_fairscale_available, ) + import numpy as np import torch import torch.distributed as dist @@ -46,6 +46,7 @@ from transformers.debug_utils import DebugOption, DebugUnderflowOverflow from transformers.deepspeed import deepspeed_init, is_deepspeed_zero3_enabled from transformers.dependency_versions_check import dep_version_check +from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from transformers.file_utils import ( CONFIG_NAME, WEIGHTS_NAME, @@ -341,27 +342,28 @@ def compute_loss(self, model_with_loss, inputs, return_outputs=False): labels = inputs.pop("labels") else: labels = None - outputs = model_with_loss(**inputs) - # Save past state if it exists - # TODO: this needs to be fixed and made cleaner later. - if self.args.past_index >= 0: - self._past = outputs[self.args.past_index] - - if labels is not None: - if unwrap_model(model_with_loss)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): - loss = self.label_smoother(outputs, labels, shift_labels=True) - else: - loss = self.label_smoother(outputs, labels) + + outputs = model_with_loss(**inputs) + # Save past state if it exists + # TODO: this needs to be fixed and made cleaner later. + if self.args.past_index >= 0: + self._past = outputs[self.args.past_index] + + if labels is not None: + if unwrap_model(model_with_loss)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): + loss = self.label_smoother(outputs, labels, shift_labels=True) else: - if isinstance(outputs, dict) and "loss" not in outputs: - raise ValueError( - "The model did not return a loss from the inputs, only the following keys: " - f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." - ) - # We don't use .loss here since the model may return tuples instead of ModelOutput. - loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] + loss = self.label_smoother(outputs, labels) + else: + if isinstance(outputs, dict) and "loss" not in outputs: + raise ValueError( + "The model did not return a loss from the inputs, only the following keys: " + f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." + ) + # We don't use .loss here since the model may return tuples instead of ModelOutput. + loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] - return (loss, outputs) if return_outputs else loss + return (loss, outputs) if return_outputs else loss def train( self, @@ -1706,7 +1708,11 @@ def create_optimizer(self): optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args) if self.sharded_ddp == ShardedDDPOption.SIMPLE: - self.optimizer = OSS(params=optimizer_grouped_parameters, optim=optimizer_cls, **optimizer_kwargs,) + self.optimizer = OSS( + params=optimizer_grouped_parameters, + optim=optimizer_cls, + **optimizer_kwargs, + ) else: self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs) if optimizer_cls.__name__ == "Adam8bit": From 7e968107cf4972e1eb0b4bccb6e0ca7edea1d649 Mon Sep 17 00:00:00 2001 From: Adam Louly Date: Wed, 15 Feb 2023 22:04:17 +0000 Subject: [PATCH 03/16] esolved comments --- optimum/onnxruntime/trainer.py | 52 +++++++++++++++++----------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py index caa5d1e0db..f7885a2892 100644 --- a/optimum/onnxruntime/trainer.py +++ b/optimum/onnxruntime/trainer.py @@ -27,10 +27,7 @@ # Integrations must be imported before ML frameworks: -from transformers.integrations import ( # isort: split - hp_params, - is_fairscale_available, -) +from transformers.integrations import hp_params, is_fairscale_available # isort: split import numpy as np @@ -144,22 +141,23 @@ def __init__(self, model, args) -> None: else: self.label_smoother = None - def forward(self, inputs: Dict[str, Union[torch.Tensor, Any]], return_outputs): + def compute_loss(self, model, inputs, return_outputs=False): + """ + How the loss is computed by Trainer. By default, all models return the loss in the first element. + Subclass and override for custom behavior. + """ if self.label_smoother is not None and "labels" in inputs: labels = inputs.pop("labels") else: labels = None - outputs = self._original_model(**inputs) - + outputs = model(**inputs) # Save past state if it exists # TODO: this needs to be fixed and made cleaner later. if self.args.past_index >= 0: self._past = outputs[self.args.past_index] if labels is not None: - from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES - - if unwrap_model(self._original_model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): + if unwrap_model(model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): loss = self.label_smoother(outputs, labels, shift_labels=True) else: loss = self.label_smoother(outputs, labels) @@ -174,6 +172,9 @@ def forward(self, inputs: Dict[str, Union[torch.Tensor, Any]], return_outputs): return (loss, outputs) if return_outputs else loss + def forward(self, inputs: Dict[str, Union[torch.Tensor, Any]], return_outputs): + return self.compute_loss(self._original_model, inputs, return_outputs=False) + @property def config(self): return self._original_model.config @@ -309,6 +310,10 @@ def __init__( optimizers=optimizers, preprocess_logits_for_metrics=preprocess_logits_for_metrics, ) + + # We leverage both training_model and inference_model in conjunction with model. + # _training_model will be wrapped so it will use ORT and will use the overriden functions in ModuleWithLoss. + # _inferencing_model will be storing the default version of the model and we will switch to it in case of eval/test. self._training_model = ModuleWithLoss(model, args) self.model = model self._inferencing_model = model @@ -318,12 +323,14 @@ def __init__( if self.args.local_rank: torch.cuda.set_device(self.args.local_rank) + # we assume that training_model and inference_model have the same forward signature column. + # self._signature_columns attribute only stores the first-time parsed signature def _set_signature_columns_if_needed(self): if self._signature_columns is None: # Inspect model forward signature to keep only the arguments it accepts. import inspect - if self.model == self._training_model: + if isinstance(self.model, ModuleWithLoss): signature = inspect.signature(self.model._original_model.forward) else: signature = inspect.signature(self.model.forward) @@ -394,7 +401,7 @@ def train( "You need to install `onnxruntime-training` to use `ORTTrainer` for training. Check out " "https://huggingface.co/docs/optimum/onnxruntime/usage_guides/trainer#install-onnx-runtime." ) - + self.model = self._training_model if resume_from_checkpoint is False: @@ -544,7 +551,6 @@ def _inner_training_loop( RuntimeWarning, ) - self.model = self._training_model deepspeed_engine, optimizer, lr_scheduler = deepspeed_init( self, num_training_steps=max_steps, resume_from_checkpoint=resume_from_checkpoint ) @@ -885,6 +891,7 @@ def evaluate( dictionary also contains the epoch number which comes from the training state. """ # memory metrics - must set up as early as possible + # TODO: We need to enable evaluation using ORT backend. self.model = self._inferencing_model self._memory_tracker.start() @@ -977,6 +984,9 @@ def predict( - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained labels). """ + # TODO: We need to enable evaluation using ORT backend. + self.model = self._inferencing_model + # memory metrics - must set up as early as possible self._memory_tracker.start() @@ -994,10 +1004,7 @@ def predict( try: output = eval_loop( - test_dataloader, - description="Prediction", - ignore_keys=ignore_keys, - metric_key_prefix=metric_key_prefix, + test_dataloader, description="Prediction", ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix ) except Exception as error: logger.error(error) @@ -1740,11 +1747,7 @@ def create_optimizer(self): optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args) if self.sharded_ddp == ShardedDDPOption.SIMPLE: - self.optimizer = OSS( - params=optimizer_grouped_parameters, - optim=optimizer_cls, - **optimizer_kwargs, - ) + self.optimizer = OSS(params=optimizer_grouped_parameters, optim=optimizer_cls, **optimizer_kwargs) else: self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs) if optimizer_cls.__name__ == "Adam8bit": @@ -1774,10 +1777,7 @@ def get_ort_optimizer_cls_and_kwargs(args: ORTTrainingArguments) -> Tuple[Any, A The training arguments for the training session. """ optimizer_kwargs = {"lr": args.learning_rate} - adam_kwargs = { - "betas": (args.adam_beta1, args.adam_beta2), - "eps": args.adam_epsilon, - } + adam_kwargs = {"betas": (args.adam_beta1, args.adam_beta2), "eps": args.adam_epsilon} if args.optim == ORTOptimizerNames.ADAMW_ORT_FUSED: try: from onnxruntime.training.optim import FusedAdam From 4732f2c2e7250a417567afb4f720918838e0be91 Mon Sep 17 00:00:00 2001 From: Adam Louly Date: Wed, 22 Feb 2023 21:50:04 +0000 Subject: [PATCH 04/16] removed duplicated code .. used main trainer compute loss --- optimum/onnxruntime/trainer.py | 64 +++------------------------------- 1 file changed, 5 insertions(+), 59 deletions(-) diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py index f7885a2892..fee06bbcc8 100644 --- a/optimum/onnxruntime/trainer.py +++ b/optimum/onnxruntime/trainer.py @@ -133,6 +133,7 @@ def __init__(self, model, args) -> None: super().__init__() self._original_model = model self.args = args + self.hf_trainer = Trainer(model) # Label smoothing if self.args.label_smoothing_factor != 0: from transformers.trainer_pt_utils import LabelSmoother @@ -141,39 +142,8 @@ def __init__(self, model, args) -> None: else: self.label_smoother = None - def compute_loss(self, model, inputs, return_outputs=False): - """ - How the loss is computed by Trainer. By default, all models return the loss in the first element. - Subclass and override for custom behavior. - """ - if self.label_smoother is not None and "labels" in inputs: - labels = inputs.pop("labels") - else: - labels = None - outputs = model(**inputs) - # Save past state if it exists - # TODO: this needs to be fixed and made cleaner later. - if self.args.past_index >= 0: - self._past = outputs[self.args.past_index] - - if labels is not None: - if unwrap_model(model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): - loss = self.label_smoother(outputs, labels, shift_labels=True) - else: - loss = self.label_smoother(outputs, labels) - else: - if isinstance(outputs, dict) and "loss" not in outputs: - raise ValueError( - "The model did not return a loss from the inputs, only the following keys: " - f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." - ) - # We don't use .loss here since the model may return tuples instead of ModelOutput. - loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] - - return (loss, outputs) if return_outputs else loss - def forward(self, inputs: Dict[str, Union[torch.Tensor, Any]], return_outputs): - return self.compute_loss(self._original_model, inputs, return_outputs=False) + return self.hf_trainer.compute_loss(self._original_model, inputs, return_outputs=False) @property def config(self): @@ -345,33 +315,8 @@ def compute_loss(self, model_with_loss, inputs, return_outputs=False): outputs = model_with_loss(inputs, return_outputs) return outputs else: - - if self.label_smoother is not None and "labels" in inputs: - labels = inputs.pop("labels") - else: - labels = None - - outputs = model_with_loss(**inputs) - # Save past state if it exists - # TODO: this needs to be fixed and made cleaner later. - if self.args.past_index >= 0: - self._past = outputs[self.args.past_index] - - if labels is not None: - if unwrap_model(model_with_loss)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): - loss = self.label_smoother(outputs, labels, shift_labels=True) - else: - loss = self.label_smoother(outputs, labels) - else: - if isinstance(outputs, dict) and "loss" not in outputs: - raise ValueError( - "The model did not return a loss from the inputs, only the following keys: " - f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." - ) - # We don't use .loss here since the model may return tuples instead of ModelOutput. - loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] - - return (loss, outputs) if return_outputs else loss + loss, outputs = super().compute_loss(model_with_loss, inputs, return_outputs) + return (loss, outputs) def train( self, @@ -551,6 +496,7 @@ def _inner_training_loop( RuntimeWarning, ) + self.model = model deepspeed_engine, optimizer, lr_scheduler = deepspeed_init( self, num_training_steps=max_steps, resume_from_checkpoint=resume_from_checkpoint ) From c47fa80f26f7fb1403c79155a39e1deed1e2c3d3 Mon Sep 17 00:00:00 2001 From: Adam Louly Date: Wed, 22 Feb 2023 23:31:13 +0000 Subject: [PATCH 05/16] added --loss_in_train flag --- optimum/onnxruntime/trainer.py | 13 +++++++++---- optimum/onnxruntime/training_args.py | 14 ++++++++++++++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py index fee06bbcc8..01428b80c3 100644 --- a/optimum/onnxruntime/trainer.py +++ b/optimum/onnxruntime/trainer.py @@ -284,7 +284,13 @@ def __init__( # We leverage both training_model and inference_model in conjunction with model. # _training_model will be wrapped so it will use ORT and will use the overriden functions in ModuleWithLoss. # _inferencing_model will be storing the default version of the model and we will switch to it in case of eval/test. - self._training_model = ModuleWithLoss(model, args) + + # Only Wrap the model if we pass --loss_in_train flag. + if args.loss_in_train: + self._training_model = ModuleWithLoss(model, args) + else: + self._training_model = model + self.model = model self._inferencing_model = model self.feature = feature @@ -311,12 +317,11 @@ def _set_signature_columns_if_needed(self): def compute_loss(self, model_with_loss, inputs, return_outputs=False): # Run model forward + loss compute. - if self.model == self._training_model: + if self.args.loss_in_train and self.model == self._training_model: outputs = model_with_loss(inputs, return_outputs) return outputs else: - loss, outputs = super().compute_loss(model_with_loss, inputs, return_outputs) - return (loss, outputs) + return super().compute_loss(self.model, inputs, return_outputs) def train( self, diff --git a/optimum/onnxruntime/training_args.py b/optimum/onnxruntime/training_args.py index 85456c7ad7..fb3e37aac8 100644 --- a/optimum/onnxruntime/training_args.py +++ b/optimum/onnxruntime/training_args.py @@ -65,6 +65,11 @@ class ORTTrainingArguments(TrainingArguments): metadata={"help": "The optimizer to use."}, ) + loss_in_train: Optional[bool] = field( + default=False, + metadata={"help": "Use ModuleWithLoss Wrapper to compute loss inside the training loop."}, + ) + # This method will not need to be overriden after the deprecation of `--adafactor` in version 5 of 🤗 Transformers. def __post_init__(self): # Handle --use_env option in torch.distributed.launch (local_rank not passed as an arg then). @@ -337,3 +342,12 @@ def __post_init__(self): f"{self.hub_model_id}).", FutureWarning, ) + if self.loss_in_train is True: + logger.info( + "Using ModuleWithLoss Wrapper." + "loss will be computed during training loop and it will save memory peak " + ) + else: + logger.info( + "Not Using ModuleWithLoss Wrapper." + ) From c45bd53757f2b789139ff0076288d875940699ea Mon Sep 17 00:00:00 2001 From: Adam Louly Date: Thu, 23 Feb 2023 22:29:30 +0000 Subject: [PATCH 06/16] resolved comments --- optimum/onnxruntime/trainer.py | 4 ++-- optimum/onnxruntime/training_args.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py index cc9c00efab..0253e0b39e 100644 --- a/optimum/onnxruntime/trainer.py +++ b/optimum/onnxruntime/trainer.py @@ -23,7 +23,6 @@ import warnings from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Type, Union -from tqdm.auto import tqdm # Integrations must be imported before ML frameworks: @@ -51,7 +50,6 @@ from transformers.debug_utils import DebugOption, DebugUnderflowOverflow from transformers.deepspeed import deepspeed_init, is_deepspeed_zero3_enabled from transformers.dependency_versions_check import dep_version_check -from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from transformers.file_utils import ( is_apex_available, is_sagemaker_dp_enabled, @@ -146,6 +144,8 @@ def __init__(self, model, args) -> None: super().__init__() self._original_model = model self.args = args + + # Creating an instance of huggingFace Trainer so we can use compute_loss() logic and avoid duplicated code. self.hf_trainer = Trainer(model) # Label smoothing if self.args.label_smoothing_factor != 0: diff --git a/optimum/onnxruntime/training_args.py b/optimum/onnxruntime/training_args.py index 9b213b2921..bf467bb908 100644 --- a/optimum/onnxruntime/training_args.py +++ b/optimum/onnxruntime/training_args.py @@ -67,7 +67,7 @@ class ORTTrainingArguments(TrainingArguments): loss_in_train: Optional[bool] = field( default=False, - metadata={"help": "Use ModuleWithLoss Wrapper to compute loss inside the training loop."}, + metadata={"help": "Use ModuleWithLoss Wrapper to compute loss inside the training loop, when label smoother is NOT none having this will help save memory for ORTMOdule Runs."}, ) # This method will not need to be overriden after the deprecation of `--adafactor` in version 5 of 🤗 Transformers. From 31178c8e1c8d67482d2a16a63470e9c01c6cb3b5 Mon Sep 17 00:00:00 2001 From: Adam Louly Date: Thu, 2 Mar 2023 08:41:51 +0000 Subject: [PATCH 07/16] resolved comments --- optimum/onnxruntime/trainer.py | 114 ++++++++++++++++++--------- optimum/onnxruntime/training_args.py | 17 ++-- 2 files changed, 84 insertions(+), 47 deletions(-) diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py index 0253e0b39e..1e2de8b486 100644 --- a/optimum/onnxruntime/trainer.py +++ b/optimum/onnxruntime/trainer.py @@ -28,17 +28,6 @@ # Integrations must be imported before ML frameworks: from transformers.integrations import hp_params, is_fairscale_available # isort: split - - -# Integrations must be imported before ML frameworks: -# isort: off -from transformers.integrations import ( - hp_params, - is_fairscale_available, -) - -# isort: on - import numpy as np import torch import torch.distributed as dist @@ -140,23 +129,48 @@ class ModuleWithLoss(nn.Module): - def __init__(self, model, args) -> None: + def __init__(self, model, args, label_smoother) -> None: super().__init__() self._original_model = model self.args = args + self.label_smoother = label_smoother + + def forward(self, inputs: Dict[str, Union[torch.Tensor, Any]], return_outputs): + if self.label_smoother is not None and "labels" in inputs: + labels = inputs.pop("labels") + else: + labels = None + outputs = self._original_model(**inputs) - # Creating an instance of huggingFace Trainer so we can use compute_loss() logic and avoid duplicated code. - self.hf_trainer = Trainer(model) - # Label smoothing - if self.args.label_smoothing_factor != 0: - from transformers.trainer_pt_utils import LabelSmoother + # Save past state if it exists + # TODO: this needs to be fixed and made cleaner later. + if self.args.past_index >= 0: + self._past = outputs[self.args.past_index] + + if labels is not None: + from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES - self.label_smoother = LabelSmoother(epsilon=self.args.label_smoothing_factor) + if unwrap_model(self._original_model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): + loss = self.label_smoother(outputs, labels, shift_labels=True) + else: + loss = self.label_smoother(outputs, labels) else: - self.label_smoother = None + if isinstance(outputs, dict) and "loss" not in outputs: + raise ValueError( + "The model did not return a loss from the inputs, only the following keys: " + f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." + ) + # We don't use .loss here since the model may return tuples instead of ModelOutput. + loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] - def forward(self, inputs: Dict[str, Union[torch.Tensor, Any]], return_outputs): - return self.hf_trainer.compute_loss(self._original_model, inputs, return_outputs=False) + return (loss, outputs) if return_outputs else loss + + @property + def module(self): + """The original `torch.nn.Module` that this module wraps. + This property provides access to methods and properties on the original module.""" + + return self._original_model.module @property def config(self): @@ -310,22 +324,28 @@ def __init__( # We leverage both training_model and inference_model in conjunction with model. # _training_model will be wrapped so it will use ORT and will use the overriden functions in ModuleWithLoss. - # _inferencing_model will be storing the default version of the model and we will switch to it in case of eval/test. + # _training_model will be storing the default version of the model and will unwrap it in case of eval/test. - # Only Wrap the model if we pass --loss_in_train flag. - if args.loss_in_train: - self._training_model = ModuleWithLoss(model, args) + # Only Wrap the model if we pass --use_module_with_loss flag. + if args.use_module_with_loss: + self._training_model = self.create_model_with_loss() else: self._training_model = model self.model = model - self._inferencing_model = model + self.feature = feature self.onnx_model_path = onnx_model_path self.exported_with_loss = False if self.args.local_rank: torch.cuda.set_device(self.args.local_rank) + # this method will create a ModuleWithLoss Instance to use if you are passing --use_module_with_loss flag. + # It will help reducing the peak memory usage by computing loss inside training. + def create_model_with_loss(self): + model_with_loss = ModuleWithLoss(self.model, self.args, self.label_smoother) + return model_with_loss + # we assume that training_model and inference_model have the same forward signature column. # self._signature_columns attribute only stores the first-time parsed signature def _set_signature_columns_if_needed(self): @@ -344,11 +364,37 @@ def _set_signature_columns_if_needed(self): def compute_loss(self, model_with_loss, inputs, return_outputs=False): # Run model forward + loss compute. - if self.args.loss_in_train and self.model == self._training_model: + if isinstance(self.model, ModuleWithLoss): outputs = model_with_loss(inputs, return_outputs) return outputs else: - return super().compute_loss(self.model, inputs, return_outputs) + + if self.label_smoother is not None and "labels" in inputs: + labels = inputs.pop("labels") + else: + labels = None + + outputs = model_with_loss(**inputs) + # Save past state if it exists + # TODO: this needs to be fixed and made cleaner later. + if self.args.past_index >= 0: + self._past = outputs[self.args.past_index] + + if labels is not None: + if unwrap_model(model_with_loss)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): + loss = self.label_smoother(outputs, labels, shift_labels=True) + else: + loss = self.label_smoother(outputs, labels) + else: + if isinstance(outputs, dict) and "loss" not in outputs: + raise ValueError( + "The model did not return a loss from the inputs, only the following keys: " + f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." + ) + # We don't use .loss here since the model may return tuples instead of ModelOutput. + loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] + + return (loss, outputs) if return_outputs else loss def train( self, @@ -868,7 +914,7 @@ def evaluate( """ # memory metrics - must set up as early as possible # TODO: We need to enable evaluation using ORT backend. - self.model = self._inferencing_model + self.model = unwrap_model(self.model) self._memory_tracker.start() eval_dataloader = self.get_eval_dataloader(eval_dataset) @@ -961,7 +1007,7 @@ def predict( labels). """ # TODO: We need to enable evaluation using ORT backend. - self.model = self._inferencing_model + self.model = unwrap_model(self.model) # memory metrics - must set up as early as possible self._memory_tracker.start() @@ -1590,13 +1636,7 @@ def _export( opset = max(opset, 12) # Operators like `nll_loss`are added for opset>=12 output_path = model_path / ONNX_WEIGHTS_NAME - _ = export( - model=model, - config=onnx_config, - opset=opset, - output=output_path, - device=device, - ) + _ = export(model=model, config=onnx_config, opset=opset, output=output_path, device=device) model.config.save_pretrained(model_path) diff --git a/optimum/onnxruntime/training_args.py b/optimum/onnxruntime/training_args.py index bf467bb908..223fd58ecd 100644 --- a/optimum/onnxruntime/training_args.py +++ b/optimum/onnxruntime/training_args.py @@ -60,14 +60,13 @@ class ORTTrainingArguments(TrainingArguments): The optimizer to use, including optimizers in Transformers: adamw_hf, adamw_torch, adamw_apex_fused, or adafactor. And optimizers implemented by ONNX Runtime: adamw_ort_fused. """ - optim: Optional[str] = field( - default="adamw_hf", - metadata={"help": "The optimizer to use."}, - ) + optim: Optional[str] = field(default="adamw_hf", metadata={"help": "The optimizer to use."}) - loss_in_train: Optional[bool] = field( + use_module_with_loss: Optional[bool] = field( default=False, - metadata={"help": "Use ModuleWithLoss Wrapper to compute loss inside the training loop, when label smoother is NOT none having this will help save memory for ORTMOdule Runs."}, + metadata={ + "help": "Use ModuleWithLoss Wrapper to compute loss inside the training loop, when label smoother is NOT none having this will help save memory for ORTMOdule Runs." + }, ) # This method will not need to be overriden after the deprecation of `--adafactor` in version 5 of 🤗 Transformers. @@ -341,12 +340,10 @@ def __post_init__(self): f"{self.hub_model_id}).", FutureWarning, ) - if self.loss_in_train is True: + if self.use_module_with_loss is True: logger.info( "Using ModuleWithLoss Wrapper." "loss will be computed during training loop and it will save memory peak " ) else: - logger.info( - "Not Using ModuleWithLoss Wrapper." - ) + logger.info("Not Using ModuleWithLoss Wrapper.") From 19cfe045900410aa52c2e492759e4d60c3b5050e Mon Sep 17 00:00:00 2001 From: Adam Louly Date: Thu, 2 Mar 2023 17:54:54 +0000 Subject: [PATCH 08/16] formatter usng latest black --- optimum/onnxruntime/trainer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py index 1e2de8b486..02c4985f41 100644 --- a/optimum/onnxruntime/trainer.py +++ b/optimum/onnxruntime/trainer.py @@ -368,7 +368,6 @@ def compute_loss(self, model_with_loss, inputs, return_outputs=False): outputs = model_with_loss(inputs, return_outputs) return outputs else: - if self.label_smoother is not None and "labels" in inputs: labels = inputs.pop("labels") else: From f268040dbaf3eec667f37aa8368dbdef226eff87 Mon Sep 17 00:00:00 2001 From: Adam Louly Date: Thu, 2 Mar 2023 17:59:51 +0000 Subject: [PATCH 09/16] add import for code quality --- optimum/onnxruntime/trainer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py index 02c4985f41..34acf05833 100644 --- a/optimum/onnxruntime/trainer.py +++ b/optimum/onnxruntime/trainer.py @@ -380,6 +380,8 @@ def compute_loss(self, model_with_loss, inputs, return_outputs=False): self._past = outputs[self.args.past_index] if labels is not None: + from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES + if unwrap_model(model_with_loss)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): loss = self.label_smoother(outputs, labels, shift_labels=True) else: From 4d8624a8f5343c1be53079779009f508b1e00252 Mon Sep 17 00:00:00 2001 From: Adam Louly Date: Thu, 2 Mar 2023 18:05:03 +0000 Subject: [PATCH 10/16] formatter usng latest black --- optimum/onnxruntime/trainer.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py index 34acf05833..d4c2f96a0b 100644 --- a/optimum/onnxruntime/trainer.py +++ b/optimum/onnxruntime/trainer.py @@ -26,7 +26,13 @@ # Integrations must be imported before ML frameworks: -from transformers.integrations import hp_params, is_fairscale_available # isort: split +# isort: off +from transformers.integrations import ( + hp_params, + is_fairscale_available, +) + +# isort: on import numpy as np import torch From ee6ef104139dd402f9a1c7344e8cb433f1b236ac Mon Sep 17 00:00:00 2001 From: Adam Louly Date: Fri, 3 Mar 2023 19:41:30 +0000 Subject: [PATCH 11/16] readding super loss compute --- optimum/onnxruntime/trainer.py | 29 +--------------------------- optimum/onnxruntime/training_args.py | 2 +- 2 files changed, 2 insertions(+), 29 deletions(-) diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py index d4c2f96a0b..80aeaa6101 100644 --- a/optimum/onnxruntime/trainer.py +++ b/optimum/onnxruntime/trainer.py @@ -374,34 +374,7 @@ def compute_loss(self, model_with_loss, inputs, return_outputs=False): outputs = model_with_loss(inputs, return_outputs) return outputs else: - if self.label_smoother is not None and "labels" in inputs: - labels = inputs.pop("labels") - else: - labels = None - - outputs = model_with_loss(**inputs) - # Save past state if it exists - # TODO: this needs to be fixed and made cleaner later. - if self.args.past_index >= 0: - self._past = outputs[self.args.past_index] - - if labels is not None: - from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES - - if unwrap_model(model_with_loss)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): - loss = self.label_smoother(outputs, labels, shift_labels=True) - else: - loss = self.label_smoother(outputs, labels) - else: - if isinstance(outputs, dict) and "loss" not in outputs: - raise ValueError( - "The model did not return a loss from the inputs, only the following keys: " - f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." - ) - # We don't use .loss here since the model may return tuples instead of ModelOutput. - loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] - - return (loss, outputs) if return_outputs else loss + return super().compute_loss(self.model, inputs, return_outputs) def train( self, diff --git a/optimum/onnxruntime/training_args.py b/optimum/onnxruntime/training_args.py index 223fd58ecd..00e01ef1c1 100644 --- a/optimum/onnxruntime/training_args.py +++ b/optimum/onnxruntime/training_args.py @@ -65,7 +65,7 @@ class ORTTrainingArguments(TrainingArguments): use_module_with_loss: Optional[bool] = field( default=False, metadata={ - "help": "Use ModuleWithLoss Wrapper to compute loss inside the training loop, when label smoother is NOT none having this will help save memory for ORTMOdule Runs." + "help": "Use ModuleWithLoss Wrapper to compute loss inside the training loop, having this will help save memory for ORTMOdule Runs." }, ) From dc8de71505aa834f33e3520b9c6c2f7e9ec07474 Mon Sep 17 00:00:00 2001 From: Adam Louly Date: Tue, 7 Mar 2023 02:31:20 +0000 Subject: [PATCH 12/16] resolv comments --- optimum/onnxruntime/trainer.py | 44 +++++++++------------------------- 1 file changed, 11 insertions(+), 33 deletions(-) diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py index 80aeaa6101..868d1fcddc 100644 --- a/optimum/onnxruntime/trainer.py +++ b/optimum/onnxruntime/trainer.py @@ -139,37 +139,17 @@ def __init__(self, model, args, label_smoother) -> None: super().__init__() self._original_model = model self.args = args + # Creating an instance of huggingFace Trainer so we can use compute_loss() logic and avoid duplicated code. + self.hf_trainer = Trainer(model) + # Label smoothing self.label_smoother = label_smoother def forward(self, inputs: Dict[str, Union[torch.Tensor, Any]], return_outputs): - if self.label_smoother is not None and "labels" in inputs: - labels = inputs.pop("labels") - else: - labels = None - outputs = self._original_model(**inputs) - - # Save past state if it exists - # TODO: this needs to be fixed and made cleaner later. - if self.args.past_index >= 0: - self._past = outputs[self.args.past_index] - - if labels is not None: - from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES - - if unwrap_model(self._original_model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): - loss = self.label_smoother(outputs, labels, shift_labels=True) - else: - loss = self.label_smoother(outputs, labels) - else: - if isinstance(outputs, dict) and "loss" not in outputs: - raise ValueError( - "The model did not return a loss from the inputs, only the following keys: " - f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." - ) - # We don't use .loss here since the model may return tuples instead of ModelOutput. - loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] + return self.hf_trainer.compute_loss(self._original_model, inputs, return_outputs=False) - return (loss, outputs) if return_outputs else loss + @property + def config(self): + return self._original_model.config @property def module(self): @@ -335,8 +315,6 @@ def __init__( # Only Wrap the model if we pass --use_module_with_loss flag. if args.use_module_with_loss: self._training_model = self.create_model_with_loss() - else: - self._training_model = model self.model = model @@ -404,8 +382,8 @@ def train( "You need to install `onnxruntime-training` to use `ORTTrainer` for training. Check out " "https://huggingface.co/docs/optimum/onnxruntime/usage_guides/trainer#install-onnx-runtime." ) - - self.model = self._training_model + if self.args.use_module_with_loss: + self.model = self._training_model if resume_from_checkpoint is False: resume_from_checkpoint = None @@ -534,10 +512,10 @@ def _inner_training_loop( or is_sagemaker_mp_enabled() or self.fsdp is not None ) - # Wrap the model with `ORTModule` logger.info("Wrap ORTModule for ONNX Runtime training.") - model = ORTModule(self.model) + from onnxruntime.training.ortmodule import ORTModule, DebugOptions, LogLevel + model = ORTModule(self.model, DebugOptions(save_onnx=True, log_level=LogLevel.VERBOSE, onnx_prefix="distil_bert")) self.model_wrapped = model if args.deepspeed: From 55ad1d2767c5957d96a6cb4551518328374a2136 Mon Sep 17 00:00:00 2001 From: Adam Louly Date: Tue, 7 Mar 2023 02:32:20 +0000 Subject: [PATCH 13/16] fix typo --- optimum/onnxruntime/training_args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/onnxruntime/training_args.py b/optimum/onnxruntime/training_args.py index 00e01ef1c1..bf16b37b72 100644 --- a/optimum/onnxruntime/training_args.py +++ b/optimum/onnxruntime/training_args.py @@ -65,7 +65,7 @@ class ORTTrainingArguments(TrainingArguments): use_module_with_loss: Optional[bool] = field( default=False, metadata={ - "help": "Use ModuleWithLoss Wrapper to compute loss inside the training loop, having this will help save memory for ORTMOdule Runs." + "help": "Use ModuleWithLoss Wrapper to compute loss inside the training loop, having this will help save memory for ORTModule Runs." }, ) From 432efe547f2cd3b7fead033b07b7a4ccdc4b63cf Mon Sep 17 00:00:00 2001 From: Adam Louly Date: Wed, 8 Mar 2023 00:00:39 +0000 Subject: [PATCH 14/16] solve not exporting onnx models --- optimum/onnxruntime/trainer.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py index 868d1fcddc..a233d7e7f6 100644 --- a/optimum/onnxruntime/trainer.py +++ b/optimum/onnxruntime/trainer.py @@ -145,7 +145,7 @@ def __init__(self, model, args, label_smoother) -> None: self.label_smoother = label_smoother def forward(self, inputs: Dict[str, Union[torch.Tensor, Any]], return_outputs): - return self.hf_trainer.compute_loss(self._original_model, inputs, return_outputs=False) + return self.hf_trainer.compute_loss(self._original_model, inputs, return_outputs) @property def config(self): @@ -349,10 +349,9 @@ def _set_signature_columns_if_needed(self): def compute_loss(self, model_with_loss, inputs, return_outputs=False): # Run model forward + loss compute. if isinstance(self.model, ModuleWithLoss): - outputs = model_with_loss(inputs, return_outputs) - return outputs + return model_with_loss(inputs, return_outputs) else: - return super().compute_loss(self.model, inputs, return_outputs) + return super().compute_loss(model_with_loss, inputs, return_outputs) def train( self, @@ -512,10 +511,10 @@ def _inner_training_loop( or is_sagemaker_mp_enabled() or self.fsdp is not None ) + # Wrap the model with `ORTModule` logger.info("Wrap ORTModule for ONNX Runtime training.") - from onnxruntime.training.ortmodule import ORTModule, DebugOptions, LogLevel - model = ORTModule(self.model, DebugOptions(save_onnx=True, log_level=LogLevel.VERBOSE, onnx_prefix="distil_bert")) + model = ORTModule(self.model) self.model_wrapped = model if args.deepspeed: @@ -663,6 +662,7 @@ def _inner_training_loop( # Otherwise we need to call the whooooole sampler cause there is some random operation added # AT THE VERY END! _ = list(train_dataloader.sampler) + for epoch in range(epochs_trained, num_train_epochs): if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler): train_dataloader.sampler.set_epoch(epoch) From 2b5e57b4ddbbe00e7c0c9935a4f8b4cf77675bfd Mon Sep 17 00:00:00 2001 From: Adam Louly Date: Wed, 8 Mar 2023 18:13:44 +0000 Subject: [PATCH 15/16] dictionary casting , bind method --- optimum/onnxruntime/trainer.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py index a233d7e7f6..c011b9ac33 100644 --- a/optimum/onnxruntime/trainer.py +++ b/optimum/onnxruntime/trainer.py @@ -23,7 +23,7 @@ import warnings from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Type, Union - +import types # Integrations must be imported before ML frameworks: # isort: off @@ -135,21 +135,18 @@ class ModuleWithLoss(nn.Module): - def __init__(self, model, args, label_smoother) -> None: + def __init__(self, model, args, label_smoother): super().__init__() self._original_model = model self.args = args - # Creating an instance of huggingFace Trainer so we can use compute_loss() logic and avoid duplicated code. - self.hf_trainer = Trainer(model) # Label smoothing self.label_smoother = label_smoother def forward(self, inputs: Dict[str, Union[torch.Tensor, Any]], return_outputs): - return self.hf_trainer.compute_loss(self._original_model, inputs, return_outputs) - - @property - def config(self): - return self._original_model.config + # The compute_model_plus_loss_internal is assigned once the class is instantiated. + # It should have same signature as Trainer.compute_loss(). + # We do this to avoid potential un-synced states if we duplicated compute loss codes . + return self.compute_model_plus_loss_internal(self._original_model, inputs, return_outputs) @property def module(self): @@ -328,6 +325,8 @@ def __init__( # It will help reducing the peak memory usage by computing loss inside training. def create_model_with_loss(self): model_with_loss = ModuleWithLoss(self.model, self.args, self.label_smoother) + model_with_loss.compute_model_plus_loss_internal = types.MethodType(Trainer.compute_loss, model_with_loss) + return model_with_loss # we assume that training_model and inference_model have the same forward signature column. @@ -349,7 +348,9 @@ def _set_signature_columns_if_needed(self): def compute_loss(self, model_with_loss, inputs, return_outputs=False): # Run model forward + loss compute. if isinstance(self.model, ModuleWithLoss): - return model_with_loss(inputs, return_outputs) + # ORTModule Does not support the BatchEncoding Type so we have to convert to a dict. + dict_inputs = {k: v for k, v in inputs.items()} + return model_with_loss(dict_inputs, return_outputs) else: return super().compute_loss(model_with_loss, inputs, return_outputs) From b6ccb53a5450ba2eabc13c02dc582c308f9ba4e3 Mon Sep 17 00:00:00 2001 From: adamlouly Date: Thu, 23 Mar 2023 19:36:43 +0000 Subject: [PATCH 16/16] trainer fix with ruff --- optimum/onnxruntime/trainer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py index c011b9ac33..29b3183862 100644 --- a/optimum/onnxruntime/trainer.py +++ b/optimum/onnxruntime/trainer.py @@ -20,10 +20,11 @@ import shutil import sys import time +import types import warnings from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Type, Union -import types + # Integrations must be imported before ML frameworks: # isort: off @@ -349,7 +350,7 @@ def compute_loss(self, model_with_loss, inputs, return_outputs=False): # Run model forward + loss compute. if isinstance(self.model, ModuleWithLoss): # ORTModule Does not support the BatchEncoding Type so we have to convert to a dict. - dict_inputs = {k: v for k, v in inputs.items()} + dict_inputs = dict(inputs.items()) return model_with_loss(dict_inputs, return_outputs) else: return super().compute_loss(model_with_loss, inputs, return_outputs)