From dc26acfc953496099677926be9a2cd6d4f816e7a Mon Sep 17 00:00:00 2001 From: Jeffrey Huynh Date: Sun, 17 Mar 2024 20:16:05 +0000 Subject: [PATCH] TPU/Neuron: When save_safetensor=False, no need to move model to CPU save_safetensor=True is default as of release 4.35.0, which then required TPU hotfix https://github.com/huggingface/transformers/pull/27799 (issue https://github.com/huggingface/transformers/issues/27578). However, when the flag save_safetensor is set to False (compatibility mode), moving the model to CPU causes generation of too many graphs during checkpoint https://github.com/huggingface/transformers/issues/28438. This PR disable moving of model to CPU when save_safetensor=False. --- src/transformers/trainer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index f40645d3ac5f47..176aea3fffdf63 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -3013,7 +3013,8 @@ def _save_tpu(self, output_dir: Optional[str] = None): logger.info(f"Saving model checkpoint to {output_dir}") model = self.model xm.mark_step() - model.to("cpu") + if self.args.save_safetensors: + model.to("cpu") if xm.is_master_ordinal(): os.makedirs(output_dir, exist_ok=True) @@ -3048,7 +3049,8 @@ def _save_tpu(self, output_dir: Optional[str] = None): # We moved the model from TPU -> CPU for saving the weights. # Now we should move it back to subsequent compute still works. - model.to(self.args.device) + if self.args.save_safetensors: + model.to(self.args.device) def _save(self, output_dir: Optional[str] = None, state_dict=None): # If we are executing this function, we are the process zero, so we don't check for that.