From 43980cdb12a204264e953dbd4ada92d704078459 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Sun, 18 Feb 2024 13:26:29 +0000 Subject: [PATCH 1/3] add RMSProp to Trainer --- src/transformers/trainer.py | 20 +++++++----- src/transformers/training_args.py | 3 ++ tests/trainer/test_trainer.py | 51 +++++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+), 7 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index a930eaccef461c..e2580cc25008ff 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -1083,9 +1083,12 @@ def get_optimizer_cls_and_kwargs(args: TrainingArguments) -> Tuple[Any, Any]: OptimizerNames.LION_8BIT, OptimizerNames.PAGED_LION, OptimizerNames.PAGED_LION_8BIT, + OptimizerNames.RMSPROP_BNB, + OptimizerNames.RMSPROP_8BIT, + OptimizerNames.RMSPROP_32BIT, ]: try: - from bitsandbytes.optim import AdamW, Lion + from bitsandbytes.optim import AdamW, Lion, RMSprop is_paged = False optim_bits = 32 @@ -1100,8 +1103,16 @@ def get_optimizer_cls_and_kwargs(args: TrainingArguments) -> Tuple[Any, Any]: elif "lion" in args.optim: optimizer_cls = Lion additional_optim_kwargs = {"betas": (args.adam_beta1, args.adam_beta2)} + elif "rmsprop" in args.optim: + optimizer_cls = RMSprop + # Above we pass all `adam_kwargs` to the optimizer, here + # we only pass `optim_args` which can be passed to the user. + additional_optim_kwargs = optim_args + + bnb_kwargs = {"optim_bits": optim_bits} + if "rmsprop" not in args.optim: + bnb_kwargs["is_paged"] = is_paged - bnb_kwargs = {"is_paged": is_paged, "optim_bits": optim_bits} optimizer_kwargs.update(additional_optim_kwargs) optimizer_kwargs.update(bnb_kwargs) except ImportError: @@ -4136,11 +4147,6 @@ def create_accelerator_and_postprocess(self): wrapper = "DeepSpeed" if self.is_deepspeed_enabled else "FSDP" raise ValueError(f"{wrapper} can't be used with `save_only_model` along with `load_best_model_at_end`.") - # `auto_find_batch_size` isn't yet supported with DeepSpeed/FSDP - if (self.is_deepspeed_enabled or self.is_fsdp_enabled) and self.args.auto_find_batch_size: - wrapper = "DeepSpeed" if self.is_deepspeed_enabled else "FSDP" - raise NotImplementedError(f"`{wrapper}` doesn't support `auto_find_batch_size`.") - def propagate_args_to_deepspeed(self, auto_find_batch_size=False): """ Sets values in the deepspeed plugin based on the Trainer args diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 4ec9424396178f..19ab24c205cf72 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -157,6 +157,9 @@ class OptimizerNames(ExplicitEnum): PAGED_LION = "paged_lion_32bit" PAGED_LION_8BIT = "paged_lion_8bit" RMSPROP = "rmsprop" + RMSPROP_BNB = "rmsprop_bnb" + RMSPROP_8BIT = "rmsprop_bnb_8bit" + RMSPROP_32BIT = "rmsprop_bnb_32bit" # TODO: `TrainingArguments` users rely on it being fully mutable. In the future see if we can narrow this to a few keys: https://github.com/huggingface/transformers/pull/25903 diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 87e95a7ea396f7..b64e93a2d17494 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -58,6 +58,7 @@ get_tests_dir, is_staging_test, require_accelerate, + require_bitsandbytes, require_deepspeed, require_intel_extension_for_pytorch, require_optuna, @@ -872,6 +873,56 @@ def test_number_of_steps_in_training_with_ipex(self): train_output = trainer.train() self.assertEqual(train_output.global_step, 10) + @require_bitsandbytes + def test_rmsprop_bnb(self): + config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4) + tiny_gpt2 = GPT2LMHeadModel(config) + x = torch.randint(0, 100, (128,)) + train_dataset = RepeatDataset(x) + + with tempfile.TemporaryDirectory() as tmpdir: + # Trainer without inf/nan filter + args = TrainingArguments( + tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="rmsprop_bnb" + ) + trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset) + + # Check that it trains without errors + trainer.train() + + @require_bitsandbytes + def test_rmsprop_bnb_8bit(self): + config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4) + tiny_gpt2 = GPT2LMHeadModel(config) + x = torch.randint(0, 100, (128,)) + train_dataset = RepeatDataset(x) + + with tempfile.TemporaryDirectory() as tmpdir: + # Trainer without inf/nan filter + args = TrainingArguments( + tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="rmsprop_bnb_8bit" + ) + trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset) + + # Check that it trains without errors + trainer.train() + + @require_bitsandbytes + def test_rmsprop_bnb_32bit(self): + config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4) + tiny_gpt2 = GPT2LMHeadModel(config) + x = torch.randint(0, 100, (128,)) + train_dataset = RepeatDataset(x) + with tempfile.TemporaryDirectory() as tmpdir: + # Trainer without inf/nan filter + args = TrainingArguments( + tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="rmsprop_bnb_32bit" + ) + trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset) + + # Check that it trains without errors + trainer.train() + def test_neftune(self): config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4) tiny_gpt2 = GPT2LMHeadModel(config) From 9f30c9773c03165b4e4610986f192e6427bbdbe5 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Sun, 18 Feb 2024 13:29:03 +0000 Subject: [PATCH 2/3] revert some change --- src/transformers/trainer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index e2580cc25008ff..c13fdf5a9baa35 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -4147,6 +4147,11 @@ def create_accelerator_and_postprocess(self): wrapper = "DeepSpeed" if self.is_deepspeed_enabled else "FSDP" raise ValueError(f"{wrapper} can't be used with `save_only_model` along with `load_best_model_at_end`.") + # `auto_find_batch_size` isn't yet supported with DeepSpeed/FSDP + if (self.is_deepspeed_enabled or self.is_fsdp_enabled) and self.args.auto_find_batch_size: + wrapper = "DeepSpeed" if self.is_deepspeed_enabled else "FSDP" + raise NotImplementedError(f"`{wrapper}` doesn't support `auto_find_batch_size`.") + def propagate_args_to_deepspeed(self, auto_find_batch_size=False): """ Sets values in the deepspeed plugin based on the Trainer args From 6899c74a09a27822d3fe318b0b7f5e2fe405f602 Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Tue, 20 Feb 2024 02:15:26 +0100 Subject: [PATCH 3/3] Update src/transformers/trainer.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- src/transformers/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index c13fdf5a9baa35..16576b2a3106a4 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -1106,7 +1106,7 @@ def get_optimizer_cls_and_kwargs(args: TrainingArguments) -> Tuple[Any, Any]: elif "rmsprop" in args.optim: optimizer_cls = RMSprop # Above we pass all `adam_kwargs` to the optimizer, here - # we only pass `optim_args` which can be passed to the user. + # we only pass `optim_args` which can be passed by the user. additional_optim_kwargs = optim_args bnb_kwargs = {"optim_bits": optim_bits}