From f1b7aac2d05aa6fc4f1153c7d64c95a26a6dea21 Mon Sep 17 00:00:00 2001 From: "[[ -z $EMAIL ]] && read -e -p \"Enter your email (for git configuration): \" EMAIL" Date: Thu, 26 Sep 2024 12:42:55 -0400 Subject: [PATCH 1/2] Multiple model multi GPU fixed, different issues than torch --- .../scripts/external_deps/test_ds_multiple_model.py | 2 ++ tests/deepspeed/test_deepspeed_multiple_model.py | 10 +++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/accelerate/test_utils/scripts/external_deps/test_ds_multiple_model.py b/src/accelerate/test_utils/scripts/external_deps/test_ds_multiple_model.py index bdfb08c715f..3729ecf4c72 100644 --- a/src/accelerate/test_utils/scripts/external_deps/test_ds_multiple_model.py +++ b/src/accelerate/test_utils/scripts/external_deps/test_ds_multiple_model.py @@ -30,6 +30,7 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup from accelerate import Accelerator, DeepSpeedPlugin, DistributedType +from accelerate.state import AcceleratorState from accelerate.utils.deepspeed import get_active_deepspeed_plugin @@ -323,6 +324,7 @@ def main(): args = parser.parse_args() config = {"lr": 2e-5, "num_epochs": args.num_epochs, "seed": 42, "batch_size": 16} single_model_training(config, args) + AcceleratorState._reset_state(True) multiple_model_training(config, args) diff --git a/tests/deepspeed/test_deepspeed_multiple_model.py b/tests/deepspeed/test_deepspeed_multiple_model.py index f26f27b6f5b..67d5a3686c2 100644 --- a/tests/deepspeed/test_deepspeed_multiple_model.py +++ b/tests/deepspeed/test_deepspeed_multiple_model.py @@ -21,9 +21,9 @@ from transformers import AutoModelForCausalLM from accelerate import Accelerator, DeepSpeedPlugin +from accelerate.commands.launch import launch_command, launch_command_parser from accelerate.test_utils.testing import ( AccelerateTestCase, - execute_subprocess_async, path_in_accelerate_package, require_deepspeed, require_huggingface_suite, @@ -32,7 +32,6 @@ slow, ) from accelerate.test_utils.training import RegressionDataset -from accelerate.utils import patch_environment from accelerate.utils.deepspeed import DummyOptim, DummyScheduler, get_active_deepspeed_plugin @@ -42,6 +41,7 @@ @require_deepspeed @require_non_cpu class DeepSpeedConfigIntegration(AccelerateTestCase): + parser = launch_command_parser() test_scripts_folder = path_in_accelerate_package("test_utils", "scripts", "external_deps") def setUp(self): @@ -171,6 +171,6 @@ def test_prepare_multiple_models_zero3_inference(self): @slow def test_train_multiple_models(self): self.test_file_path = self.test_scripts_folder / "test_ds_multiple_model.py" - cmd = ["accelerate", "launch", "--num_processes=2", "--num_machines=1", self.test_file_path] - with patch_environment(omp_num_threads=1): - execute_subprocess_async(cmd) + args = ["--num_processes=2", "--num_machines=1", "--main_process_port=10999", str(self.test_file_path)] + args = self.parser.parse_args(args) + launch_command(args) From 1bb554c2f6abd148e763c38c1efa6466bb18ab4f Mon Sep 17 00:00:00 2001 From: "[[ -z $EMAIL ]] && read -e -p \"Enter your email (for git configuration): \" EMAIL" Date: Thu, 26 Sep 2024 12:48:50 -0400 Subject: [PATCH 2/2] Fix multiple-model issues --- .../test_deepspeed_multiple_model.py | 42 ++++++++++--------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/tests/deepspeed/test_deepspeed_multiple_model.py b/tests/deepspeed/test_deepspeed_multiple_model.py index 67d5a3686c2..f26f40debb3 100644 --- a/tests/deepspeed/test_deepspeed_multiple_model.py +++ b/tests/deepspeed/test_deepspeed_multiple_model.py @@ -32,6 +32,7 @@ slow, ) from accelerate.test_utils.training import RegressionDataset +from accelerate.utils import patch_environment from accelerate.utils.deepspeed import DummyOptim, DummyScheduler, get_active_deepspeed_plugin @@ -145,26 +146,27 @@ def test_multiple_accelerators(self): _ = Accelerator(deepspeed_plugin=ds_zero3) def test_prepare_multiple_models_zero3_inference(self): - ds_plugins = self.get_ds_plugins(zero3_inference=True) - accelerator = Accelerator(deepspeed_plugin=ds_plugins) - # Using Zero-2 first - model1 = self.model_init() - optimizer = DummyOptim(model1.parameters()) - scheduler = DummyScheduler(optimizer) - - dataset = RegressionDataset() - dataloader = torch.utils.data.DataLoader(dataset, batch_size=1) - model1, optimizer, scheduler, dataloader = accelerator.prepare(model1, optimizer, scheduler, dataloader) - accelerator.state.select_deepspeed_plugin("zero3") - model2 = self.model_init() - with self.assertLogs(level="WARNING") as captured: - model2 = accelerator.prepare(model2) - self.assertIn( - "A wrapped DeepSpeed engine reference is currently tied for this `Accelerator()` instance.", - captured.output[0], - ) - - assert accelerator.deepspeed_engine_wrapped.engine is model1 + with patch_environment(**self.dist_env): + ds_plugins = self.get_ds_plugins(zero3_inference=True) + accelerator = Accelerator(deepspeed_plugin=ds_plugins) + # Using Zero-2 first + model1 = self.model_init() + optimizer = DummyOptim(model1.parameters()) + scheduler = DummyScheduler(optimizer) + + dataset = RegressionDataset() + dataloader = torch.utils.data.DataLoader(dataset, batch_size=1) + model1, optimizer, scheduler, dataloader = accelerator.prepare(model1, optimizer, scheduler, dataloader) + accelerator.state.select_deepspeed_plugin("zero3") + model2 = self.model_init() + with self.assertLogs(level="WARNING") as captured: + model2 = accelerator.prepare(model2) + self.assertIn( + "A wrapped DeepSpeed engine reference is currently tied for this `Accelerator()` instance.", + captured.output[0], + ) + + assert accelerator.deepspeed_engine_wrapped.engine is model1 @require_huggingface_suite @require_multi_device