From 3aea9650568f67a8cac240213622e37d0dae0142 Mon Sep 17 00:00:00 2001 From: Tunji Ruwase Date: Fri, 12 Mar 2021 22:08:43 +0000 Subject: [PATCH 1/4] Fix ZeRO3 save_checkpoint --- deepspeed/runtime/zero/stage3.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index f840de15c57d..8b3b8d7a6df1 100755 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -2629,14 +2629,12 @@ def get_groups_without_padding(self, groups_with_padding): def _set_fp32_optimizer_param_groups(self): for sub_group_id, _ in enumerate(self.fp16_groups): param_group_id = self.sub_group_to_group_id[sub_group_id] - self.optimizer.param_groups[param_group_id]['params'] = [ - self.fp32_partitioned_groups_flat[sub_group_id] - ] + self.optimizer.param_groups[param_group_id]['params'].append( + self.fp32_partitioned_groups_flat[sub_group_id]) def _clear_fp32_optimizer_param_groups(self): - for sub_group_id, _ in enumerate(self.fp16_groups): - param_group_id = self.sub_group_to_group_id[sub_group_id] - self.optimizer.param_groups[param_group_id]['params'] = [] + for param_group in self.optimizer.param_groups: + param_group['params'] = [] def _rigid_state_dict(self): state_dict = {} From bf47165f8274cb0b8823b76215a20e0072e3b73d Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Mon, 15 Mar 2021 14:16:51 -0700 Subject: [PATCH 2/4] turn checkpoint test back on --- tests/unit/test_checkpointing.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/unit/test_checkpointing.py b/tests/unit/test_checkpointing.py index 0fbe354933c4..8546fa5896ad 100755 --- a/tests/unit/test_checkpointing.py +++ b/tests/unit/test_checkpointing.py @@ -307,8 +307,6 @@ def _test_checkpoint_fused_optimizer(args, def test_checkpoint_zero_optimizer(tmpdir, zero_stage, use_cpu_offload, adam_optimizer): if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: pytest.skip("cpu-adam is not compatible") - if zero_stage == 3: - pytest.skip('Skip checkpointing tests for ZeRO3') config_dict = { "train_batch_size": 2, @@ -340,9 +338,7 @@ def _test_checkpoint_zero_optimizer(args, hidden_dim, load_optimizer_states): if zero_stage == 3: - global FP16_DeepSpeedZeroOptimizer_Stage3 - from deepspeed.runtime.zero.stage3 import FP16_DeepSpeedZeroOptimizer_Stage3 - with deepspeed.ScatteredParameters(zero_modules=True): + with deepspeed.zero.Init(): models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] else: models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] From 0371abb6034eb6e0f0f5b26cf9fe9c2895781742 Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Mon, 15 Mar 2021 21:38:42 -0700 Subject: [PATCH 3/4] formatting --- tests/unit/test_checkpointing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_checkpointing.py b/tests/unit/test_checkpointing.py index 8546fa5896ad..880c7b6d7b43 100755 --- a/tests/unit/test_checkpointing.py +++ b/tests/unit/test_checkpointing.py @@ -338,7 +338,7 @@ def _test_checkpoint_zero_optimizer(args, hidden_dim, load_optimizer_states): if zero_stage == 3: - with deepspeed.zero.Init(): + with deepspeed.zero.Init(): models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] else: models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] From 9016e20a6fe712c56eb774feb726bc01ea6ce43b Mon Sep 17 00:00:00 2001 From: Tunji Ruwase Date: Tue, 16 Mar 2021 13:37:53 +0000 Subject: [PATCH 4/4] debug prints --- deepspeed/runtime/zero/stage3.py | 13 ++++++++++- tests/unit/test_checkpointing.py | 39 ++++++++++++++++++++------------ 2 files changed, 36 insertions(+), 16 deletions(-) diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index 8b3b8d7a6df1..09a69ba24e83 100755 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -2258,9 +2258,20 @@ def _prepare_fp32_grad_for_sub_group(self, sub_group_id): self.averaged_gradients[sub_group_id]).to( self.fp32_partitioned_groups_flat[sub_group_id].dtype) + print(f'rank = {dist.get_rank()} sub_group_id = {sub_group_id}') + print(f'rank = {dist.get_rank()} partition_id = {partition_id}') + print(f'rank = {dist.get_rank()} len(self.averaged_gradients = len(self.averaged_gradients)') + print(f'rank = {dist.get_rank()} self.averaged_gradients[sub_group_id] = {self.averaged_gradients[sub_group_id]}') + print(f'rank = {dist.get_rank()} partition_size = {len(self.partition_size)}') + print(f'rank = {dist.get_rank()} single_grad_partition.numel() == {single_grad_partition.numel()}') + print(f'rank = {dist.get_rank()} len(self.fp32_partitioned_groups_flat) = {len(self.fp32_partitioned_groups_flat)}') + print(f'rank = {dist.get_rank()} self.fp32_partitioned_groups_flat[sub_group_id].numel() = {self.fp32_partitioned_groups_flat[sub_group_id].numel()}') + print(f'rank = {dist.get_rank()} single_grad_partition == {single_grad_partition}') + print(f'rank = {dist.get_rank()} self.fp32_partitioned_groups_flat[sub_group_id] = {self.fp32_partitioned_groups_flat[sub_group_id]}') + assert single_grad_partition.numel() == self.fp32_partitioned_groups_flat[sub_group_id].numel(), \ "averaged gradients have different number of elements that partition size {} {} {} {}".format( - single_grad_partition.numel(), self.partition_size[sub_group_id], sub_group_id, partition_id) + single_grad_partition.numel(), self.fp32_partitioned_groups_flat[sub_group_id].numel(), sub_group_id, partition_id) self.fp32_partitioned_groups_flat[sub_group_id].grad = single_grad_partition diff --git a/tests/unit/test_checkpointing.py b/tests/unit/test_checkpointing.py index 0fbe354933c4..95061c0d37fb 100755 --- a/tests/unit/test_checkpointing.py +++ b/tests/unit/test_checkpointing.py @@ -47,7 +47,11 @@ def compare_model_states(saved_model, loaded_model, compare_optimizer=True): if FP16_DeepSpeedZeroOptimizer_Stage3 is not None and isinstance( saved_model.optimizer, FP16_DeepSpeedZeroOptimizer_Stage3): - for p0, p1 in zip(saved_model.optimizer.fp32_groups_flat, loaded_model.optimizer.fp32_groups_flat): + + print(f'rank {dist.get_rank()} saved fp32_partitioned_groups_flat = {saved_model.optimizer.fp32_partitioned_groups_flat}') + print(f'rank {dist.get_rank()} loaded fp32_partitioned_groups_flat = {loaded_model.optimizer.fp32_partitioned_groups_flat}') + + for p0, p1 in zip(saved_model.optimizer.fp32_partitioned_groups_flat, loaded_model.optimizer.fp32_partitioned_groups_flat): assert torch.allclose(p0, p1, atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}" elif isinstance(saved_model.optimizer, FP16_DeepSpeedZeroOptimizer): @@ -303,12 +307,15 @@ def _test_checkpoint_fused_optimizer(args, 'deepspeed_adam'), (3, False, - 'Adam')]) + 'Adam'), + (3, + False, + 'deepspeed_adam')]) def test_checkpoint_zero_optimizer(tmpdir, zero_stage, use_cpu_offload, adam_optimizer): if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: pytest.skip("cpu-adam is not compatible") - if zero_stage == 3: - pytest.skip('Skip checkpointing tests for ZeRO3') + #if zero_stage == 3: + # pytest.skip('Skip checkpointing tests for ZeRO3') config_dict = { "train_batch_size": 2, @@ -324,8 +331,10 @@ def test_checkpoint_zero_optimizer(tmpdir, zero_stage, use_cpu_offload, adam_opt } }, "fp16": { - "enabled": True + "enabled": True, + "initial_scale_power": 8 }, + "wall_clock_breakdown": True, "zero_optimization": { "stage": zero_stage, "cpu_offload": use_cpu_offload @@ -342,7 +351,7 @@ def _test_checkpoint_zero_optimizer(args, if zero_stage == 3: global FP16_DeepSpeedZeroOptimizer_Stage3 from deepspeed.runtime.zero.stage3 import FP16_DeepSpeedZeroOptimizer_Stage3 - with deepspeed.ScatteredParameters(zero_modules=True): + with deepspeed.zero.Init(): models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] else: models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] @@ -378,8 +387,8 @@ def test_checkpoint_zero_no_optimizer(tmpdir, adam_optimizer): if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: pytest.skip("cpu-adam is not compatible") - if zero_stage == 3: - pytest.skip('Skip checkpointing tests for ZeRO3') + #if zero_stage == 3: + # pytest.skip('Skip checkpointing tests for ZeRO3') config_dict = { "train_batch_size": 2, @@ -413,7 +422,7 @@ def _test_checkpoint_zero_no_optimizer(args, if zero_stage == 3: global FP16_DeepSpeedZeroOptimizer_Stage3 from deepspeed.runtime.zero.stage3 import FP16_DeepSpeedZeroOptimizer_Stage3 - with deepspeed.ScatteredParameters(zero_modules=True): + with deepspeed.zero.Init(): models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] else: models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] @@ -449,8 +458,8 @@ def _test_checkpoint_zero_no_optimizer(args, def test_checkpoint_lr_scheduler(tmpdir, zero_stage, use_cpu_offload, adam_optimizer): if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: pytest.skip("cpu-adam is not compatible") - if zero_stage == 3: - pytest.skip('Skip checkpointing tests for ZeRO3') + #if zero_stage == 3: + # pytest.skip('Skip checkpointing tests for ZeRO3') config_dict = { "train_batch_size": 2, @@ -493,7 +502,7 @@ def _test_checkpoint_lr_scheduler(args, if zero_stage == 3: global FP16_DeepSpeedZeroOptimizer_Stage3 from deepspeed.runtime.zero.stage3 import FP16_DeepSpeedZeroOptimizer_Stage3 - with deepspeed.ScatteredParameters(zero_modules=True): + with deepspeed.zero.Init(): models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] else: models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] @@ -532,8 +541,8 @@ def _test_checkpoint_lr_scheduler(args, def test_checkpoint_no_lr_scheduler(tmpdir, zero_stage, use_cpu_offload, adam_optimizer): if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: pytest.skip("cpu-adam is not compatible") - if zero_stage == 3: - pytest.skip('Skip checkpointing tests for ZeRO3') + #if zero_stage == 3: + # pytest.skip('Skip checkpointing tests for ZeRO3') config_dict = { "train_batch_size": 2, @@ -570,7 +579,7 @@ def _test_checkpoint_no_lr_scheduler(args, load_optimizer_states, load_lr_scheduler_states): if zero_stage == 3: - with deepspeed.ScatteredParameters(zero_modules=True): + with deepspeed.zero.Init(): models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)] else: models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]