diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py index fe81fceebd33..a48dd4e620b4 100755 --- a/deepspeed/runtime/zero/config.py +++ b/deepspeed/runtime/zero/config.py @@ -189,4 +189,4 @@ def _initialize(self, zero_config_dict): self.round_robin_gradients = get_scalar_param( zero_config_dict, ZERO_OPTIMIZATION_ROUND_ROBIN_GRADIENTS, - ZERO3_OPTIMIZATION_CONTIGUOUS_GRADIENTS_DEFAULT) + ZERO_OPTIMIZATION_ROUND_ROBIN_GRADIENTS_DEFAULT) diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py index 064d59629d87..faa6443bfef4 100755 --- a/deepspeed/runtime/zero/stage2.py +++ b/deepspeed/runtime/zero/stage2.py @@ -106,6 +106,7 @@ def __init__(self, logger.info(f"Reduce bucket size {reduce_bucket_size}") logger.info(f"Allgather bucket size {allgather_bucket_size}") logger.info(f"CPU Offload: {cpu_offload}") + logger.info(f'Round robin gradient partitioning: {round_robin_gradients}') # The fused optimizer does all the work. We need this layer for two reason: # 1. maintain same user API from apex.fp16_utils # 2. keep common stuff here in case we need to add ne552w fused optimizer later