deepspeedai · tjruwase · Jul 28, 2021 · Jul 28, 2021 · Jul 28, 2021 · Jul 28, 2021
@@ -189,4 +189,4 @@ def _initialize(self, zero_config_dict):
         self.round_robin_gradients = get_scalar_param(
             zero_config_dict,
             ZERO_OPTIMIZATION_ROUND_ROBIN_GRADIENTS,
-            ZERO3_OPTIMIZATION_CONTIGUOUS_GRADIENTS_DEFAULT)
+            ZERO_OPTIMIZATION_ROUND_ROBIN_GRADIENTS_DEFAULT)
@@ -106,6 +106,7 @@ def __init__(self,
             logger.info(f"Reduce bucket size {reduce_bucket_size}")
             logger.info(f"Allgather bucket size {allgather_bucket_size}")
             logger.info(f"CPU Offload: {cpu_offload}")
+            logger.info(f'Round robin gradient partitioning: {round_robin_gradients}')
         # The fused optimizer does all the work. We need this layer for two reason:
         # 1. maintain same user API from apex.fp16_utils
         # 2. keep common stuff here in case we need to add ne552w fused optimizer later