Skip to content

Commit

Permalink
Add interface to set NCCL options of each process group (NVIDIA#7923)
Browse files Browse the repository at this point in the history
Signed-off-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
  • Loading branch information
erhoo82 and ericharper authored Dec 4, 2023
1 parent 16496ed commit 4e2ed33
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ model:
apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
sync_batch_comm: False # Enable stream synchronization after each p2p communication between pipeline stages
nccl_communicator_config_path: null # Path to the yaml file with NCCL communicator options (min_ctas, max_ctas, and cga_cluster_size)

## Activation Checkpointing
# NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
Expand Down
1 change: 1 addition & 0 deletions nemo/collections/nlp/parts/megatron_trainer_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def _training_strategy(self) -> NLPDDPStrategy:
no_ddp_communication_hook=True,
gradient_as_bucket_view=self.cfg.model.gradient_as_bucket_view,
find_unused_parameters=False,
nccl_communicator_config_path=self.cfg.model.get('nccl_communicator_config_path', None),
)

def _grad_scaler(self) -> GradScaler:
Expand Down
5 changes: 4 additions & 1 deletion nemo/collections/nlp/parts/nlp_overrides.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ class NLPDDPStrategy(DDPStrategy):
Args:
no_ddp_communication_hook: Disable DDP communication hook when using AMP-O2
with FP32 gradient accumulation.
nccl_communicator_config_path: Path to the yaml file with NCCL communicator options
"""

def __init__(
Expand All @@ -89,6 +90,7 @@ def __init__(
cluster_environment: ClusterEnvironment = None,
checkpoint_io: Optional[CheckpointIO] = None,
no_ddp_communication_hook: bool = False,
nccl_communicator_config_path: Optional[str] = None,
**kwargs: Union[Any, Dict[str, Any]],
) -> None:
if not HAVE_APEX:
Expand All @@ -103,6 +105,7 @@ def __init__(
super().__init__(parallel_devices, cluster_environment, checkpoint_io, **kwargs)

self.no_ddp_communication_hook = no_ddp_communication_hook
self.nccl_communicator_config_path = nccl_communicator_config_path

def setup(self, trainer: "pl.Trainer") -> None:
"""
Expand Down Expand Up @@ -180,7 +183,6 @@ def init_model_parallel(self, global_rank: int, world_size: int) -> None:
Args:
global_rank (int): the global process index.
world_size (int): the total number of GPUs, num_nodes * num_devices
is_slurm_managing_tasks (bool, optional): is the cluster managed by SLURM.
"""
app_state = AppState()

Expand All @@ -196,6 +198,7 @@ def init_model_parallel(self, global_rank: int, world_size: int) -> None:
pipeline_model_parallel_size=app_state.pipeline_model_parallel_size,
virtual_pipeline_model_parallel_size=app_state.virtual_pipeline_model_parallel_size,
pipeline_model_parallel_split_rank=app_state.pipeline_model_parallel_split_rank,
nccl_communicator_config_path=self.nccl_communicator_config_path,
)

# assert that fake tp and pp rank match after model parallel init
Expand Down

0 comments on commit 4e2ed33

Please sign in to comment.