Skip to content

Commit

Permalink
Set gloo process group for FSDP with CPU offload (#2108)
Browse files Browse the repository at this point in the history
  • Loading branch information
ebsmothers authored Dec 4, 2024
1 parent 9b41f49 commit 5eb04cd
Show file tree
Hide file tree
Showing 6 changed files with 7 additions and 5 deletions.
2 changes: 1 addition & 1 deletion recipes/full_finetune_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -946,7 +946,7 @@ def recipe_main(cfg: DictConfig) -> None:
"Distributed finetune recipe should be run via a distributed launcher."
"If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]"
)
init_process_group(backend="gloo" if cfg.device == "cpu" else "nccl")
init_process_group("cuda:nccl,cpu:gloo")
if cfg.get("fsdp_cpu_offload", False):
# Utilize all available CPU cores for intra-op parallelism. This provides ~2x
# speed up when benchmarking fused AdamW on CPU
Expand Down
2 changes: 1 addition & 1 deletion recipes/knowledge_distillation_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -971,11 +971,11 @@ def recipe_main(cfg: DictConfig) -> None:
"Distributed finetune recipe should be run via a distributed launcher."
"If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]"
)
init_process_group("cuda:nccl,cpu:gloo")
if cfg.get("fsdp_cpu_offload", False):
# Utilize all available CPU cores for intra-op parallelism. This provides ~2x
# speed up when benchmarking fused AdamW on CPU
training.set_torch_num_threads()
init_process_group(backend="gloo" if cfg.device == "cpu" else "nccl")

config.log_config(recipe_name="KDRecipeDistributed", cfg=cfg)

Expand Down
2 changes: 1 addition & 1 deletion recipes/lora_dpo_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -782,11 +782,11 @@ def recipe_main(cfg: DictConfig) -> None:
"Distributed finetune recipe should be run via a distributed launcher."
"If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]"
)
init_process_group("cuda:nccl,cpu:gloo")
if cfg.get("fsdp_cpu_offload", False):
# Utilize all available CPU cores for intra-op parallelism. This provides ~2x
# speed up when benchmarking fused AdamW on CPU
training.set_torch_num_threads()
init_process_group(backend="gloo" if cfg.device == "cpu" else "nccl")

config.log_config(recipe_name="LoRADPORecipeDistributed", cfg=cfg)

Expand Down
2 changes: 1 addition & 1 deletion recipes/lora_finetune_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -920,11 +920,11 @@ def recipe_main(cfg: DictConfig) -> None:
"Distributed finetune recipe should be run via a distributed launcher."
"If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]"
)
init_process_group("cuda:nccl,cpu:gloo")
if cfg.get("fsdp_cpu_offload", False):
# Utilize all available CPU cores for intra-op parallelism. This provides ~2x
# speed up when benchmarking fused AdamW on CPU
training.set_torch_num_threads()
init_process_group(backend="gloo" if cfg.device == "cpu" else "nccl")

config.log_config(recipe_name="LoRAFinetuneRecipeDistributed", cfg=cfg)

Expand Down
2 changes: 1 addition & 1 deletion recipes/qat_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -935,7 +935,7 @@ def recipe_main(cfg: DictConfig) -> None:
"Distributed finetune recipe should be run via a distributed launcher."
"If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]"
)
init_process_group(backend="gloo" if cfg.device == "cpu" else "nccl")
init_process_group("cuda:nccl,cpu:gloo")
if cfg.get("fsdp_cpu_offload", False):
# Utilize all available CPU cores for intra-op parallelism. This provides ~2x
# speed up when benchmarking fused AdamW on CPU
Expand Down
2 changes: 2 additions & 0 deletions tests/recipes/test_full_finetune_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ def test_loss(
# should be the same.
if not optim_in_bwd:
cmd.append("clip_grad_norm=100")
# Test that gradient clipping works with CPU offload
cmd.append("fsdp_cpu_offload=True")
else:
cmd.append("optimizer_in_bwd=True")

Expand Down

0 comments on commit 5eb04cd

Please sign in to comment.