fix: fix async vllm nccl fail on dsv3 tp16pp2 and non-colocated on single node (NVIDIA-NeMo#898)

yuki-97 · soodoshll · commit d22d309b249e · 2025-09-04T11:36:47.000-07:00
Signed-off-by: Yuki Huang &lt;yukih@nvidia.com&gt;
Signed-off-by: Qidong Su &lt;qidongs@nvidia.com&gt;
diff --git a/nemo_rl/models/generation/vllm/vllm_worker.py b/nemo_rl/models/generation/vllm/vllm_worker.py
@@ -202,9 +202,14 @@ def _patched_maybe_force_spawn():
             logger.info("Successfully patched vllm.utils._maybe_force_spawn.")
 
             def _patch_vllm_init_workers_ray():
-                # Patch the vLLM ray_distributed_executor.py file to pass custom runtime_env in _init_workers_ray call.
-                # This allows passing custom py_executable to worker initialization.
+                """Patch the vLLM ray_distributed_executor.py file.
 
+                1. Pass custom runtime_env in _init_workers_ray call.
+                    - This allows passing custom py_executable to worker initialization.
+                2. Add NCCL_CUMEM_ENABLE and NCCL_NVLS_ENABLE to vLLM ADDITIONAL_ENV_VARS.
+                    - This is a workaround to fix async vllm in some scenarios.
+                    - See https://github.com/NVIDIA-NeMo/RL/pull/898 for more details.
+                """
                 try:
                     import vllm.executor.ray_distributed_executor as ray_executor_module
 
@@ -213,26 +218,36 @@ def _patch_vllm_init_workers_ray():
                     with open(file_to_patch, "r") as f:
                         content = f.read()
 
-                    old_line = "self._init_workers_ray(placement_group)"
-                    new_line = f'self._init_workers_ray(placement_group, runtime_env={{"py_executable": "{self.py_executable}"}})'
+                    old_lines = [
+                        "self._init_workers_ray(placement_group)",
+                        'ADDITIONAL_ENV_VARS = {"HF_TOKEN", "HUGGING_FACE_HUB_TOKEN"}',
+                    ]
 
-                    if new_line in content:
-                        return
+                    new_lines = [
+                        f'self._init_workers_ray(placement_group, runtime_env={{"py_executable": "{self.py_executable}"}})',
+                        'ADDITIONAL_ENV_VARS = {"HF_TOKEN", "HUGGING_FACE_HUB_TOKEN", "NCCL_CUMEM_ENABLE", "NCCL_NVLS_ENABLE"}',
+                    ]
 
-                    if old_line not in content:
-                        return
+                    need_replace = False
+                    for old_line, new_line in zip(old_lines, new_lines):
+                        if new_line in content or old_line not in content:
+                            continue
+                        content = content.replace(old_line, new_line)
+                        need_replace = True
 
-                    patched_content = content.replace(old_line, new_line)
+                    if not need_replace:
+                        return
 
                     # Write back the patched content
                     with open(file_to_patch, "w") as f:
-                        f.write(patched_content)
+                        f.write(content)
 
                 except (ImportError, FileNotFoundError, PermissionError):
                     # Allow failures gracefully
                     pass
 
             _patch_vllm_init_workers_ray()
+            logger.info("Successfully patched vllm _init_workers_ray.")
 
         except (ImportError, AttributeError):
             # vllm not installed or has a different structure, skipping patch.