fix TP

jiemingz · jiemingz · commit f5401dc8c3ba · 2025-07-23T20:49:04.000-07:00
Signed-off-by: Jimmy Zhang &lt;jiemingz@nvidia.com&gt;
diff --git a/nemo_rl/models/generation/fp8.py b/nemo_rl/models/generation/fp8.py
@@ -1,4 +1,4 @@
-import torch, os
+import torch, os, ray
 from accelerate import init_empty_weights
 from dataclasses import dataclass, field
 from transformers import AutoConfig, AutoModel
@@ -33,12 +33,23 @@ class FP8State:
 
 # Global FP8 config that can be accessed by patched vLLM functions
 # initialized by 'init_fp8_cfg()'
-fp8_config: FP8Config = None
+global_fp8_config: FP8Config = None
 # Global FP8 state that holds runtime fp8 objects
 fp8_state: FP8State = FP8State()
 
+fp8_patches_applied = False
+
+
+from vllm.executor.ray_distributed_executor import RayDistributedExecutor
+original_run_workers = RayDistributedExecutor._run_workers
+
+
+def apply_fp8_patches(self, fp8_config):
+    global global_fp8_config, fp8_patches_applied
+
+    if global_fp8_config is None:
+        global_fp8_config = fp8_config
 
-def init_fp8(vllm_cfg, model_name):
     # This patch is used to support torch.compile with vllm parameter subclasses, such as
     # PerTensorScaleParameter. Because we need weight loaders to update fp8 weights each
     # refit, we patch fp8 parameters to have a reference to their weight loader. Eventually
@@ -47,10 +58,9 @@ def init_fp8(vllm_cfg, model_name):
     func1_path = "vllm.model_executor.layers.quantization.fp8.Fp8LinearMethod.process_weights_after_loading"
     patcher1 = patch(func1_path, process_weights_after_loading)
     fp8_state.vllm_patches.append(patcher1)
-
     # These patches add support for pow2, e8 dynamic activation scalings factors which are believed to have higher
     # SNR compared to plain fp32 scaling factors. This feature is still under active research.
-    if vllm_cfg.get("pow2_activation_scaling_factors", False):
+    if global_fp8_config.use_activation_pow2_scale:
         func2_path = "vllm.model_executor.layers.quantization.utils.fp8_utils.per_token_group_quant_fp8"
         func3_path = "vllm.model_executor.layers.quantization.utils.fp8_utils._per_token_group_quant_fp8"
         func4_path = "vllm.model_executor.layers.quantization.utils.fp8_utils._per_token_group_quant_fp8_colmajor"
@@ -62,6 +72,33 @@ def init_fp8(vllm_cfg, model_name):
     for p in fp8_state.vllm_patches:
         p.start()
 
+    fp8_patches_applied = True
+        
+def patched_run_workers(self, *args, **kwargs):
+    global fp8_patches_applied
+    if not fp8_patches_applied:
+        apply_fp8_patches(self, global_fp8_config)
+        futures = [worker.execute_method.remote(apply_fp8_patches, global_fp8_config) for worker in self.workers]
+        [ray.get(future) for future in futures]
+    
+    return original_run_workers(self, *args, **kwargs)
+
+# we patch vllm's _run_workers so that before vllm initalizes the model, we execute a remote call that patches
+# each worker with our required fp8 vllm patches
+RayDistributedExecutor._run_workers = patched_run_workers
+
+
+def init_fp8(vllm_cfg, model_name):
+    global global_fp8_config
+    global_fp8_config = FP8Config(
+        use_weight_pow2_scale=vllm_cfg.get("pow2_weight_scaling_factors", False),
+        use_activation_pow2_scale=vllm_cfg.get(
+            "pow2_activation_scaling_factors", False
+        ),
+        num_first_layers_in_bf16=vllm_cfg.get("num_first_layers_in_bf16", 0),
+        num_last_layers_in_bf16=vllm_cfg.get("num_last_layers_in_bf16", 0),
+    )
+
     if vllm_cfg.get("use_deep_gemm", False):
         os.environ["VLLM_USE_DEEP_GEMM"] = "1"
 
@@ -106,17 +143,6 @@ def init_fp8(vllm_cfg, model_name):
     return vllm_kwargs
 
 
-def init_fp8_cfg(vllm_cfg):
-    global fp8_config
-    fp8_config = FP8Config(
-        use_weight_pow2_scale=vllm_cfg.get("pow2_weight_scaling_factors", False),
-        use_activation_pow2_scale=vllm_cfg.get(
-            "pow2_activation_scaling_factors", False
-        ),
-        num_first_layers_in_bf16=vllm_cfg.get("num_first_layers_in_bf16", 0),
-        num_last_layers_in_bf16=vllm_cfg.get("num_last_layers_in_bf16", 0),
-    )
-
 
 def is_fp8_model(vllm_config):
     from vllm.model_executor.layers.quantization.fp8 import Fp8Config
@@ -269,7 +295,7 @@ def kitchen_block_scale(
     # Calculate descale factor
     descale = max_abs / max_dtype
 
-    if fp8_config.use_weight_pow2_scale:
+    if global_fp8_config.use_weight_pow2_scale:
         exponent = torch.ceil(torch.log2(descale))
         # Post process exponent to be in range of -127 to 127 and to be E8M0 biased
         exponent = torch.clamp(exponent, min=-127, max=127) + 127
@@ -483,7 +509,7 @@ def _per_token_group_quant_fp8_colmajor(
 def per_token_group_quant_fp8(
     *args, **kwargs,
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    assert fp8_config.use_activation_pow2_scale
+    assert global_fp8_config.use_activation_pow2_scale
     from vllm.model_executor.layers.quantization.utils.fp8_utils import per_token_group_quant_fp8 as vllm_per_token_group_quant_fp8
     return vllm_per_token_group_quant_fp8(*args, **kwargs)
    
diff --git a/nemo_rl/models/generation/vllm.py b/nemo_rl/models/generation/vllm.py
@@ -359,9 +359,6 @@ def _patch_vllm_init_workers_ray():
         # used in update_weights_from_ipc_handles
         self.vllm_device_ids = None
 
-        if self.cfg["vllm_cfg"]["precision"] == "fp8":
-            self.llm.collective_rpc("init_fp8_cfg", args=(self.cfg["vllm_cfg"],))
-
     def post_init(self):
         self.vllm_device_ids = self.report_device_id()