Remove/fix TODOs

ProExpertProg · ProExpertProg · commit 8ffb4744f86e · 2025-10-15T03:25:26.000-04:00
Signed-off-by: Luka Govedič &lt;lgovedic@redhat.com&gt;
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
@@ -101,7 +101,7 @@ def build_attn_metadata(self, batch_size: int) -> AttentionMetadata:
         num_blocks = batch_size * max_blocks
         backend = self.attn.backend
 
-        # TODO use get_kv_cache_stride_order
+        # TODO(luka) use get_kv_cache_stride_order
         # Create dummy KV cache for the selected backend
         if backend == _Backend.ROCM_ATTN:
             # k/v as 1st dimention
diff --git a/tests/compile/test_fusions_e2e.py b/tests/compile/test_fusions_e2e.py
@@ -90,7 +90,7 @@ class ModelBackendTestCase(NamedTuple):
         ModelBackendTestCase(
             model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
             model_kwargs=dict(max_model_len=1024),
-            backend=_Backend.ROCM_AITER_FA,  # TODO ROCM_AITER_UNIFIED_ATTN
+            backend=_Backend.ROCM_AITER_UNIFIED_ATTN,
             attention_fusions=32,
         ),
     ]
@@ -187,7 +187,7 @@ def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]:
         flat_product(
             MODELS_FP8, custom_ops_product(CUSTOM_OPS_FP8, CUSTOM_OPS_RMS_NORM)
         )
-    )  # TODO
+    )
     # Toggle RMSNorm for FP4 models and unquant models
     + list(flat_product(MODELS_FP4 + MODELS, CUSTOM_OPS_RMS_NORM)),
 )
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
@@ -9,7 +9,7 @@
 from torch._inductor.pattern_matcher import PatternMatcherPass
 from torch._ops import OpOverload
 
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape,
@@ -93,6 +93,8 @@ class RMSNormQuantPattern:
     def __init__(self, epsilon: float, key: FusedRMSQuantKey):
         self.epsilon = epsilon
         self.quant_dtype = key.quant.dtype
+        config = get_current_vllm_config()
+        self.model_dtype = config.model_config.dtype if config.model_config else None
 
         assert key in FUSED_OPS, f"unsupported fused rmsnorm+quant op for {key}"
         self.FUSED_OP = FUSED_OPS[key]
@@ -124,7 +126,7 @@ def pattern(input: torch.Tensor, weight: torch.Tensor, scale: torch.Tensor):
         def replacement(input: torch.Tensor, weight: torch.Tensor, scale: torch.Tensor):
             # In case we're matching native rms-norm, conversions might be
             # optimized out. We convert here just to be safe.
-            input = input.to(dtype=torch.float16)  # TODO model dtype
+            input = input.to(dtype=self.model_dtype)
 
             result = torch.empty_like(input, dtype=self.quant_dtype)
             at = auto_functionalized(
@@ -179,8 +181,8 @@ def replacement(
         ):
             # In case we're matching native rms-norm, conversions might be
             # optimized out. We convert here just to be safe.
-            input = input.to(dtype=torch.float16)  # TODO model dtype
-            residual = residual.to(dtype=torch.float16)
+            input = input.to(dtype=self.model_dtype)
+            residual = residual.to(dtype=self.model_dtype)
 
             result = torch.empty_like(input, dtype=self.quant_dtype)
             at = auto_functionalized(
@@ -235,7 +237,7 @@ def pattern(input: torch.Tensor, weight: torch.Tensor):
         def replacement(input: torch.Tensor, weight: torch.Tensor):
             # In case we're matching native rms-norm, conversions might be
             # optimized out. We convert here just to be safe.
-            input = input.to(dtype=torch.float16)  # TODO model dtype
+            input = input.to(dtype=self.model_dtype)
 
             result = torch.empty_like(input, dtype=self.quant_dtype)
             scale = self.quant_matcher.make_scale(input)
@@ -289,8 +291,8 @@ def replacement(
         ):
             # In case we're matching native rms-norm, conversions might be
             # optimized out. We convert here just to be safe.
-            input = input.to(dtype=torch.float16)  # TODO model dtype
-            residual = residual.to(dtype=torch.float16)
+            input = input.to(dtype=self.model_dtype)
+            residual = residual.to(dtype=self.model_dtype)
 
             result = torch.empty_like(input, dtype=self.quant_dtype)
             scale = self.quant_matcher.make_scale(input)
diff --git a/vllm/compilation/matcher_utils.py b/vllm/compilation/matcher_utils.py
@@ -34,7 +34,9 @@
 
 class MatcherCustomOp(ABC):
     def __init__(self, enabled: bool):
-        self.model_dtype = get_current_vllm_config().model_config.dtype
+        config = get_current_vllm_config()
+        self.model_dtype = config.model_config.dtype if config.model_config else None
+        self.device = config.device_config.device if config.device_config else None
 
         self.enabled = enabled
         self.forward = self.forward_custom if enabled else self.forward_native
@@ -51,10 +53,10 @@ def __call__(self, *args, **kws):
         return self.forward(*args, **kws)
 
     def empty(self, *args, **kws):
-        return torch.empty(*args, dtype=self.model_dtype, device="cuda", **kws)
+        return torch.empty(*args, dtype=self.model_dtype, device=self.device, **kws)
 
     def empty_f32(self, *args, **kws):
-        return torch.empty(*args, dtype=torch.float32, device="cuda", **kws)
+        return torch.empty(*args, dtype=torch.float32, device=self.device, **kws)
 
     def inputs(self) -> list[torch.Tensor]:
         """Utility for inputs to the pattern"""
@@ -166,7 +168,7 @@ def forward_custom(
         input: torch.Tensor,
         scale: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
-        # TODO: why does empty_like produce a permute but
+        # TODO(luka): why does empty_like produce a permute but
         #  empty via shape doesn't?
         result = torch.empty(
             input.shape, device=input.device, dtype=self.quant_key.dtype

Original file line number	Diff line number	Diff line change
`@@ -90,7 +90,7 @@ class ModelBackendTestCase(NamedTuple):`
`90`	`90`	`ModelBackendTestCase(`
`91`	`91`	`model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",`
`92`	`92`	`model_kwargs=dict(max_model_len=1024),`
`93`		`- backend=_Backend.ROCM_AITER_FA, # TODO ROCM_AITER_UNIFIED_ATTN`
	`93`	`+ backend=_Backend.ROCM_AITER_UNIFIED_ATTN,`
`94`	`94`	`attention_fusions=32,`
`95`	`95`	`),`
`96`	`96`	`]`
`@@ -187,7 +187,7 @@ def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]:`
`187`	`187`	`flat_product(`
`188`	`188`	`MODELS_FP8, custom_ops_product(CUSTOM_OPS_FP8, CUSTOM_OPS_RMS_NORM)`
`189`	`189`	`)`
`190`		`- ) # TODO`
	`190`	`+ )`
`191`	`191`	`# Toggle RMSNorm for FP4 models and unquant models`
`192`	`192`	`+ list(flat_product(MODELS_FP4 + MODELS, CUSTOM_OPS_RMS_NORM)),`
`193`	`193`	`)`