[ROCm] Small functional changes for gptoss (#25201)

jpvillam-amd · jpvillam · web-flow · commit bde2a1a8a442 · 2025-09-23T23:39:50.000Z
Signed-off-by: jpvillam &lt;jpvillam@amd.com&gt;
Co-authored-by: jpvillam &lt;jpvillam@amd.com&gt;
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -212,12 +212,15 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
             intermediate_size_per_partition_after_pad = round_up(
                 intermediate_size_per_partition, 256)
             hidden_size = round_up(hidden_size, 256)
-        elif current_platform.is_rocm() or (
-                self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS
-                or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16):
+        elif (self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS
+              or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16):
             intermediate_size_per_partition_after_pad = round_up(
                 intermediate_size_per_partition, 128)
             hidden_size = round_up(hidden_size, 128)
+        elif current_platform.is_rocm():
+            intermediate_size_per_partition_after_pad = round_up(
+                intermediate_size_per_partition, 256)
+            hidden_size = round_up(hidden_size, 256)
         else:
             intermediate_size_per_partition_after_pad = round_up(
                 intermediate_size_per_partition, 64)
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Callable, Optional
+from typing import Any, Callable, Optional
 
 import torch
 
@@ -21,15 +21,26 @@ def _swizzle_mxfp4(quant_tensor, scale, num_warps):
     from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
     from triton_kernels.tensor_details import layout
     from triton_kernels.tensor_details.layout import StridedLayout
+
+    value_layout_opts: dict[str, Any] = {}
+    scale_layout_opts: dict[str, Any] = {}
+
     if (current_platform.is_cuda()
             and current_platform.is_device_capability(90)
             and not is_torch_equal_or_newer("2.8.1")):
         logger.warning_once(
             "Mxfp4 on hopper is running on torch < 2.8.1, "
             "this cause swizling to be disabled, which may "
             "cause performance degradation. Please upgrade to torch nightly")
-        value_layout, value_layout_opts = StridedLayout, dict()
-        scale_layout, scale_layout_opts = StridedLayout, dict()
+        value_layout = StridedLayout
+        scale_layout = StridedLayout
+    elif current_platform.is_rocm():
+        from triton_kernels.tensor_details.layout import (GFX950MXScaleLayout,
+                                                          StridedLayout)
+
+        from vllm.platforms.rocm import on_gfx950
+        value_layout = StridedLayout
+        scale_layout = GFX950MXScaleLayout if on_gfx950() else StridedLayout
     else:
         value_layout, value_layout_opts = \
             layout.make_default_matmul_mxfp4_w_layout(mx_axis=1)
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
@@ -118,6 +118,12 @@ def on_gfx9() -> bool:
     return any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"])
 
 
+@cache
+def on_gfx950() -> bool:
+    GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
+    return any(arch in GPU_ARCH for arch in ["gfx950"])
+
+
 @cache
 def use_rocm_custom_paged_attention(
         qtype: torch.dtype,