Update test. Disable QR by default. Set fp16 ovfl flag.

ilmarkov · ilmarkov · commit 247348eae35e · 2025-06-17T16:39:26.000Z
Signed-off-by: ilmarkov &lt;imarkov@redhat.com&gt;
diff --git a/csrc/quickreduce/quick_reduce_impl.cuh b/csrc/quickreduce/quick_reduce_impl.cuh
@@ -12,7 +12,9 @@ struct CodecBase {
   __quickreduce_device_inline__ CodecBase(int thread, int rank)
       : thread(thread),
         rank(rank),
-        group_leader((threadIdx.x / kThreadGroupSize) * kThreadGroupSize) {}
+        group_leader((threadIdx.x / kThreadGroupSize) * kThreadGroupSize) {
+    set_fp16_ovfl(true);
+  }
 };
 
 // Default full precision codec.
@@ -98,9 +100,7 @@ struct CodecQ4 : public CodecBase {
   static constexpr int kRangeBias = 0x00080008;
 
   __quickreduce_device_inline__ CodecQ4(int thread, int rank)
-      : CodecBase(thread, rank) {
-    set_fp16_ovfl(true);
-  }
+      : CodecBase(thread, rank) {}
 
   __quickreduce_device_inline__ void send(int32x4_t* __restrict__ send_buffer,
                                           const int32x4_t* __restrict__ data) {
@@ -253,9 +253,7 @@ struct CodecQ6 : public CodecBase {
   static constexpr int kRangeBias = 0x00200020;
 
   __quickreduce_device_inline__ CodecQ6(int thread, int rank)
-      : CodecBase(thread, rank) {
-    set_fp16_ovfl(true);
-  }
+      : CodecBase(thread, rank) {}
 
   __quickreduce_device_inline__ void send(int32x4_t* __restrict__ send_buffer,
                                           const int32x4_t* __restrict__ data) {
@@ -431,9 +429,7 @@ struct CodecQ8 : public CodecBase {
   static constexpr int kRangeBias = 0x00800080;
 
   __quickreduce_device_inline__ CodecQ8(int thread, int rank)
-      : CodecBase(thread, rank) {
-    set_fp16_ovfl(true);
-  }
+      : CodecBase(thread, rank) {}
 
   __quickreduce_device_inline__ void send(int32x4_t* __restrict__ send_buffer,
                                           int32x4_t const* __restrict__ data) {
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import os
 import random
 
 import pytest
@@ -86,7 +87,7 @@ def graph_allreduce(
 
 
 @ray.remote(num_gpus=1, max_calls=1)
-def eager_allreduce(
+def eager_custom_allreduce(
     monkeypatch: pytest.MonkeyPatch,
     tp_size,
     pp_size,
@@ -111,19 +112,51 @@ def eager_allreduce(
         inp = torch.ones(sz, dtype=torch.float32, device=device)
         out = inp
         for _ in range(num_communication):
-            out = fa.all_reduce(out, registered=False)
+            out = fa.ca_all_reduce(out, registered=False)
         torch.testing.assert_close(out, inp * (tp_size**num_communication))
 
         inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
         out = inp
         for _ in range(num_communication):
-            out = fa.all_reduce(out, registered=False)
+            out = fa.ca_all_reduce(out, registered=False)
         torch.testing.assert_close(out, inp * (tp_size**num_communication))
 
 
+@ray.remote(num_gpus=1, max_calls=1)
+def eager_quickreduce(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size,
+    pp_size,
+    rank,
+    distributed_init_port,
+):
+    with monkeypatch.context() as m:
+        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+        os.environ["VLLM_ROCM_QR_QUANT_REGIME"] = "FP"
+        device = torch.device(f"cuda:{rank}")
+        torch.cuda.set_device(device)
+        init_test_distributed_environment(tp_size, pp_size, rank,
+                                          distributed_init_port)
+
+        sz = 1024 * 1024
+        fa = get_tp_group().device_communicator.ca_comm
+        inp = torch.ones(sz, dtype=torch.float16, device=device)
+        out = inp
+        out = fa.qr_all_reduce(out)
+        torch.testing.assert_close(out, inp * tp_size)
+
+        sz = 1024 * 1024
+        inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
+        out = inp
+        out = fa.qr_all_reduce(out)
+        torch.testing.assert_close(out, inp * tp_size)
+
+
 @pytest.mark.parametrize("tp_size", [2])
 @pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
-@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
+@pytest.mark.parametrize(
+    "test_target",
+    [eager_custom_allreduce, graph_allreduce, eager_quickreduce])
 def test_custom_allreduce(
     monkeypatch: pytest.MonkeyPatch,
     tp_size,
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -71,11 +71,11 @@ class CustomAllreduce:
     # TODO: We should set a reasonable range for FP.
     MB = 1024 * 1024
     _QR_MIN_SIZE = {
-        (torch.float16, 2): [16 * MB, 2 * MB, 2 * MB, 1 * MB],
-        (torch.float16, 4): [16 * MB, 64 * MB, 4 * MB, 2 * MB],
+        (torch.float16, 2): [1 * MB, 2 * MB, 2 * MB, 1 * MB],
+        (torch.float16, 4): [1 * MB, 64 * MB, 4 * MB, 2 * MB],
         (torch.float16, 8): [16 * MB, 4 * MB, 4 * MB, 2 * MB],
-        (torch.bfloat16, 2): [16 * MB, 8 * MB, 8 * MB, 8 * MB],
-        (torch.bfloat16, 4): [16 * MB, 128 * MB, 128 * MB, 16 * MB],
+        (torch.bfloat16, 2): [2 * MB, 8 * MB, 8 * MB, 8 * MB],
+        (torch.bfloat16, 4): [8 * MB, 128 * MB, 128 * MB, 16 * MB],
         (torch.bfloat16, 8): [16 * MB, 2048 * MB, 2048 * MB, 2048 * MB],
     }
 
@@ -256,40 +256,43 @@ def init_custom_quick_allreduce(self):
         Initialize a custom quick allreduce implementation for AMD
         based on quick reduce (https://github.com/mk1-project/quickreduce).
         """
+        if not self._QR_SHOULD_INIT:
+            return
+        self.use_fp16_kernels = envs.VLLM_ROCM_QR_CAST_BF16_TO_FP16
+        regime_str = envs.VLLM_ROCM_QR_QUANT_REGIME
+        if regime_str not in QuickReduceRegime.__members__:
+            logger.warning(
+                "Custom quick allreduce:",
+                f"Invalid quantization level: {regime_str}. "
+                "Supported levels: "
+                f"{list(QuickReduceRegime.__members__.keys())}")
+            return
+
+        if regime_str == "NONE":
+            logger.debug("Custom quick allreduce is disabled based "
+                         "on env variable VLLM_ROCM_QR_QUANT_REGIME")
+            return
+
         vllm_config = get_current_vllm_config()
-        # for test mode
-        if vllm_config is not None and hasattr(vllm_config, "model_config"):
+        if vllm_config is not None and \
+              hasattr(vllm_config, "model_config") and \
+              hasattr(vllm_config.model_config, "dtype"):
             dtype = vllm_config.model_config.dtype
             if dtype not in [torch.float16, torch.bfloat16]:
                 self._QR_SHOULD_INIT = False
-        # On RocM bfloat16 kernels are slower than fp16
-        # due to slower match operations
-        # If environment is not set to 1 we convert input to fp16
-        self.use_fp16_kernels: bool = envs.VLLM_ROCM_QR_CAST_BF16_TO_FP16
-        regime_str = envs.VLLM_ROCM_QR_QUANT_REGIME
-        if self._QR_SHOULD_INIT:
-            if regime_str not in QuickReduceRegime.__members__:
-                logger.warning(
-                    "Custom quick allreduce:",
-                    f"Invalid quantization level: {regime_str}. "
-                    "Supported levels: "
-                    f"{list(QuickReduceRegime.__members__.keys())}")
-                return
-
-            if regime_str == "NONE":
-                logger.debug("Custom quick allreduce is disabled based "
-                             "on env variable VLLM_ROCM_QR_QUANT_REGIME")
-                return
-
-            self.qr_quant_level = QuickReduceRegime[regime_str]
-            self._qr_ptr = ops.init_custom_qr(self.rank, self.world_size)
-            self.create_qr_shared_buffer()
+            # On RocM bfloat16 kernels are slower than fp16
+            # due to slower match operations
+            # If environment variable is not set to 1 we convert input to fp16
             if dtype == torch.bfloat16 and not self.use_fp16_kernels:
                 logger.info(
                     "Custom quick allreduce: converting bf16 inputs to "
                     "fp16 can improve performance"
                     "set envs.VLLM_ROCM_QR_CAST_BF16_TO_FP16=1 to turn on.")
-            self.qr_disabled = False
+
+        self.qr_quant_level = QuickReduceRegime[regime_str]
+        self._qr_ptr = ops.init_custom_qr(self.rank, self.world_size)
+        self.create_qr_shared_buffer()
+        self.qr_disabled = False
 
     @contextmanager
     def capture(self):
@@ -346,7 +349,7 @@ def should_quick_allreduce(self, inp: torch.Tensor):
         if self.use_fp16_kernels:
             dtype = torch.float16
         return inp_size <= self.qr_max_size and \
-            inp_size > self._QR_MIN_SIZE[(dtype, self.world_size)]\
+            inp_size >= self._QR_MIN_SIZE[(dtype, self.world_size)]\
                 [self.qr_quant_level.value]
 
     def should_custom_allreduce(self, inp: torch.Tensor):
@@ -369,7 +372,7 @@ def should_custom_ar(self, inp: torch.Tensor):
         return self.should_quick_allreduce(
             inp) or self.should_custom_allreduce(inp)
 
-    def cr_all_reduce(self,
+    def ca_all_reduce(self,
                       inp: torch.Tensor,
                       *,
                       out: torch.Tensor = None,
@@ -411,7 +414,7 @@ def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
         if self.should_custom_allreduce(input):
             if self._IS_CAPTURING:
                 if torch.cuda.is_current_stream_capturing():
-                    return self.cr_all_reduce(input, registered=True)
+                    return self.ca_all_reduce(input, registered=True)
                 else:
                     # If warm up, mimic the allocation pattern since custom
                     # allreduce is out-of-place.
@@ -421,7 +424,7 @@ def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
                 # incurs a cost of cudaMemcpy, which should be small
                 # (<=1% of overall latency) compared to the performance
                 # gain of using custom kernels
-                return self.cr_all_reduce(input, registered=False)
+                return self.ca_all_reduce(input, registered=False)
 
         return None
 
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -129,7 +129,7 @@
     VLLM_SLEEP_WHEN_IDLE: bool = False
     VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16
     VLLM_KV_CACHE_LAYOUT: Optional[str] = None
-    VLLM_ROCM_QR_QUANT_REGIME: str = "FP"
+    VLLM_ROCM_QR_QUANT_REGIME: str = "NONE"
     VLLM_ROCM_QR_CAST_BF16_TO_FP16: bool = False
 
 
@@ -677,7 +677,7 @@ def get_vllm_port() -> Optional[int]:
     # Choice of quantization level: FP, INT8, INT6, INT4 or NONE
     # Recommended for large models to get allreduce
     "VLLM_ROCM_QR_QUANT_REGIME":
-    lambda: os.getenv("VLLM_ROCM_QR_QUANT_REGIME", "FP").upper(),
+    lambda: os.getenv("VLLM_ROCM_QR_QUANT_REGIME", "NONE").upper(),
 
     # Custom quick allreduce kernel for MI3* cards
     # Due to the lack of the bfloat16 asm instruction, bfloat16