vllm-project
diff --git a/‎benchmarks/benchmark_throughput.py‎
Lines changed: 3 additions & 0 deletions b/‎benchmarks/benchmark_throughput.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎vllm/distributed/device_communicators/base_device_communicator.py‎
Lines changed: 16 additions & 2 deletions b/‎vllm/distributed/device_communicators/base_device_communicator.py‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎vllm/distributed/device_communicators/cuda_communicator.py‎
Lines changed: 60 additions & 5 deletions b/‎vllm/distributed/device_communicators/cuda_communicator.py‎
Lines changed: 60 additions & 5 deletions
diff --git a/‎vllm/distributed/device_communicators/pynccl.py‎
Lines changed: 60 additions & 14 deletions b/‎vllm/distributed/device_communicators/pynccl.py‎
Lines changed: 60 additions & 14 deletions
diff --git a/‎vllm/distributed/device_communicators/pynccl_wrapper.py‎
Lines changed: 32 additions & 0 deletions b/‎vllm/distributed/device_communicators/pynccl_wrapper.py‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎vllm/distributed/parallel_state.py‎
Lines changed: 18 additions & 7 deletions b/‎vllm/distributed/parallel_state.py‎
Lines changed: 18 additions & 7 deletions
diff --git a/‎vllm/envs.py‎
Lines changed: 5 additions & 0 deletions b/‎vllm/envs.py‎
Lines changed: 5 additions & 0 deletions
@@ -28,6 +28,7 @@
     VisionArenaDataset,
 )
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
+from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.entrypoints.openai.api_server import (
     build_async_engine_client_from_engine_args,
@@ -110,6 +111,8 @@ def run_vllm(
             ),
         )
         end = time.perf_counter()
+
+    cleanup_dist_env_and_memory()
     return end - start, outputs
 
 
 
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import threading
-from typing import Optional
+from typing import List, Optional, Union
 from weakref import WeakValueDictionary
 
 import torch
@@ -138,9 +138,23 @@ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
                                               input_size[dim + 1:])
         return output_tensor
 
+    def all_gatherv(self,
+                    input_: Union[torch.Tensor, List[torch.Tensor]],
+                    dim: int = 0,
+                    sizes: Optional[List[int]] = None):
+        assert False, "not implemented"
+
+    def all_gatherv(self,
+                    input_: Union[torch.Tensor, List[torch.Tensor]],
+                    dim: int = 0,
+                    sizes: Optional[List[int]] = None):
+        assert False, "not implemented"
+
     def reduce_scatter(self,
                        input_: torch.Tensor,
-                       dim: int = -1) -> torch.Tensor:
+                       dim: int = -1,
+                       sizes: Optional[List[int]] = None) -> torch.Tensor:
+        assert sizes is None, "Varying size reduce scatter not supported with base device communicator"
         world_size = self.world_size
         # Bypass the function if we are using only 1 GPU.
         if world_size == 1:
 
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
+from typing import List, Optional, Union
 
 import torch
 from torch.distributed import ProcessGroup
@@ -117,7 +117,10 @@ def all_reduce(self, input_):
             torch.distributed.all_reduce(out, group=self.device_group)
         return out
 
-    def reduce_scatter(self, input_: torch.Tensor, dim: int = -1):
+    def reduce_scatter(self,
+                       input_: torch.Tensor,
+                       dim: int = -1,
+                       sizes: Optional[List[int]] = None):
         world_size = self.world_size
         pynccl_comm = self.pynccl_comm
         assert pynccl_comm is not None
@@ -129,15 +132,20 @@ def reduce_scatter(self, input_: torch.Tensor, dim: int = -1):
         # the input_tensor contiguous. Possible bug in reduce_scatter_tensor?
         input_tensor = input_.movedim(0, dim).contiguous()
 
-        assert input_tensor.shape[0] % world_size == 0
-        chunk_size = input_tensor.shape[0] // world_size
+        if sizes is not None:
+            assert len(sizes) == world_size
+            assert input_tensor.shape[0] == sum(sizes)
+            chunk_size = sizes[self.rank_in_group]
+        else:
+            assert input_tensor.shape[0] % world_size == 0
+            chunk_size = input_tensor.shape[0] // world_size
         output_shape = (chunk_size, ) + input_tensor.shape[1:]
 
         output = torch.empty(output_shape,
                              dtype=input_tensor.dtype,
                              device=input_tensor.device)
 
-        pynccl_comm.reduce_scatter(output, input_)
+        pynccl_comm.reduce_scatter(output, input_, sizes=sizes)
 
         # Reshape before returning
         return output.movedim(0, dim).contiguous()
@@ -180,6 +188,53 @@ def destroy(self):
             self.all2all_manager.destroy()
             self.all2all_manager = None
 
+    """
+    Allgather with support for list of tensors and varying sizes per rank.
+    Example:
+    Instead of:
+        ... = get_ep_group().dispatch(...)
+    Use this:
+        ... = get_dp_group().all_gatherv([topk_weights, topk_ids, a1q, a1q_scale], dim=0, sizes=get_forward_context().dp_metadata.num_tokens_across_dp_cpu)
+    """
+
+    def all_gatherv(self,
+                    input_: Union[torch.Tensor, List[torch.Tensor]],
+                    dim: int = 0,
+                    sizes: Optional[List[int]] = None):
+        assert dim == 0, "only dim 0 all-gather is supported"
+        world_size = self.world_size
+        pynccl_comm = self.pynccl_comm
+        assert pynccl_comm is not None and not pynccl_comm.disabled
+
+        def _all_gather_single(input_: torch.Tensor,
+                               sizes: Optional[List[int]] = None):
+            input_size = input_.size()
+            if sizes is not None:
+                assert len(sizes) == world_size
+                assert input_.shape[dim] == sizes[self.rank_in_group]
+                output_size = (sum(sizes), ) + input_size[1:]
+                # 'sizes' is not needed if all inputs in the same group have the same shape
+                if all(s == sizes[0] for s in sizes):
+                    sizes = None
+            else:
+                output_size = (input_size[0] * world_size, ) + input_size[1:]
+            # Allocate output tensor.
+            output_tensor = torch.empty(output_size,
+                                        dtype=input_.dtype,
+                                        device=input_.device)
+            pynccl_comm.all_gather(output_tensor, input_, sizes=sizes)
+            return output_tensor
+
+        if isinstance(input_, torch.Tensor):
+            return _all_gather_single(input_, sizes)
+
+        pynccl_comm.group_start()
+        output_list = []
+        for inp in input_:
+            output_list.append(_all_gather_single(inp, sizes=sizes))
+        pynccl_comm.group_end()
+        return output_list
+
     def dispatch(
             self, hidden_states: torch.Tensor,
             router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
 
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union
+from typing import List, Optional, Union
 
+import numpy as np
 # ===================== import region =====================
 import torch
 import torch.distributed as dist
@@ -135,7 +136,8 @@ def all_reduce(self,
     def all_gather(self,
                    output_tensor: torch.Tensor,
                    input_tensor: torch.Tensor,
-                   stream=None):
+                   stream=None,
+                   sizes: Optional[List[int]] = None):
         if self.disabled:
             return
         # nccl communicator created on a specific device
@@ -146,17 +148,38 @@ def all_gather(self,
             f"but the input tensor is on {input_tensor.device}")
         if stream is None:
             stream = current_stream()
-        self.nccl.ncclAllGather(
-            buffer_type(input_tensor.data_ptr()),
-            buffer_type(output_tensor.data_ptr()), input_tensor.numel(),
-            ncclDataTypeEnum.from_torch(input_tensor.dtype), self.comm,
-            cudaStream_t(stream.cuda_stream))
+        if sizes is not None:
+            assert output_tensor.shape[0] == sum(sizes)
+            numel_base = int(np.prod(output_tensor.shape[1:]))
+            split_offset = 0
+            self.nccl.ncclGroupStart()
+            for root, split_size in enumerate(sizes):
+                dst_slice = output_tensor[split_offset:split_offset +
+                                          split_size]
+                self.nccl.ncclBroadcast(
+                    buffer_type(input_tensor.data_ptr()),
+                    buffer_type(dst_slice.data_ptr()),
+                    split_size * numel_base,
+                    ncclDataTypeEnum.from_torch(input_tensor.dtype),
+                    root,
+                    self.comm,
+                    cudaStream_t(stream.cuda_stream),
+                )
+                split_offset += split_size
+            self.nccl.ncclGroupEnd()
+        else:
+            self.nccl.ncclAllGather(
+                buffer_type(input_tensor.data_ptr()),
+                buffer_type(output_tensor.data_ptr()), input_tensor.numel(),
+                ncclDataTypeEnum.from_torch(input_tensor.dtype), self.comm,
+                cudaStream_t(stream.cuda_stream))
 
     def reduce_scatter(self,
                        output_tensor: torch.Tensor,
                        input_tensor: torch.Tensor,
                        op: ReduceOp = ReduceOp.SUM,
-                       stream=None):
+                       stream=None,
+                       sizes: Optional[List[int]] = None):
         if self.disabled:
             return
         # nccl communicator created on a specific device
@@ -167,12 +190,29 @@ def reduce_scatter(self,
             f"but the input tensor is on {input_tensor.device}")
         if stream is None:
             stream = current_stream()
-        self.nccl.ncclReduceScatter(
-            buffer_type(input_tensor.data_ptr()),
-            buffer_type(output_tensor.data_ptr()), output_tensor.numel(),
-            ncclDataTypeEnum.from_torch(input_tensor.dtype),
-            ncclRedOpTypeEnum.from_torch(op), self.comm,
-            cudaStream_t(stream.cuda_stream))
+
+        if sizes is not None:
+            numel_base = int(np.prod(input_tensor.shape[1:]))
+            split_offset = 0
+            self.nccl.ncclGroupStart()
+            for root, split_size in enumerate(sizes):
+                chunk = input_tensor[split_offset:split_offset + split_size, :]
+                self.nccl.ncclReduce(
+                    buffer_type(chunk.data_ptr()),
+                    buffer_type(output_tensor.data_ptr()),
+                    split_size * numel_base,
+                    ncclDataTypeEnum.from_torch(input_tensor.dtype),
+                    ncclRedOpTypeEnum.from_torch(op), root, self.comm,
+                    cudaStream_t(stream.cuda_stream))
+                split_offset += split_size
+            self.nccl.ncclGroupEnd()
+        else:
+            self.nccl.ncclReduceScatter(
+                buffer_type(input_tensor.data_ptr()),
+                buffer_type(output_tensor.data_ptr()), output_tensor.numel(),
+                ncclDataTypeEnum.from_torch(input_tensor.dtype),
+                ncclRedOpTypeEnum.from_torch(op), self.comm,
+                cudaStream_t(stream.cuda_stream))
 
     def send(self, tensor: torch.Tensor, dst: int, stream=None):
         if self.disabled:
@@ -216,3 +256,9 @@ def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
         self.nccl.ncclBroadcast(sendbuff, recvbuff, tensor.numel(),
                                 ncclDataTypeEnum.from_torch(tensor.dtype), src,
                                 self.comm, cudaStream_t(stream.cuda_stream))
+
+    def group_start(self):
+        self.nccl.ncclGroupStart()
+
+    def group_end(self):
+        self.nccl.ncclGroupEnd()
@@ -154,6 +154,16 @@ class NCCLLibrary:
             ncclRedOp_t, ncclComm_t, cudaStream_t
         ]),
 
+        # ncclResult_t  ncclReduce(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, ncclRedOp_t op, int root,
+        #   ncclComm_t comm,  cudaStream_t stream);
+        # note that cudaStream_t is a pointer type, so the last argument
+        # is a pointer
+        Function("ncclReduce", ncclResult_t, [
+            buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t,
+            ncclRedOp_t, ctypes.c_int, ncclComm_t, cudaStream_t
+        ]),
         # ncclResult_t  ncclAllGather(
         #   const void* sendbuff, void* recvbuff, size_t count,
         #   ncclDataType_t datatype, ncclComm_t comm,
@@ -207,6 +217,10 @@ class NCCLLibrary:
         # it is better not to call it at all.
         # ncclResult_t  ncclCommDestroy(ncclComm_t comm);
         Function("ncclCommDestroy", ncclResult_t, [ncclComm_t]),
+        # ncclResult_t ncclGroupStart();
+        Function("ncclGroupStart", ncclResult_t, []),
+        # ncclResult_t ncclGroupEnd();
+        Function("ncclGroupEnd", ncclResult_t, []),
     ]
 
     # class attribute to store the mapping from the path to the library
@@ -300,6 +314,18 @@ def ncclAllReduce(self, sendbuff: buffer_type, recvbuff: buffer_type,
                                                      datatype, op, comm,
                                                      stream))
 
+    def ncclReduce(self, sendbuff: buffer_type, recvbuff: buffer_type,
+                   count: int, datatype: int, op: int, root: int,
+                   comm: ncclComm_t, stream: cudaStream_t) -> None:
+        # `datatype` actually should be `ncclDataType_t`
+        # and `op` should be `ncclRedOp_t`
+        # both are aliases of `ctypes.c_int`
+        # when we pass int to a function, it will be converted to `ctypes.c_int`
+        # by ctypes automatically
+        self.NCCL_CHECK(self._funcs["ncclReduce"](sendbuff, recvbuff, count,
+                                                  datatype, op, root, comm,
+                                                  stream))
+
     def ncclReduceScatter(self, sendbuff: buffer_type, recvbuff: buffer_type,
                           count: int, datatype: int, op: int, comm: ncclComm_t,
                           stream: cudaStream_t) -> None:
@@ -342,6 +368,12 @@ def ncclBroadcast(self, sendbuff: buffer_type, recvbuff: buffer_type,
     def ncclCommDestroy(self, comm: ncclComm_t) -> None:
         self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm))
 
+    def ncclGroupStart(self) -> None:
+        self.NCCL_CHECK(self._funcs["ncclGroupStart"]())
+
+    def ncclGroupEnd(self) -> None:
+        self.NCCL_CHECK(self._funcs["ncclGroupEnd"]())
+
 
 __all__ = [
     "NCCLLibrary", "ncclDataTypeEnum", "ncclRedOpTypeEnum", "ncclUniqueId",
 
@@ -30,7 +30,7 @@
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from multiprocessing import shared_memory
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, List, Optional, Union
 from unittest.mock import patch
 
 import torch
@@ -381,9 +381,16 @@ def _all_gather_out_place(self, input_: torch.Tensor,
                               dim: int) -> torch.Tensor:
         return self.device_communicator.all_gather(input_, dim)
 
+    def all_gatherv(self,
+                    input_: Union[torch.Tensor, List[torch.Tensor]],
+                    dim: int = 0,
+                    sizes: Optional[List[int]] = None):
+        return self.device_communicator.all_gatherv(input_, dim, sizes)
+
     def reduce_scatter(self,
                        input_: torch.Tensor,
-                       dim: int = -1) -> torch.Tensor:
+                       dim: int = -1,
+                       sizes: Optional[List[int]] = None) -> torch.Tensor:
         world_size = self.world_size
         # Bypass the function if we are using only 1 GPU.
         if world_size == 1:
@@ -392,16 +399,20 @@ def reduce_scatter(self,
             f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
 
         if self.use_custom_op_call:
+            assert sizes is None, "Varying size reduce scatter not supported with vllm custom op"
             return torch.ops.vllm.reduce_scatter(input_,
                                                  dim,
                                                  world_size,
                                                  group_name=self.unique_name)
         else:
-            return self._reduce_scatter_out_place(input_, dim)
-
-    def _reduce_scatter_out_place(self, input_: torch.Tensor,
-                                  dim: int) -> torch.Tensor:
-        return self.device_communicator.reduce_scatter(input_, dim)
+            return self._reduce_scatter_out_place(input_, dim, sizes)
+
+    def _reduce_scatter_out_place(
+            self,
+            input_: torch.Tensor,
+            dim: int,
+            sizes: Optional[List[int]] = None) -> torch.Tensor:
+        return self.device_communicator.reduce_scatter(input_, dim, sizes)
 
     def gather(self,
                input_: torch.Tensor,
 
@@ -121,6 +121,7 @@
     VLLM_TPU_BUCKET_PADDING_GAP: int = 0
     VLLM_TPU_MOST_MODEL_LEN: Optional[int] = None
     VLLM_USE_DEEP_GEMM: bool = False
+    VLLM_USE_FLASHINFER_MOE: bool = False
     VLLM_XGRAMMAR_CACHE_MB: int = 0
     VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
     VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
@@ -867,6 +868,10 @@ def get_vllm_port() -> Optional[int]:
     "VLLM_USE_DEEP_GEMM":
     lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),
 
+    # Allow use of FlashInfer CUTLASS kernels for fused moe ops.
+    "VLLM_USE_FLASHINFER_MOE":
+    lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE", "0"))),
+
     # Control the cache sized used by the xgrammar compiler. The default
     # of 512 MB should be enough for roughly 1000 JSON schemas.
     # It can be changed with this variable if needed for some reason.
Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,7 @@`
`28`	`28`	`VisionArenaDataset,`
`29`	`29`	`)`
`30`	`30`	`from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json`
	`31`	`+from vllm.distributed import cleanup_dist_env_and_memory`
`31`	`32`	`from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs`
`32`	`33`	`from vllm.entrypoints.openai.api_server import (`
`33`	`34`	`build_async_engine_client_from_engine_args,`
`@@ -110,6 +111,8 @@ def run_vllm(`
`110`	`111`	`),`
`111`	`112`	`)`
`112`	`113`	`end = time.perf_counter()`
	`114`	`+`
	`115`	`+ cleanup_dist_env_and_memory()`
`113`	`116`	`return end - start, outputs`
`114`	`117`
`115`	`118`