From 968b95e27437e205ce2b5c0eba6afa9f2640dcc7 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 25 Jun 2025 10:39:35 +0800 Subject: [PATCH 01/58] proxy Signed-off-by: Abatom --- .../disagg_xpyd/disagg_prefill_proxy_xpyd.py | 39 +++++++++++++++++-- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py index 73f2caaa0dbd..93b2b106b595 100644 --- a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py +++ b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py @@ -3,7 +3,9 @@ import os import socket import threading +import time import uuid +from typing import Any import aiohttp import msgpack @@ -11,12 +13,25 @@ from quart import Quart, make_response, request count = 0 -prefill_instances: dict[str, str] = {} # http_address: zmq_address -decode_instances: dict[str, str] = {} # http_address: zmq_address +prefill_instances: dict[str, Any] = {} # http_address: (zmq_address, stamp) +decode_instances: dict[str, Any] = {} # http_address: (zmq_address, stamp) prefill_cv = threading.Condition() decode_cv = threading.Condition() +DEFAULT_PING_SECONDS = 5 + + +def _remove_oldest_instances(instances: dict[str, Any]) -> None: + oldest_key = next(iter(instances), None) + while oldest_key is not None: + value = instances[oldest_key] + if value[1] > time.time(): + break + print(f"🔴Remove [HTTP:{oldest_key}, ZMQ:{value[0]}, stamp:{value[1]}]") + instances.pop(oldest_key, None) + oldest_key = next(iter(instances), None) + def _listen_for_register(poller, router_socket): while True: @@ -30,12 +45,23 @@ def _listen_for_register(poller, router_socket): global prefill_instances global prefill_cv with prefill_cv: - prefill_instances[data["http_address"]] = data["zmq_address"] + node = prefill_instances.pop(data["http_address"], None) + prefill_instances[data["http_address"]] = ( + data["zmq_address"], + time.time() + DEFAULT_PING_SECONDS, + ) + _remove_oldest_instances(prefill_instances) + elif data["type"] == "D": global decode_instances global decode_cv with decode_cv: - decode_instances[data["http_address"]] = data["zmq_address"] + node = decode_instances.pop(data["http_address"], None) + decode_instances[data["http_address"]] = ( + data["zmq_address"], + time.time() + DEFAULT_PING_SECONDS, + ) + _remove_oldest_instances(decode_instances) else: print( "Unexpected, Received message from %s, data: %s", @@ -43,6 +69,9 @@ def _listen_for_register(poller, router_socket): data, ) + if node is None: + print(f"🔵Add [HTTP:{data['http_address']}, ZMQ:{data['http_address']}") + def start_service_discovery(hostname, port): if not hostname: @@ -104,12 +133,14 @@ async def handle_request(): with prefill_cv: prefill_list = list(prefill_instances.items()) prefill_addr, prefill_zmq_addr = prefill_list[count % len(prefill_list)] + prefill_zmq_addr = prefill_zmq_addr[0] global decode_instances global decode_cv with decode_cv: decode_list = list(decode_instances.items()) decode_addr, decode_zmq_addr = decode_list[count % len(decode_list)] + decode_zmq_addr = decode_zmq_addr[0] print( f"handle_request count: {count}, [HTTP:{prefill_addr}, " From b4d14e7e14ecafef46c6c31cede98197393f4e3f Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 25 Jun 2025 11:08:44 +0800 Subject: [PATCH 02/58] format Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 81f7a2525896..97588bd2e0df 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -470,7 +470,6 @@ def get_finished( request_id, None) self.recv_request_id_to_tensor_ids.pop( request_id, None) - addr = 0 if isinstance(tensor, tuple): addr, _, _ = tensor self.pool.free(addr) From f71822b990057595be32635c6b6dcfe5d97ed381 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 25 Jun 2025 11:20:15 +0800 Subject: [PATCH 03/58] _listen_for_requests Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 157 +++++++++--------- 1 file changed, 80 insertions(+), 77 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 97588bd2e0df..957cdde342d4 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -292,87 +292,90 @@ def recv_tensor( def _listen_for_requests(self): while True: socks = dict(self.poller.poll()) - if self.router_socket in socks: - remote_address, message = self.router_socket.recv_multipart() - data = msgpack.loads(message) - if data["cmd"] == "NEW": - unique_id = self.nccl.unique_id_from_bytes( - bytes(data["unique_id"])) - with torch.cuda.device(self.device): - rank = 1 - with set_p2p_nccl_context(self.nccl_num_channels): - comm: ncclComm_t = self.nccl.ncclCommInitRank( - 2, unique_id, rank) - self.comms[remote_address.decode()] = (comm, rank) - logger.info( - "🤝ncclCommInitRank Success, %s👈%s, MyRank:%s", - self.zmq_address, remote_address.decode(), rank) - elif data["cmd"] == "PUT": - tensor_id = data["tensor_id"] - try: - tensor = torch.empty(data["shape"], - dtype=getattr( - torch, data["dtype"]), - device=self.device) - self.router_socket.send_multipart( - [remote_address, b"0"]) - comm, rank = self.comms[remote_address.decode()] - self._recv(comm, tensor, rank ^ 1, self.recv_stream) - tensor_size = tensor.element_size() * tensor.numel() - if (self.buffer_size + tensor_size - > self.buffer_size_threshold): - # Store Tensor in memory pool - addr = self.pool.store_tensor(tensor) - tensor = (addr, tensor.dtype, tensor.shape) - logger.warning( - "🔴[PUT]Recv Tensor, Out Of Threshold, " - "%s👈%s, data:%s, addr:%d", self.zmq_address, - remote_address.decode(), data, addr) - else: - self.buffer_size += tensor_size - - except torch.cuda.OutOfMemoryError: - self.router_socket.send_multipart( - [remote_address, b"1"]) - tensor = None + if self.router_socket not in socks: + continue + + remote_address, message = self.router_socket.recv_multipart() + data = msgpack.loads(message) + remote = remote_address.decode() + if data["cmd"] == "NEW": + unique_id = self.nccl.unique_id_from_bytes( + bytes(data["unique_id"])) + with torch.cuda.device(self.device): + rank = 1 + with set_p2p_nccl_context(self.nccl_num_channels): + comm: ncclComm_t = self.nccl.ncclCommInitRank( + 2, unique_id, rank) + self.comms[remote] = (comm, rank) + logger.info( + "🤝ncclCommInitRank Success, %s👈%s, MyRank:%s", + self.zmq_address, remote, rank) + elif data["cmd"] == "PUT": + tensor_id = data["tensor_id"] + try: + tensor = torch.empty(data["shape"], + dtype=getattr( + torch, data["dtype"]), + device=self.device) + self.router_socket.send_multipart( + [remote_address, b"0"]) + comm, rank = self.comms[remote] + self._recv(comm, tensor, rank ^ 1, self.recv_stream) + tensor_size = tensor.element_size() * tensor.numel() + if (self.buffer_size + tensor_size + > self.buffer_size_threshold): + # Store Tensor in memory pool + addr = self.pool.store_tensor(tensor) + tensor = (addr, tensor.dtype, tensor.shape) logger.warning( - "🔴[PUT]Recv Tensor, Out Of Memory, %s👈%s, " - "data:%s", self.zmq_address, - remote_address.decode(), data) - - with self.recv_store_cv: - self.recv_store[tensor_id] = tensor - self._have_received_tensor_id(tensor_id) - self.recv_store_cv.notify() - - elif data["cmd"] == "GET": - tensor_id = data["tensor_id"] - with self.send_store_cv: - tensor = self.send_store.pop(tensor_id, None) - if tensor is not None: - data = { - "ret": 0, - "shape": tensor.shape, - "dtype": - str(tensor.dtype).replace("torch.", "") - } - # LRU - self.send_store[tensor_id] = tensor - self._have_sent_tensor_id(tensor_id) - else: - data = {"ret": 1} + "🔴[PUT]Recv Tensor, Out Of Threshold, " + "%s👈%s, data:%s, addr:%d", self.zmq_address, + remote, data, addr) + else: + self.buffer_size += tensor_size + except torch.cuda.OutOfMemoryError: self.router_socket.send_multipart( - [remote_address, msgpack.dumps(data)]) - - if data["ret"] == 0: - comm, rank = self.comms[remote_address.decode()] - self._send(comm, tensor.to(self.device), rank ^ 1, - self.send_stream) - else: + [remote_address, b"1"]) + tensor = None logger.warning( - "🚧Unexpected, Received message from %s, data:%s", - remote_address, data) + "🔴[PUT]Recv Tensor, Out Of Memory, %s👈%s, " + "data:%s", self.zmq_address, + remote, data) + + with self.recv_store_cv: + self.recv_store[tensor_id] = tensor + self._have_received_tensor_id(tensor_id) + self.recv_store_cv.notify() + + elif data["cmd"] == "GET": + tensor_id = data["tensor_id"] + with self.send_store_cv: + tensor = self.send_store.pop(tensor_id, None) + if tensor is not None: + data = { + "ret": 0, + "shape": tensor.shape, + "dtype": + str(tensor.dtype).replace("torch.", "") + } + # LRU + self.send_store[tensor_id] = tensor + self._have_sent_tensor_id(tensor_id) + else: + data = {"ret": 1} + + self.router_socket.send_multipart( + [remote_address, msgpack.dumps(data)]) + + if data["ret"] == 0: + comm, rank = self.comms[remote] + self._send(comm, tensor.to(self.device), rank ^ 1, + self.send_stream) + else: + logger.warning( + "🚧Unexpected, Received message from %s, data:%s", + remote_address, data) def _have_sent_tensor_id(self, tensor_id: str): request_id = tensor_id.split('#')[0] From 38f19f5190b08e44e745e6b2b198599fca8c417a Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 25 Jun 2025 18:19:37 +0800 Subject: [PATCH 04/58] received Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 8 ++++---- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 15 +++++++++++---- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index a47deaf91272..3c6eabac908e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -317,10 +317,10 @@ def get_num_new_matched_tokens( num_external_tokens = (len(request.prompt_token_ids) - 1 - num_computed_tokens) - if num_external_tokens < 0: - num_external_tokens = 0 + if num_external_tokens <= 0: + return 0, False - return num_external_tokens, False + return num_external_tokens, True def update_state_after_alloc(self, request: "Request", blocks: "KVCacheBlocks", @@ -328,7 +328,7 @@ def update_state_after_alloc(self, request: "Request", """ Update KVConnector state after block allocation. """ - if not self.is_producer and num_external_tokens > 0: + if not self.is_producer and num_external_tokens == 0: self._requests_need_load[request.request_id] = ( request, blocks.get_block_ids()[0]) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 957cdde342d4..026bb853ea51 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -43,7 +43,7 @@ def set_p2p_nccl_context(num_channels: str): for var in env_vars: original_values[var] = os.environ.get(var) - logger.info("set_p2p_nccl_context, original_values: %s", original_values) + logger.debug("set_p2p_nccl_context, original_values: %s", original_values) try: os.environ['NCCL_MAX_NCHANNELS'] = num_channels @@ -183,7 +183,7 @@ def _create_connect(self, remote_address: typing.Optional[str] = None): comm: ncclComm_t = self.nccl.ncclCommInitRank( 2, unique_id, rank) self.comms[remote_address] = (comm, rank) - logger.info("🤝ncclCommInitRank Success, %s👉%s, MyRank: %s", + logger.info("🤝ncclCommInitRank Success, %s👉%s, MyRank:%s", self.zmq_address, remote_address, rank) return self.socks[remote_address], self.comms[remote_address] @@ -406,7 +406,7 @@ def wait_for_sent(self): while self.send_queue: self.send_queue_cv.wait() duration = time.time() - start_time - logger.debug( + logger.info( "🚧[PUT_ASYNC]It took %.3fms to wait for the send_queue" " to be empty, rank:%d", duration * 1000, self.rank) @@ -477,11 +477,18 @@ def get_finished( addr, _, _ = tensor self.pool.free(addr) + num_layers = len(forward_context.no_compile_layers) # TODO:Retrieve requests that have already sent the KV cache. finished_sending: set[str] = set() - # TODO:Retrieve requests that have already received the KV cache. + # Retrieve requests that have already received the KV cache. + # TODO: 1)Avoid polling. 2)Validate chunked prefill and preemption. finished_recving: set[str] = set() + for request_id in self.recv_request_id_to_tensor_ids: + if num_layers == len(self.recv_request_id_to_tensor_ids[request_id]): + finished_recving.add(request_id) + for request_id in finished_recving: + self.recv_request_id_to_tensor_ids.pop(request_id, None) return finished_sending or None, finished_recving or None From 64a2113205908ccacd079a6474e5380d80ccdfc9 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 25 Jun 2025 19:29:13 +0800 Subject: [PATCH 05/58] recv_tensor Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 026bb853ea51..21f3434724fa 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -239,11 +239,12 @@ def recv_tensor( remote_address: typing.Optional[str] = None, ) -> torch.Tensor: if self.send_type == "PUT" or self.send_type == "PUT_ASYNC": - start_time = time.time() with self.recv_store_cv: - while tensor_id not in self.recv_store: - self.recv_store_cv.wait() - tensor = self.recv_store[tensor_id] + if tensor_id not in self.recv_store: + logger.warning( + "🔴[PUT]Recv From %s, tensor_id:%s not exist, rank:%d", + remote_address, tensor_id,self.rank) + tensor = self.recv_store.get(tensor_id) if tensor is not None: if isinstance(tensor, tuple): @@ -254,11 +255,8 @@ def recv_tensor( self.buffer_size -= (tensor.element_size() * tensor.numel()) else: - duration = time.time() - start_time - logger.warning( - "🔴[PUT]Recv From %s, tensor_id:%s, duration:%.3fms, " - "rank:%d", remote_address, tensor_id, duration * 1000, - self.rank) + logger.warning("🔴[PUT]Recv From %s, tensor_id:%s, rank:%d", + remote_address, tensor_id,self.rank) return tensor # GET From 9b719e0dc3d4d92c33a387cb5d5868870cda6ff5 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 25 Jun 2025 20:05:55 +0800 Subject: [PATCH 06/58] sent Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 15 +++--------- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 24 ++++++++----------- 2 files changed, 13 insertions(+), 26 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 3c6eabac908e..2843663aba07 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -265,9 +265,8 @@ def extract_kv_from_layer( kv_cache, remote_address) def wait_for_save(self): - if self.is_producer: - assert self.p2p_nccl_engine is not None - self.p2p_nccl_engine.wait_for_sent() + """P2pNcclConnector does not save explicitly.""" + pass def get_finished( self, finished_req_ids: set[str], @@ -414,14 +413,6 @@ def build_connector_meta( block_ids=block_ids, block_size=self._block_size) - # Requests loaded asynchronously are not in the scheduler_output. - # for request_id in self._requests_need_load: - # request, block_ids = self._requests_need_load[request_id] - # meta.add_request(request_id=request.request_id, - # token_ids=request.prompt_token_ids, - # block_ids=block_ids, - # block_size=self._block_size) - self._requests_need_load.clear() return meta @@ -443,7 +434,7 @@ def request_finished( self.chunked_prefill.pop(request.request_id, None) - return False, None + return True, None # ============================== # Static methods diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 21f3434724fa..ee32fddad789 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -397,17 +397,6 @@ def _send_async(self): self.send_queue_cv.notify() self._send_sync(tensor_id, tensor, remote_address) - def wait_for_sent(self): - if self.send_type == "PUT_ASYNC": - start_time = time.time() - with self.send_queue_cv: - while self.send_queue: - self.send_queue_cv.wait() - duration = time.time() - start_time - logger.info( - "🚧[PUT_ASYNC]It took %.3fms to wait for the send_queue" - " to be empty, rank:%d", duration * 1000, self.rank) - def _send_sync( self, tensor_id: str, @@ -475,12 +464,18 @@ def get_finished( addr, _, _ = tensor self.pool.free(addr) + # TODO: 1)Avoid polling. 2)Validate chunked prefill and preemption. num_layers = len(forward_context.no_compile_layers) - # TODO:Retrieve requests that have already sent the KV cache. + # Retrieve requests that have already sent the KV cache. finished_sending: set[str] = set() - + if self.send_type != "GET": + for request_id in self.send_request_id_to_tensor_ids: + if (num_layers == + len(self.send_request_id_to_tensor_ids[request_id])): + finished_sending.add(request_id) + for request_id in finished_sending: + self.send_request_id_to_tensor_ids.pop(request_id, None) # Retrieve requests that have already received the KV cache. - # TODO: 1)Avoid polling. 2)Validate chunked prefill and preemption. finished_recving: set[str] = set() for request_id in self.recv_request_id_to_tensor_ids: if num_layers == len(self.recv_request_id_to_tensor_ids[request_id]): @@ -488,6 +483,7 @@ def get_finished( for request_id in finished_recving: self.recv_request_id_to_tensor_ids.pop(request_id, None) + # TODO: Add failed requests (e.g., transmission errors) return finished_sending or None, finished_recving or None def _ping(self): From 3ed382926aa2990417062b03d330670233403247 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 25 Jun 2025 20:22:27 +0800 Subject: [PATCH 07/58] zmq_address Signed-off-by: Abatom --- .../online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py index 93b2b106b595..ad92805ec9aa 100644 --- a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py +++ b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py @@ -70,7 +70,7 @@ def _listen_for_register(poller, router_socket): ) if node is None: - print(f"🔵Add [HTTP:{data['http_address']}, ZMQ:{data['http_address']}") + print(f"🔵Add [HTTP:{data['http_address']}, ZMQ:{data['zmq_address']}") def start_service_discovery(hostname, port): From a44370872fc1ef655e158b41c09fe45dd791c7cc Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 25 Jun 2025 21:08:12 +0800 Subject: [PATCH 08/58] send_queue_cv Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index ee32fddad789..eb2d3435c53e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -393,8 +393,6 @@ def _send_async(self): while not self.send_queue: self.send_queue_cv.wait() tensor_id, remote_address, tensor = self.send_queue.popleft() - if not self.send_queue: - self.send_queue_cv.notify() self._send_sync(tensor_id, tensor, remote_address) def _send_sync( @@ -428,7 +426,7 @@ def _send_sync( response.decode()) return False - self._send(comm, tensor.to(self.device), rank ^ 1, self.send_stream) + self._send(comm, tensor, rank ^ 1, self.send_stream) if self.send_type == "PUT_ASYNC": self._have_sent_tensor_id(tensor_id) From f9ab67f7266e63083a3fdd074d74bb8c92e61035 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 25 Jun 2025 23:43:05 +0800 Subject: [PATCH 09/58] bugfix & format Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 4 +-- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 31 ++++++++----------- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 2843663aba07..4ca4e205bf64 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -266,7 +266,7 @@ def extract_kv_from_layer( def wait_for_save(self): """P2pNcclConnector does not save explicitly.""" - pass + return def get_finished( self, finished_req_ids: set[str], @@ -434,7 +434,7 @@ def request_finished( self.chunked_prefill.pop(request.request_id, None) - return True, None + return self.is_producer, None # ============================== # Static methods diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index eb2d3435c53e..2d048c9b2b07 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -243,7 +243,7 @@ def recv_tensor( if tensor_id not in self.recv_store: logger.warning( "🔴[PUT]Recv From %s, tensor_id:%s not exist, rank:%d", - remote_address, tensor_id,self.rank) + remote_address, tensor_id, self.rank) tensor = self.recv_store.get(tensor_id) if tensor is not None: @@ -256,7 +256,7 @@ def recv_tensor( tensor.numel()) else: logger.warning("🔴[PUT]Recv From %s, tensor_id:%s, rank:%d", - remote_address, tensor_id,self.rank) + remote_address, tensor_id, self.rank) return tensor # GET @@ -305,18 +305,15 @@ def _listen_for_requests(self): comm: ncclComm_t = self.nccl.ncclCommInitRank( 2, unique_id, rank) self.comms[remote] = (comm, rank) - logger.info( - "🤝ncclCommInitRank Success, %s👈%s, MyRank:%s", - self.zmq_address, remote, rank) + logger.info("🤝ncclCommInitRank Success, %s👈%s, MyRank:%s", + self.zmq_address, remote, rank) elif data["cmd"] == "PUT": tensor_id = data["tensor_id"] try: tensor = torch.empty(data["shape"], - dtype=getattr( - torch, data["dtype"]), + dtype=getattr(torch, data["dtype"]), device=self.device) - self.router_socket.send_multipart( - [remote_address, b"0"]) + self.router_socket.send_multipart([remote_address, b"0"]) comm, rank = self.comms[remote] self._recv(comm, tensor, rank ^ 1, self.recv_stream) tensor_size = tensor.element_size() * tensor.numel() @@ -333,13 +330,11 @@ def _listen_for_requests(self): self.buffer_size += tensor_size except torch.cuda.OutOfMemoryError: - self.router_socket.send_multipart( - [remote_address, b"1"]) + self.router_socket.send_multipart([remote_address, b"1"]) tensor = None logger.warning( "🔴[PUT]Recv Tensor, Out Of Memory, %s👈%s, " - "data:%s", self.zmq_address, - remote, data) + "data:%s", self.zmq_address, remote, data) with self.recv_store_cv: self.recv_store[tensor_id] = tensor @@ -354,8 +349,7 @@ def _listen_for_requests(self): data = { "ret": 0, "shape": tensor.shape, - "dtype": - str(tensor.dtype).replace("torch.", "") + "dtype": str(tensor.dtype).replace("torch.", "") } # LRU self.send_store[tensor_id] = tensor @@ -468,15 +462,16 @@ def get_finished( finished_sending: set[str] = set() if self.send_type != "GET": for request_id in self.send_request_id_to_tensor_ids: - if (num_layers == - len(self.send_request_id_to_tensor_ids[request_id])): + if (num_layers == len( + self.send_request_id_to_tensor_ids[request_id])): finished_sending.add(request_id) for request_id in finished_sending: self.send_request_id_to_tensor_ids.pop(request_id, None) # Retrieve requests that have already received the KV cache. finished_recving: set[str] = set() for request_id in self.recv_request_id_to_tensor_ids: - if num_layers == len(self.recv_request_id_to_tensor_ids[request_id]): + if num_layers == len( + self.recv_request_id_to_tensor_ids[request_id]): finished_recving.add(request_id) for request_id in finished_recving: self.recv_request_id_to_tensor_ids.pop(request_id, None) From 0eaf48c39da7b1ba5ba0c9d78955f7205b2a6c4f Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 26 Jun 2025 10:20:41 +0800 Subject: [PATCH 10/58] debug Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 7 +++- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 36 +++++++++++-------- vllm/v1/core/sched/scheduler.py | 4 +-- 3 files changed, 30 insertions(+), 17 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 4ca4e205bf64..3abfbe1dd753 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -191,6 +191,7 @@ def inject_kv_into_layer( # Load the KV for each request each layer for request in metadata.requests: + is_success = True for layer_name in forward_context.no_compile_layers: attn_layer = forward_context.no_compile_layers[layer_name] kv_cache_layer = attn_layer.kv_cache[ \ @@ -202,10 +203,14 @@ def inject_kv_into_layer( if kv_cache is None: logger.warning("🚧src_kv_cache is None, %s", request.request_id) - continue + is_success = False + break inject_kv_into_layer(kv_cache_layer, kv_cache, request.slot_mapping, request.request_id) + if is_success: + logger.info( + "🔵KV Cache is injected into layer, %s", request.request_id) def wait_for_layer_load(self, layer_name: str) -> None: """Blocking until the KV for a specific layer is loaded into vLLM's diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 2d048c9b2b07..d10b4402b1b4 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -254,6 +254,8 @@ def recv_tensor( else: self.buffer_size -= (tensor.element_size() * tensor.numel()) + logger.info("🔵[PUT]Recv From %s, tensor_id:%s, rank:%d", + remote_address, tensor_id, self.rank) else: logger.warning("🔴[PUT]Recv From %s, tensor_id:%s, rank:%d", remote_address, tensor_id, self.rank) @@ -313,9 +315,16 @@ def _listen_for_requests(self): tensor = torch.empty(data["shape"], dtype=getattr(torch, data["dtype"]), device=self.device) + except torch.cuda.OutOfMemoryError: + self.router_socket.send_multipart([remote_address, b"1"]) + tensor = None + logger.warning( + "🔴[PUT]Recv Tensor, Out Of Memory, %s👈%s, " + "data:%s", self.zmq_address, remote, data) + else: self.router_socket.send_multipart([remote_address, b"0"]) comm, rank = self.comms[remote] - self._recv(comm, tensor, rank ^ 1, self.recv_stream) + self._recv(comm, tensor_id, tensor, rank ^ 1, self.recv_stream) tensor_size = tensor.element_size() * tensor.numel() if (self.buffer_size + tensor_size > self.buffer_size_threshold): @@ -329,18 +338,15 @@ def _listen_for_requests(self): else: self.buffer_size += tensor_size - except torch.cuda.OutOfMemoryError: - self.router_socket.send_multipart([remote_address, b"1"]) - tensor = None - logger.warning( - "🔴[PUT]Recv Tensor, Out Of Memory, %s👈%s, " - "data:%s", self.zmq_address, remote, data) - with self.recv_store_cv: self.recv_store[tensor_id] = tensor self._have_received_tensor_id(tensor_id) self.recv_store_cv.notify() + logger.info( + "🔵[PUT]Recv Tensor, %s👈%s, is_success:%s, data:%s", + self.zmq_address, remote, tensor is not None, data) + elif data["cmd"] == "GET": tensor_id = data["tensor_id"] with self.send_store_cv: @@ -465,16 +471,16 @@ def get_finished( if (num_layers == len( self.send_request_id_to_tensor_ids[request_id])): finished_sending.add(request_id) - for request_id in finished_sending: - self.send_request_id_to_tensor_ids.pop(request_id, None) + # for request_id in finished_sending: + # self.send_request_id_to_tensor_ids.pop(request_id, None) # Retrieve requests that have already received the KV cache. finished_recving: set[str] = set() for request_id in self.recv_request_id_to_tensor_ids: if num_layers == len( self.recv_request_id_to_tensor_ids[request_id]): finished_recving.add(request_id) - for request_id in finished_recving: - self.recv_request_id_to_tensor_ids.pop(request_id, None) + # for request_id in finished_recving: + # self.recv_request_id_to_tensor_ids.pop(request_id, None) # TODO: Add failed requests (e.g., transmission errors) return finished_sending or None, finished_recving or None @@ -493,7 +499,7 @@ def _ping(self): sock.send(msgpack.dumps(data)) time.sleep(3) - def _send(self, comm, tensor: torch.Tensor, dst: int, stream=None): + def _send(self, comm, tensor_id: str, tensor: torch.Tensor, dst: int, stream=None): assert tensor.device == self.device, ( f"this nccl communicator is created to work on {self.device}, " f"but the input tensor is on {tensor.device}") @@ -504,9 +510,10 @@ def _send(self, comm, tensor: torch.Tensor, dst: int, stream=None): self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(), ncclDataTypeEnum.from_torch(tensor.dtype), dst, comm, cudaStream_t(stream.cuda_stream)) + logger.info("🔵_send, tensor_id:%s", tensor_id) stream.synchronize() - def _recv(self, comm, tensor: torch.Tensor, src: int, stream=None): + def _recv(self, comm, tensor_id: str, tensor: torch.Tensor, src: int, stream=None): assert tensor.device == self.device, ( f"this nccl communicator is created to work on {self.device}, " f"but the input tensor is on {tensor.device}") @@ -517,6 +524,7 @@ def _recv(self, comm, tensor: torch.Tensor, src: int, stream=None): self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(), ncclDataTypeEnum.from_torch(tensor.dtype), src, comm, cudaStream_t(stream.cuda_stream)) + logger.info("🔵_recv, tensor_id:%s", tensor_id) stream.synchronize() def close(self) -> None: diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 00b0844a5660..21c157af0adb 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1101,8 +1101,8 @@ def _update_from_kv_xfer_finished(self, """ # KV Connector:: update recv and send status from last step. for req_id in (model_runner_output.finished_recving or ()): - logger.debug("Finished recving KV transfer for request %s", req_id) + logger.info("Finished recving KV transfer for request %s", req_id) self.finished_recving_kv_req_ids.add(req_id) for req_id in (model_runner_output.finished_sending or ()): - logger.debug("Finished sending KV transfer for request %s", req_id) + logger.info("Finished sending KV transfer for request %s", req_id) self._free_blocks(self.requests[req_id]) From f6b1c68c871f2194a472c4270e3e646e5c9496c6 Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 26 Jun 2025 10:25:11 +0800 Subject: [PATCH 11/58] bugfix Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index d10b4402b1b4..466ee6cde2de 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -285,7 +285,7 @@ def recv_tensor( dtype=getattr(torch, data["dtype"]), device=self.device) - self._recv(comm, tensor, rank ^ 1, self.recv_stream) + self._recv(comm, tensor_id, tensor, rank ^ 1, self.recv_stream) return tensor @@ -368,7 +368,7 @@ def _listen_for_requests(self): if data["ret"] == 0: comm, rank = self.comms[remote] - self._send(comm, tensor.to(self.device), rank ^ 1, + self._send(comm, tensor_id, tensor.to(self.device), rank ^ 1, self.send_stream) else: logger.warning( @@ -426,7 +426,7 @@ def _send_sync( response.decode()) return False - self._send(comm, tensor, rank ^ 1, self.send_stream) + self._send(comm, tensor_id, tensor, rank ^ 1, self.send_stream) if self.send_type == "PUT_ASYNC": self._have_sent_tensor_id(tensor_id) From f8b0cfcbb186da7e76e9978cd26178037ea3db7b Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 26 Jun 2025 10:32:32 +0800 Subject: [PATCH 12/58] bugfix Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 466ee6cde2de..3bacd8cfaf82 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -471,16 +471,16 @@ def get_finished( if (num_layers == len( self.send_request_id_to_tensor_ids[request_id])): finished_sending.add(request_id) - # for request_id in finished_sending: - # self.send_request_id_to_tensor_ids.pop(request_id, None) + for request_id in finished_sending: + self.send_request_id_to_tensor_ids.pop(request_id, None) # Retrieve requests that have already received the KV cache. finished_recving: set[str] = set() for request_id in self.recv_request_id_to_tensor_ids: if num_layers == len( self.recv_request_id_to_tensor_ids[request_id]): finished_recving.add(request_id) - # for request_id in finished_recving: - # self.recv_request_id_to_tensor_ids.pop(request_id, None) + for request_id in finished_recving: + self.recv_request_id_to_tensor_ids.pop(request_id, None) # TODO: Add failed requests (e.g., transmission errors) return finished_sending or None, finished_recving or None From 50bbc9b5847aad9b60df5f79bbbebc8f91abc93d Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 26 Jun 2025 10:52:45 +0800 Subject: [PATCH 13/58] add log Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 3bacd8cfaf82..485e53036559 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -461,6 +461,7 @@ def get_finished( if isinstance(tensor, tuple): addr, _, _ = tensor self.pool.free(addr) + logger.info("🔵get_finished, request_id:%s", request_id) # TODO: 1)Avoid polling. 2)Validate chunked prefill and preemption. num_layers = len(forward_context.no_compile_layers) From 5bf5681ebb163d280990267481b36aca0c68f072 Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 26 Jun 2025 11:13:16 +0800 Subject: [PATCH 14/58] recv_request_id_to_tensor_ids Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 485e53036559..28a8e75ec52e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -480,8 +480,8 @@ def get_finished( if num_layers == len( self.recv_request_id_to_tensor_ids[request_id]): finished_recving.add(request_id) - for request_id in finished_recving: - self.recv_request_id_to_tensor_ids.pop(request_id, None) + # for request_id in finished_recving: + # self.recv_request_id_to_tensor_ids.pop(request_id, None) # TODO: Add failed requests (e.g., transmission errors) return finished_sending or None, finished_recving or None From c46ec518746ebf09ec448408b2df00771a1f8436 Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 26 Jun 2025 12:15:35 +0800 Subject: [PATCH 15/58] recv_request_id_to_tensor_ids Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 28a8e75ec52e..485e53036559 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -480,8 +480,8 @@ def get_finished( if num_layers == len( self.recv_request_id_to_tensor_ids[request_id]): finished_recving.add(request_id) - # for request_id in finished_recving: - # self.recv_request_id_to_tensor_ids.pop(request_id, None) + for request_id in finished_recving: + self.recv_request_id_to_tensor_ids.pop(request_id, None) # TODO: Add failed requests (e.g., transmission errors) return finished_sending or None, finished_recving or None From 776a0586c448dd5457f30d8afb02d2bd74cb8878 Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 26 Jun 2025 16:28:00 +0800 Subject: [PATCH 16/58] bugfix Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 34 ++++++++++++------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 485e53036559..3c5f9d2d727a 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -505,28 +505,38 @@ def _send(self, comm, tensor_id: str, tensor: torch.Tensor, dst: int, stream=Non f"this nccl communicator is created to work on {self.device}, " f"but the input tensor is on {tensor.device}") if stream is None: - stream = current_stream() + stream = torch.cuda.current_stream() + + event = torch.cuda.Event() with torch.cuda.stream(stream): - self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(), - ncclDataTypeEnum.from_torch(tensor.dtype), dst, - comm, cudaStream_t(stream.cuda_stream)) - logger.info("🔵_send, tensor_id:%s", tensor_id) - stream.synchronize() + self.nccl.ncclSend( + buffer_type(tensor.data_ptr()), tensor.numel(), + ncclDataTypeEnum.from_torch(tensor.dtype), dst, + comm, cudaStream_t(stream.cuda_stream) + ) + event.record(stream) + + event.synchronize() def _recv(self, comm, tensor_id: str, tensor: torch.Tensor, src: int, stream=None): assert tensor.device == self.device, ( f"this nccl communicator is created to work on {self.device}, " f"but the input tensor is on {tensor.device}") if stream is None: - stream = current_stream() + stream = torch.cuda.current_stream() + + event = torch.cuda.Event() with torch.cuda.stream(stream): - self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(), - ncclDataTypeEnum.from_torch(tensor.dtype), src, - comm, cudaStream_t(stream.cuda_stream)) - logger.info("🔵_recv, tensor_id:%s", tensor_id) - stream.synchronize() + self.nccl.ncclRecv( + buffer_type(tensor.data_ptr()), tensor.numel(), + ncclDataTypeEnum.from_torch(tensor.dtype), src, + comm, cudaStream_t(stream.cuda_stream) + ) + event.record(stream) + + event.synchronize() def close(self) -> None: self._listener_thread.join() From d70614a3f65c37e21fd7b2406dcd2a5125995ab8 Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 26 Jun 2025 19:31:27 +0800 Subject: [PATCH 17/58] event.synchronize() Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 28 ++++++------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 3c5f9d2d727a..e4d1a7b92e4f 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -504,38 +504,26 @@ def _send(self, comm, tensor_id: str, tensor: torch.Tensor, dst: int, stream=Non assert tensor.device == self.device, ( f"this nccl communicator is created to work on {self.device}, " f"but the input tensor is on {tensor.device}") - if stream is None: - stream = torch.cuda.current_stream() - + stream = stream if stream is not None else torch.cuda.current_stream() event = torch.cuda.Event() - with torch.cuda.stream(stream): - self.nccl.ncclSend( - buffer_type(tensor.data_ptr()), tensor.numel(), - ncclDataTypeEnum.from_torch(tensor.dtype), dst, - comm, cudaStream_t(stream.cuda_stream) - ) + self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(), + ncclDataTypeEnum.from_torch(tensor.dtype), dst, + comm, cudaStream_t(stream.cuda_stream)) event.record(stream) - event.synchronize() def _recv(self, comm, tensor_id: str, tensor: torch.Tensor, src: int, stream=None): assert tensor.device == self.device, ( f"this nccl communicator is created to work on {self.device}, " f"but the input tensor is on {tensor.device}") - if stream is None: - stream = torch.cuda.current_stream() - + stream = stream if stream is not None else torch.cuda.current_stream() event = torch.cuda.Event() - with torch.cuda.stream(stream): - self.nccl.ncclRecv( - buffer_type(tensor.data_ptr()), tensor.numel(), - ncclDataTypeEnum.from_torch(tensor.dtype), src, - comm, cudaStream_t(stream.cuda_stream) - ) + self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(), + ncclDataTypeEnum.from_torch(tensor.dtype), src, + comm, cudaStream_t(stream.cuda_stream)) event.record(stream) - event.synchronize() def close(self) -> None: From fae468db62b6accc2b710446d083e256b6f6849f Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 26 Jun 2025 20:05:02 +0800 Subject: [PATCH 18/58] rm with torch.cuda.stream(stream) Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index e4d1a7b92e4f..467948009f12 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -506,11 +506,10 @@ def _send(self, comm, tensor_id: str, tensor: torch.Tensor, dst: int, stream=Non f"but the input tensor is on {tensor.device}") stream = stream if stream is not None else torch.cuda.current_stream() event = torch.cuda.Event() - with torch.cuda.stream(stream): - self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(), - ncclDataTypeEnum.from_torch(tensor.dtype), dst, - comm, cudaStream_t(stream.cuda_stream)) - event.record(stream) + self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(), + ncclDataTypeEnum.from_torch(tensor.dtype), dst, + comm, cudaStream_t(stream.cuda_stream)) + event.record(stream) event.synchronize() def _recv(self, comm, tensor_id: str, tensor: torch.Tensor, src: int, stream=None): @@ -519,11 +518,10 @@ def _recv(self, comm, tensor_id: str, tensor: torch.Tensor, src: int, stream=Non f"but the input tensor is on {tensor.device}") stream = stream if stream is not None else torch.cuda.current_stream() event = torch.cuda.Event() - with torch.cuda.stream(stream): - self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(), - ncclDataTypeEnum.from_torch(tensor.dtype), src, - comm, cudaStream_t(stream.cuda_stream)) - event.record(stream) + self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(), + ncclDataTypeEnum.from_torch(tensor.dtype), src, + comm, cudaStream_t(stream.cuda_stream)) + event.record(stream) event.synchronize() def close(self) -> None: From e28fa4019b5af1f646c1342ce91f5778213211d9 Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 26 Jun 2025 20:14:39 +0800 Subject: [PATCH 19/58] log level Signed-off-by: Abatom --- vllm/v1/core/sched/scheduler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 21c157af0adb..00b0844a5660 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1101,8 +1101,8 @@ def _update_from_kv_xfer_finished(self, """ # KV Connector:: update recv and send status from last step. for req_id in (model_runner_output.finished_recving or ()): - logger.info("Finished recving KV transfer for request %s", req_id) + logger.debug("Finished recving KV transfer for request %s", req_id) self.finished_recving_kv_req_ids.add(req_id) for req_id in (model_runner_output.finished_sending or ()): - logger.info("Finished sending KV transfer for request %s", req_id) + logger.debug("Finished sending KV transfer for request %s", req_id) self._free_blocks(self.requests[req_id]) From fc16221fff25d5a20c515ccdc386e60d53033176 Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 26 Jun 2025 20:16:45 +0800 Subject: [PATCH 20/58] to(self.device) Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 467948009f12..89850db006c9 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -426,7 +426,7 @@ def _send_sync( response.decode()) return False - self._send(comm, tensor_id, tensor, rank ^ 1, self.send_stream) + self._send(comm, tensor_id, tensor.to(self.device), rank ^ 1, self.send_stream) if self.send_type == "PUT_ASYNC": self._have_sent_tensor_id(tensor_id) From edca3944d907f87048f3ad323e115684f9c0dffc Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 26 Jun 2025 20:41:09 +0800 Subject: [PATCH 21/58] current_stream() Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 89850db006c9..26050df13233 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -504,7 +504,7 @@ def _send(self, comm, tensor_id: str, tensor: torch.Tensor, dst: int, stream=Non assert tensor.device == self.device, ( f"this nccl communicator is created to work on {self.device}, " f"but the input tensor is on {tensor.device}") - stream = stream if stream is not None else torch.cuda.current_stream() + stream = stream if stream is not None else current_stream() event = torch.cuda.Event() self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(), ncclDataTypeEnum.from_torch(tensor.dtype), dst, @@ -516,7 +516,7 @@ def _recv(self, comm, tensor_id: str, tensor: torch.Tensor, src: int, stream=Non assert tensor.device == self.device, ( f"this nccl communicator is created to work on {self.device}, " f"but the input tensor is on {tensor.device}") - stream = stream if stream is not None else torch.cuda.current_stream() + stream = stream if stream is not None else current_stream() event = torch.cuda.Event() self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(), ncclDataTypeEnum.from_torch(tensor.dtype), src, From 40f47429b7aaee630ade53ca6ab4a47df075007d Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 26 Jun 2025 20:50:24 +0800 Subject: [PATCH 22/58] log level Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 26050df13233..0e97df2d4060 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -254,8 +254,8 @@ def recv_tensor( else: self.buffer_size -= (tensor.element_size() * tensor.numel()) - logger.info("🔵[PUT]Recv From %s, tensor_id:%s, rank:%d", - remote_address, tensor_id, self.rank) + logger.debug("🔵[PUT]Recv From %s, tensor_id:%s, rank:%d", + remote_address, tensor_id, self.rank) else: logger.warning("🔴[PUT]Recv From %s, tensor_id:%s, rank:%d", remote_address, tensor_id, self.rank) @@ -461,7 +461,7 @@ def get_finished( if isinstance(tensor, tuple): addr, _, _ = tensor self.pool.free(addr) - logger.info("🔵get_finished, request_id:%s", request_id) + logger.debug("🔵get_finished, request_id:%s", request_id) # TODO: 1)Avoid polling. 2)Validate chunked prefill and preemption. num_layers = len(forward_context.no_compile_layers) From 178ff2cfa2ed6d85e5efd38dbafa74d44fc7d7fe Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 26 Jun 2025 20:59:59 +0800 Subject: [PATCH 23/58] log level Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 0e97df2d4060..25f168c68b6a 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -343,7 +343,7 @@ def _listen_for_requests(self): self._have_received_tensor_id(tensor_id) self.recv_store_cv.notify() - logger.info( + logger.debug( "🔵[PUT]Recv Tensor, %s👈%s, is_success:%s, data:%s", self.zmq_address, remote, tensor is not None, data) From 4999a423c9fc4f34c9bd21d687abcdf83951b5a3 Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 27 Jun 2025 09:59:01 +0800 Subject: [PATCH 24/58] add nvtx Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 25f168c68b6a..2e70478090d1 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -10,6 +10,7 @@ from typing import TYPE_CHECKING, Any, Optional import msgpack +import nvtx import torch import zmq @@ -500,6 +501,7 @@ def _ping(self): sock.send(msgpack.dumps(data)) time.sleep(3) + @nvtx.annotate("P2pNcclEngine.send", color="red") def _send(self, comm, tensor_id: str, tensor: torch.Tensor, dst: int, stream=None): assert tensor.device == self.device, ( f"this nccl communicator is created to work on {self.device}, " @@ -512,6 +514,7 @@ def _send(self, comm, tensor_id: str, tensor: torch.Tensor, dst: int, stream=Non event.record(stream) event.synchronize() + @nvtx.annotate("P2pNcclEngine.recv", color="blue") def _recv(self, comm, tensor_id: str, tensor: torch.Tensor, src: int, stream=None): assert tensor.device == self.device, ( f"this nccl communicator is created to work on {self.device}, " From 9af341a3eb35b161327e21a4053897dda2c95ad6 Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 27 Jun 2025 11:45:36 +0800 Subject: [PATCH 25/58] with torch.cuda.stream(stream) Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 2e70478090d1..b49a177c4ea7 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -521,10 +521,11 @@ def _recv(self, comm, tensor_id: str, tensor: torch.Tensor, src: int, stream=Non f"but the input tensor is on {tensor.device}") stream = stream if stream is not None else current_stream() event = torch.cuda.Event() - self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(), - ncclDataTypeEnum.from_torch(tensor.dtype), src, - comm, cudaStream_t(stream.cuda_stream)) - event.record(stream) + with torch.cuda.stream(stream): + self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(), + ncclDataTypeEnum.from_torch(tensor.dtype), src, + comm, cudaStream_t(stream.cuda_stream)) + event.record(stream) event.synchronize() def close(self) -> None: From 41b8ebaf21a16057792cac6041eeaaf5c187be4b Mon Sep 17 00:00:00 2001 From: Abatom Date: Sat, 28 Jun 2025 19:39:55 +0800 Subject: [PATCH 26/58] mod proxy port Signed-off-by: Abatom --- .../online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py index ad92805ec9aa..7f1b5e4f6f8f 100644 --- a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py +++ b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py @@ -180,6 +180,6 @@ async def handle_request(): if __name__ == "__main__": - t = start_service_discovery("0.0.0.0", 30001) - app.run(host="0.0.0.0", port=10001) + t = start_service_discovery("0.0.0.0", 30201) + app.run(host="0.0.0.0", port=10101) t.join() From 07d01cda90234015561083f7583bf253f2d50462 Mon Sep 17 00:00:00 2001 From: Abatom Date: Sat, 28 Jun 2025 19:53:17 +0800 Subject: [PATCH 27/58] del PUT_ASYNC Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 83 +++++++++---------- 1 file changed, 41 insertions(+), 42 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index b49a177c4ea7..4c310c9bb137 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -7,6 +7,7 @@ import typing from collections import deque from contextlib import contextmanager +from enum import Enum from typing import TYPE_CHECKING, Any, Optional import msgpack @@ -118,20 +119,19 @@ def __init__(self, 1024**3) # GB # The sending type includes tree mutually exclusive options: - # PUT, GET, PUT_ASYNC. + # PUT, GET. self.send_type = self.config.get_from_extra_config("send_type", "PUT") if self.send_type == "GET": # tensor_id: torch.Tensor self.send_store: dict[str, torch.Tensor] = {} else: - # PUT or PUT_ASYNC + # PUT # tensor_id: torch.Tensor self.send_queue: deque[list[Any]] = deque() self.send_request_id_to_tensor_ids: dict[str, set[str]] = {} - if self.send_type == "PUT_ASYNC": - self._send_thread = threading.Thread(target=self._send_async, - daemon=True) - self._send_thread.start() + self._send_thread = threading.Thread(target=self._send_async, + daemon=True) + self._send_thread.start() # tensor_id: torch.Tensor/(addr, dtype, shape) self.recv_store: dict[str, Any] = {} @@ -200,37 +200,38 @@ def send_tensor( self.recv_store[tensor_id] = tensor self.recv_store_cv.notify() return True - else: - if self.send_type == "PUT": - return self._send_sync(tensor_id, tensor, remote_address) - elif self.send_type == "PUT_ASYNC": - with self.send_queue_cv: - self.send_queue.append([tensor_id, remote_address, tensor]) - self.send_queue_cv.notify() - else: # GET - with self.send_store_cv: - tensor_size = tensor.element_size() * tensor.numel() - while (self.buffer_size + tensor_size - > self.buffer_size_threshold): - oldest_tenser_id = next(iter(self.send_store)) - oldest_tenser = self.send_store.pop(oldest_tenser_id) - oldest_tenser_size = oldest_tenser.element_size( - ) * oldest_tenser.numel() - self.buffer_size -= oldest_tenser_size - logger.info( - "⛔[GET]Send to %s, tensor_id:%s, tensor_size:%d," - " buffer_size:%d, oldest_tenser_size:%d, rank:%d", - remote_address, tensor_id, tensor_size, - self.buffer_size, oldest_tenser_size, self.rank) - - self.send_store[tensor_id] = tensor - self.buffer_size += tensor_size - logger.debug( - "🔵[GET]Send to %s, tensor_id:%s, tensor_size:%d, " - "shape:%s, rank:%d, buffer_size:%d(%.2f%%)", - remote_address, tensor_id, tensor_size, tensor.shape, - self.rank, self.buffer_size, - self.buffer_size / self.buffer_size_threshold * 100) + + if self.send_type == "PUT": + with self.send_queue_cv: + self.send_queue.append([tensor_id, remote_address, tensor]) + self.send_queue_cv.notify() + return True + + # GET + with self.send_store_cv: + tensor_size = tensor.element_size() * tensor.numel() + while (self.buffer_size + tensor_size + > self.buffer_size_threshold): + oldest_tenser_id = next(iter(self.send_store)) + oldest_tenser = self.send_store.pop(oldest_tenser_id) + oldest_tenser_size = oldest_tenser.element_size( + ) * oldest_tenser.numel() + self.buffer_size -= oldest_tenser_size + logger.info( + "⛔[GET]Send to %s, tensor_id:%s, tensor_size:%d," + " buffer_size:%d, oldest_tenser_size:%d, rank:%d", + remote_address, tensor_id, tensor_size, + self.buffer_size, oldest_tenser_size, self.rank) + + self.send_store[tensor_id] = tensor + self.buffer_size += tensor_size + + logger.debug( + "🔵[GET]Send to %s, tensor_id:%s, tensor_size:%d, " + "shape:%s, rank:%d, buffer_size:%d(%.2f%%)", + remote_address, tensor_id, tensor_size, tensor.shape, + self.rank, self.buffer_size, + self.buffer_size / self.buffer_size_threshold * 100) return True @@ -239,7 +240,7 @@ def recv_tensor( tensor_id: str, remote_address: typing.Optional[str] = None, ) -> torch.Tensor: - if self.send_type == "PUT" or self.send_type == "PUT_ASYNC": + if self.send_type == "PUT": with self.recv_store_cv: if tensor_id not in self.recv_store: logger.warning( @@ -429,8 +430,7 @@ def _send_sync( self._send(comm, tensor_id, tensor.to(self.device), rank ^ 1, self.send_stream) - if self.send_type == "PUT_ASYNC": - self._have_sent_tensor_id(tensor_id) + self._have_sent_tensor_id(tensor_id) return True @@ -530,7 +530,6 @@ def _recv(self, comm, tensor_id: str, tensor: torch.Tensor, src: int, stream=Non def close(self) -> None: self._listener_thread.join() - if self.send_type == "PUT_ASYNC": - self._send_thread.join() + self._send_thread.join() if self._ping_thread is not None: self._ping_thread.join() From 82e5faca8a9a7cde8ef6c9592ff345f4f4288241 Mon Sep 17 00:00:00 2001 From: Abatom Date: Sat, 28 Jun 2025 20:12:14 +0800 Subject: [PATCH 28/58] _recv Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 34 +++++++------------ 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 4c310c9bb137..ce04518ac953 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -283,13 +283,7 @@ def recv_tensor( remote_address, tensor_id, data["ret"]) return None - tensor = torch.empty(data["shape"], - dtype=getattr(torch, data["dtype"]), - device=self.device) - - self._recv(comm, tensor_id, tensor, rank ^ 1, self.recv_stream) - - return tensor + return self._recv(comm, tensor_id, data["shape"], data["dtype"], rank ^ 1, self.recv_stream) def _listen_for_requests(self): while True: @@ -314,19 +308,9 @@ def _listen_for_requests(self): elif data["cmd"] == "PUT": tensor_id = data["tensor_id"] try: - tensor = torch.empty(data["shape"], - dtype=getattr(torch, data["dtype"]), - device=self.device) - except torch.cuda.OutOfMemoryError: - self.router_socket.send_multipart([remote_address, b"1"]) - tensor = None - logger.warning( - "🔴[PUT]Recv Tensor, Out Of Memory, %s👈%s, " - "data:%s", self.zmq_address, remote, data) - else: self.router_socket.send_multipart([remote_address, b"0"]) comm, rank = self.comms[remote] - self._recv(comm, tensor_id, tensor, rank ^ 1, self.recv_stream) + tensor = self._recv(comm, tensor_id, data["shape"], data["dtype"], rank ^ 1, self.recv_stream) tensor_size = tensor.element_size() * tensor.numel() if (self.buffer_size + tensor_size > self.buffer_size_threshold): @@ -339,6 +323,12 @@ def _listen_for_requests(self): remote, data, addr) else: self.buffer_size += tensor_size + except torch.cuda.OutOfMemoryError: + self.router_socket.send_multipart([remote_address, b"1"]) + tensor = None + logger.warning( + "🔴[PUT]Recv Tensor, Out Of Memory, %s👈%s, " + "data:%s", self.zmq_address, remote, data) with self.recv_store_cv: self.recv_store[tensor_id] = tensor @@ -515,18 +505,18 @@ def _send(self, comm, tensor_id: str, tensor: torch.Tensor, dst: int, stream=Non event.synchronize() @nvtx.annotate("P2pNcclEngine.recv", color="blue") - def _recv(self, comm, tensor_id: str, tensor: torch.Tensor, src: int, stream=None): - assert tensor.device == self.device, ( - f"this nccl communicator is created to work on {self.device}, " - f"but the input tensor is on {tensor.device}") + def _recv(self, comm, tensor_id: str, shape: str, dtype: str, src: int, stream=None): stream = stream if stream is not None else current_stream() event = torch.cuda.Event() with torch.cuda.stream(stream): + tensor = torch.empty(shape, dtype=getattr(torch, dtype), + device=self.device) self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(), ncclDataTypeEnum.from_torch(tensor.dtype), src, comm, cudaStream_t(stream.cuda_stream)) event.record(stream) event.synchronize() + return tensor def close(self) -> None: self._listener_thread.join() From 1e31f15b428bbba556914f154fde899d807a6425 Mon Sep 17 00:00:00 2001 From: Abatom Date: Sat, 28 Jun 2025 20:47:36 +0800 Subject: [PATCH 29/58] format Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 5 --- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 42 ++++++++++++------- 2 files changed, 28 insertions(+), 19 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 3abfbe1dd753..a7e72090bbea 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -191,7 +191,6 @@ def inject_kv_into_layer( # Load the KV for each request each layer for request in metadata.requests: - is_success = True for layer_name in forward_context.no_compile_layers: attn_layer = forward_context.no_compile_layers[layer_name] kv_cache_layer = attn_layer.kv_cache[ \ @@ -203,14 +202,10 @@ def inject_kv_into_layer( if kv_cache is None: logger.warning("🚧src_kv_cache is None, %s", request.request_id) - is_success = False break inject_kv_into_layer(kv_cache_layer, kv_cache, request.slot_mapping, request.request_id) - if is_success: - logger.info( - "🔵KV Cache is injected into layer, %s", request.request_id) def wait_for_layer_load(self, layer_name: str) -> None: """Blocking until the KV for a specific layer is loaded into vLLM's diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index ce04518ac953..f8d2bff0c8bb 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -7,7 +7,6 @@ import typing from collections import deque from contextlib import contextmanager -from enum import Enum from typing import TYPE_CHECKING, Any, Optional import msgpack @@ -220,17 +219,16 @@ def send_tensor( logger.info( "⛔[GET]Send to %s, tensor_id:%s, tensor_size:%d," " buffer_size:%d, oldest_tenser_size:%d, rank:%d", - remote_address, tensor_id, tensor_size, - self.buffer_size, oldest_tenser_size, self.rank) + remote_address, tensor_id, tensor_size, self.buffer_size, + oldest_tenser_size, self.rank) self.send_store[tensor_id] = tensor self.buffer_size += tensor_size logger.debug( "🔵[GET]Send to %s, tensor_id:%s, tensor_size:%d, " - "shape:%s, rank:%d, buffer_size:%d(%.2f%%)", - remote_address, tensor_id, tensor_size, tensor.shape, - self.rank, self.buffer_size, + "shape:%s, rank:%d, buffer_size:%d(%.2f%%)", remote_address, + tensor_id, tensor_size, tensor.shape, self.rank, self.buffer_size, self.buffer_size / self.buffer_size_threshold * 100) return True @@ -283,7 +281,8 @@ def recv_tensor( remote_address, tensor_id, data["ret"]) return None - return self._recv(comm, tensor_id, data["shape"], data["dtype"], rank ^ 1, self.recv_stream) + return self._recv(comm, tensor_id, data["shape"], data["dtype"], + rank ^ 1, self.recv_stream) def _listen_for_requests(self): while True: @@ -310,7 +309,9 @@ def _listen_for_requests(self): try: self.router_socket.send_multipart([remote_address, b"0"]) comm, rank = self.comms[remote] - tensor = self._recv(comm, tensor_id, data["shape"], data["dtype"], rank ^ 1, self.recv_stream) + tensor = self._recv(comm, tensor_id, data["shape"], + data["dtype"], rank ^ 1, + self.recv_stream) tensor_size = tensor.element_size() * tensor.numel() if (self.buffer_size + tensor_size > self.buffer_size_threshold): @@ -360,8 +361,8 @@ def _listen_for_requests(self): if data["ret"] == 0: comm, rank = self.comms[remote] - self._send(comm, tensor_id, tensor.to(self.device), rank ^ 1, - self.send_stream) + self._send(comm, tensor_id, tensor.to(self.device), + rank ^ 1, self.send_stream) else: logger.warning( "🚧Unexpected, Received message from %s, data:%s", @@ -418,7 +419,8 @@ def _send_sync( response.decode()) return False - self._send(comm, tensor_id, tensor.to(self.device), rank ^ 1, self.send_stream) + self._send(comm, tensor_id, tensor.to(self.device), rank ^ 1, + self.send_stream) self._have_sent_tensor_id(tensor_id) @@ -492,7 +494,12 @@ def _ping(self): time.sleep(3) @nvtx.annotate("P2pNcclEngine.send", color="red") - def _send(self, comm, tensor_id: str, tensor: torch.Tensor, dst: int, stream=None): + def _send(self, + comm, + tensor_id: str, + tensor: torch.Tensor, + dst: int, + stream=None): assert tensor.device == self.device, ( f"this nccl communicator is created to work on {self.device}, " f"but the input tensor is on {tensor.device}") @@ -505,11 +512,18 @@ def _send(self, comm, tensor_id: str, tensor: torch.Tensor, dst: int, stream=Non event.synchronize() @nvtx.annotate("P2pNcclEngine.recv", color="blue") - def _recv(self, comm, tensor_id: str, shape: str, dtype: str, src: int, stream=None): + def _recv(self, + comm, + tensor_id: str, + shape: str, + dtype: str, + src: int, + stream=None): stream = stream if stream is not None else current_stream() event = torch.cuda.Event() with torch.cuda.stream(stream): - tensor = torch.empty(shape, dtype=getattr(torch, dtype), + tensor = torch.empty(shape, + dtype=getattr(torch, dtype), device=self.device) self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(), ncclDataTypeEnum.from_torch(tensor.dtype), src, From f2928721d57d34be4790e2a9724e227f1b9d249f Mon Sep 17 00:00:00 2001 From: Abatom Date: Sat, 28 Jun 2025 20:57:31 +0800 Subject: [PATCH 30/58] remove nvtx Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index f8d2bff0c8bb..4623c4c11d52 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -10,7 +10,6 @@ from typing import TYPE_CHECKING, Any, Optional import msgpack -import nvtx import torch import zmq @@ -493,7 +492,6 @@ def _ping(self): sock.send(msgpack.dumps(data)) time.sleep(3) - @nvtx.annotate("P2pNcclEngine.send", color="red") def _send(self, comm, tensor_id: str, @@ -511,7 +509,6 @@ def _send(self, event.record(stream) event.synchronize() - @nvtx.annotate("P2pNcclEngine.recv", color="blue") def _recv(self, comm, tensor_id: str, From f724f58a1633a7f5c74306f5d9c5e9f7c1816c57 Mon Sep 17 00:00:00 2001 From: Abatom Date: Sat, 28 Jun 2025 21:14:30 +0800 Subject: [PATCH 31/58] update md Signed-off-by: Abatom --- docs/design/v1/p2p_nccl_connector.md | 32 ++++++++++++++-------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/design/v1/p2p_nccl_connector.md b/docs/design/v1/p2p_nccl_connector.md index 32cdaacf058a..ce541f17672a 100644 --- a/docs/design/v1/p2p_nccl_connector.md +++ b/docs/design/v1/p2p_nccl_connector.md @@ -8,7 +8,7 @@ As shown in Figure 1, the overall process of this **PD disaggregation** solution 1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface. 2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**. 3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**. -4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`. +4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT** mode). The D instance's `zmq_addr` can be resolved through the `request_id`. 5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**. 6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**. 7. After completing **Decode**, the D instance returns the result to the **Proxy/Router**, which then forwards it to the **client**. @@ -31,9 +31,9 @@ Each P/D instance periodically sends a heartbeat packet to the Proxy/Router (cur ## KV Cache Transfer Methods -There are three methods for KVcache transfer: PUT, GET, and PUT_ASYNC. These methods can be specified using the `--kv-transfer-config` and `kv_connector_extra_config` parameters, specifically through the `send_type` field. Both PUT and PUT_ASYNC involve the P instance actively sending KVcache to the D instance. The difference is that PUT is a synchronous transfer method that blocks the main process, while PUT_ASYNC is an asynchronous transfer method. PUT_ASYNC uses a dedicated thread for sending KVcache, which means it does not block the main process. In contrast, the GET method involves the P instance saving the KVcache to the memory buffer after computing the prefill. The D instance then actively retrieves the computed KVcache from the P instance once it has allocated space for the KVcache. +There are three methods for KVcache transfer: PUT and GET. These methods can be specified using the `--kv-transfer-config` and `kv_connector_extra_config` parameters, specifically through the `send_type` field. PUT involve the P instance actively sending KVcache to the D instance. PUT is an asynchronous transfer method. PUT uses a dedicated thread for sending KVcache, which means it does not block the main process. In contrast, the GET method involves the P instance saving the KVcache to the memory buffer after computing the prefill. The D instance then actively retrieves the computed KVcache from the P instance once it has allocated space for the KVcache. -Experimental results have shown that the performance of these methods, from highest to lowest, is as follows: PUT_ASYNC → GET → PUT. +Experimental results have shown that the performance of these methods, from highest to lowest, is as follows: PUT → GET. ## P2P Communication via ZMQ & NCCL @@ -53,7 +53,7 @@ Each NCCL group occupies a certain amount of GPU memory buffer for communication ## GPU Memory Buffer and Tensor Memory Pool -The trade-off in the size of the memory buffer is as follows: For P instances, the memory buffer is not required in PUT and PUT_ASYNC modes, but it is necessary in GET mode. For D instances, a memory buffer is needed in all three modes. The memory buffer for D instances should not be too large. Similarly, for P instances in GET mode, the memory buffer should also not be too large. The memory buffer of D instances is used to temporarily store KVcache sent by P instances. If it is too large, it will reduce the KVcache space available for normal inference by D instances, thereby decreasing the inference batch size and ultimately leading to a reduction in output throughput. The size of the memory buffer is configured by the parameter `kv_buffer_size`, measured in bytes, and is typically set to 5%~10% of the memory size. +The trade-off in the size of the memory buffer is as follows: For P instances, the memory buffer is not required in PUT mode, but it is necessary in GET mode. For D instances, a memory buffer is needed in all three modes. The memory buffer for D instances should not be too large. Similarly, for P instances in GET mode, the memory buffer should also not be too large. The memory buffer of D instances is used to temporarily store KVcache sent by P instances. If it is too large, it will reduce the KVcache space available for normal inference by D instances, thereby decreasing the inference batch size and ultimately leading to a reduction in output throughput. The size of the memory buffer is configured by the parameter `kv_buffer_size`, measured in bytes, and is typically set to 5%~10% of the memory size. If the `--max-num-seqs` parameter for P instances is set to a large value, due to the large batch size, P instances will generate a large amount of KVcache simultaneously. This may exceed the capacity of the memory buffer of D instances, resulting in KVcache loss. Once KVcache is lost, D instances need to recompute Prefill, which is equivalent to performing Prefill twice. Consequently, the time-to-first-token (TTFT) will significantly increase, leading to degraded performance. @@ -88,9 +88,9 @@ To address the above issues, I have designed and developed a local Tensor memory - Pay attention to the setting of the `kv_buffer_size` (in bytes). The empirical value is 10% of the GPU memory size. This is related to the kvcache size. If it is too small, the GPU memory buffer for temporarily storing the received kvcache will overflow, causing the kvcache to be stored in the tensor memory pool, which increases latency. If it is too large, the kvcache available for inference will be reduced, leading to a smaller batch size and decreased throughput. - For Prefill instances, when using non-GET mode, the `kv_buffer_size` can be set to 1, as Prefill currently does not need to receive kvcache. However, when using GET mode, a larger `kv_buffer_size` is required because it needs to store the kvcache sent to the D instance. - You may need to modify the `kv_buffer_size` and `port` in the following commands (if there is a conflict). -- `PUT_ASYNC` offers the best performance and should be prioritized. +- `PUT` offers the more performance and should be prioritized. - The `--port` must be consistent with the `http_port` in the `--kv-transfer-config`. -- The `disagg_prefill_proxy_xpyd.py` script will use port 10001 (for receiving client requests) and port 30001 (for receiving service discovery from P and D instances). +- The `disagg_prefill_proxy_xpyd.py` script will use port 10101 (for receiving client requests) and port 30201 (for receiving service discovery from P and D instances). - The node running the proxy must have `quart` installed. - Supports multiple nodes; you just need to modify the `proxy_ip` and `proxy_port` in `--kv-transfer-config`. - In the following examples, it is assumed that **the proxy's IP is 10.0.1.1**. @@ -123,7 +123,7 @@ python3 disagg_prefill_proxy_xpyd.py & --gpu-memory-utilization 0.9 \ --disable-log-request \ --kv-transfer-config \ - '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30201","http_port":"20005","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 & ``` ### Decode1 (e.g. 10.0.1.3 or 10.0.1.1) @@ -145,7 +145,7 @@ python3 disagg_prefill_proxy_xpyd.py & --gpu-memory-utilization 0.7 \ --disable-log-request \ --kv-transfer-config \ - '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30201","http_port":"20009","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 & ``` ### Decode2 (e.g. 10.0.1.4 or 10.0.1.1) @@ -167,7 +167,7 @@ python3 disagg_prefill_proxy_xpyd.py & --gpu-memory-utilization 0.7 \ --disable-log-request \ --kv-transfer-config \ - '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30201","http_port":"20003","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 & ``` ### Decode3 (e.g. 10.0.1.5 or 10.0.1.1) @@ -189,7 +189,7 @@ python3 disagg_prefill_proxy_xpyd.py & --gpu-memory-utilization 0.7 \ --disable-log-request \ --kv-transfer-config \ - '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30201","http_port":"20008","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 & ``` ## Run 3P1D @@ -220,7 +220,7 @@ python3 disagg_prefill_proxy_xpyd.py & --gpu-memory-utilization 0.9 \ --disable-log-request \ --kv-transfer-config \ - '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30201","http_port":"20005","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 & ``` ### Prefill2 (e.g. 10.0.1.3 or 10.0.1.1) @@ -242,7 +242,7 @@ python3 disagg_prefill_proxy_xpyd.py & --gpu-memory-utilization 0.9 \ --disable-log-request \ --kv-transfer-config \ - '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30201","http_port":"20009","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 & ``` ### Prefill3 (e.g. 10.0.1.4 or 10.0.1.1) @@ -264,7 +264,7 @@ python3 disagg_prefill_proxy_xpyd.py & --gpu-memory-utilization 0.9 \ --disable-log-request \ --kv-transfer-config \ - '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30201","http_port":"20003","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 & ``` ### Decode1 (e.g. 10.0.1.5 or 10.0.1.1) @@ -286,13 +286,13 @@ python3 disagg_prefill_proxy_xpyd.py & --gpu-memory-utilization 0.7 \ --disable-log-request \ --kv-transfer-config \ - '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30201","http_port":"20008","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 & ``` # Single request ```shell -curl -X POST -s http://10.0.1.1:10001/v1/completions \ +curl -X POST -s http://10.0.1.1:10101/v1/completions \ -H "Content-Type: application/json" \ -d '{ "model": "base_model", @@ -313,7 +313,7 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \ --tokenizer meta-llama/Llama-3.1-8B-Instruct \ --dataset-name "random" \ --host 10.0.1.1 \ - --port 10001 \ + --port 10101 \ --random-input-len 1024 \ --random-output-len 1024 \ --ignore-eos \ From e5e585b052c087e8d49c4e9d92807f71f161c8cd Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 30 Jun 2025 11:08:12 +0800 Subject: [PATCH 32/58] add log Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index a7e72090bbea..60a8c8a7d17e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -354,6 +354,9 @@ def build_connector_meta( # the request's prompt is chunked prefill if num_tokens < len(new_req.prompt_token_ids): # 'CachedRequestData' has no attribute 'prompt_token_ids' + logger.info( + "🚧%s is chunked prefill, num_token:%d, num_prompt:%d", + request_id, num_token, len(new_req.prompt_token_ids)) self.chunked_prefill[new_req.req_id] = ( new_req.block_ids[0], new_req.prompt_token_ids) continue @@ -384,6 +387,9 @@ def build_connector_meta( prompt_token_ids = self.chunked_prefill[cached_req.req_id][1] # the request's prompt is chunked prefill again if num_tokens < len(prompt_token_ids): + logger.info("🚧%s is chunked prefill again, num_token:%d, " + "num_prompt:%d", request_id, num_token, + len(prompt_token_ids)) self.chunked_prefill[cached_req.req_id] = ( block_ids, prompt_token_ids) continue @@ -404,6 +410,8 @@ def build_connector_meta( total_tokens = cached_req.num_computed_tokens + 1 token_ids = request.all_token_ids[:total_tokens] + logger.info("🚧%s is resumed from preemption, total_tokens:%d", + cached_req.req_id, total_tokens) # NOTE(rob): For resumed req, new_block_ids is all # of the block_ids for the request. block_ids = cached_req.new_block_ids[0] From 9ff003f8f8a619ecd6c78d035c2f81b42dfebcb0 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 30 Jun 2025 11:15:36 +0800 Subject: [PATCH 33/58] bugfix Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 60a8c8a7d17e..746b72187ece 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -356,7 +356,8 @@ def build_connector_meta( # 'CachedRequestData' has no attribute 'prompt_token_ids' logger.info( "🚧%s is chunked prefill, num_token:%d, num_prompt:%d", - request_id, num_token, len(new_req.prompt_token_ids)) + new_req.req_id, num_token, + len(new_req.prompt_token_ids)) self.chunked_prefill[new_req.req_id] = ( new_req.block_ids[0], new_req.prompt_token_ids) continue @@ -388,7 +389,7 @@ def build_connector_meta( # the request's prompt is chunked prefill again if num_tokens < len(prompt_token_ids): logger.info("🚧%s is chunked prefill again, num_token:%d, " - "num_prompt:%d", request_id, num_token, + "num_prompt:%d", cached_req.req_id, num_token, len(prompt_token_ids)) self.chunked_prefill[cached_req.req_id] = ( block_ids, prompt_token_ids) From be38ac8fd116b33162f0ee84c923db0d19895c09 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 30 Jun 2025 11:24:07 +0800 Subject: [PATCH 34/58] bugfix Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 746b72187ece..4a72be6855d5 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -355,8 +355,8 @@ def build_connector_meta( if num_tokens < len(new_req.prompt_token_ids): # 'CachedRequestData' has no attribute 'prompt_token_ids' logger.info( - "🚧%s is chunked prefill, num_token:%d, num_prompt:%d", - new_req.req_id, num_token, + "🚧%s is chunked prefill, num_tokens:%d, num_prompt:%d", + new_req.req_id, num_tokens, len(new_req.prompt_token_ids)) self.chunked_prefill[new_req.req_id] = ( new_req.block_ids[0], new_req.prompt_token_ids) @@ -388,8 +388,8 @@ def build_connector_meta( prompt_token_ids = self.chunked_prefill[cached_req.req_id][1] # the request's prompt is chunked prefill again if num_tokens < len(prompt_token_ids): - logger.info("🚧%s is chunked prefill again, num_token:%d, " - "num_prompt:%d", cached_req.req_id, num_token, + logger.info("🚧%s is chunked prefill again, num_tokens:%d, " + "num_prompt:%d", cached_req.req_id, num_tokens, len(prompt_token_ids)) self.chunked_prefill[cached_req.req_id] = ( block_ids, prompt_token_ids) From 3b51339d1635b19e873ce6e8ea0a88a5349b19a0 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 30 Jun 2025 12:05:41 +0800 Subject: [PATCH 35/58] add log Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 4623c4c11d52..db0003860f33 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -423,6 +423,9 @@ def _send_sync( self._have_sent_tensor_id(tensor_id) + logger.debug("🔵[PUT]Send Tensor, %s👉%s, data:%s", self.zmq_address, + remote_address, data) + return True def get_finished( From 479aa0dc296a936a7ec22c3963dbdf9af02ee95d Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 30 Jun 2025 13:54:09 +0800 Subject: [PATCH 36/58] finished_recving Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index db0003860f33..0d99bc007b6e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -475,8 +475,6 @@ def get_finished( if num_layers == len( self.recv_request_id_to_tensor_ids[request_id]): finished_recving.add(request_id) - for request_id in finished_recving: - self.recv_request_id_to_tensor_ids.pop(request_id, None) # TODO: Add failed requests (e.g., transmission errors) return finished_sending or None, finished_recving or None From 0b35d02f503c518c7ab0608063017e81bdec36b8 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 30 Jun 2025 14:11:18 +0800 Subject: [PATCH 37/58] format Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 4a72be6855d5..eee06419aaf2 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -388,9 +388,10 @@ def build_connector_meta( prompt_token_ids = self.chunked_prefill[cached_req.req_id][1] # the request's prompt is chunked prefill again if num_tokens < len(prompt_token_ids): - logger.info("🚧%s is chunked prefill again, num_tokens:%d, " - "num_prompt:%d", cached_req.req_id, num_tokens, - len(prompt_token_ids)) + logger.info( + "🚧%s is chunked prefill again, num_tokens:%d, " + "num_prompt:%d", cached_req.req_id, num_tokens, + len(prompt_token_ids)) self.chunked_prefill[cached_req.req_id] = ( block_ids, prompt_token_ids) continue From 349cb61d4b5c425472e52b8010aa91bc2478b5ef Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 30 Jun 2025 14:14:51 +0800 Subject: [PATCH 38/58] update md Signed-off-by: Abatom --- docs/design/v1/p2p_nccl_connector.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/design/v1/p2p_nccl_connector.md b/docs/design/v1/p2p_nccl_connector.md index ce541f17672a..cbef2aea0784 100644 --- a/docs/design/v1/p2p_nccl_connector.md +++ b/docs/design/v1/p2p_nccl_connector.md @@ -68,7 +68,7 @@ To address the above issues, I have designed and developed a local Tensor memory cd /home # Download the installation package, and I will update the commit-id in time. You can directly copy the command. - wget https://vllm-wheels.s3.us-west-2.amazonaws.com/9112b443a042d8d815880b8780633882ad32b183/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl + wget https://vllm-wheels.s3.us-west-2.amazonaws.com/0d06b533a0fcca7a62603c868df68235659d6935/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl # Download the code repository. git clone -b xpyd-v1 https://github.com/Abatom/vllm.git From db2a1e5d8a45ba1b7d217e35ba93c2f495cd25c7 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 30 Jun 2025 20:14:52 +0800 Subject: [PATCH 39/58] finished_recving Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 2 ++ .../kv_connector/v1/p2p/p2p_nccl_engine.py | 26 ++++++++++++------- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index eee06419aaf2..90b9698a9a33 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -87,11 +87,13 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): self._local_rank = get_world_group().local_rank \ if role == KVConnectorRole.WORKER else 0 + num_layers = len(vllm_config.compilation_config.static_forward_context) self.p2p_nccl_engine = P2pNcclEngine( local_rank=self._local_rank, config=self.config, hostname="", port_offset=self._rank, + num_layers=num_layers, ) if role == KVConnectorRole.WORKER else None # ============================== diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 0d99bc007b6e..809999323407 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -65,10 +65,12 @@ def __init__(self, config: KVTransferConfig, hostname: str = "", port_offset: int = 0, + num_layers: int = 0, library_path: Optional[str] = None) -> None: self.config = config self.rank = port_offset self.local_rank = local_rank + self.num_layers = num_layers self.device = torch.device(f"cuda:{self.local_rank}") self.nccl = NCCLLibrary(library_path) @@ -153,12 +155,16 @@ def __init__(self, daemon=True) self._ping_thread.start() + self.finished_recving: set[str] = set() + self.finished_sending: set[str] = set() + logger.info( "💯P2pNcclEngine init, rank:%d, local_rank:%d, http_address:%s, " "zmq_address:%s, proxy_address:%s, send_type:%s, buffer_size_" - "threshold:%.2f, nccl_num_channels:%s", self.rank, self.local_rank, - self.http_address, self.zmq_address, self.proxy_address, - self.send_type, self.buffer_size_threshold, self.nccl_num_channels) + "threshold:%.2f, nccl_num_channels:%s, num_layers:%d", self.rank, + self.local_rank, self.http_address, self.zmq_address, + self.proxy_address, self.send_type, self.buffer_size_threshold, + self.nccl_num_channels, self.num_layers) def _create_connect(self, remote_address: typing.Optional[str] = None): assert remote_address is not None @@ -453,6 +459,7 @@ def get_finished( request_id, None) self.recv_request_id_to_tensor_ids.pop( request_id, None) + self.finished_recving.discard(request_id) if isinstance(tensor, tuple): addr, _, _ = tensor self.pool.free(addr) @@ -461,23 +468,24 @@ def get_finished( # TODO: 1)Avoid polling. 2)Validate chunked prefill and preemption. num_layers = len(forward_context.no_compile_layers) # Retrieve requests that have already sent the KV cache. - finished_sending: set[str] = set() + self.finished_sending.clear() if self.send_type != "GET": for request_id in self.send_request_id_to_tensor_ids: if (num_layers == len( self.send_request_id_to_tensor_ids[request_id])): - finished_sending.add(request_id) - for request_id in finished_sending: + self.finished_sending.add(request_id) + for request_id in self.finished_sending: self.send_request_id_to_tensor_ids.pop(request_id, None) # Retrieve requests that have already received the KV cache. - finished_recving: set[str] = set() for request_id in self.recv_request_id_to_tensor_ids: if num_layers == len( self.recv_request_id_to_tensor_ids[request_id]): - finished_recving.add(request_id) + self.finished_recving.add(request_id) + for request_id in finished_sending: + self.send_request_id_to_tensor_ids.pop(request_id, None) # TODO: Add failed requests (e.g., transmission errors) - return finished_sending or None, finished_recving or None + return self.finished_sending or None, self.finished_recving or None def _ping(self): sock = self.context.socket(zmq.DEALER) From 61c243aea27a19ef4940237bf51caa8e9274e4c5 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 30 Jun 2025 20:19:09 +0800 Subject: [PATCH 40/58] discard Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 809999323407..0e2b61971b37 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -459,10 +459,10 @@ def get_finished( request_id, None) self.recv_request_id_to_tensor_ids.pop( request_id, None) - self.finished_recving.discard(request_id) if isinstance(tensor, tuple): addr, _, _ = tensor self.pool.free(addr) + self.finished_recving.discard(request_id) logger.debug("🔵get_finished, request_id:%s", request_id) # TODO: 1)Avoid polling. 2)Validate chunked prefill and preemption. From 3a47595710f1c9e14238a536e6f821e8e644706d Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 30 Jun 2025 20:23:13 +0800 Subject: [PATCH 41/58] bugfix Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 0e2b61971b37..b3b4007d67c0 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -481,8 +481,8 @@ def get_finished( if num_layers == len( self.recv_request_id_to_tensor_ids[request_id]): self.finished_recving.add(request_id) - for request_id in finished_sending: - self.send_request_id_to_tensor_ids.pop(request_id, None) + for request_id in self.finished_recving: + self.recv_request_id_to_tensor_ids.pop(request_id, None) # TODO: Add failed requests (e.g., transmission errors) return self.finished_sending or None, self.finished_recving or None From f4bba34c39b3770697f300989d7d3b3829917f4d Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 30 Jun 2025 20:48:44 +0800 Subject: [PATCH 42/58] finished_recving_kv_req_ids Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py | 2 ++ .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 2 +- vllm/v1/core/sched/scheduler.py | 3 +-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 90b9698a9a33..c2e34b29808f 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -88,6 +88,8 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): if role == KVConnectorRole.WORKER else 0 num_layers = len(vllm_config.compilation_config.static_forward_context) + + logger.ifno("🚧compilation_config:%s", vllm_config.compilation_config) self.p2p_nccl_engine = P2pNcclEngine( local_rank=self._local_rank, config=self.config, diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index b3b4007d67c0..5b20e296e6d6 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -462,7 +462,6 @@ def get_finished( if isinstance(tensor, tuple): addr, _, _ = tensor self.pool.free(addr) - self.finished_recving.discard(request_id) logger.debug("🔵get_finished, request_id:%s", request_id) # TODO: 1)Avoid polling. 2)Validate chunked prefill and preemption. @@ -477,6 +476,7 @@ def get_finished( for request_id in self.finished_sending: self.send_request_id_to_tensor_ids.pop(request_id, None) # Retrieve requests that have already received the KV cache. + self.finished_recving.clear() for request_id in self.recv_request_id_to_tensor_ids: if num_layers == len( self.recv_request_id_to_tensor_ids[request_id]): diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 00b0844a5660..bbc246602db6 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -942,6 +942,7 @@ def finish_requests( # First pass: collect requests to remove from queues for req_id in request_ids: + self.finished_recving_kv_req_ids.discard(req_id) request = self.requests.get(req_id) if request is None: # Invalid request ID. @@ -1084,8 +1085,6 @@ def _update_waiting_for_remote_kv(self, request: Request) -> bool: # Update the request state for scheduling. request.num_computed_tokens = num_computed_tokens - # Return that we are ready. - self.finished_recving_kv_req_ids.remove(request.request_id) return True def _update_from_kv_xfer_finished(self, From af0d84b11f732870e3bf61ec99139ce140582ec2 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 30 Jun 2025 20:50:09 +0800 Subject: [PATCH 43/58] bugfix Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index c2e34b29808f..87c81c04795b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -89,7 +89,7 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): num_layers = len(vllm_config.compilation_config.static_forward_context) - logger.ifno("🚧compilation_config:%s", vllm_config.compilation_config) + logger.info("🚧compilation_config:%s", vllm_config.compilation_config) self.p2p_nccl_engine = P2pNcclEngine( local_rank=self._local_rank, config=self.config, From 7fa1f1125ed684f241483f52a67b52e5e0c30f0f Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 30 Jun 2025 21:17:11 +0800 Subject: [PATCH 44/58] rm num_layers Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py | 4 ---- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 2 -- 2 files changed, 6 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 87c81c04795b..eee06419aaf2 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -87,15 +87,11 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): self._local_rank = get_world_group().local_rank \ if role == KVConnectorRole.WORKER else 0 - num_layers = len(vllm_config.compilation_config.static_forward_context) - - logger.info("🚧compilation_config:%s", vllm_config.compilation_config) self.p2p_nccl_engine = P2pNcclEngine( local_rank=self._local_rank, config=self.config, hostname="", port_offset=self._rank, - num_layers=num_layers, ) if role == KVConnectorRole.WORKER else None # ============================== diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 5b20e296e6d6..98f386a31e7c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -65,12 +65,10 @@ def __init__(self, config: KVTransferConfig, hostname: str = "", port_offset: int = 0, - num_layers: int = 0, library_path: Optional[str] = None) -> None: self.config = config self.rank = port_offset self.local_rank = local_rank - self.num_layers = num_layers self.device = torch.device(f"cuda:{self.local_rank}") self.nccl = NCCLLibrary(library_path) From c200e582d130e5f1ca562f887ca8147c824f332b Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 30 Jun 2025 22:55:50 +0800 Subject: [PATCH 45/58] format Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 98f386a31e7c..4f59daf9d8e2 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -159,10 +159,9 @@ def __init__(self, logger.info( "💯P2pNcclEngine init, rank:%d, local_rank:%d, http_address:%s, " "zmq_address:%s, proxy_address:%s, send_type:%s, buffer_size_" - "threshold:%.2f, nccl_num_channels:%s, num_layers:%d", self.rank, - self.local_rank, self.http_address, self.zmq_address, - self.proxy_address, self.send_type, self.buffer_size_threshold, - self.nccl_num_channels, self.num_layers) + "threshold:%.2f, nccl_num_channels:%s", self.rank, self.local_rank, + self.http_address, self.zmq_address, self.proxy_address, + self.send_type, self.buffer_size_threshold, self.nccl_num_channels) def _create_connect(self, remote_address: typing.Optional[str] = None): assert remote_address is not None From 100026d918fc937b979b63125e2896acf631c1ce Mon Sep 17 00:00:00 2001 From: Abatom Date: Tue, 1 Jul 2025 12:14:42 +0800 Subject: [PATCH 46/58] self.num_layers Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 2 +- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 40 +++++++++---------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index eee06419aaf2..c897a63501e9 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -89,7 +89,7 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): self.p2p_nccl_engine = P2pNcclEngine( local_rank=self._local_rank, - config=self.config, + vllm_config=vllm_config, hostname="", port_offset=self._rank, ) if role == KVConnectorRole.WORKER else None diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 4f59daf9d8e2..dc3d0181c1be 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -62,11 +62,12 @@ class P2pNcclEngine: def __init__(self, local_rank: int, - config: KVTransferConfig, + vllm_config: "VllmConfig", hostname: str = "", port_offset: int = 0, library_path: Optional[str] = None) -> None: - self.config = config + self.config = vllm_config.kv_transfer_config + self.compilation_config = vllm_config.compilation_config self.rank = port_offset self.local_rank = local_rank self.device = torch.device(f"cuda:{self.local_rank}") @@ -153,6 +154,7 @@ def __init__(self, daemon=True) self._ping_thread.start() + self.num_layers = 0 self.finished_recving: set[str] = set() self.finished_sending: set[str] = set() @@ -370,17 +372,29 @@ def _listen_for_requests(self): "🚧Unexpected, Received message from %s, data:%s", remote_address, data) + def get_num_layers(self): + if self.num_layers == 0: + self.num_layers = len(self.compilation_config.static_forward_context) + logger.debug("get_num_layers, num_layers:%d", self.num_layers) + return self.num_layers + def _have_sent_tensor_id(self, tensor_id: str): request_id = tensor_id.split('#')[0] if request_id not in self.send_request_id_to_tensor_ids: self.send_request_id_to_tensor_ids[request_id] = set() self.send_request_id_to_tensor_ids[request_id].add(tensor_id) + if self.get_num_layers() == len( + self.send_request_id_to_tensor_ids[request_id]): + self.finished_sending.add(request_id) def _have_received_tensor_id(self, tensor_id: str): request_id = tensor_id.split('#')[0] if request_id not in self.recv_request_id_to_tensor_ids: self.recv_request_id_to_tensor_ids[request_id] = set() self.recv_request_id_to_tensor_ids[request_id].add(tensor_id) + if self.get_num_layers() == len( + self.recv_request_id_to_tensor_ids[request_id]): + self.finished_recving.add(request_id) def _send_async(self): while True: @@ -461,28 +475,14 @@ def get_finished( self.pool.free(addr) logger.debug("🔵get_finished, request_id:%s", request_id) - # TODO: 1)Avoid polling. 2)Validate chunked prefill and preemption. - num_layers = len(forward_context.no_compile_layers) # Retrieve requests that have already sent the KV cache. - self.finished_sending.clear() - if self.send_type != "GET": - for request_id in self.send_request_id_to_tensor_ids: - if (num_layers == len( - self.send_request_id_to_tensor_ids[request_id])): - self.finished_sending.add(request_id) - for request_id in self.finished_sending: - self.send_request_id_to_tensor_ids.pop(request_id, None) + finished_sending = self.finished_sending.copy() # Retrieve requests that have already received the KV cache. + finished_recving = self.finished_recving.copy() + self.finished_sending.clear() self.finished_recving.clear() - for request_id in self.recv_request_id_to_tensor_ids: - if num_layers == len( - self.recv_request_id_to_tensor_ids[request_id]): - self.finished_recving.add(request_id) - for request_id in self.finished_recving: - self.recv_request_id_to_tensor_ids.pop(request_id, None) - # TODO: Add failed requests (e.g., transmission errors) - return self.finished_sending or None, self.finished_recving or None + return finished_sending or None, finished_recving or None def _ping(self): sock = self.context.socket(zmq.DEALER) From f80dad5de5ac5c9740479b111e34a58203132581 Mon Sep 17 00:00:00 2001 From: Abatom Date: Tue, 1 Jul 2025 12:23:22 +0800 Subject: [PATCH 47/58] rm _ Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index dc3d0181c1be..da8f5088d124 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -128,9 +128,9 @@ def __init__(self, # tensor_id: torch.Tensor self.send_queue: deque[list[Any]] = deque() self.send_request_id_to_tensor_ids: dict[str, set[str]] = {} - self._send_thread = threading.Thread(target=self._send_async, + self.send_thread = threading.Thread(target=self.send_async, daemon=True) - self._send_thread.start() + self.send_thread.start() # tensor_id: torch.Tensor/(addr, dtype, shape) self.recv_store: dict[str, Any] = {} @@ -145,12 +145,12 @@ def __init__(self, "nccl_num_channels", "8") self._listener_thread = threading.Thread( - target=self._listen_for_requests, daemon=True) + target=self.listen_for_requests, daemon=True) self._listener_thread.start() self._ping_thread = None if port_offset == 0 and self.proxy_address != "": - self._ping_thread = threading.Thread(target=self._ping, + self._ping_thread = threading.Thread(target=self.ping, daemon=True) self._ping_thread.start() @@ -165,7 +165,7 @@ def __init__(self, self.http_address, self.zmq_address, self.proxy_address, self.send_type, self.buffer_size_threshold, self.nccl_num_channels) - def _create_connect(self, remote_address: typing.Optional[str] = None): + def create_connect(self, remote_address: typing.Optional[str] = None): assert remote_address is not None if remote_address not in self.socks: sock = self.context.socket(zmq.DEALER) @@ -270,7 +270,7 @@ def recv_tensor( return None if remote_address not in self.socks: - self._create_connect(remote_address) + self.create_connect(remote_address) sock = self.socks[remote_address] comm, rank = self.comms[remote_address] @@ -285,10 +285,10 @@ def recv_tensor( remote_address, tensor_id, data["ret"]) return None - return self._recv(comm, tensor_id, data["shape"], data["dtype"], + return self.recv(comm, tensor_id, data["shape"], data["dtype"], rank ^ 1, self.recv_stream) - def _listen_for_requests(self): + def listen_for_requests(self): while True: socks = dict(self.poller.poll()) if self.router_socket not in socks: @@ -313,7 +313,7 @@ def _listen_for_requests(self): try: self.router_socket.send_multipart([remote_address, b"0"]) comm, rank = self.comms[remote] - tensor = self._recv(comm, tensor_id, data["shape"], + tensor = self.recv(comm, tensor_id, data["shape"], data["dtype"], rank ^ 1, self.recv_stream) tensor_size = tensor.element_size() * tensor.numel() @@ -337,7 +337,7 @@ def _listen_for_requests(self): with self.recv_store_cv: self.recv_store[tensor_id] = tensor - self._have_received_tensor_id(tensor_id) + self.have_received_tensor_id(tensor_id) self.recv_store_cv.notify() logger.debug( @@ -356,7 +356,7 @@ def _listen_for_requests(self): } # LRU self.send_store[tensor_id] = tensor - self._have_sent_tensor_id(tensor_id) + self.have_sent_tensor_id(tensor_id) else: data = {"ret": 1} @@ -365,7 +365,7 @@ def _listen_for_requests(self): if data["ret"] == 0: comm, rank = self.comms[remote] - self._send(comm, tensor_id, tensor.to(self.device), + self.send(comm, tensor_id, tensor.to(self.device), rank ^ 1, self.send_stream) else: logger.warning( @@ -378,7 +378,7 @@ def get_num_layers(self): logger.debug("get_num_layers, num_layers:%d", self.num_layers) return self.num_layers - def _have_sent_tensor_id(self, tensor_id: str): + def have_sent_tensor_id(self, tensor_id: str): request_id = tensor_id.split('#')[0] if request_id not in self.send_request_id_to_tensor_ids: self.send_request_id_to_tensor_ids[request_id] = set() @@ -387,7 +387,7 @@ def _have_sent_tensor_id(self, tensor_id: str): self.send_request_id_to_tensor_ids[request_id]): self.finished_sending.add(request_id) - def _have_received_tensor_id(self, tensor_id: str): + def have_received_tensor_id(self, tensor_id: str): request_id = tensor_id.split('#')[0] if request_id not in self.recv_request_id_to_tensor_ids: self.recv_request_id_to_tensor_ids[request_id] = set() @@ -396,15 +396,15 @@ def _have_received_tensor_id(self, tensor_id: str): self.recv_request_id_to_tensor_ids[request_id]): self.finished_recving.add(request_id) - def _send_async(self): + def send_async(self): while True: with self.send_queue_cv: while not self.send_queue: self.send_queue_cv.wait() tensor_id, remote_address, tensor = self.send_queue.popleft() - self._send_sync(tensor_id, tensor, remote_address) + self.send_sync(tensor_id, tensor, remote_address) - def _send_sync( + def send_sync( self, tensor_id: str, tensor: torch.Tensor, @@ -413,7 +413,7 @@ def _send_sync( if remote_address is None: return False if remote_address not in self.socks: - self._create_connect(remote_address) + self.create_connect(remote_address) sock = self.socks[remote_address] comm, rank = self.comms[remote_address] @@ -435,10 +435,10 @@ def _send_sync( response.decode()) return False - self._send(comm, tensor_id, tensor.to(self.device), rank ^ 1, + self.send(comm, tensor_id, tensor.to(self.device), rank ^ 1, self.send_stream) - self._have_sent_tensor_id(tensor_id) + self.have_sent_tensor_id(tensor_id) logger.debug("🔵[PUT]Send Tensor, %s👉%s, data:%s", self.zmq_address, remote_address, data) @@ -484,7 +484,7 @@ def get_finished( # TODO: Add failed requests (e.g., transmission errors) return finished_sending or None, finished_recving or None - def _ping(self): + def ping(self): sock = self.context.socket(zmq.DEALER) sock.setsockopt_string(zmq.IDENTITY, self.zmq_address) logger.debug("ping start, zmq_address:%s", self.zmq_address) @@ -498,7 +498,7 @@ def _ping(self): sock.send(msgpack.dumps(data)) time.sleep(3) - def _send(self, + def send(self, comm, tensor_id: str, tensor: torch.Tensor, @@ -515,7 +515,7 @@ def _send(self, event.record(stream) event.synchronize() - def _recv(self, + def recv(self, comm, tensor_id: str, shape: str, From e6d225dd2658c5392a5922c7c12fa0848ba3ba7d Mon Sep 17 00:00:00 2001 From: Abatom Date: Tue, 1 Jul 2025 12:25:49 +0800 Subject: [PATCH 48/58] rm _ Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index da8f5088d124..da0b5cb6c855 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -78,15 +78,15 @@ def __init__(self, port = int(self.config.kv_port) + port_offset if port == 0: raise ValueError("Port cannot be 0") - self._hostname = hostname - self._port = port + self.hostname = hostname + self.port = port # Each card corresponds to a ZMQ address. - self.zmq_address = f"{self._hostname}:{self._port}" + self.zmq_address = f"{self.hostname}:{self.port}" # The `http_port` must be consistent with the port of OpenAI. self.http_address = ( - f"{self._hostname}:" + f"{self.hostname}:" f"{self.config.kv_connector_extra_config['http_port']}") # If `proxy_ip` or `proxy_port` is `""`, @@ -144,15 +144,15 @@ def __init__(self, self.nccl_num_channels = self.config.get_from_extra_config( "nccl_num_channels", "8") - self._listener_thread = threading.Thread( + self.listener_thread = threading.Thread( target=self.listen_for_requests, daemon=True) - self._listener_thread.start() + self.listener_thread.start() - self._ping_thread = None + self.ping_thread = None if port_offset == 0 and self.proxy_address != "": - self._ping_thread = threading.Thread(target=self.ping, + self.ping_thread = threading.Thread(target=self.ping, daemon=True) - self._ping_thread.start() + self.ping_thread.start() self.num_layers = 0 self.finished_recving: set[str] = set() @@ -536,7 +536,7 @@ def recv(self, return tensor def close(self) -> None: - self._listener_thread.join() + self.listener_thread.join() self._send_thread.join() - if self._ping_thread is not None: - self._ping_thread.join() + if self.ping_thread is not None: + self.ping_thread.join() From 4fb5f8509f33482270954a0cd729a5b5cacb656f Mon Sep 17 00:00:00 2001 From: Abatom Date: Tue, 1 Jul 2025 14:55:21 +0800 Subject: [PATCH 49/58] bugfix Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 24faf82bf6c1..818d2274b9b1 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -416,7 +416,7 @@ def build_connector_meta( token_ids = request.all_token_ids[:total_tokens] logger.info("🚧%s is resumed from preemption, total_tokens:%d", - cached_req.req_id, total_tokens) + req_id, total_tokens) # NOTE(rob): For resumed req, new_block_ids is all # of the block_ids for the request. block_ids = new_block_ids[0] From 2c8f5c270721b56b77305d5a876837accb66cc2c Mon Sep 17 00:00:00 2001 From: Abatom Date: Tue, 1 Jul 2025 15:31:56 +0800 Subject: [PATCH 50/58] format Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 45 +++++++++---------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index da0b5cb6c855..36423278ca7c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -13,7 +13,6 @@ import torch import zmq -from vllm.config import KVTransferConfig from vllm.distributed.device_communicators.pynccl_wrapper import ( NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum) from vllm.distributed.kv_transfer.kv_connector.v1.p2p.tensor_memory_pool import ( # noqa: E501 @@ -129,7 +128,7 @@ def __init__(self, self.send_queue: deque[list[Any]] = deque() self.send_request_id_to_tensor_ids: dict[str, set[str]] = {} self.send_thread = threading.Thread(target=self.send_async, - daemon=True) + daemon=True) self.send_thread.start() # tensor_id: torch.Tensor/(addr, dtype, shape) @@ -150,8 +149,7 @@ def __init__(self, self.ping_thread = None if port_offset == 0 and self.proxy_address != "": - self.ping_thread = threading.Thread(target=self.ping, - daemon=True) + self.ping_thread = threading.Thread(target=self.ping, daemon=True) self.ping_thread.start() self.num_layers = 0 @@ -285,8 +283,8 @@ def recv_tensor( remote_address, tensor_id, data["ret"]) return None - return self.recv(comm, tensor_id, data["shape"], data["dtype"], - rank ^ 1, self.recv_stream) + return self.recv(comm, data["shape"], data["dtype"], rank ^ 1, + self.recv_stream) def listen_for_requests(self): while True: @@ -313,9 +311,8 @@ def listen_for_requests(self): try: self.router_socket.send_multipart([remote_address, b"0"]) comm, rank = self.comms[remote] - tensor = self.recv(comm, tensor_id, data["shape"], - data["dtype"], rank ^ 1, - self.recv_stream) + tensor = self.recv(comm, data["shape"], data["dtype"], + rank ^ 1, self.recv_stream) tensor_size = tensor.element_size() * tensor.numel() if (self.buffer_size + tensor_size > self.buffer_size_threshold): @@ -365,8 +362,8 @@ def listen_for_requests(self): if data["ret"] == 0: comm, rank = self.comms[remote] - self.send(comm, tensor_id, tensor.to(self.device), - rank ^ 1, self.send_stream) + self.send(comm, tensor.to(self.device), rank ^ 1, + self.send_stream) else: logger.warning( "🚧Unexpected, Received message from %s, data:%s", @@ -374,7 +371,8 @@ def listen_for_requests(self): def get_num_layers(self): if self.num_layers == 0: - self.num_layers = len(self.compilation_config.static_forward_context) + self.num_layers = len( + self.compilation_config.static_forward_context) logger.debug("get_num_layers, num_layers:%d", self.num_layers) return self.num_layers @@ -435,8 +433,7 @@ def send_sync( response.decode()) return False - self.send(comm, tensor_id, tensor.to(self.device), rank ^ 1, - self.send_stream) + self.send(comm, tensor.to(self.device), rank ^ 1, self.send_stream) self.have_sent_tensor_id(tensor_id) @@ -499,11 +496,10 @@ def ping(self): time.sleep(3) def send(self, - comm, - tensor_id: str, - tensor: torch.Tensor, - dst: int, - stream=None): + comm, + tensor: torch.Tensor, + dst: int, + stream=None): assert tensor.device == self.device, ( f"this nccl communicator is created to work on {self.device}, " f"but the input tensor is on {tensor.device}") @@ -516,12 +512,11 @@ def send(self, event.synchronize() def recv(self, - comm, - tensor_id: str, - shape: str, - dtype: str, - src: int, - stream=None): + comm, + shape: str, + dtype: str, + src: int, + stream=None): stream = stream if stream is not None else current_stream() event = torch.cuda.Event() with torch.cuda.stream(stream): From c2842576ae958cad4c50dc6e3739dfa573f46931 Mon Sep 17 00:00:00 2001 From: Abatom Date: Tue, 1 Jul 2025 16:22:35 +0800 Subject: [PATCH 51/58] format Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 36423278ca7c..38e426e5ae4f 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -20,6 +20,7 @@ from vllm.utils import current_stream, get_ip if TYPE_CHECKING: + from vllm.config import VllmConfig from vllm.forward_context import ForwardContext logger = logging.getLogger(__name__) @@ -495,11 +496,7 @@ def ping(self): sock.send(msgpack.dumps(data)) time.sleep(3) - def send(self, - comm, - tensor: torch.Tensor, - dst: int, - stream=None): + def send(self, comm, tensor: torch.Tensor, dst: int, stream=None): assert tensor.device == self.device, ( f"this nccl communicator is created to work on {self.device}, " f"but the input tensor is on {tensor.device}") @@ -511,12 +508,7 @@ def send(self, event.record(stream) event.synchronize() - def recv(self, - comm, - shape: str, - dtype: str, - src: int, - stream=None): + def recv(self, comm, shape: str, dtype: str, src: int, stream=None): stream = stream if stream is not None else current_stream() event = torch.cuda.Event() with torch.cuda.stream(stream): @@ -532,6 +524,6 @@ def recv(self, def close(self) -> None: self.listener_thread.join() - self._send_thread.join() + self.send_thread.join() if self.ping_thread is not None: self.ping_thread.join() From 0a69f9ae8fd4992937b64266850b27b130cef47b Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 2 Jul 2025 10:47:42 +0800 Subject: [PATCH 52/58] torch.empty Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 38e426e5ae4f..767ff454051b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -310,10 +310,14 @@ def listen_for_requests(self): elif data["cmd"] == "PUT": tensor_id = data["tensor_id"] try: + with torch.cuda.stream(self.recv_stream): + tensor = torch.empty(data["shape"], + dtype=getattr( + torch, data["dtype"]), + device=self.device) self.router_socket.send_multipart([remote_address, b"0"]) comm, rank = self.comms[remote] - tensor = self.recv(comm, data["shape"], data["dtype"], - rank ^ 1, self.recv_stream) + self.recv(comm, tensor, rank ^ 1, self.recv_stream) tensor_size = tensor.element_size() * tensor.numel() if (self.buffer_size + tensor_size > self.buffer_size_threshold): @@ -508,13 +512,10 @@ def send(self, comm, tensor: torch.Tensor, dst: int, stream=None): event.record(stream) event.synchronize() - def recv(self, comm, shape: str, dtype: str, src: int, stream=None): + def recv(self, comm, tensor: torch.Tensor, src: int, stream=None): stream = stream if stream is not None else current_stream() event = torch.cuda.Event() with torch.cuda.stream(stream): - tensor = torch.empty(shape, - dtype=getattr(torch, dtype), - device=self.device) self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(), ncclDataTypeEnum.from_torch(tensor.dtype), src, comm, cudaStream_t(stream.cuda_stream)) From f5a06ea2e3f48efdf782055abe9fb037059e3d20 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 2 Jul 2025 11:30:44 +0800 Subject: [PATCH 53/58] bugfix Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 767ff454051b..7c6f54f0edd4 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -284,8 +284,13 @@ def recv_tensor( remote_address, tensor_id, data["ret"]) return None - return self.recv(comm, data["shape"], data["dtype"], rank ^ 1, - self.recv_stream) + with torch.cuda.stream(self.recv_stream): + tensor = torch.empty(data["shape"], + dtype=getattr( + torch, data["dtype"]), + device=self.device) + + return self.recv(comm, tensor, rank ^ 1, self.recv_stream) def listen_for_requests(self): while True: From 53241bbcdbb373ef5392e9645e4c26069c1226b5 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 2 Jul 2025 13:46:02 +0800 Subject: [PATCH 54/58] sched_yield Signed-off-by: Abatom --- vllm/v1/worker/gpu_model_runner.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 5bdaf4b969e7..6fd6f8b53368 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -28,6 +28,7 @@ from vllm.distributed.parallel_state import ( get_pp_group, get_tp_group, graph_capture, is_global_first_rank, prepare_communication_buffer_for_model) +from vllm.distributed.utils import sched_yield from vllm.forward_context import (DPMetadata, get_forward_context, set_forward_context) from vllm.logger import init_logger @@ -1496,6 +1497,12 @@ def execute_model( # Get the valid generated tokens. sampled_token_ids = sampler_output.sampled_token_ids max_gen_len = sampled_token_ids.shape[-1] + + gpu_event = torch.cuda.Event() + gpu_event.record() + while not gpu_event.query(): + sched_yield() + if max_gen_len == 1: # No spec decode tokens. valid_sampled_token_ids = sampled_token_ids.tolist() From 6d411a830af406246d960d1f5640040148d84ac7 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 2 Jul 2025 14:10:58 +0800 Subject: [PATCH 55/58] format Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 7c6f54f0edd4..d59121b9211b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -286,8 +286,7 @@ def recv_tensor( with torch.cuda.stream(self.recv_stream): tensor = torch.empty(data["shape"], - dtype=getattr( - torch, data["dtype"]), + dtype=getattr(torch, data["dtype"]), device=self.device) return self.recv(comm, tensor, rank ^ 1, self.recv_stream) From d0e432d6aa64a91ad6fe8a5992dfdfee87562533 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 2 Jul 2025 14:59:44 +0800 Subject: [PATCH 56/58] time.sleep(0) Signed-off-by: Abatom --- vllm/v1/worker/gpu_model_runner.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 6fd6f8b53368..c47930077db8 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -28,7 +28,6 @@ from vllm.distributed.parallel_state import ( get_pp_group, get_tp_group, graph_capture, is_global_first_rank, prepare_communication_buffer_for_model) -from vllm.distributed.utils import sched_yield from vllm.forward_context import (DPMetadata, get_forward_context, set_forward_context) from vllm.logger import init_logger @@ -1501,7 +1500,7 @@ def execute_model( gpu_event = torch.cuda.Event() gpu_event.record() while not gpu_event.query(): - sched_yield() + time.sleep(0) if max_gen_len == 1: # No spec decode tokens. From 81e7a806504d2b5dac01d0fa4636b4834b21c0ad Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 2 Jul 2025 15:07:11 +0800 Subject: [PATCH 57/58] add comments Signed-off-by: Abatom --- vllm/v1/worker/gpu_model_runner.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c47930077db8..2ccf9185bf2a 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1500,6 +1500,9 @@ def execute_model( gpu_event = torch.cuda.Event() gpu_event.record() while not gpu_event.query(): + # It can achieve a precision of around 50 microseconds. + # sched_yield can achieve a precision of around 1.25 microseconds. + # However, this can lead to very high CPU utilization. time.sleep(0) if max_gen_len == 1: From 2535da8f3248baf138cd88a4cefa13f074ccb172 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 2 Jul 2025 15:22:45 +0800 Subject: [PATCH 58/58] add comments Signed-off-by: Abatom --- vllm/v1/worker/gpu_model_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 2ccf9185bf2a..bdb45151191d 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1497,6 +1497,7 @@ def execute_model( sampled_token_ids = sampler_output.sampled_token_ids max_gen_len = sampled_token_ids.shape[-1] + # Eliminate global synchronization in `cudaMemcpyAsync`. gpu_event = torch.cuda.Event() gpu_event.record() while not gpu_event.query():