add some fix for llmdatadist

ganyi1996ppo · ganyi1996ppo · commit c72fdadeb521 · 2025-06-07T14:59:46.000+08:00
Signed-off-by: ganyi &lt;pleaplusone.gy@gmail.com&gt;
diff --git a/vllm_ascend/distributed/llmdatadist_connector_v1_a3.py b/vllm_ascend/distributed/llmdatadist_connector_v1_a3.py
@@ -167,6 +167,7 @@ def __init__(self, vllm_config: VllmConfig, engine_id: int):
      # Can not retrive the parallel config since it is not initialized.
      self.local_dp_rank = None
      self.tp_size = None
+     self.port = self.vllm_config.parallel_config.data_parallel_rank_local * self.vllm_config.parallel_config.tensor_parallel_size + envs.VLLM_LLMDD_CHANNEL_PORT
 
      self._reqs_need_recv: dict[str, tuple[Request, list[int]]] = {}
 
@@ -244,12 +245,6 @@ def request_finished(
       request: "Request",
       block_ids: list[int],
   ) -> tuple[bool, Optional[dict[str, Any]]]:
-    if self.local_dp_rank is None:
-      vllm_config = get_current_vllm_config()
-      # Need this dp rank to locate the only dp rank the kv cache from
-      self.local_dp_rank = vllm_config.parallel_config.data_parallel_rank_local
-      # Need this tp size to offset the port in tp size
-      self.tp_size = vllm_config.parallel_config.tensor_parallel_size
 
     params = request.kv_transfer_params
     logger.debug(
@@ -267,13 +262,13 @@ def request_finished(
     # If prompt < block_size, no xfer so free blocks immediately.
     delay_free_blocks = len(computed_block_ids) > 0
 
-    return delay_free_blocks, dict(
+    return False, dict(
         do_remote_prefill=True,
         do_remote_decode=False,
         remote_block_ids=computed_block_ids,
         remote_engine_id=self.engine_id,
         remote_host=self.local_ip,
-        remote_port=envs.VLLM_LLMDD_CHANNEL_PORT + self.local_dp_rank * self.tp_size,
+        remote_port=self.port,
     )
 
 class LLMDataDistConnectorWorker():
@@ -295,6 +290,7 @@ def __init__(
       self.local_ip = get_ip()
       self.kv_transfer_config: Optional[KVTransferConfig] = vllm_config.kv_transfer_config
       self.local_agent_metadata: Optional[LLMDataDistAgentMetadata] = None
+      self.vllm_config = vllm_config
 
       self.llm_datadist_role = None
       self.llm_datadist_remote_role = None
@@ -498,11 +494,11 @@ def start_load_kv(self, metadata: LLMDataDistConnectorMetadata):
       )
       self.finished_reqs.add(req_id)
 
-  def add_remote_agent(self, metadata: LLMDataDistAgentMetadata) -> bool:
+  def add_remote_agent(self, metadata: LLMDataDistAgentMetadata) -> int:
     remote_cluster_id = metadata.cluster_id
     if remote_cluster_id in self.linked_cluster:
       logger.debug(f"LLMDataDistConnectorWorker: remote cluster_id: {metadata.cluster_id} already linked with this server, skip the connection")
-      return False
+      return remote_cluster_id
     remote_super_pod_id = metadata.super_pod_id
     remote_device_id = metadata.device_id
     remote_device_ip = metadata.device_ip
@@ -618,10 +614,10 @@ def add_remote_agent(self, metadata: LLMDataDistAgentMetadata) -> bool:
         raise RuntimeError(f"LLMDataDistConnectorWorker: Linking failed, comm id: {comm_id}")
       time.sleep(1)
       logger.info("Checking query_register_mem_status again")
-    self.linked_cluster.update({remote_server_id: (remote_cluster_id, comm_id)})
+    self.linked_cluster.update({remote_cluster_id: comm_id})
     logger.info(f"cached linked cluster: {self.linked_cluster}")
     logger.info(f"Sucessfully build link with cluster id {remote_cluster_id} with cluster name {comm_name} !")
-    return True
+    return remote_cluster_id
 
 
   def remove_remote_agent(self, cluster_id: int):
@@ -641,7 +637,7 @@ def connect_to_remote_agent(
     host: str,
     port: int
   ):
-    url = f"tcp://{host}:{port + self.tp_rank}"
+    url = f"tcp://{host}:{port}"
     logger.debug(f"Querying metadata from url: {url}")
     msg_encoder = msgspec.msgpack.Encoder()
     msg_send = msg_encoder.encode(self.local_agent_metadata)
@@ -653,7 +649,8 @@ def connect_to_remote_agent(
       metadata = decoder.decode(metadata_bytes)
       metadata = LLMDataDistAgentMetadata(**metadata)
       logger.info(f"recving metadata: {metadata}")
-      self.add_remote_agent(metadata)
+      cluster_id = self.add_remote_agent(metadata)
+    return cluster_id
 
   def _read_blocks(
     self,
@@ -664,8 +661,8 @@ def _read_blocks(
     remote_engine_id: str,
     request_id: str,
   ):
-    if remote_ip not in self.linked_cluster:
-      self.connect_to_remote_agent(remote_ip, remote_port)
+    # if remote_ip not in self.linked_cluster:
+    self.connect_to_remote_agent(remote_ip, remote_port + self.tp_rank)
     num_local_blocks = len(local_block_ids)
     if num_local_blocks == 0:
        return 
@@ -681,8 +678,8 @@ def _read_blocks(
       remote_cache_key_k_pe = BlocksCacheKey(cluster_id=remote_cluster_id, model_id=1)
       logger.info("Try pull blocks from remote server")
       try:
-        self.cache_manager.pull_blocks(remote_cache_key_k_normed, self.cache[0], local_block_ids, remote_block_ids)
-        self.cache_manager.pull_blocks(remote_cache_key_k_pe, self.cache[1], local_block_ids, remote_block_ids)
+        self.cache_manager.pull_blocks(remote_cache_key_k_normed, self.cache[0], remote_block_ids, local_block_ids)
+        self.cache_manager.pull_blocks(remote_cache_key_k_pe, self.cache[1], remote_block_ids, local_block_ids)
       except (TypeError, ValueError) as e:
         raise RuntimeError(f"LLMDataDistConnectorWorker: Passing unexpected parameter to pull_blocks remote_cache_key: {remote_cache_key}, cache: {self.cache}, local_block_ids: {local_block_ids}, remote_block_ids: {remote_block_ids}")
       except LLMException:
@@ -691,7 +688,7 @@ def _read_blocks(
       remote_cache_key = BlocksCacheKey(cluster_id=remote_cluster_id)
       logger.info("Try pull blocks from remote server")
       try:
-        self.cache_manager.pull_blocks(remote_cache_key, self.cache, local_block_ids, remote_block_ids)
+        self.cache_manager.pull_blocks(remote_cache_key, self.cache, remote_block_ids, local_block_ids)
       except (TypeError, ValueError) as e:
         raise RuntimeError(f"LLMDataDistConnectorWorker: Passing unexpected parameter to pull_blocks remote_cache_key: {remote_cache_key}, cache: {self.cache}, local_block_ids: {local_block_ids}, remote_block_ids: {remote_block_ids}")
       except LLMException:
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -1389,6 +1389,11 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         """
         import torch_npu
         kv_caches: Dict[str, torch.Tensor] = {}
+        def align_memory(tensor: torch.Tensor, alignment: int) -> torch.Tensor:
+            data_ptr = tensor.data_ptr()
+            aligned_addr = (data_ptr + alignment - 1) // alignment * alignment
+            offset = (aligned_addr - data_ptr) // tensor.element_size
+            return tensor[int(offset):]
 
         # Remove this after we drop 0.9.0 support
         if vllm_version_is("0.9.0"):
@@ -1461,10 +1466,10 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
                         rope_allocate_shape_alignment = rope_allocate_shape + alignment
                         nope_cache_shape = (num_blocks, block_size, num_kv_heads, nope_dim)
                         rope_cache_shape = (num_blocks, block_size, num_kv_heads, rope_dim)
-                        rope_cache = torch.zeros(nope_allocate_shape_alignment, dtype=dtype, device=self.device)
-                        nope_cache = torch.zeros(rope_allocate_shape_alignment, dtype=dtype, device=self.device)
-                        rope_cache = align_memory(nope_cache, alignment)[:nope_allocate_shape].view(nope_cache_shape)
-                        nope_cache = align_memory(rope_cache, alignment)[:rope_allocate_shape].view(rope_cache_shape)
+                        nope_cache = torch.zeros(nope_allocate_shape_alignment, dtype=dtype, device=self.device)
+                        rope_cache = torch.zeros(rope_allocate_shape_alignment, dtype=dtype, device=self.device)
+                        nope_cache = align_memory(nope_cache, alignment)[:nope_allocate_shape].view(nope_cache_shape)
+                        rope_cache = align_memory(rope_cache, alignment)[:rope_allocate_shape].view(rope_cache_shape)
                         kv_caches[layer_name] = (nope_cache, rope_cache)
                     else:
                         num_caches = kv_cache_shape[0]