diff --git a/vllm_ascend/distributed/kvpool/kv_transfer.py b/vllm_ascend/distributed/kvpool/kv_transfer.py index cd5245364c4..0265d6a320c 100644 --- a/vllm_ascend/distributed/kvpool/kv_transfer.py +++ b/vllm_ascend/distributed/kvpool/kv_transfer.py @@ -117,7 +117,6 @@ def _handle_request(self, req_meta: dict[str, Any]): addr_list.append(addr) size_list.append(size) if self.dcp_size > 1: - torch.npu.current_stream().synchronize() self.m_store.put(key_list, addr_list, size_list) else: key_list_tp = key_list[self.tp_rank % self.put_step::self.put_step] @@ -126,7 +125,6 @@ def _handle_request(self, req_meta: dict[str, Any]): size_list_tp = size_list[self.tp_rank % self.put_step::self.put_step] if key_list_tp: - torch.npu.current_stream().synchronize() self.m_store.put(key_list_tp, addr_list_tp, size_list_tp) if is_last_chunk: self.set_finished_request(req_id) @@ -205,7 +203,6 @@ def _handle_request( # type: ignore[override] addr_list.append(addr) size_list.append(size) if self.dcp_size > 1: - torch.npu.current_stream().synchronize() self.m_store.put(key_list, addr_list, size_list) else: key_list_tp = key_list[self.tp_rank % self.put_step::self.put_step] @@ -214,7 +211,6 @@ def _handle_request( # type: ignore[override] size_list_tp = size_list[self.tp_rank % self.put_step::self.put_step] if key_list_tp: - torch.npu.current_stream().synchronize() self.m_store.put(key_list_tp, addr_list_tp, size_list_tp) if req_meta.layer_id == self.final_layer_id and req_meta.is_last_chunk: self.set_finished_request(req_meta.req_id) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 2e7c4ea299b..ce5848b3495 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2339,7 +2339,6 @@ def execute_model( attn_metadata, self.with_prefill, maybe_padded_num_tokens, input_ids, positions, intermediate_tensors, inputs_embeds) - self.maybe_wait_for_kv_save() finished_sending, finished_recving = self.get_finished_kv_transfer( scheduler_output) @@ -2603,7 +2602,7 @@ def propose_draft_token_ids(sampled_token_ids): # ngram and other speculative decoding methods use the sampled # tokens on the CPU, so they are run after bookkeeping. propose_draft_token_ids(valid_sampled_token_ids) - + self.maybe_wait_for_kv_save() if has_kv_transfer_group(): get_kv_transfer_group().clear_connector_metadata()