diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp index 685aa06a9cec9a..7635975fd135a1 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -255,6 +255,47 @@ void ov::npuw::LLMInferRequest::init_lora_states() { } } +std::string ov::npuw::LLMInferRequest::init_pre_alloc_device() { + bool pre_alloc_on_npu = false; + const auto& kvcache_compiled = m_npuw_llm_compiled_model->m_kvcache_compiled; + for (std::size_t idx = 0; idx < kvcache_compiled->m_compiled_submodels.size(); ++idx) { + if (kvcache_compiled->submodel_device(idx) == "NPU") { + pre_alloc_on_npu = true; + break; + } + } + + return pre_alloc_on_npu ? "NPU" : "CPU"; +} + +void ov::npuw::LLMInferRequest::bind_past_kv() { + // Only reuse KV cache related tensors (past_key_values) + for (const auto& [input_name, input_port] : m_prefill_in_ports) { + // Only process KV cache inputs (past_key_values) + if (input_name.find(layer_names::past_key_values) == std::string::npos) { + continue; + } + + // Check if the kv cache request has the same input port + if (m_kvcache_in_ports.find(input_name) == m_kvcache_in_ports.end()) { + continue; + } + + auto kvcache_in_port = m_kvcache_in_ports.at(input_name); + auto kvcache_past_kv_in_tensor = m_kvcache_request->get_tensor(kvcache_in_port); + auto data = kvcache_past_kv_in_tensor->data(); + + auto origTensor = m_prefill_request->get_tensor(input_port); + auto new_tensor = + ov::get_tensor_impl(ov::Tensor(origTensor->get_element_type(), origTensor->get_shape(), data)); + m_prefill_request->set_tensor(input_port, new_tensor); + + // Record that we have already bind past_kv, will need data copy when update past kv in infer requests to + // ensure correct data layout + m_past_kv_bound = true; + } +} + ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr& compiled_model) : ov::ISyncInferRequest(compiled_model), m_npuw_llm_compiled_model(compiled_model) { @@ -291,10 +332,12 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptrm_use_chunk_prefill; if (use_chunk_prefill) { + bind_past_kv(); clear_chunk_prefill_kv_cache(); } @@ -370,16 +413,6 @@ void ov::npuw::LLMInferRequest::init_tensor(const ov::Output& po void ov::npuw::LLMInferRequest::apply_lora() { uint32_t max_low_rank_dim_size = m_npuw_llm_compiled_model->m_max_lora_rank; - bool pre_alloc_on_npu = true; - const auto& prefill_compiled = m_npuw_llm_compiled_model->m_prefill_compiled; - for (std::size_t idx = 0; idx < prefill_compiled->m_compiled_submodels.size(); ++idx) { - if (prefill_compiled->submodel_device(idx) != "NPU") { - pre_alloc_on_npu = false; - break; - } - } - std::string device = pre_alloc_on_npu ? "NPU" : "CPU"; - for (auto state : m_variableStates) { auto state_name = state->get_name(); auto state_tensor = state->get_state(); @@ -426,7 +459,7 @@ void ov::npuw::LLMInferRequest::apply_lora() { auto prefill_lora_in_tensor = m_prefill_request->get_tensor(m_prefill_in_ports.at(state_name)); auto new_infer_tensor = ov::npuw::util::allocMem(prefill_lora_in_tensor->get_element_type(), prefill_lora_in_tensor->get_shape(), - device, + m_pre_alloc_device, m_npuw_llm_compiled_model->get_plugin()); bool has_padding = state_tensor_rank != target_lora_rank; if (has_padding) { @@ -507,9 +540,25 @@ void ov::npuw::LLMInferRequest::copy_kvcache() { // tokens_in_past_chunks may be 0 in case short prompts are prefilled in single chunk auto tokens_in_past_chunks = kvcache_desc.num_stored_tokens - tokens_in_present_chunk; if (tokens_in_past_chunks > 0) { + // Create backup of past KV tensor when buffer sharing is enabled to prevent data corruption + // This is necessary because subsequent copy operations would overwrite the shared buffer auto prefill_past_kv = m_prefill_request->get_tensor(m_prefill_in_ports.at(input_name)); - auto prefill_past_kv_chunks = - make_tensor_slice(prefill_past_kv, kv_dim, 0u, static_cast(tokens_in_past_chunks)); + ov::SoPtr tmp_dense_kv_tensor; + ov::SoPtr prefill_past_kv_chunks; + if (m_past_kv_bound) { + tmp_dense_kv_tensor = ov::npuw::util::allocMem(prefill_past_kv->get_element_type(), + prefill_past_kv->get_shape(), + m_pre_alloc_device, + m_npuw_llm_compiled_model->get_plugin()); + prefill_past_kv->copy_to(tmp_dense_kv_tensor._ptr); + prefill_past_kv_chunks = make_tensor_slice(tmp_dense_kv_tensor, + kv_dim, + 0u, + static_cast(tokens_in_past_chunks)); + } else { + prefill_past_kv_chunks = + make_tensor_slice(prefill_past_kv, kv_dim, 0u, static_cast(tokens_in_past_chunks)); + } auto kvcache_past_kv_chunks = make_tensor_slice(kvcache_in_tensor, kv_dim, 0u, static_cast(tokens_in_past_chunks)); diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp index 86ccfef6e41700..2b769fc656750c 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp @@ -91,6 +91,11 @@ class LLMInferRequest final : public ov::ISyncInferRequest { // Support LoRA std::vector> m_variableStates; void init_lora_states(); + + bool m_past_kv_bound = false; + void bind_past_kv(); + std::string m_pre_alloc_device = "CPU"; + std::string init_pre_alloc_device(); }; } // namespace npuw