Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 62 additions & 13 deletions src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,47 @@ void ov::npuw::LLMInferRequest::init_lora_states() {
}
}

std::string ov::npuw::LLMInferRequest::init_pre_alloc_device() {
bool pre_alloc_on_npu = false;
const auto& kvcache_compiled = m_npuw_llm_compiled_model->m_kvcache_compiled;
for (std::size_t idx = 0; idx < kvcache_compiled->m_compiled_submodels.size(); ++idx) {
if (kvcache_compiled->submodel_device(idx) == "NPU") {
pre_alloc_on_npu = true;
break;
}
}

return pre_alloc_on_npu ? "NPU" : "CPU";
}

void ov::npuw::LLMInferRequest::bind_past_kv() {
// Only reuse KV cache related tensors (past_key_values)
for (const auto& [input_name, input_port] : m_prefill_in_ports) {
// Only process KV cache inputs (past_key_values)
if (input_name.find(layer_names::past_key_values) == std::string::npos) {
continue;
}

// Check if the kv cache request has the same input port
if (m_kvcache_in_ports.find(input_name) == m_kvcache_in_ports.end()) {
continue;
}

auto kvcache_in_port = m_kvcache_in_ports.at(input_name);
auto kvcache_past_kv_in_tensor = m_kvcache_request->get_tensor(kvcache_in_port);
auto data = kvcache_past_kv_in_tensor->data();

auto origTensor = m_prefill_request->get_tensor(input_port);
auto new_tensor =
ov::get_tensor_impl(ov::Tensor(origTensor->get_element_type(), origTensor->get_shape(), data));
m_prefill_request->set_tensor(input_port, new_tensor);

// Record that we have already bind past_kv, will need data copy when update past kv in infer requests to
// ensure correct data layout
m_past_kv_bound = true;
}
}

ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model)
: ov::ISyncInferRequest(compiled_model),
m_npuw_llm_compiled_model(compiled_model) {
Expand Down Expand Up @@ -291,10 +332,12 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCo
m_kvcache_out_ports.emplace(output_port.get_any_name(), output_port);
}

init_pre_alloc_device();
init_lora_states();

const bool use_chunk_prefill = m_npuw_llm_compiled_model->m_use_chunk_prefill;
if (use_chunk_prefill) {
bind_past_kv();
clear_chunk_prefill_kv_cache();
}

Expand Down Expand Up @@ -370,16 +413,6 @@ void ov::npuw::LLMInferRequest::init_tensor(const ov::Output<const ov::Node>& po
void ov::npuw::LLMInferRequest::apply_lora() {
uint32_t max_low_rank_dim_size = m_npuw_llm_compiled_model->m_max_lora_rank;

bool pre_alloc_on_npu = true;
const auto& prefill_compiled = m_npuw_llm_compiled_model->m_prefill_compiled;
for (std::size_t idx = 0; idx < prefill_compiled->m_compiled_submodels.size(); ++idx) {
if (prefill_compiled->submodel_device(idx) != "NPU") {
pre_alloc_on_npu = false;
break;
}
}
std::string device = pre_alloc_on_npu ? "NPU" : "CPU";

for (auto state : m_variableStates) {
auto state_name = state->get_name();
auto state_tensor = state->get_state();
Expand Down Expand Up @@ -426,7 +459,7 @@ void ov::npuw::LLMInferRequest::apply_lora() {
auto prefill_lora_in_tensor = m_prefill_request->get_tensor(m_prefill_in_ports.at(state_name));
auto new_infer_tensor = ov::npuw::util::allocMem(prefill_lora_in_tensor->get_element_type(),
prefill_lora_in_tensor->get_shape(),
device,
m_pre_alloc_device,
m_npuw_llm_compiled_model->get_plugin());
bool has_padding = state_tensor_rank != target_lora_rank;
if (has_padding) {
Expand Down Expand Up @@ -507,9 +540,25 @@ void ov::npuw::LLMInferRequest::copy_kvcache() {
// tokens_in_past_chunks may be 0 in case short prompts are prefilled in single chunk
auto tokens_in_past_chunks = kvcache_desc.num_stored_tokens - tokens_in_present_chunk;
if (tokens_in_past_chunks > 0) {
// Create backup of past KV tensor when buffer sharing is enabled to prevent data corruption
// This is necessary because subsequent copy operations would overwrite the shared buffer
auto prefill_past_kv = m_prefill_request->get_tensor(m_prefill_in_ports.at(input_name));
auto prefill_past_kv_chunks =
make_tensor_slice(prefill_past_kv, kv_dim, 0u, static_cast<uint32_t>(tokens_in_past_chunks));
ov::SoPtr<ov::ITensor> tmp_dense_kv_tensor;
ov::SoPtr<ov::ITensor> prefill_past_kv_chunks;
if (m_past_kv_bound) {
tmp_dense_kv_tensor = ov::npuw::util::allocMem(prefill_past_kv->get_element_type(),
prefill_past_kv->get_shape(),
m_pre_alloc_device,
m_npuw_llm_compiled_model->get_plugin());
prefill_past_kv->copy_to(tmp_dense_kv_tensor._ptr);
prefill_past_kv_chunks = make_tensor_slice(tmp_dense_kv_tensor,
kv_dim,
0u,
static_cast<uint32_t>(tokens_in_past_chunks));
} else {
prefill_past_kv_chunks =
make_tensor_slice(prefill_past_kv, kv_dim, 0u, static_cast<uint32_t>(tokens_in_past_chunks));
}

auto kvcache_past_kv_chunks =
make_tensor_slice(kvcache_in_tensor, kv_dim, 0u, static_cast<uint32_t>(tokens_in_past_chunks));
Expand Down
5 changes: 5 additions & 0 deletions src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ class LLMInferRequest final : public ov::ISyncInferRequest {
// Support LoRA
std::vector<ov::SoPtr<ov::IVariableState>> m_variableStates;
void init_lora_states();

bool m_past_kv_bound = false;
void bind_past_kv();
std::string m_pre_alloc_device = "CPU";
std::string init_pre_alloc_device();
};

} // namespace npuw
Expand Down
Loading