From 162c2e6b427323bd336ba99a93f8f2e6b084adb4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 11 Aug 2025 09:44:39 +0300 Subject: [PATCH 1/2] kv-cache : fix seq_rm with seq_id == -1 ggml-ci --- src/llama-kv-cache-unified.cpp | 50 +++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp index e539142e6b8..b0a6c320158 100644 --- a/src/llama-kv-cache-unified.cpp +++ b/src/llama-kv-cache-unified.cpp @@ -223,12 +223,7 @@ void llama_kv_cache_unified::clear(bool data) { } bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { - GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); - - auto & cells = v_cells[seq_to_stream[seq_id]]; - auto & head = v_heads[seq_to_stream[seq_id]]; - - uint32_t new_head = cells.size(); + GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size())); if (p0 < 0) { p0 = 0; @@ -239,6 +234,11 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos } if (seq_id >= 0) { + auto & cells = v_cells[seq_to_stream[seq_id]]; + auto & head = v_heads[seq_to_stream[seq_id]]; + + uint32_t new_head = cells.size(); + for (uint32_t i = 0; i < cells.size(); ++i) { if (!cells.pos_in(i, p0, p1)) { continue; @@ -250,24 +250,36 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos } } } + + // If we freed up a slot, set head to it so searching can start there. + if (new_head != cells.size() && new_head < head) { + head = new_head; + } } else { - // match any sequence - for (uint32_t i = 0; i < cells.size(); ++i) { - if (!cells.pos_in(i, p0, p1)) { - continue; - } + for (seq_id = 0; seq_id < (int) seq_to_stream.size(); ++seq_id) { + auto & cells = v_cells[seq_to_stream[seq_id]]; + auto & head = v_heads[seq_to_stream[seq_id]]; - cells.rm(i); + uint32_t new_head = cells.size(); - if (new_head == cells.size()) { - new_head = i; + // match any sequence + for (uint32_t i = 0; i < cells.size(); ++i) { + if (!cells.pos_in(i, p0, p1)) { + continue; + } + + cells.rm(i); + + if (new_head == cells.size()) { + new_head = i; + } } - } - } - // If we freed up a slot, set head to it so searching can start there. - if (new_head != cells.size() && new_head < head) { - head = new_head; + // If we freed up a slot, set head to it so searching can start there. + if (new_head != cells.size() && new_head < head) { + head = new_head; + } + } } return true; From b2b03356a1b994a87b8a00ed81307d75bdb8baa1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 11 Aug 2025 11:37:46 +0300 Subject: [PATCH 2/2] cont : iterate over streams ggml-ci --- src/llama-kv-cache-unified.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp index b0a6c320158..7c9b342d105 100644 --- a/src/llama-kv-cache-unified.cpp +++ b/src/llama-kv-cache-unified.cpp @@ -256,13 +256,13 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos head = new_head; } } else { - for (seq_id = 0; seq_id < (int) seq_to_stream.size(); ++seq_id) { - auto & cells = v_cells[seq_to_stream[seq_id]]; - auto & head = v_heads[seq_to_stream[seq_id]]; + // match any sequence + for (uint32_t s = 0; s < n_stream; ++s) { + auto & cells = v_cells[s]; + auto & head = v_heads[s]; uint32_t new_head = cells.size(); - // match any sequence for (uint32_t i = 0; i < cells.size(); ++i) { if (!cells.pos_in(i, p0, p1)) { continue;