kv-cells : fix tracking of seq_pos during cache reuse

ggerganov · ggerganov · commit 439d562a56f4 · 2025-06-23T07:59:38.000+03:00
ggml-ci
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
@@ -247,7 +247,8 @@ bool llama_batch_allocr::init(
         if (memory) {
             if (batch.token) {
                 if (seq_pos_min(s) != memory->seq_pos_max(s) + 1) {
-                    LLAMA_LOG_ERROR("%s: sequence %d does not start from the last position stored in the memory\n", __func__, s);
+                    LLAMA_LOG_ERROR("%s: sequence %d (min = %d) does not start from the last position (%d) stored in the memory\n",
+                            __func__, s, seq_pos_min(s), memory->seq_pos_max(s));
                     return false;
                 }
             } else {
@@ -256,7 +257,8 @@ bool llama_batch_allocr::init(
                 // for embeddings (typically used as vision input), we allow them to have repeating positions
                 // ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
                 if (seq_pos_min(s) != memory->seq_pos_max(s) && seq_pos_min(s) != memory->seq_pos_max(s) + 1) {
-                    LLAMA_LOG_ERROR("%s: sequence %d does not start from the last position stored in the memory\n", __func__, s);
+                    LLAMA_LOG_ERROR("%s: sequence %d (min = %d) does not start from the last position (%d) stored in the memory\n",
+                            __func__, s, seq_pos_min(s), memory->seq_pos_max(s));
                     return false;
                 }
             }
diff --git a/src/llama-kv-cells.h b/src/llama-kv-cells.h
@@ -7,6 +7,7 @@
 #include <cassert>
 #include <vector>
 #include <set>
+#include <map>
 
 // meta information about KV cells that can be part of multiple sequences at the same time
 // TODO: add unit tests
@@ -164,7 +165,7 @@ class llama_kv_cells_unified {
         assert(seq_id >= 0);
 
         seq[i].reset(seq_id);
-        seq_pos[seq_id].erase(pos[i]);
+        seq_pos_dec(seq_id, pos[i]);
 
         if (seq[i].none()) {
             pos[i] = -1;
@@ -187,7 +188,7 @@ class llama_kv_cells_unified {
             seq[i].reset();
 
             seq[i].set(seq_id);
-            seq_pos[seq_id].insert(pos[i]);
+            seq_pos_inc(seq_id, pos[i]);
 
             return false;
         }
@@ -232,7 +233,7 @@ class llama_kv_cells_unified {
         assert(!seq[i].test(seq_id));
 
         seq[i].set(seq_id);
-        seq_pos[seq_id].insert(pos[i]);
+        seq_pos_inc(seq_id, pos[i]);
     }
 
     // return the sequence id of this cell
@@ -259,7 +260,9 @@ class llama_kv_cells_unified {
             return -1;
         }
 
-        return *seq_pos[seq_id].begin();
+        assert(seq_pos[seq_id].begin()->second > 0);
+
+        return seq_pos[seq_id].begin()->first;
     }
 
     // the maximum position of sequence seq_id currently present in any of the cells
@@ -272,7 +275,9 @@ class llama_kv_cells_unified {
             return -1;
         }
 
-        return *seq_pos[seq_id].rbegin();
+        assert(seq_pos[seq_id].rbegin()->second > 0);
+
+        return seq_pos[seq_id].rbegin()->first;
     }
 
     // note: call only if the cell is not empty
@@ -391,15 +396,31 @@ class llama_kv_cells_unified {
 
     // the set seq_pos[s] tells us which positions are currently present for sequence s
     // this way seq_pos[s].begin() and seq_pos[s].rbegin() give us the min/max positions currently in the cache
-    std::set<llama_pos> seq_pos[LLAMA_MAX_SEQ];
+    //
+    // note that we cannot a use an std::set because in some cases a position can be more than once for the same seq:
+    //   - during performing a cache reuse via (add + rm)
+    //   - some vision models have input embeddings with repeating positions
+    std::map<llama_pos, int> seq_pos[LLAMA_MAX_SEQ];
 
     // helper functions for updating `seq_pos`, once cell at a time:
 
+    void seq_pos_dec(llama_seq_id s, llama_pos p) {
+        auto it = seq_pos[s].find(p);
+        assert(it != seq_pos[s].end());
+        if (--it->second == 0) {
+            seq_pos[s].erase(it);
+        }
+    }
+
+    void seq_pos_inc(llama_seq_id s, llama_pos p) {
+        seq_pos[s][p]++;
+    }
+
     // remove cell i
     void seq_pos_rm(uint32_t i) {
         for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
             if (seq[i].test(s)) {
-                seq_pos[s].erase(pos[i]);
+                seq_pos_dec(s, pos[i]);
             }
         }
     }
@@ -408,7 +429,7 @@ class llama_kv_cells_unified {
     void seq_pos_add(uint32_t i) {
         for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
             if (seq[i].test(s)) {
-                seq_pos[s].insert(pos[i]);
+                seq_pos_inc(s, pos[i]);
             }
         }
     }

Original file line number	Diff line number	Diff line change
`@@ -247,7 +247,8 @@ bool llama_batch_allocr::init(`
`247`	`247`	`if (memory) {`
`248`	`248`	`if (batch.token) {`
`249`	`249`	`if (seq_pos_min(s) != memory->seq_pos_max(s) + 1) {`
`250`		`- LLAMA_LOG_ERROR("%s: sequence %d does not start from the last position stored in the memory\n", __func__, s);`
	`250`	`+ LLAMA_LOG_ERROR("%s: sequence %d (min = %d) does not start from the last position (%d) stored in the memory\n",`
	`251`	`+ __func__, s, seq_pos_min(s), memory->seq_pos_max(s));`
`251`	`252`	`return false;`
`252`	`253`	`}`
`253`	`254`	`} else {`
`@@ -256,7 +257,8 @@ bool llama_batch_allocr::init(`
`256`	`257`	`// for embeddings (typically used as vision input), we allow them to have repeating positions`
`257`	`258`	`// ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762`
`258`	`259`	`if (seq_pos_min(s) != memory->seq_pos_max(s) && seq_pos_min(s) != memory->seq_pos_max(s) + 1) {`
`259`		`- LLAMA_LOG_ERROR("%s: sequence %d does not start from the last position stored in the memory\n", __func__, s);`
	`260`	`+ LLAMA_LOG_ERROR("%s: sequence %d (min = %d) does not start from the last position (%d) stored in the memory\n",`
	`261`	`+ __func__, s, seq_pos_min(s), memory->seq_pos_max(s));`
`260`	`262`	`return false;`
`261`	`263`	`}`
`262`	`264`	`}`