Update llama.cpp API and supplementing the State/sessions API

JamePeng · JamePeng · commit a1c6dc2fa82f · 2025-11-08T08:38:22.000+08:00
Signed-off-by: JamePeng &lt;jame_peng@sina.com&gt;
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -347,16 +347,92 @@ def memory_seq_pos_max(self, seq_id: int) -> int:
     def memory_seq_pos_min(self, seq_id: int) -> int:
         return llama_cpp.llama_memory_seq_pos_min(self.get_memory(), seq_id)
 
+    # // State / sessions API
+
     def get_state_size(self) -> int:
         return llama_cpp.llama_state_get_size(self.ctx)
 
-    # TODO: copy_state_data
+    def get_state_data(self, dst:ctypes.Array[ctypes.c_uint8], size: int) -> int:
+        return llama_cpp.llama_state_get_data(self.ctx, dst, size)
+
+    def set_state_data(self, src:ctypes.Array[ctypes.c_uint8], size: int) -> int:
+        return llama_cpp.llama_state_set_data(self.ctx, src, size)
+
+    def load_state_file(
+        self,
+        path_session: bytes,
+        tokens_out: ctypes.Array[llama_cpp.llama_token],
+        n_token_capacity: ctypes.c_size_t,
+        n_token_count_out: ctypes.pointer(ctypes.c_size_t)
+    ) -> bool:
+        return llama_cpp.llama_state_load_file(self.ctx, path_session, tokens_out, n_token_capacity, n_token_count_out)
+
+    def save_state_file(
+        self,
+        path_session: bytes,
+        tokens: ctypes.Array[llama_cpp.llama_token],
+        n_token_count: ctypes.c_size_t
+    ) -> bool:
+        return llama_cpp.llama_state_save_file(self.ctx, path_session, tokens, n_token_count)
+
+    def get_state_seq_size(self, seq_id: int) -> int:
+        return llama_cpp.llama_state_seq_get_size(self.ctx, seq_id)
+
+    def get_state_seq_data(self, dst: ctypes.Array[ctypes.c_uint8], size: int, seq_id: int) -> int:
+        return llama_cpp.llama_state_seq_get_data(self.ctx, dst, size, seq_id)
+
+    def set_state_seq_data(self, src: ctypes.Array[ctypes.c_uint8], size: int, dest_seq_id: int) -> int:
+        return llama_cpp.llama_state_seq_set_data(self.ctx, src, size, dest_seq_id)
+
+    def load_state_seq_file(
+        self,
+        filepath: bytes,
+        dest_seq_id: int,
+        tokens_out: ctypes.Array[llama_cpp.llama_token],
+        n_token_capacity: ctypes.c_size_t,
+        n_token_count_out: ctypes.pointer(ctypes.c_size_t)
+    ) -> int:
+        return llama_cpp.llama_state_seq_load_file(self.ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out)
+
+    def save_state_seq_file(
+        self,
+        filepath: bytes,
+        seq_id: int,
+        tokens: ctypes.Array[llama_cpp.llama_token],
+        n_token_count: ctypes.c_size_t
+    ) -> int:
+        return llama_cpp.llama_state_seq_save_file(self.ctx, filepath, seq_id, tokens, n_token_count)
+
+    def get_state_seq_size_ext(self, seq_id: int, flags: llama_cpp.llama_state_seq_flags) -> int:
+        return llama_cpp.llama_state_seq_get_size_ext(self.ctx, seq_id, flags)
+
+    def get_state_seq_data_ext(
+        self,
+        dst:ctypes.Array[ctypes.c_uint8],
+        size: int,
+        seq_id: int,
+        flags: llama_cpp.llama_state_seq_flags
+    ) -> int:
+        return llama_cpp.llama_state_seq_get_data_ext(self.ctx, dst, size, seq_id, flags)
 
-    # TODO: set_state_data
+    def set_state_seq_data_ext(
+        self,
+        src:ctypes.Array[ctypes.c_uint8],
+        size: int,
+        dest_seq_id: int,
+        flags: llama_cpp.llama_state_seq_flags
+    ) -> int:
+        return llama_cpp.llama_state_seq_set_data_ext(self.ctx, src, size, dest_seq_id, flags)
 
-    # TODO: llama_load_session_file
+    # // Decoding API
 
-    # TODO: llama_save_session_file
+    def encode(self, batch: LlamaBatch):
+        return_code = llama_cpp.llama_encode(
+            self.ctx,
+            batch.batch,
+        )
+        if return_code != 0:
+            raise RuntimeError(f"llama_encode returned {return_code}")
 
     def decode(self, batch: LlamaBatch):
         return_code = llama_cpp.llama_decode(
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -1403,6 +1403,7 @@ def llama_supports_rpc() -> bool:
 
 # // NOTE: After creating a llama_context, it is recommended to query the actual values using these functions
 # //       In some cases the requested values via llama_context_params may differ from the actual values used by the context
+# //       ref: https://github.com/ggml-org/llama.cpp/pull/17046#discussion_r2503085732
 # LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
 @ctypes_function("llama_n_ctx", [llama_context_p_ctypes], ctypes.c_uint32)
 def llama_n_ctx(ctx: llama_context_p, /) -> int:
@@ -1503,6 +1504,12 @@ def llama_model_n_embd(model: llama_model_p, /) -> int:
     ...
 
 
+# LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
+@ctypes_function("llama_model_n_embd_inp", [llama_model_p_ctypes], ctypes.c_int32)
+def llama_model_n_embd_inp(model: llama_model_p, /) -> int:
+    ...
+
+
 # LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
 @ctypes_function("llama_model_n_layer", [llama_model_p_ctypes], ctypes.c_int32)
 def llama_model_n_layer(model: llama_model_p, /) -> int:
@@ -2440,7 +2447,7 @@ def llama_save_session_file(
 @ctypes_function(
     "llama_state_seq_get_size",
     [llama_context_p_ctypes, llama_seq_id],
-    ctypes.c_size_t,
+    llama_seq_id,
 )
 def llama_state_seq_get_size(ctx: llama_context_p, seq_id: llama_seq_id, /) -> int:
     """Get the exact size needed to copy the state of a single sequence"""