diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 125c189a38b34..5bba1ef32c5a6 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -140,9 +140,12 @@ int main(int argc, char ** argv) {
     // Add a space in front of the first character to match OG llama tokenizer behavior
     params.prompt.insert(0, 1, ' ');
 
-    std::string path_session = params.path_session;
-    std::vector<llama_token> session_tokens;
+    // tokenize the prompt
+    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
 
+    // restore prompt from saved session
+    const std::string path_session = params.path_session;
+    int n_matching_session_tokens = 0;
     if (!path_session.empty()) {
         fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
 
@@ -151,7 +154,7 @@ int main(int argc, char ** argv) {
         if (fp != NULL) {
             std::fclose(fp);
 
-            session_tokens.resize(params.n_ctx);
+            std::vector<llama_token> session_tokens(embd_inp.size());
             size_t n_token_count_out = 0;
             if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
                 fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
@@ -159,15 +162,28 @@ int main(int argc, char ** argv) {
             }
             session_tokens.resize(n_token_count_out);
 
-            fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
+            // find matching input prefix from saved session
+            for (llama_token id : session_tokens) {
+                if (n_matching_session_tokens >= (int) embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
+                    break;
+                }
+                n_matching_session_tokens++;
+            }
+
+            if (n_matching_session_tokens >= (int) embd_inp.size()) {
+                fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
+            } else if (n_matching_session_tokens < (int) (embd_inp.size() / 2)) {
+                fprintf(stderr, "%s: warning: session file has low similarity to prompt (%d / %zu tokens); will mostly be reevaluated\n",
+                    __func__, n_matching_session_tokens, embd_inp.size());
+            } else {
+                fprintf(stderr, "%s: session file matches %d / %zu tokens of prompt\n",
+                    __func__, n_matching_session_tokens, embd_inp.size());
+            }
         } else {
             fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
         }
     }
 
-    // tokenize the prompt
-    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
-
     const int n_ctx = llama_n_ctx(ctx);
 
     if ((int) embd_inp.size() > n_ctx - 4) {
@@ -175,25 +191,6 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    // debug message about similarity of saved session, if applicable
-    size_t n_matching_session_tokens = 0;
-    if (session_tokens.size()) {
-        for (llama_token id : session_tokens) {
-            if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
-                break;
-            }
-            n_matching_session_tokens++;
-        }
-        if (n_matching_session_tokens >= embd_inp.size()) {
-            fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
-        } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
-            fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
-                __func__, n_matching_session_tokens, embd_inp.size());
-        } else {
-            fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n",
-                __func__, n_matching_session_tokens, embd_inp.size());
-        }
-    }
 
     // number of tokens to keep when resetting context
     if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) {
@@ -283,16 +280,11 @@ int main(int argc, char ** argv) {
     bool is_antiprompt = false;
     bool input_echo    = true;
 
-    // HACK - because session saving incurs a non-negligible delay, for now skip re-saving session
-    // if we loaded a session with at least 75% similarity. It's currently just used to speed up the
-    // initial prompt so it doesn't need to be an exact match.
-    bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < (embd_inp.size() * 3 / 4);
-
-
-    int n_past             = 0;
-    int n_remain           = params.n_predict;
-    int n_consumed         = 0;
-    int n_session_consumed = 0;
+    int n_past               = 0;
+    int n_remain             = params.n_predict;
+    int n_consumed           = 0;
+    int n_session_consumed   = 0;
+    int n_session_write_past = 0;
 
     // the first thing we will do is to output the prompt, so set color accordingly
     set_console_color(con_st, CONSOLE_COLOR_PROMPT);
@@ -306,7 +298,8 @@ int main(int argc, char ** argv) {
             // if we run out of context:
             // - take the n_keep first tokens from the original prompt (via n_past)
             // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-            if (n_past + (int) embd.size() > n_ctx) {
+            bool needs_swap = n_past + (int) embd.size() > n_ctx;
+            if (needs_swap) {
                 const int n_left = n_past - params.n_keep;
 
                 n_past = params.n_keep;
@@ -314,9 +307,6 @@ int main(int argc, char ** argv) {
                 // insert n_left/2 tokens at the start of embd from last_n_tokens
                 embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
 
-                // stop saving session if we run out of context
-                path_session = "";
-
                 //printf("\n---\n");
                 //printf("resetting: '");
                 //for (int i = 0; i < (int) embd.size(); i++) {
@@ -326,27 +316,12 @@ int main(int argc, char ** argv) {
                 //printf("\n---\n");
             }
 
-            // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
-            // REVIEW
-            if (n_session_consumed < (int) session_tokens.size()) {
-                size_t i = 0;
-                for ( ; i < embd.size(); i++) {
-                    if (embd[i] != session_tokens[n_session_consumed]) {
-                        session_tokens.resize(n_session_consumed);
-                        break;
-                    }
-
-                    n_past++;
-                    n_session_consumed++;
-
-                    if (n_session_consumed >= (int) session_tokens.size()) {
-                        ++i;
-                        break;
-                    }
-                }
-                if (i > 0) {
-                    embd.erase(embd.begin(), embd.begin() + i);
-                }
+            // skip evaluation of tokens in the input prefix that matched session
+            if (n_session_consumed < n_matching_session_tokens) {
+                int n_skip = std::min((int) embd.size(), n_matching_session_tokens - n_session_consumed);
+                embd.erase(embd.begin(), embd.begin() + n_skip);
+                n_session_consumed += n_skip;
+                n_past += n_skip;
             }
 
             // evaluate tokens in batches
@@ -363,14 +338,42 @@ int main(int argc, char ** argv) {
                 n_past += n_eval;
             }
 
-            if (embd.size() > 0 && !path_session.empty()) {
-                session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
-                n_session_consumed = session_tokens.size();
+            // save session after context swap
+            if (!path_session.empty() && needs_swap) {
+                int n_tokens = n_past - params.n_keep;
+                if (!llama_append_session_file(
+                        ctx, path_session.c_str(), params.n_keep,
+                        last_n_tokens.data() + last_n_tokens.size() - n_tokens, n_tokens)) {
+                    fprintf(stderr, "%s: error: unable to write to session file '%s'\n",
+                        __func__, path_session.c_str());
+                    return 1;
+                }
+
+                n_session_write_past = n_past;
             }
         }
 
         embd.clear();
 
+        // save prompt evaluation state to session file
+        if (!path_session.empty() && !n_session_write_past && (int) embd_inp.size() <= n_consumed) {
+            if (!llama_init_session_file(ctx, path_session.c_str())) {
+                fprintf(stderr, "%s: error: unable to start session file '%s'\n",
+                    __func__, path_session.c_str());
+                return 1;
+            }
+
+            if (!llama_append_session_file(
+                    ctx, path_session.c_str(), 0,
+                    last_n_tokens.data() + last_n_tokens.size() - n_past, n_past)) {
+                fprintf(stderr, "%s: error: unable to write to session file '%s'\n",
+                    __func__, path_session.c_str());
+                return 1;
+            }
+
+            n_session_write_past = n_past;
+        }
+
         if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
             // out of user input, sample next token
             const float   temp            = params.temp;
@@ -387,12 +390,6 @@ int main(int argc, char ** argv) {
             const float   mirostat_eta    = params.mirostat_eta;
             const bool    penalize_nl     = params.penalize_nl;
 
-            // optionally save the session on first sample (for faster prompt loading next time)
-            if (!path_session.empty() && need_to_save_session) {
-                need_to_save_session = false;
-                llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
-            }
-
             llama_token id = 0;
 
             {
@@ -608,6 +605,20 @@ int main(int argc, char ** argv) {
         }
     }
 
+    if (!path_session.empty()) {
+        int n_session_remain = n_past - n_session_write_past;
+        fprintf(stderr, "\n%s: saving remaining state (%d tokens) to session file '%s'",
+            __func__, n_session_remain, path_session.c_str());
+        if (!llama_append_session_file(
+                ctx, path_session.c_str(), n_session_write_past,
+                last_n_tokens.data() + last_n_tokens.size() - embd.size() - n_session_remain,
+                n_session_remain)) {
+            fprintf(stderr, "%s: error: unable to write to session file '%s'\n",
+                __func__, path_session.c_str());
+            return 1;
+        }
+    }
+
     llama_print_timings(ctx);
     llama_free(ctx);
 
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index ea0a984d93816..df18578b0375d 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -58,7 +58,7 @@ int main(int argc, char ** argv) {
     // Save state (rng, logits, embedding and kv_cache) to file
     {
         FILE *fp_write = fopen("dump_state.bin", "wb");
-        llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
+        llama_copy_state_data(ctx, state_mem, 0); // could also copy directly to memory mapped file
         fwrite(state_mem, 1, state_size, fp_write);
         fclose(fp_write);
     }
diff --git a/llama.cpp b/llama.cpp
index 85af4dc4930dc..bb7b22581e248 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2436,7 +2436,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
 }
 
 // Copies the state to the specified destination address
-size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
+size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest, int n_token_offset) {
     uint8_t * out = dest;
 
     // copy rng
@@ -2492,31 +2492,38 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
         const size_t kv_size = kv_self.buf.size;
         const int    kv_ntok = llama_get_kv_cache_token_count(ctx);
 
-        memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
-        memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
+        memcpy(out, &kv_size,        sizeof(kv_size));        out += sizeof(kv_size);
+        memcpy(out, &kv_ntok,        sizeof(kv_ntok));        out += sizeof(kv_ntok);
+        memcpy(out, &n_token_offset, sizeof(n_token_offset)); out += sizeof(n_token_offset);
 
-        if (kv_size) {
+        LLAMA_ASSERT(n_token_offset <= kv_ntok);
+
+        if (kv_size && n_token_offset < kv_ntok) {
+            const int    n_tokens = kv_ntok - n_token_offset;
             const size_t elt_size = ggml_element_size(kv_self.k);
+
             char buffer[4096];
             ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
             ggml_cgraph gf{};
             gf.n_threads = 1;
 
-            ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
+            ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, n_tokens, n_layer);
             kout3d->data = out;
             out += ggml_nbytes(kout3d);
 
-            ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
+            ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, n_tokens, n_embd, n_layer);
             vout3d->data = out;
             out += ggml_nbytes(vout3d);
 
             ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
-                n_embd, kv_ntok, n_layer,
-                elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
+                n_embd, n_tokens, n_layer,
+                elt_size*n_embd, elt_size*n_embd*n_ctx,
+                elt_size*n_embd*n_token_offset);
 
             ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
-                kv_ntok, n_embd, n_layer,
-                elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
+                n_tokens, n_embd, n_layer,
+                elt_size*n_ctx, elt_size*n_ctx*n_embd,
+                elt_size*n_token_offset);
 
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
@@ -2593,34 +2600,42 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
 
         size_t kv_size;
         int kv_ntok;
+        int n_token_offset;
 
-        memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
-        memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
+        memcpy(&kv_size,        in, sizeof(kv_size));        in += sizeof(kv_size);
+        memcpy(&kv_ntok,        in, sizeof(kv_ntok));        in += sizeof(kv_ntok);
+        memcpy(&n_token_offset, in, sizeof(n_token_offset)); in += sizeof(n_token_offset);
 
-        if (kv_size) {
+        LLAMA_ASSERT(n_token_offset <= kv_ntok);
+
+        if (kv_size && n_token_offset < kv_ntok) {
             LLAMA_ASSERT(kv_self.buf.size == kv_size);
 
+            const int    n_tokens = kv_ntok - n_token_offset;
             const size_t elt_size = ggml_element_size(kv_self.k);
+
             char buffer[4096];
             ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
             ggml_cgraph gf{};
             gf.n_threads = 1;
 
-            ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
+            ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, n_tokens, n_layer);
             kin3d->data = (void *) in;
             in += ggml_nbytes(kin3d);
 
-            ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
+            ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, n_tokens, n_embd, n_layer);
             vin3d->data = (void *) in;
             in += ggml_nbytes(vin3d);
 
             ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
-                n_embd, kv_ntok, n_layer,
-                elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
+                n_embd, n_tokens, n_layer,
+                elt_size*n_embd, elt_size*n_embd*n_ctx,
+                elt_size*n_embd*n_token_offset);
 
             ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
-                kv_ntok, n_embd, n_layer,
-                elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
+                n_tokens, n_embd, n_layer,
+                elt_size*n_ctx, elt_size*n_ctx*n_embd,
+                elt_size*n_token_offset);
 
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
@@ -2638,7 +2653,12 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
     return nread;
 }
 
-bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+bool llama_load_session_file(
+    struct llama_context * ctx,
+              const char * path_session,
+             llama_token * tokens_out,
+                  size_t   n_token_capacity,
+                  size_t * n_token_count_out) {
     llama_file file(path_session, "rb");
 
     // sanity checks
@@ -2660,39 +2680,70 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
         }
     }
 
-    // load the prompt
-    {
-        const uint32_t n_token_count = file.read_u32();
+    const size_t n_state_size_max = llama_get_state_size(ctx);
+    size_t       n_token_count    = 0;
 
-        if (n_token_count > n_token_capacity) {
-            fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
-            return false;
-        }
+    std::vector<uint8_t> state_data(n_state_size_max);
 
-        file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
-        *n_token_count_out = n_token_count;
-    }
+    // read N segments of (tokens + state), until end or tokens_out filled
+    while (file.size > file.tell()) {
 
-    // restore the context state
-    {
-        const size_t n_state_size_cur = file.size - file.tell();
-        const size_t n_state_size_max = llama_get_state_size(ctx);
+        // load the prompt/tokens
+        const uint32_t n_token_segment = file.read_u32();
+        const size_t   n_token_read    =
+                std::min((size_t) n_token_segment, n_token_capacity - n_token_count);
 
-        if (n_state_size_cur > n_state_size_max) {
-            fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
-            return false;
+        file.read_raw(tokens_out + n_token_count, sizeof(llama_token) * n_token_read);
+
+        n_token_count += n_token_read;
+
+        if (n_token_segment > n_token_read) {
+            const size_t n_token_extra = n_token_segment - n_token_read;
+            file.seek(sizeof(llama_token) * n_token_extra, SEEK_CUR);
         }
 
-        std::vector<uint8_t> state_data(n_state_size_max);
-        file.read_raw(state_data.data(), n_state_size_cur);
+        LLAMA_ASSERT(n_token_count <= n_token_capacity);
+
+        // restore the context state
+        {
+            size_t n_state_size_cur;
+            file.read_raw(&n_state_size_cur, sizeof(n_state_size_cur));
+
+            if (n_state_size_cur > n_state_size_max) {
+                fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
+                return false;
+            }
+
+            file.read_raw(state_data.data(), n_state_size_cur);
+            llama_set_state_data(ctx, state_data.data());
+        }
 
-        llama_set_state_data(ctx, state_data.data());
+        if (n_token_count == n_token_capacity) {
+            // the logits for this segment apply to the last token; if we didn't read a full
+            // segment, move back one token to force an eval to get accurate logits
+            if (n_token_read < n_token_segment) {
+                n_token_count--;
+            }
+            break;
+        }
     }
 
+    *n_token_count_out = n_token_count;
     return true;
 }
 
-bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
+bool llama_save_session_file(
+    struct llama_context * ctx,
+              const char * path_session,
+       const llama_token * tokens,
+                  size_t   n_token_count) {
+    return (
+        llama_init_session_file(ctx, path_session) &&
+        llama_append_session_file(ctx, path_session, 0, tokens, n_token_count)
+    );
+}
+
+bool llama_init_session_file(struct llama_context * ctx, const char * path_session) {
     llama_file file(path_session, "wb");
 
     file.write_u32(LLAMA_SESSION_MAGIC);
@@ -2700,6 +2751,17 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
 
     file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
 
+    return true;
+}
+
+bool llama_append_session_file(
+    struct llama_context * ctx,
+              const char * path_session,
+                     int   n_token_offset,
+       const llama_token * tokens,
+                  size_t   n_token_count) {
+    llama_file file(path_session, "ab");
+
     // save the prompt
     file.write_u32((uint32_t) n_token_count);
     file.write_raw(tokens, sizeof(llama_token) * n_token_count);
@@ -2709,8 +2771,9 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
         const size_t n_state_size_max = llama_get_state_size(ctx);
 
         std::vector<uint8_t> state_data(n_state_size_max);
-        const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data());
+        const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data(), n_token_offset);
 
+        file.write_raw(&n_state_size_cur, sizeof(n_state_size_cur));
         file.write_raw(state_data.data(), n_state_size_cur);
     }
 
diff --git a/llama.h b/llama.h
index e993c464ab10e..2e65fcdee2130 100644
--- a/llama.h
+++ b/llama.h
@@ -23,7 +23,7 @@
 #define LLAMA_FILE_MAGIC             'ggjt'
 #define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
 #define LLAMA_SESSION_MAGIC          'ggsn'
-#define LLAMA_SESSION_VERSION        1
+#define LLAMA_SESSION_VERSION        2
 
 #ifdef __cplusplus
 extern "C" {
@@ -134,15 +134,34 @@ extern "C" {
     // Copies the state to the specified destination address.
     // Destination needs to have allocated enough memory.
     // Returns the number of bytes copied
-    LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
+    LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest, int n_token_offset);
 
     // Set the state reading from the specified address
     // Returns the number of bytes read
     LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
 
     // Save/load session file
-    LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
-    LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
+    LLAMA_API bool llama_load_session_file(
+        struct llama_context * ctx,
+                  const char * path_session,
+                 llama_token * tokens_out,
+                      size_t   n_token_capacity,
+                      size_t * n_token_count_out);
+
+    LLAMA_API bool llama_save_session_file(
+        struct llama_context * ctx,
+                  const char * path_session,
+           const llama_token * tokens,
+                      size_t   n_token_count);
+
+    LLAMA_API bool llama_init_session_file(struct llama_context * ctx, const char * path_session);
+
+    LLAMA_API bool llama_append_session_file(
+        struct llama_context * ctx,
+                  const char * path_session,
+                         int   n_token_offset,
+           const llama_token * tokens,
+                      size_t   n_token_count);
 
     // Run the llama inference to obtain the logits and probabilities for the next token.
     // tokens + n_tokens is the provided batch of new tokens to process