diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 125c189a38b34..5bba1ef32c5a6 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -140,9 +140,12 @@ int main(int argc, char ** argv) { // Add a space in front of the first character to match OG llama tokenizer behavior params.prompt.insert(0, 1, ' '); - std::string path_session = params.path_session; - std::vector session_tokens; + // tokenize the prompt + auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); + // restore prompt from saved session + const std::string path_session = params.path_session; + int n_matching_session_tokens = 0; if (!path_session.empty()) { fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); @@ -151,7 +154,7 @@ int main(int argc, char ** argv) { if (fp != NULL) { std::fclose(fp); - session_tokens.resize(params.n_ctx); + std::vector session_tokens(embd_inp.size()); size_t n_token_count_out = 0; if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) { fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str()); @@ -159,15 +162,28 @@ int main(int argc, char ** argv) { } session_tokens.resize(n_token_count_out); - fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size()); + // find matching input prefix from saved session + for (llama_token id : session_tokens) { + if (n_matching_session_tokens >= (int) embd_inp.size() || id != embd_inp[n_matching_session_tokens]) { + break; + } + n_matching_session_tokens++; + } + + if (n_matching_session_tokens >= (int) embd_inp.size()) { + fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__); + } else if (n_matching_session_tokens < (int) (embd_inp.size() / 2)) { + fprintf(stderr, "%s: warning: session file has low similarity to prompt (%d / %zu tokens); will mostly be reevaluated\n", + __func__, n_matching_session_tokens, embd_inp.size()); + } else { + fprintf(stderr, "%s: session file matches %d / %zu tokens of prompt\n", + __func__, n_matching_session_tokens, embd_inp.size()); + } } else { fprintf(stderr, "%s: session file does not exist, will create\n", __func__); } } - // tokenize the prompt - auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); - const int n_ctx = llama_n_ctx(ctx); if ((int) embd_inp.size() > n_ctx - 4) { @@ -175,25 +191,6 @@ int main(int argc, char ** argv) { return 1; } - // debug message about similarity of saved session, if applicable - size_t n_matching_session_tokens = 0; - if (session_tokens.size()) { - for (llama_token id : session_tokens) { - if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) { - break; - } - n_matching_session_tokens++; - } - if (n_matching_session_tokens >= embd_inp.size()) { - fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__); - } else if (n_matching_session_tokens < (embd_inp.size() / 2)) { - fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", - __func__, n_matching_session_tokens, embd_inp.size()); - } else { - fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n", - __func__, n_matching_session_tokens, embd_inp.size()); - } - } // number of tokens to keep when resetting context if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) { @@ -283,16 +280,11 @@ int main(int argc, char ** argv) { bool is_antiprompt = false; bool input_echo = true; - // HACK - because session saving incurs a non-negligible delay, for now skip re-saving session - // if we loaded a session with at least 75% similarity. It's currently just used to speed up the - // initial prompt so it doesn't need to be an exact match. - bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < (embd_inp.size() * 3 / 4); - - - int n_past = 0; - int n_remain = params.n_predict; - int n_consumed = 0; - int n_session_consumed = 0; + int n_past = 0; + int n_remain = params.n_predict; + int n_consumed = 0; + int n_session_consumed = 0; + int n_session_write_past = 0; // the first thing we will do is to output the prompt, so set color accordingly set_console_color(con_st, CONSOLE_COLOR_PROMPT); @@ -306,7 +298,8 @@ int main(int argc, char ** argv) { // if we run out of context: // - take the n_keep first tokens from the original prompt (via n_past) // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches - if (n_past + (int) embd.size() > n_ctx) { + bool needs_swap = n_past + (int) embd.size() > n_ctx; + if (needs_swap) { const int n_left = n_past - params.n_keep; n_past = params.n_keep; @@ -314,9 +307,6 @@ int main(int argc, char ** argv) { // insert n_left/2 tokens at the start of embd from last_n_tokens embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size()); - // stop saving session if we run out of context - path_session = ""; - //printf("\n---\n"); //printf("resetting: '"); //for (int i = 0; i < (int) embd.size(); i++) { @@ -326,27 +316,12 @@ int main(int argc, char ** argv) { //printf("\n---\n"); } - // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past) - // REVIEW - if (n_session_consumed < (int) session_tokens.size()) { - size_t i = 0; - for ( ; i < embd.size(); i++) { - if (embd[i] != session_tokens[n_session_consumed]) { - session_tokens.resize(n_session_consumed); - break; - } - - n_past++; - n_session_consumed++; - - if (n_session_consumed >= (int) session_tokens.size()) { - ++i; - break; - } - } - if (i > 0) { - embd.erase(embd.begin(), embd.begin() + i); - } + // skip evaluation of tokens in the input prefix that matched session + if (n_session_consumed < n_matching_session_tokens) { + int n_skip = std::min((int) embd.size(), n_matching_session_tokens - n_session_consumed); + embd.erase(embd.begin(), embd.begin() + n_skip); + n_session_consumed += n_skip; + n_past += n_skip; } // evaluate tokens in batches @@ -363,14 +338,42 @@ int main(int argc, char ** argv) { n_past += n_eval; } - if (embd.size() > 0 && !path_session.empty()) { - session_tokens.insert(session_tokens.end(), embd.begin(), embd.end()); - n_session_consumed = session_tokens.size(); + // save session after context swap + if (!path_session.empty() && needs_swap) { + int n_tokens = n_past - params.n_keep; + if (!llama_append_session_file( + ctx, path_session.c_str(), params.n_keep, + last_n_tokens.data() + last_n_tokens.size() - n_tokens, n_tokens)) { + fprintf(stderr, "%s: error: unable to write to session file '%s'\n", + __func__, path_session.c_str()); + return 1; + } + + n_session_write_past = n_past; } } embd.clear(); + // save prompt evaluation state to session file + if (!path_session.empty() && !n_session_write_past && (int) embd_inp.size() <= n_consumed) { + if (!llama_init_session_file(ctx, path_session.c_str())) { + fprintf(stderr, "%s: error: unable to start session file '%s'\n", + __func__, path_session.c_str()); + return 1; + } + + if (!llama_append_session_file( + ctx, path_session.c_str(), 0, + last_n_tokens.data() + last_n_tokens.size() - n_past, n_past)) { + fprintf(stderr, "%s: error: unable to write to session file '%s'\n", + __func__, path_session.c_str()); + return 1; + } + + n_session_write_past = n_past; + } + if ((int) embd_inp.size() <= n_consumed && !is_interacting) { // out of user input, sample next token const float temp = params.temp; @@ -387,12 +390,6 @@ int main(int argc, char ** argv) { const float mirostat_eta = params.mirostat_eta; const bool penalize_nl = params.penalize_nl; - // optionally save the session on first sample (for faster prompt loading next time) - if (!path_session.empty() && need_to_save_session) { - need_to_save_session = false; - llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); - } - llama_token id = 0; { @@ -608,6 +605,20 @@ int main(int argc, char ** argv) { } } + if (!path_session.empty()) { + int n_session_remain = n_past - n_session_write_past; + fprintf(stderr, "\n%s: saving remaining state (%d tokens) to session file '%s'", + __func__, n_session_remain, path_session.c_str()); + if (!llama_append_session_file( + ctx, path_session.c_str(), n_session_write_past, + last_n_tokens.data() + last_n_tokens.size() - embd.size() - n_session_remain, + n_session_remain)) { + fprintf(stderr, "%s: error: unable to write to session file '%s'\n", + __func__, path_session.c_str()); + return 1; + } + } + llama_print_timings(ctx); llama_free(ctx); diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index ea0a984d93816..df18578b0375d 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -58,7 +58,7 @@ int main(int argc, char ** argv) { // Save state (rng, logits, embedding and kv_cache) to file { FILE *fp_write = fopen("dump_state.bin", "wb"); - llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file + llama_copy_state_data(ctx, state_mem, 0); // could also copy directly to memory mapped file fwrite(state_mem, 1, state_size, fp_write); fclose(fp_write); } diff --git a/llama.cpp b/llama.cpp index 85af4dc4930dc..bb7b22581e248 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2436,7 +2436,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) { } // Copies the state to the specified destination address -size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) { +size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest, int n_token_offset) { uint8_t * out = dest; // copy rng @@ -2492,31 +2492,38 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) { const size_t kv_size = kv_self.buf.size; const int kv_ntok = llama_get_kv_cache_token_count(ctx); - memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size); - memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok); + memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size); + memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok); + memcpy(out, &n_token_offset, sizeof(n_token_offset)); out += sizeof(n_token_offset); - if (kv_size) { + LLAMA_ASSERT(n_token_offset <= kv_ntok); + + if (kv_size && n_token_offset < kv_ntok) { + const int n_tokens = kv_ntok - n_token_offset; const size_t elt_size = ggml_element_size(kv_self.k); + char buffer[4096]; ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true }); ggml_cgraph gf{}; gf.n_threads = 1; - ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); + ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, n_tokens, n_layer); kout3d->data = out; out += ggml_nbytes(kout3d); - ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer); + ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, n_tokens, n_embd, n_layer); vout3d->data = out; out += ggml_nbytes(vout3d); ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k, - n_embd, kv_ntok, n_layer, - elt_size*n_embd, elt_size*n_embd*n_ctx, 0); + n_embd, n_tokens, n_layer, + elt_size*n_embd, elt_size*n_embd*n_ctx, + elt_size*n_embd*n_token_offset); ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v, - kv_ntok, n_embd, n_layer, - elt_size*n_ctx, elt_size*n_ctx*n_embd, 0); + n_tokens, n_embd, n_layer, + elt_size*n_ctx, elt_size*n_ctx*n_embd, + elt_size*n_token_offset); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d)); @@ -2593,34 +2600,42 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { size_t kv_size; int kv_ntok; + int n_token_offset; - memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size); - memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok); + memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size); + memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok); + memcpy(&n_token_offset, in, sizeof(n_token_offset)); in += sizeof(n_token_offset); - if (kv_size) { + LLAMA_ASSERT(n_token_offset <= kv_ntok); + + if (kv_size && n_token_offset < kv_ntok) { LLAMA_ASSERT(kv_self.buf.size == kv_size); + const int n_tokens = kv_ntok - n_token_offset; const size_t elt_size = ggml_element_size(kv_self.k); + char buffer[4096]; ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true }); ggml_cgraph gf{}; gf.n_threads = 1; - ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); + ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, n_tokens, n_layer); kin3d->data = (void *) in; in += ggml_nbytes(kin3d); - ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer); + ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, n_tokens, n_embd, n_layer); vin3d->data = (void *) in; in += ggml_nbytes(vin3d); ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k, - n_embd, kv_ntok, n_layer, - elt_size*n_embd, elt_size*n_embd*n_ctx, 0); + n_embd, n_tokens, n_layer, + elt_size*n_embd, elt_size*n_embd*n_ctx, + elt_size*n_embd*n_token_offset); ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v, - kv_ntok, n_embd, n_layer, - elt_size*n_ctx, elt_size*n_ctx*n_embd, 0); + n_tokens, n_embd, n_layer, + elt_size*n_ctx, elt_size*n_ctx*n_embd, + elt_size*n_token_offset); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d)); @@ -2638,7 +2653,12 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { return nread; } -bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { +bool llama_load_session_file( + struct llama_context * ctx, + const char * path_session, + llama_token * tokens_out, + size_t n_token_capacity, + size_t * n_token_count_out) { llama_file file(path_session, "rb"); // sanity checks @@ -2660,39 +2680,70 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi } } - // load the prompt - { - const uint32_t n_token_count = file.read_u32(); + const size_t n_state_size_max = llama_get_state_size(ctx); + size_t n_token_count = 0; - if (n_token_count > n_token_capacity) { - fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity); - return false; - } + std::vector state_data(n_state_size_max); - file.read_raw(tokens_out, sizeof(llama_token) * n_token_count); - *n_token_count_out = n_token_count; - } + // read N segments of (tokens + state), until end or tokens_out filled + while (file.size > file.tell()) { - // restore the context state - { - const size_t n_state_size_cur = file.size - file.tell(); - const size_t n_state_size_max = llama_get_state_size(ctx); + // load the prompt/tokens + const uint32_t n_token_segment = file.read_u32(); + const size_t n_token_read = + std::min((size_t) n_token_segment, n_token_capacity - n_token_count); - if (n_state_size_cur > n_state_size_max) { - fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur); - return false; + file.read_raw(tokens_out + n_token_count, sizeof(llama_token) * n_token_read); + + n_token_count += n_token_read; + + if (n_token_segment > n_token_read) { + const size_t n_token_extra = n_token_segment - n_token_read; + file.seek(sizeof(llama_token) * n_token_extra, SEEK_CUR); } - std::vector state_data(n_state_size_max); - file.read_raw(state_data.data(), n_state_size_cur); + LLAMA_ASSERT(n_token_count <= n_token_capacity); + + // restore the context state + { + size_t n_state_size_cur; + file.read_raw(&n_state_size_cur, sizeof(n_state_size_cur)); + + if (n_state_size_cur > n_state_size_max) { + fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur); + return false; + } + + file.read_raw(state_data.data(), n_state_size_cur); + llama_set_state_data(ctx, state_data.data()); + } - llama_set_state_data(ctx, state_data.data()); + if (n_token_count == n_token_capacity) { + // the logits for this segment apply to the last token; if we didn't read a full + // segment, move back one token to force an eval to get accurate logits + if (n_token_read < n_token_segment) { + n_token_count--; + } + break; + } } + *n_token_count_out = n_token_count; return true; } -bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { +bool llama_save_session_file( + struct llama_context * ctx, + const char * path_session, + const llama_token * tokens, + size_t n_token_count) { + return ( + llama_init_session_file(ctx, path_session) && + llama_append_session_file(ctx, path_session, 0, tokens, n_token_count) + ); +} + +bool llama_init_session_file(struct llama_context * ctx, const char * path_session) { llama_file file(path_session, "wb"); file.write_u32(LLAMA_SESSION_MAGIC); @@ -2700,6 +2751,17 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi file.write_raw(&ctx->model.hparams, sizeof(llama_hparams)); + return true; +} + +bool llama_append_session_file( + struct llama_context * ctx, + const char * path_session, + int n_token_offset, + const llama_token * tokens, + size_t n_token_count) { + llama_file file(path_session, "ab"); + // save the prompt file.write_u32((uint32_t) n_token_count); file.write_raw(tokens, sizeof(llama_token) * n_token_count); @@ -2709,8 +2771,9 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi const size_t n_state_size_max = llama_get_state_size(ctx); std::vector state_data(n_state_size_max); - const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data()); + const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data(), n_token_offset); + file.write_raw(&n_state_size_cur, sizeof(n_state_size_cur)); file.write_raw(state_data.data(), n_state_size_cur); } diff --git a/llama.h b/llama.h index e993c464ab10e..2e65fcdee2130 100644 --- a/llama.h +++ b/llama.h @@ -23,7 +23,7 @@ #define LLAMA_FILE_MAGIC 'ggjt' #define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml' #define LLAMA_SESSION_MAGIC 'ggsn' -#define LLAMA_SESSION_VERSION 1 +#define LLAMA_SESSION_VERSION 2 #ifdef __cplusplus extern "C" { @@ -134,15 +134,34 @@ extern "C" { // Copies the state to the specified destination address. // Destination needs to have allocated enough memory. // Returns the number of bytes copied - LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest); + LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest, int n_token_offset); // Set the state reading from the specified address // Returns the number of bytes read LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src); // Save/load session file - LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); - LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count); + LLAMA_API bool llama_load_session_file( + struct llama_context * ctx, + const char * path_session, + llama_token * tokens_out, + size_t n_token_capacity, + size_t * n_token_count_out); + + LLAMA_API bool llama_save_session_file( + struct llama_context * ctx, + const char * path_session, + const llama_token * tokens, + size_t n_token_count); + + LLAMA_API bool llama_init_session_file(struct llama_context * ctx, const char * path_session); + + LLAMA_API bool llama_append_session_file( + struct llama_context * ctx, + const char * path_session, + int n_token_offset, + const llama_token * tokens, + size_t n_token_count); // Run the llama inference to obtain the logits and probabilities for the next token. // tokens + n_tokens is the provided batch of new tokens to process