From 4c76d52bb839ea7b6458dee130f35885d1e8b475 Mon Sep 17 00:00:00 2001 From: Evan Jones Date: Fri, 5 May 2023 23:01:12 -0400 Subject: [PATCH 1/6] main : add option to save full output to session --- examples/common.cpp | 3 +++ examples/common.h | 1 + examples/main/main.cpp | 19 ++++++++++++------- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index 7aa77587b4605..3104e8c6b61d7 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -124,6 +124,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.path_session = argv[i]; + } else if (arg == "--session-full") { + params.session_full = true; } else if (arg == "-f" || arg == "--file") { if (++i >= argc) { invalid_param = true; @@ -368,6 +370,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " prompt to start generation with (default: empty)\n"); fprintf(stderr, " -e process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n"); + fprintf(stderr, " --session-full if specified, saves output to the session file in addition to prompt\n"); fprintf(stderr, " --random-prompt start with a randomized prompt.\n"); fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); fprintf(stderr, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); diff --git a/examples/common.h b/examples/common.h index 43f1cc9ef09d5..bae11a2e07ce8 100644 --- a/examples/common.h +++ b/examples/common.h @@ -58,6 +58,7 @@ struct gpt_params { bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs bool interactive = false; // interactive mode + bool session_full = false; // save the output to the session file in addition to prompt bool embedding = false; // get only sentence embedding bool interactive_first = false; // wait for user input immediately diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 6e1172a48367d..ff9db5fb865a3 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -292,13 +292,9 @@ int main(int argc, char ** argv) { is_interacting = params.interactive_first; } - bool is_antiprompt = false; - bool input_echo = true; - - // HACK - because session saving incurs a non-negligible delay, for now skip re-saving session - // if we loaded a session with at least 75% similarity. It's currently just used to speed up the - // initial prompt so it doesn't need to be an exact match. - bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < (embd_inp.size() * 3 / 4); + bool is_antiprompt = false; + bool input_echo = true; + bool need_to_save_session = !path_session.empty(); int n_past = 0; @@ -328,6 +324,10 @@ int main(int argc, char ** argv) { embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size()); // stop saving session if we run out of context + if (!path_session.empty() && params.session_full) { + llama_save_session_file(ctx, path_session.c_str(), + session_tokens.data(), session_tokens.size()); + } path_session = ""; //printf("\n---\n"); @@ -603,6 +603,11 @@ int main(int argc, char ** argv) { } } + if (!path_session.empty() && params.session_full) { + fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); + llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); + } + llama_print_timings(ctx); llama_free(ctx); From 56758f033c386b7ea23a2d9d4b81808f436411bf Mon Sep 17 00:00:00 2001 From: Evan Jones Date: Sat, 6 May 2023 20:27:15 -0400 Subject: [PATCH 2/6] split behavior into --session and --prompt-cache --- examples/common.cpp | 17 +++++++++++++---- examples/common.h | 8 ++++---- examples/main/README.md | 4 ++-- examples/main/main.cpp | 10 ++++++---- 4 files changed, 25 insertions(+), 14 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index 3104e8c6b61d7..c932914e9fd78 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -118,14 +118,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.prompt = argv[i]; } else if (arg == "-e") { escape_prompt = true; + } else if (arg == "--prompt-cache") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.path_prompt_cache = argv[i]; } else if (arg == "--session") { if (++i >= argc) { invalid_param = true; break; } params.path_session = argv[i]; - } else if (arg == "--session-full") { - params.session_full = true; } else if (arg == "-f" || arg == "--file") { if (++i >= argc) { invalid_param = true; @@ -344,6 +348,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { gpt_print_usage(argc, argv, default_params); exit(1); } + if (!params.path_session.empty() && !params.path_prompt_cache.empty()) { + fprintf(stderr, "error: only one of --prompt-cache or --session may be specified\n"); + gpt_print_usage(argc, argv, default_params); + exit(1); + } if (escape_prompt) { process_escapes(params.prompt); } @@ -369,8 +378,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); fprintf(stderr, " prompt to start generation with (default: empty)\n"); fprintf(stderr, " -e process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); - fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n"); - fprintf(stderr, " --session-full if specified, saves output to the session file in addition to prompt\n"); + fprintf(stderr, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n"); + fprintf(stderr, " --session FNAME file to store prompt and generations, allowing continuation (default: none)\n"); fprintf(stderr, " --random-prompt start with a randomized prompt.\n"); fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); fprintf(stderr, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); diff --git a/examples/common.h b/examples/common.h index bae11a2e07ce8..0c721bad0e45f 100644 --- a/examples/common.h +++ b/examples/common.h @@ -46,9 +46,10 @@ struct gpt_params { std::string model = "models/lamma-7B/ggml-model.bin"; // model path std::string prompt = ""; - std::string path_session = ""; // path to file for saving/loading model eval state - std::string input_prefix = ""; // string to prefix user inputs with - std::string input_suffix = ""; // string to suffix user inputs with + std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state + std::string path_session = ""; // file for saving/loading prompt and generations + std::string input_prefix = ""; // string to prefix user inputs with + std::string input_suffix = ""; // string to suffix user inputs with std::vector antiprompt; // string upon seeing which more user input is prompted std::string lora_adapter = ""; // lora adapter path @@ -58,7 +59,6 @@ struct gpt_params { bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs bool interactive = false; // interactive mode - bool session_full = false; // save the output to the session file in addition to prompt bool embedding = false; // get only sentence embedding bool interactive_first = false; // wait for user input immediately diff --git a/examples/main/README.md b/examples/main/README.md index 35f87bcd594ed..7c03f92c897d9 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -270,9 +270,9 @@ These options help improve the performance and memory usage of the LLaMA models. - `-b N, --batch_size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations. -### Session Caching +### Prompt Caching -- `--session FNAME`: Specify a file to load/save the session, which caches the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The session file is created during the first run and is reused in subsequent runs. If you change your prompt such that 75% or less of the session is reusable, the existing session file will be overwritten with a new, updated version to maintain optimal performance. +- `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. ### Quantization diff --git a/examples/main/main.cpp b/examples/main/main.cpp index ff9db5fb865a3..106110e55a0d6 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -139,8 +139,10 @@ int main(int argc, char ** argv) { // Add a space in front of the first character to match OG llama tokenizer behavior params.prompt.insert(0, 1, ' '); - std::string path_session = params.path_session; + std::string path_session = + !params.path_session.empty() ? params.path_session : params.path_prompt_cache; std::vector session_tokens; + bool resume_session = !params.path_session.empty(); if (!path_session.empty()) { fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); @@ -323,8 +325,8 @@ int main(int argc, char ** argv) { // insert n_left/2 tokens at the start of embd from last_n_tokens embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size()); - // stop saving session if we run out of context - if (!path_session.empty() && params.session_full) { + // stop saving session if we run out of context, saving whatever was evaled + if (!path_session.empty() && resume_session) { llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); } @@ -603,7 +605,7 @@ int main(int argc, char ** argv) { } } - if (!path_session.empty() && params.session_full) { + if (!path_session.empty() && resume_session) { fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); } From e4429e912badc0cb6e85b874eff9ff7e1b5db1ce Mon Sep 17 00:00:00 2001 From: Evan Jones Date: Sun, 7 May 2023 22:46:06 -0400 Subject: [PATCH 3/6] restore original implementation with new names --- examples/common.cpp | 15 +++------------ examples/common.h | 10 +++++----- examples/main/main.cpp | 11 +++++------ 3 files changed, 13 insertions(+), 23 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index c932914e9fd78..53afe3cc28d7d 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -124,12 +124,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.path_prompt_cache = argv[i]; - } else if (arg == "--session") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.path_session = argv[i]; + } else if (arg == "--prompt-cache-all") { + params.prompt_cache_save_all = true; } else if (arg == "-f" || arg == "--file") { if (++i >= argc) { invalid_param = true; @@ -348,11 +344,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { gpt_print_usage(argc, argv, default_params); exit(1); } - if (!params.path_session.empty() && !params.path_prompt_cache.empty()) { - fprintf(stderr, "error: only one of --prompt-cache or --session may be specified\n"); - gpt_print_usage(argc, argv, default_params); - exit(1); - } if (escape_prompt) { process_escapes(params.prompt); } @@ -379,7 +370,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " prompt to start generation with (default: empty)\n"); fprintf(stderr, " -e process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); fprintf(stderr, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n"); - fprintf(stderr, " --session FNAME file to store prompt and generations, allowing continuation (default: none)\n"); + fprintf(stderr, " --prompt-cache-all if specified, saves user input and generations to cache as well\n"); fprintf(stderr, " --random-prompt start with a randomized prompt.\n"); fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); fprintf(stderr, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); diff --git a/examples/common.h b/examples/common.h index 0c721bad0e45f..00ddd773224ce 100644 --- a/examples/common.h +++ b/examples/common.h @@ -47,7 +47,6 @@ struct gpt_params { std::string model = "models/lamma-7B/ggml-model.bin"; // model path std::string prompt = ""; std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state - std::string path_session = ""; // file for saving/loading prompt and generations std::string input_prefix = ""; // string to prefix user inputs with std::string input_suffix = ""; // string to suffix user inputs with std::vector antiprompt; // string upon seeing which more user input is prompted @@ -55,10 +54,11 @@ struct gpt_params { std::string lora_adapter = ""; // lora adapter path std::string lora_base = ""; // base model path for the lora adapter - bool memory_f16 = true; // use f16 instead of f32 for memory kv - bool random_prompt = false; // do not randomize prompt if none provided - bool use_color = false; // use color to distinguish generations and inputs - bool interactive = false; // interactive mode + bool memory_f16 = true; // use f16 instead of f32 for memory kv + bool random_prompt = false; // do not randomize prompt if none provided + bool use_color = false; // use color to distinguish generations and inputs + bool interactive = false; // interactive mode + bool prompt_cache_save_all = false; // save user input and generations to prompt cache bool embedding = false; // get only sentence embedding bool interactive_first = false; // wait for user input immediately diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 106110e55a0d6..bb172cce21058 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -139,10 +139,9 @@ int main(int argc, char ** argv) { // Add a space in front of the first character to match OG llama tokenizer behavior params.prompt.insert(0, 1, ' '); - std::string path_session = - !params.path_session.empty() ? params.path_session : params.path_prompt_cache; + std::string path_session = params.path_prompt_cache; + const bool session_save_all = params.prompt_cache_save_all; std::vector session_tokens; - bool resume_session = !params.path_session.empty(); if (!path_session.empty()) { fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); @@ -325,8 +324,8 @@ int main(int argc, char ** argv) { // insert n_left/2 tokens at the start of embd from last_n_tokens embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size()); - // stop saving session if we run out of context, saving whatever was evaled - if (!path_session.empty() && resume_session) { + // stop saving session if we run out of context + if (!path_session.empty() && session_save_all) { llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); } @@ -605,7 +604,7 @@ int main(int argc, char ** argv) { } } - if (!path_session.empty() && resume_session) { + if (!path_session.empty() && session_save_all) { fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); } From 8c88b172c5ba779349ad9a491fda79d43921f4c6 Mon Sep 17 00:00:00 2001 From: Evan Jones Date: Mon, 8 May 2023 23:10:51 -0400 Subject: [PATCH 4/6] PR comments --- examples/common.cpp | 5 +++-- examples/common.h | 10 +++++----- examples/main/main.cpp | 17 ++++++++--------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index 53afe3cc28d7d..9b8e2c39c65d0 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -125,7 +125,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } params.path_prompt_cache = argv[i]; } else if (arg == "--prompt-cache-all") { - params.prompt_cache_save_all = true; + params.prompt_cache_all = true; } else if (arg == "-f" || arg == "--file") { if (++i >= argc) { invalid_param = true; @@ -370,7 +370,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " prompt to start generation with (default: empty)\n"); fprintf(stderr, " -e process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); fprintf(stderr, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n"); - fprintf(stderr, " --prompt-cache-all if specified, saves user input and generations to cache as well\n"); + fprintf(stderr, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n"); + fprintf(stderr, " not supported with --interactive or other interactive options\n"); fprintf(stderr, " --random-prompt start with a randomized prompt.\n"); fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); fprintf(stderr, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); diff --git a/examples/common.h b/examples/common.h index 00ddd773224ce..499671b2e8d6d 100644 --- a/examples/common.h +++ b/examples/common.h @@ -54,11 +54,11 @@ struct gpt_params { std::string lora_adapter = ""; // lora adapter path std::string lora_base = ""; // base model path for the lora adapter - bool memory_f16 = true; // use f16 instead of f32 for memory kv - bool random_prompt = false; // do not randomize prompt if none provided - bool use_color = false; // use color to distinguish generations and inputs - bool interactive = false; // interactive mode - bool prompt_cache_save_all = false; // save user input and generations to prompt cache + bool memory_f16 = true; // use f16 instead of f32 for memory kv + bool random_prompt = false; // do not randomize prompt if none provided + bool use_color = false; // use color to distinguish generations and inputs + bool interactive = false; // interactive mode + bool prompt_cache_all = false; // save user input and generations to prompt cache bool embedding = false; // get only sentence embedding bool interactive_first = false; // wait for user input immediately diff --git a/examples/main/main.cpp b/examples/main/main.cpp index bb172cce21058..6604e60f20f0d 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -140,7 +140,6 @@ int main(int argc, char ** argv) { params.prompt.insert(0, 1, ' '); std::string path_session = params.path_prompt_cache; - const bool session_save_all = params.prompt_cache_save_all; std::vector session_tokens; if (!path_session.empty()) { @@ -236,6 +235,11 @@ int main(int argc, char ** argv) { } if (params.interactive) { + if (params.prompt_cache_all) { + fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n"); + return 1; + } + #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) struct sigaction sigint_action; sigint_action.sa_handler = sigint_handler; @@ -295,8 +299,7 @@ int main(int argc, char ** argv) { bool is_antiprompt = false; bool input_echo = true; - bool need_to_save_session = !path_session.empty(); - + bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < embd_inp.size(); int n_past = 0; int n_remain = params.n_predict; @@ -325,11 +328,7 @@ int main(int argc, char ** argv) { embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size()); // stop saving session if we run out of context - if (!path_session.empty() && session_save_all) { - llama_save_session_file(ctx, path_session.c_str(), - session_tokens.data(), session_tokens.size()); - } - path_session = ""; + path_session.clear(); //printf("\n---\n"); //printf("resetting: '"); @@ -604,7 +603,7 @@ int main(int argc, char ** argv) { } } - if (!path_session.empty() && session_save_all) { + if (!path_session.empty() && params.prompt_cache_all) { fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); } From 8826fb8e2f0c2e06f96a1e06f09278aa57d6ef6b Mon Sep 17 00:00:00 2001 From: Evan Jones Date: Tue, 9 May 2023 22:28:40 -0400 Subject: [PATCH 5/6] move the check for incompatible parameters to gpt_params_parse --- examples/common.cpp | 7 +++++++ examples/main/main.cpp | 5 ----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index 9b8e2c39c65d0..f3085b08e5b25 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -344,6 +344,13 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { gpt_print_usage(argc, argv, default_params); exit(1); } + if (params.prompt_cache_all && + (params.interactive || params.interactive_first || + params.instruct || params.antiprompt.size())) { + fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n"); + gpt_print_usage(argc, argv, default_params); + exit(1); + } if (escape_prompt) { process_escapes(params.prompt); } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 6604e60f20f0d..468ce30c78574 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -235,11 +235,6 @@ int main(int argc, char ** argv) { } if (params.interactive) { - if (params.prompt_cache_all) { - fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n"); - return 1; - } - #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) struct sigaction sigint_action; sigint_action.sa_handler = sigint_handler; From ac5584b3a8b75081a92eb958cb1e733ba8dbebbe Mon Sep 17 00:00:00 2001 From: Evan Jones Date: Wed, 10 May 2023 11:32:44 -0400 Subject: [PATCH 6/6] Fix whitespace Co-authored-by: DannyDaemonic --- examples/main/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 468ce30c78574..bd1c4ab558521 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -139,7 +139,7 @@ int main(int argc, char ** argv) { // Add a space in front of the first character to match OG llama tokenizer behavior params.prompt.insert(0, 1, ' '); - std::string path_session = params.path_prompt_cache; + std::string path_session = params.path_prompt_cache; std::vector session_tokens; if (!path_session.empty()) {