Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit c0335b5

Browse files
committedMay 1, 2023
llama : fix session load / save
1 parent 90b19bd commit c0335b5

File tree

3 files changed

+96
-69
lines changed

3 files changed

+96
-69
lines changed
 

‎examples/main/main.cpp

+10-10
Original file line numberDiff line numberDiff line change
@@ -161,23 +161,22 @@ int main(int argc, char ** argv) {
161161
std::vector<llama_token> session_tokens;
162162

163163
if (!path_session.empty()) {
164-
fprintf(stderr, "%s: attempting to load saved session from %s..\n", __func__, path_session.c_str());
164+
fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
165165

166-
// REVIEW - fopen to check for existing session
166+
// fopen to check for existing session
167167
FILE * fp = std::fopen(path_session.c_str(), "rb");
168168
if (fp != NULL) {
169169
std::fclose(fp);
170170

171171
session_tokens.resize(params.n_ctx);
172172
size_t n_token_count_out = 0;
173-
const size_t n_session_bytes = llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out);
173+
if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
174+
fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
175+
return 1;
176+
}
174177
session_tokens.resize(n_token_count_out);
175178

176-
if (n_session_bytes > 0) {
177-
fprintf(stderr, "%s: loaded %zu bytes of session data!\n", __func__, n_session_bytes);
178-
} else {
179-
fprintf(stderr, "%s: could not load session file, will recreate\n", __func__);
180-
}
179+
fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
181180
} else {
182181
fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
183182
}
@@ -214,7 +213,7 @@ int main(int argc, char ** argv) {
214213
}
215214

216215
// number of tokens to keep when resetting context
217-
if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) {
216+
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) {
218217
params.n_keep = (int)embd_inp.size();
219218
}
220219

@@ -329,7 +328,7 @@ int main(int argc, char ** argv) {
329328
// insert n_left/2 tokens at the start of embd from last_n_tokens
330329
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
331330

332-
// REVIEW - stop saving session if we run out of context
331+
// stop saving session if we run out of context
333332
path_session = "";
334333

335334
//printf("\n---\n");
@@ -355,6 +354,7 @@ int main(int argc, char ** argv) {
355354
n_session_consumed++;
356355

357356
if (n_session_consumed >= (int) session_tokens.size()) {
357+
++i;
358358
break;
359359
}
360360
}

‎llama.cpp

+79-54
Original file line numberDiff line numberDiff line change
@@ -2567,6 +2567,85 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
25672567
return nread;
25682568
}
25692569

2570+
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
2571+
llama_file file(path_session, "rb");
2572+
2573+
// sanity checks
2574+
{
2575+
const uint32_t magic = file.read_u32();
2576+
const uint32_t version = file.read_u32();
2577+
2578+
if (!(magic == LLAMA_SESSION_MAGIC && version == LLAMA_SESSION_VERSION)) {
2579+
fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
2580+
return false;
2581+
}
2582+
2583+
llama_hparams session_hparams;
2584+
file.read_raw(&session_hparams, sizeof(llama_hparams));
2585+
2586+
if (session_hparams != ctx->model.hparams) {
2587+
fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
2588+
return false;
2589+
}
2590+
}
2591+
2592+
// load the prompt
2593+
{
2594+
const uint32_t n_token_count = file.read_u32();
2595+
2596+
if (n_token_count > n_token_capacity) {
2597+
fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
2598+
return false;
2599+
}
2600+
2601+
file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
2602+
*n_token_count_out = n_token_count;
2603+
}
2604+
2605+
// restore the context state
2606+
{
2607+
const size_t n_state_size_cur = file.size - file.tell();
2608+
const size_t n_state_size_exp = llama_get_state_size(ctx);
2609+
2610+
if (n_state_size_cur != n_state_size_exp) {
2611+
fprintf(stderr, "%s : the state size in session file didn't match! expected %zu, got %zu\n", __func__, n_state_size_exp, n_state_size_cur);
2612+
return false;
2613+
}
2614+
2615+
std::vector<uint8_t> state_data(n_state_size_cur);
2616+
file.read_raw(state_data.data(), n_state_size_cur);
2617+
2618+
llama_set_state_data(ctx, state_data.data());
2619+
}
2620+
2621+
return true;
2622+
}
2623+
2624+
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
2625+
llama_file file(path_session, "wb");
2626+
2627+
file.write_u32(LLAMA_SESSION_MAGIC);
2628+
file.write_u32(LLAMA_SESSION_VERSION);
2629+
2630+
file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
2631+
2632+
// save the prompt
2633+
file.write_u32((uint32_t) n_token_count);
2634+
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
2635+
2636+
// save the context state
2637+
{
2638+
const size_t n_state_size = llama_get_state_size(ctx);
2639+
2640+
std::vector<uint8_t> state_data(n_state_size);
2641+
llama_copy_state_data(ctx, state_data.data());
2642+
2643+
file.write_raw(state_data.data(), n_state_size);
2644+
}
2645+
2646+
return true;
2647+
}
2648+
25702649
int llama_eval(
25712650
struct llama_context * ctx,
25722651
const llama_token * tokens,
@@ -2694,57 +2773,3 @@ const char * llama_print_system_info(void) {
26942773
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
26952774
return ctx->model.tensors_by_name;
26962775
}
2697-
2698-
size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
2699-
// TODO leverage mmap
2700-
llama_file file(path_session, "rb");
2701-
const uint32_t magic = file.read_u32();
2702-
const uint32_t version = file.read_u32();
2703-
2704-
if (!(magic == 'ggsn' && version == 0)) {
2705-
fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
2706-
return 0;
2707-
}
2708-
2709-
llama_hparams session_hparams;
2710-
file.read_raw(&session_hparams, sizeof(llama_hparams));
2711-
2712-
// REVIEW
2713-
if (session_hparams != ctx->model.hparams) {
2714-
fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
2715-
return 0;
2716-
}
2717-
2718-
const uint32_t n_token_count = file.read_u32();
2719-
LLAMA_ASSERT(n_token_capacity >= n_token_count);
2720-
file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
2721-
*n_token_count_out = n_token_count;
2722-
2723-
const size_t n_state_size = file.size - file.tell();
2724-
const size_t n_orig_state_size = llama_get_state_size(ctx);
2725-
if (n_state_size != n_orig_state_size) {
2726-
fprintf(stderr, "%s : failed to validate state size\n", __func__);
2727-
}
2728-
std::unique_ptr<uint8_t[]> state_data(new uint8_t[n_state_size]);
2729-
file.read_raw(state_data.get(), n_state_size);
2730-
return llama_set_state_data(ctx, state_data.get());
2731-
}
2732-
2733-
size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
2734-
// TODO save temp & swap
2735-
llama_file file(path_session, "wb");
2736-
2737-
const size_t n_state_size = llama_get_state_size(ctx);
2738-
std::unique_ptr<uint8_t[]> state_data(new uint8_t[n_state_size]);
2739-
llama_copy_state_data(ctx, state_data.get());
2740-
2741-
file.write_u32('ggsn'); // magic
2742-
file.write_u32(0); // version
2743-
file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
2744-
2745-
file.write_u32((uint32_t) n_token_count); // REVIEW
2746-
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
2747-
2748-
file.write_raw(state_data.get(), n_state_size);
2749-
return n_state_size; // REVIEW
2750-
}

‎llama.h

+7-5
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,11 @@
1919
# define LLAMA_API
2020
#endif
2121

22-
#define LLAMA_FILE_VERSION 1
23-
#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
24-
#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
22+
#define LLAMA_FILE_VERSION 1
23+
#define LLAMA_FILE_MAGIC 'ggjt'
24+
#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
25+
#define LLAMA_SESSION_MAGIC 'ggsn'
26+
#define LLAMA_SESSION_VERSION 0
2527

2628
#ifdef __cplusplus
2729
extern "C" {
@@ -138,8 +140,8 @@ extern "C" {
138140
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
139141

140142
// Save/load session file
141-
LLAMA_API size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
142-
LLAMA_API size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
143+
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
144+
LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
143145

144146
// Run the llama inference to obtain the logits and probabilities for the next token.
145147
// tokens + n_tokens is the provided batch of new tokens to process

0 commit comments

Comments
 (0)
Please sign in to comment.