- 
                Notifications
    You must be signed in to change notification settings 
- Fork 13.4k
Closed
Labels
bugSomething isn't workingSomething isn't workinggood first issueGood for newcomersGood for newcomershelp wantedNeeds help from the communityNeeds help from the communityhigh priorityVery important issueVery important issue
Description
I may be doing something wrong or misunderstanding the purpose of the kv_cache API but I believe the recent PR #685 by @chrfalch which added the ability to get / set the kv_cache is still insufficient to restore the state of the model even when resetting external model state such as last_n_tokens_data and n_past.
Here is a minimal example
#include "llama.h"
#include <vector>
#include <iostream>
using namespace std;
int main() {
    // init
    auto params = llama_context_default_params();
    auto ctx = llama_init_from_file("../../models/ggml-model.bin", params);
    auto tokens = vector<llama_token>(params.n_ctx);
    auto prompt = "The quick brown fox";
    auto n_tokens = llama_tokenize(ctx, prompt, tokens.data(), tokens.size(), true);
    // evaluate prompt
    llama_eval(ctx, tokens.data(), n_tokens, 0, 12);
    auto last_n_tokens_size = 64;
    auto last_n_tokens_data = vector<llama_token>(last_n_tokens_size, 0);
    last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_tokens);
    auto n_past = n_tokens;
    // save state
    auto kv_cache_size = llama_get_kv_cache_size(ctx);
    auto kv_cache_token_count = llama_get_kv_cache_token_count(ctx);
    auto kv_cache = llama_get_kv_cache(ctx);
    auto kv_cache_copy = vector<uint8_t>(kv_cache, kv_cache + kv_cache_size);
    auto n_past_copy = n_past;
    auto last_n_tokens_data_copy = vector<llama_token>(last_n_tokens_data);
    
    // first run
    cout << prompt;
    for (auto i = 0; i < 6; i++) {
        auto next_token = llama_sample_top_p_top_k(
            ctx,
            last_n_tokens_data.data() + last_n_tokens_data.size() - n_past,
            last_n_tokens_size,
            1,
            1.0,
            0.0,
            1.1
        );
        auto next_token_str = llama_token_to_str(ctx, next_token);
        last_n_tokens_data.push_back(next_token);
        cout << next_token_str;
        llama_eval(ctx, &next_token, 1, n_past, 12);
        n_past += 1;
    }
    cout << endl;
    //
    // restore state
    llama_set_kv_cache(ctx, kv_cache_copy.data(), kv_cache_size, kv_cache_token_count);
    last_n_tokens_data = last_n_tokens_data_copy;
    n_past = n_past_copy;
    //
    // second run
    cout << prompt;
    for (auto i = 0; i < 6; i++) {
        auto next_token = llama_sample_top_p_top_k(
            ctx,
            last_n_tokens_data.data() + last_n_tokens_data.size() - n_past,
            last_n_tokens_size,
            1,
            1.0,
            0.0,
            1.1
        );
        auto next_token_str = llama_token_to_str(ctx, next_token);
        last_n_tokens_data.push_back(next_token);
        cout << next_token_str;
        llama_eval(ctx, &next_token, 1, n_past, 12);
        n_past += 1;
    }
    cout << endl;
    //
    return 0;
}I'd expect the following output
The quick brown fox jumps over the lazy dog
The quick brown fox jumps over the lazy dog
But instead I get
The quick brown fox jumps over the lazy dog
The quick brown fox.
The quick brown fo
Which implies the model is still generating from the end of the first run.
Green-Sky and lin72h
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't workinggood first issueGood for newcomersGood for newcomershelp wantedNeeds help from the communityNeeds help from the communityhigh priorityVery important issueVery important issue