From 0b2553e4f2220c381c0f12a8bdc5d989c0aa44b8 Mon Sep 17 00:00:00 2001 From: mudler Date: Tue, 5 Sep 2023 19:01:58 +0200 Subject: [PATCH 1/2] feat(speculative-sampling): add grammar support Signed-off-by: mudler --- binding.cpp | 78 ++++++++++++++++++++++++++++++++++++++++++++--------- llama.cpp | 2 +- 2 files changed, 67 insertions(+), 13 deletions(-) diff --git a/binding.cpp b/binding.cpp index 2cec6a5..608f7de 100644 --- a/binding.cpp +++ b/binding.cpp @@ -619,14 +619,32 @@ int speculative_sampling(void* params_ptr, void* target_model, void* draft_model // used to determine end of generation bool has_eos = false; + // grammar stuff + struct llama_grammar * grammar_dft = NULL; + struct llama_grammar * grammar_tgt = NULL; + + grammar_parser::parse_state parsed_grammar; + + // if requested - load the grammar, error checking is omitted for brevity + if (!params.grammar.empty()) { + parsed_grammar = grammar_parser::parse(params.grammar.c_str()); + // will be empty (default) if there are parse errors + if (parsed_grammar.rules.empty()) { + return 1; + } + + std::vector grammar_rules(parsed_grammar.c_rules()); + grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); + } + const auto t_dec_start = ggml_time_us(); while (true) { - // sample from the drafted tokens if any int i_dft = 0; while (true) { - const llama_token id = llama_sample_token(ctx_tgt, NULL, NULL, params, last_tokens, candidates, i_dft); - + // sample from the target model + const llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft); + // remember which tokens were sampled - used for repetition penalties during sampling last_tokens.erase(last_tokens.begin()); last_tokens.push_back(id); @@ -644,6 +662,7 @@ int speculative_sampling(void* params_ptr, void* target_model, void* draft_model ++n_predict; + // check if the draft matches the target if (i_dft < (int) drafted.size() && id == drafted[i_dft]) { LOG("drafted token %d accepted\n", id); ++n_accept; @@ -654,6 +673,13 @@ int speculative_sampling(void* params_ptr, void* target_model, void* draft_model continue; } + if (i_dft < (int) drafted.size()) { + LOG("the %dth drafted token (%d, '%s') does not match the sampled target token (%d, '%s') - rejected\n", + i_dft, drafted[i_dft], llama_token_to_piece(ctx_dft, drafted[i_dft]).c_str(), id, token_str.c_str()); + } else { + LOG("out of drafted tokens\n"); + } + // the drafted token was rejected or we are out of drafted tokens llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads); ++n_past_dft; @@ -668,7 +694,16 @@ int speculative_sampling(void* params_ptr, void* target_model, void* draft_model break; } - // sample n_draft tokens from the draft model picking the best token + if (grammar_tgt) { + if (grammar_dft) { + llama_grammar_free(grammar_dft); + } + grammar_dft = llama_grammar_copy(grammar_tgt); + + LOG("copied target grammar to draft grammar\n"); + } + + // sample n_draft tokens from the draft model using greedy decoding int n_past_cur = n_past_dft; for (int i = 0; i < n_draft; ++i) { float * logits = llama_get_logits(ctx_dft); @@ -680,6 +715,10 @@ int speculative_sampling(void* params_ptr, void* target_model, void* draft_model llama_token_data_array cur_p = { candidates.data(), candidates.size(), false }; + if (grammar_dft != NULL) { + llama_sample_grammar(ctx_dft, &cur_p, grammar_dft); + } + // computes softmax and sorts the candidates llama_sample_softmax(ctx_dft, &cur_p); @@ -687,25 +726,37 @@ int speculative_sampling(void* params_ptr, void* target_model, void* draft_model LOG(" - draft candidate %d: %d (%.3f)\n", i, cur_p.data[i].id, cur_p.data[i].p); } - // too low probability, stop drafting + // TODO: better logic? if (cur_p.data[0].p < 2*cur_p.data[1].p) { + LOG("stopping drafting, probability too low: %.3f < 2*%.3f\n", cur_p.data[0].p, cur_p.data[1].p); break; } - drafted.push_back(cur_p.data[0].id); + // drafted token + const llama_token id = cur_p.data[0].id; + + drafted.push_back(id); ++n_drafted; - if (i < n_draft - 1) { - // evaluate the drafted token on the draft model - llama_eval(ctx_dft, &drafted.back(), 1, n_past_cur, params.n_threads); - ++n_past_cur; + // no need to evaluate the last drafted token, since we won't use the result + if (i == n_draft - 1) { + break; + } + + // evaluate the drafted token on the draft model + llama_eval(ctx_dft, &drafted.back(), 1, n_past_cur, params.n_threads); + ++n_past_cur; + + if (grammar_dft != NULL) { + llama_grammar_accept_token(ctx_dft, grammar_dft, id); } } // evaluate the target model on the drafted tokens llama_eval(ctx_tgt, drafted.data(), drafted.size(), n_past_tgt, params.n_threads); ++n_past_tgt; - + + // the first token is always proposed by the traget model before the speculation loop drafted.erase(drafted.begin()); } if (debug) { @@ -732,7 +783,10 @@ int speculative_sampling(void* params_ptr, void* target_model, void* draft_model fprintf(stderr, "\n\n"); } - + if (grammar_dft != NULL) { + llama_grammar_free(grammar_dft); + llama_grammar_free(grammar_tgt); + } strcpy(result, res.c_str()); return 0; } diff --git a/llama.cpp b/llama.cpp index e4386f4..9217721 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit e4386f417faf894f6706eec005e24d142b577fcb +Subproject commit 921772104ba2219bfdc2b2980d05ebc0aa0c92a4 From 8b78c857a9da8a2b0268c052896007fe6748d0ac Mon Sep 17 00:00:00 2001 From: mudler Date: Tue, 5 Sep 2023 19:27:21 +0200 Subject: [PATCH 2/2] ci: Explicitly disable Metal on non-metal tests Signed-off-by: mudler --- .github/workflows/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 770d767..54305a6 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -46,7 +46,7 @@ jobs: run: go version - name: Test run: | - CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make test + CMAKE_ARGS="-DLLAMA_METAL=OFF -DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make test macOS-metal-latest: runs-on: macOS-latest