Skip to content

Commit

Permalink
whisper : add grammar-based sampling (ggerganov#1229)
Browse files Browse the repository at this point in the history
* whisper : add grammar-based sampling

* build : fix after master merge

* command : fix exception when recognizing the command

* whisper : fine-tuning grammar functionality

* command : grammar-related improvements

- option to read grammar from file
- add sample grammars for colors and chess moves
- fine-tune the performance further

* grammars : add assistant + update comments

* command : enable beam-search, add "no_timestamps", add "context", add p

* whisper : remove comment

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
  • Loading branch information
ejones and ggerganov committed Nov 13, 2023
1 parent 0bfcd89 commit e29dd12
Show file tree
Hide file tree
Showing 10 changed files with 1,289 additions and 69 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -362,8 +362,8 @@ quantize: examples/quantize/quantize.cpp $(WHISPER_OBJ) $(SRC_COMMON)
stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)

command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS)
command: examples/command/command.cpp examples/grammar-parser.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/command/command.cpp examples/grammar-parser.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS)

lsp: examples/lsp/lsp.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/lsp/lsp.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o lsp $(CC_SDL) $(LDFLAGS)
Expand Down
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ add_library(${TARGET} STATIC
common.cpp
common-ggml.h
common-ggml.cpp
grammar-parser.cpp
)

include(DefaultTargetOptions)
Expand Down
176 changes: 140 additions & 36 deletions examples/command/command.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "common-sdl.h"
#include "common.h"
#include "whisper.h"
#include "grammar-parser.h"

#include <sstream>
#include <cassert>
Expand All @@ -21,6 +22,11 @@
#include <vector>
#include <map>

bool file_exists(const std::string & fname) {
std::ifstream f(fname.c_str());
return f.good();
}

// command-line parameters
struct whisper_params {
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
Expand All @@ -30,8 +36,12 @@ struct whisper_params {
int32_t max_tokens = 32;
int32_t audio_ctx = 0;

float vad_thold = 0.6f;
float freq_thold = 100.0f;
float vad_thold = 0.6f;
float freq_thold = 100.0f;

float grammar_penalty = 100.0f;

grammar_parser::parse_state grammar_parsed;

bool speed_up = false;
bool translate = false;
Expand All @@ -45,6 +55,8 @@ struct whisper_params {
std::string fname_out;
std::string commands;
std::string prompt;
std::string context;
std::string grammar;
};

void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
Expand Down Expand Up @@ -75,6 +87,9 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
else if (arg == "-cmd" || arg == "--commands") { params.commands = argv[++i]; }
else if (arg == "-p" || arg == "--prompt") { params.prompt = argv[++i]; }
else if (arg == "-ctx" || arg == "--context") { params.context = argv[++i]; }
else if ( arg == "--grammar") { params.grammar = argv[++i]; }
else if ( arg == "--grammar-penalty") { params.grammar_penalty = std::stof(argv[++i]); }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
whisper_print_usage(argc, argv, params);
Expand Down Expand Up @@ -109,36 +124,72 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
fprintf(stderr, " -cmd FNAME, --commands FNAME [%-7s] text file with allowed commands\n", params.commands.c_str());
fprintf(stderr, " -p, --prompt [%-7s] the required activation prompt\n", params.prompt.c_str());
fprintf(stderr, " -ctx, --context [%-7s] sample text to help the transcription\n", params.context.c_str());
fprintf(stderr, " --grammar GRAMMAR [%-7s] GBNF grammar to guide decoding\n", params.grammar.c_str());
fprintf(stderr, " --grammar-penalty N [%-7.1f] scales down logits of nongrammar tokens\n", params.grammar_penalty);
fprintf(stderr, "\n");
}

std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
std::string transcribe(
whisper_context * ctx,
const whisper_params & params,
const std::vector<float> & pcmf32,
const std::string & grammar_rule,
float & logprob_min,
float & logprob_sum,
int & n_tokens,
int64_t & t_ms) {
const auto t_start = std::chrono::high_resolution_clock::now();

prob = 0.0f;
logprob_min = 0.0f;
logprob_sum = 0.0f;
n_tokens = 0;
t_ms = 0;

whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
//whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH);

wparams.print_progress = false;
wparams.print_special = params.print_special;
wparams.print_realtime = false;
wparams.print_timestamps = !params.no_timestamps;
wparams.translate = params.translate;
wparams.no_context = true;
wparams.no_timestamps = params.no_timestamps;
wparams.single_segment = true;
wparams.max_tokens = params.max_tokens;
wparams.language = params.language.c_str();
wparams.n_threads = params.n_threads;

wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up;
wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up;

wparams.temperature = 0.4f;
wparams.temperature_inc = 1.0f;
wparams.greedy.best_of = 5;

wparams.beam_search.beam_size = 5;

wparams.initial_prompt = params.context.data();

const auto & grammar_parsed = params.grammar_parsed;
auto grammar_rules = grammar_parsed.c_rules();

if (!params.grammar_parsed.rules.empty() && !grammar_rule.empty()) {
if (grammar_parsed.symbol_ids.find(grammar_rule) == grammar_parsed.symbol_ids.end()) {
fprintf(stderr, "%s: warning: grammar rule '%s' not found - skipping grammar sampling\n", __func__, grammar_rule.c_str());
} else {
wparams.grammar_rules = grammar_rules.data();
wparams.n_grammar_rules = grammar_rules.size();
wparams.i_start_rule = grammar_parsed.symbol_ids.at(grammar_rule);
wparams.grammar_penalty = params.grammar_penalty;
}
}

if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
return "";
}

int prob_n = 0;
std::string result;

const int n_segments = whisper_full_n_segments(ctx);
Expand All @@ -147,19 +198,17 @@ std::string transcribe(whisper_context * ctx, const whisper_params & params, con

result += text;

const int n_tokens = whisper_full_n_tokens(ctx, i);
for (int j = 0; j < n_tokens; ++j) {
const int n = whisper_full_n_tokens(ctx, i);
for (int j = 0; j < n; ++j) {
const auto token = whisper_full_get_token_data(ctx, i, j);

prob += token.p;
++prob_n;
if(token.plog > 0.0f) exit(0);
logprob_min = std::min(logprob_min, token.plog);
logprob_sum += token.plog;
++n_tokens;
}
}

if (prob_n > 0) {
prob /= prob_n;
}

const auto t_end = std::chrono::high_resolution_clock::now();
t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();

Expand Down Expand Up @@ -250,7 +299,7 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const
fprintf(stderr, " ]\n");
}

std::string k_prompt = "select one from the available words: ";
std::string k_prompt = "select one from the available words: ";
for (int i = 0; i < (int) allowed_commands.size(); ++i) {
if (i > 0) {
k_prompt += ", ";
Expand Down Expand Up @@ -418,7 +467,9 @@ int always_prompt_transcription(struct whisper_context * ctx, audio_async & audi
bool is_running = true;
bool ask_prompt = true;

float prob = 0.0f;
float logprob_min = 0.0f;
float logprob_sum = 0.0f;
int n_tokens = 0;

std::vector<float> pcmf32_cur;

Expand Down Expand Up @@ -456,7 +507,7 @@ int always_prompt_transcription(struct whisper_context * ctx, audio_async & audi
// detect the commands
audio.get(params.command_ms, pcmf32_cur);

const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob, t_ms));
const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, "", logprob_min, logprob_sum, n_tokens, t_ms));

const auto words = get_words(txt);

Expand Down Expand Up @@ -492,18 +543,27 @@ int always_prompt_transcription(struct whisper_context * ctx, audio_async & audi

// general-purpose mode
// freely transcribe the voice into text
int process_general_transcription(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
int process_general_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
bool is_running = true;
bool have_prompt = false;
bool ask_prompt = true;

float prob0 = 0.0f;
float prob = 0.0f;
float logprob_min0 = 0.0f;
float logprob_min = 0.0f;

float logprob_sum0 = 0.0f;
float logprob_sum = 0.0f;

int n_tokens0 = 0;
int n_tokens = 0;

std::vector<float> pcmf32_cur;
std::vector<float> pcmf32_prompt;

const std::string k_prompt = "Ok Whisper, start listening for commands.";
std::string k_prompt = "Ok Whisper, start listening for commands.";
if (!params.prompt.empty()) {
k_prompt = params.prompt;
}

fprintf(stderr, "\n");
fprintf(stderr, "%s: general-purpose mode\n", __func__);
Expand Down Expand Up @@ -536,9 +596,11 @@ int process_general_transcription(struct whisper_context * ctx, audio_async &aud
// wait for activation phrase
audio.get(params.prompt_ms, pcmf32_cur);

const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob0, t_ms));
const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, "prompt", logprob_min0, logprob_sum0, n_tokens0, t_ms));

fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms);
const float p = 100.0f * std::exp(logprob_min0);

fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms, p = %.2f%%)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms, p);

const float sim = similarity(txt, k_prompt);

Expand All @@ -559,19 +621,30 @@ int process_general_transcription(struct whisper_context * ctx, audio_async &aud
// we have heard the activation phrase, now detect the commands
audio.get(params.command_ms, pcmf32_cur);

//printf("len prompt: %.4f\n", pcmf32_prompt.size() / (float) WHISPER_SAMPLE_RATE);
//printf("len command: %.4f\n", pcmf32_cur.size() / (float) WHISPER_SAMPLE_RATE);

// prepend 3 second of silence
pcmf32_cur.insert(pcmf32_cur.begin(), 3.0f*WHISPER_SAMPLE_RATE, 0.0f);

// prepend the prompt audio
pcmf32_cur.insert(pcmf32_cur.begin(), pcmf32_prompt.begin(), pcmf32_prompt.end());

const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob, t_ms));
const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, "root", logprob_min, logprob_sum, n_tokens, t_ms));

prob = 100.0f*(prob - prob0);
//const float p = 100.0f * std::exp((logprob - logprob0) / (n_tokens - n_tokens0));
const float p = 100.0f * std::exp(logprob_min);

//fprintf(stdout, "%s: heard '%s'\n", __func__, txt.c_str());

// find the prompt in the text
float best_sim = 0.0f;
size_t best_len = 0;
for (int n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) {
for (size_t n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) {
if (n >= txt.size()) {
break;
}

const auto prompt = txt.substr(0, n);

const float sim = similarity(prompt, k_prompt);
Expand All @@ -584,9 +657,16 @@ int process_general_transcription(struct whisper_context * ctx, audio_async &aud
}
}

const std::string command = ::trim(txt.substr(best_len));
fprintf(stdout, "%s: DEBUG: txt = '%s', prob = %.2f%%\n", __func__, txt.c_str(), p);
if (best_len == 0) {
fprintf(stdout, "%s: WARNING: command not recognized, try again\n", __func__);
} else {
// cut the prompt from the decoded text
const std::string command = ::trim(txt.substr(best_len));

fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
}

fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
fprintf(stdout, "\n");
}

Expand Down Expand Up @@ -654,12 +734,36 @@ int main(int argc, char ** argv) {

int ret_val = 0;

if (!params.commands.empty()) {
ret_val = process_command_list(ctx, audio, params);
} else if (!params.prompt.empty()) {
ret_val = always_prompt_transcription(ctx, audio, params);
} else {
ret_val = process_general_transcription(ctx, audio, params);
if (!params.grammar.empty()) {
auto & grammar = params.grammar_parsed;
if (file_exists(params.grammar.c_str())) {
// read grammar from file
std::ifstream ifs(params.grammar.c_str());
const std::string txt = std::string((std::istreambuf_iterator<char>(ifs)), std::istreambuf_iterator<char>());
grammar = grammar_parser::parse(txt.c_str());
} else {
// read grammar from string
grammar = grammar_parser::parse(params.grammar.c_str());
}

// will be empty (default) if there are parse errors
if (grammar.rules.empty()) {
ret_val = 1;
} else {
fprintf(stderr, "%s: grammar:\n", __func__);
grammar_parser::print_grammar(stderr, grammar);
fprintf(stderr, "\n");
}
}

if (ret_val == 0) {
if (!params.commands.empty()) {
ret_val = process_command_list(ctx, audio, params);
} else if (!params.prompt.empty() && params.grammar_parsed.rules.empty()) {
ret_val = always_prompt_transcription(ctx, audio, params);
} else {
ret_val = process_general_transcription(ctx, audio, params);
}
}

audio.pause();
Expand Down
Loading

0 comments on commit e29dd12

Please sign in to comment.