From aff5a9ca77ac3bef79c60244186be1625220aa42 Mon Sep 17 00:00:00 2001 From: Emmanuel Schmidbauer Date: Fri, 12 Apr 2024 08:39:41 -0400 Subject: [PATCH 1/2] server.cpp: add dtw --- examples/server/server.cpp | 39 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index a11d0e78d15..425ab05743c 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -87,6 +87,8 @@ struct whisper_params { std::string tdrz_speaker_turn = " [SPEAKER_TURN]"; // TODO: set from command line std::string openvino_encode_device = "CPU"; + + std::string dtw = ""; }; void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params, const server_params& sparams) { @@ -126,6 +128,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str()); fprintf(stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n", params.openvino_encode_device.c_str()); // server params + fprintf(stderr, " -dtw MODEL --dtw MODEL [%-7s] compute token-level timestamps\n", params.dtw.c_str()); fprintf(stderr, " --host HOST, [%-7s] Hostname/ip-adress for the server\n", sparams.hostname.c_str()); fprintf(stderr, " --port PORT, [%-7d] Port number for the server\n", sparams.port); fprintf(stderr, " --public PATH, [%-7s] Path to the public folder\n", sparams.public_path.c_str()); @@ -173,6 +176,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve else if ( arg == "--prompt") { params.prompt = argv[++i]; } else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; } else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; } + else if (arg == "-dtw" || arg == "--dtw") { params.dtw = argv[++i]; } else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } // server params else if ( arg == "--port") { sparams.port = std::stoi(argv[++i]); } @@ -499,6 +503,40 @@ int main(int argc, char ** argv) { // whisper init struct whisper_context_params cparams = whisper_context_default_params(); cparams.use_gpu = params.use_gpu; + if (!params.dtw.empty()) + { + cparams.dtw_token_timestamps = true; + cparams.dtw_aheads_preset = WHISPER_AHEADS_NONE; + + if (params.dtw == "tiny") + cparams.dtw_aheads_preset = WHISPER_AHEADS_TINY; + if (params.dtw == "tiny.en") + cparams.dtw_aheads_preset = WHISPER_AHEADS_TINY_EN; + if (params.dtw == "base") + cparams.dtw_aheads_preset = WHISPER_AHEADS_BASE; + if (params.dtw == "base.en") + cparams.dtw_aheads_preset = WHISPER_AHEADS_BASE_EN; + if (params.dtw == "small") + cparams.dtw_aheads_preset = WHISPER_AHEADS_SMALL; + if (params.dtw == "small.en") + cparams.dtw_aheads_preset = WHISPER_AHEADS_SMALL_EN; + if (params.dtw == "medium") + cparams.dtw_aheads_preset = WHISPER_AHEADS_MEDIUM; + if (params.dtw == "medium.en") + cparams.dtw_aheads_preset = WHISPER_AHEADS_MEDIUM_EN; + if (params.dtw == "large.v1") + cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V1; + if (params.dtw == "large.v2") + cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V2; + if (params.dtw == "large.v3") + cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V3; + + if (cparams.dtw_aheads_preset == WHISPER_AHEADS_NONE) + { + fprintf(stderr, "error: unknown DTW preset '%s'\n", params.dtw.c_str()); + return 3; + } + } struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams); @@ -865,6 +903,7 @@ int main(int argc, char ** argv) { if (!params.no_timestamps) { word["start"] = token.t0 * 0.01; word["end"] = token.t1 * 0.01; + word["t_dtw"] = token.t_dtw; } word["probability"] = token.p; total_logprob += token.plog; From 02bf9f3f86485eaf1a08f0b17919e1767b509969 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 15 Apr 2024 22:16:38 +0300 Subject: [PATCH 2/2] Update examples/server/server.cpp --- examples/server/server.cpp | 39 +++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 425ab05743c..a98e156c26b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -503,36 +503,45 @@ int main(int argc, char ** argv) { // whisper init struct whisper_context_params cparams = whisper_context_default_params(); cparams.use_gpu = params.use_gpu; - if (!params.dtw.empty()) - { + if (!params.dtw.empty()) { cparams.dtw_token_timestamps = true; cparams.dtw_aheads_preset = WHISPER_AHEADS_NONE; - if (params.dtw == "tiny") + if (params.dtw == "tiny") { cparams.dtw_aheads_preset = WHISPER_AHEADS_TINY; - if (params.dtw == "tiny.en") + } + if (params.dtw == "tiny.en") { cparams.dtw_aheads_preset = WHISPER_AHEADS_TINY_EN; - if (params.dtw == "base") + } + if (params.dtw == "base") { cparams.dtw_aheads_preset = WHISPER_AHEADS_BASE; - if (params.dtw == "base.en") + } + if (params.dtw == "base.en") { cparams.dtw_aheads_preset = WHISPER_AHEADS_BASE_EN; - if (params.dtw == "small") + } + if (params.dtw == "small") { cparams.dtw_aheads_preset = WHISPER_AHEADS_SMALL; - if (params.dtw == "small.en") + } + if (params.dtw == "small.en") { cparams.dtw_aheads_preset = WHISPER_AHEADS_SMALL_EN; - if (params.dtw == "medium") + } + if (params.dtw == "medium") { cparams.dtw_aheads_preset = WHISPER_AHEADS_MEDIUM; - if (params.dtw == "medium.en") + } + if (params.dtw == "medium.en") { cparams.dtw_aheads_preset = WHISPER_AHEADS_MEDIUM_EN; - if (params.dtw == "large.v1") + } + if (params.dtw == "large.v1") { cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V1; - if (params.dtw == "large.v2") + } + if (params.dtw == "large.v2") { cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V2; - if (params.dtw == "large.v3") + } + if (params.dtw == "large.v3") { cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V3; + } - if (cparams.dtw_aheads_preset == WHISPER_AHEADS_NONE) - { + if (cparams.dtw_aheads_preset == WHISPER_AHEADS_NONE) { fprintf(stderr, "error: unknown DTW preset '%s'\n", params.dtw.c_str()); return 3; }