@@ -43,6 +43,7 @@ struct whisper_params {
43
43
44
44
bool speed_up = false ;
45
45
bool translate = false ;
46
+ bool no_fallback = false ;
46
47
bool print_special = false ;
47
48
bool no_context = true ;
48
49
bool no_timestamps = false ;
@@ -73,6 +74,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
73
74
else if (arg == " -fth" || arg == " --freq-thold" ) { params.freq_thold = std::stof (argv[++i]); }
74
75
else if (arg == " -su" || arg == " --speed-up" ) { params.speed_up = true ; }
75
76
else if (arg == " -tr" || arg == " --translate" ) { params.translate = true ; }
77
+ else if (arg == " -nf" || arg == " --no-fallback" ) { params.no_fallback = true ; }
76
78
else if (arg == " -ps" || arg == " --print-special" ) { params.print_special = true ; }
77
79
else if (arg == " -kc" || arg == " --keep-context" ) { params.no_context = false ; }
78
80
else if (arg == " -l" || arg == " --language" ) { params.language = argv[++i]; }
@@ -94,22 +96,23 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
94
96
fprintf (stderr, " \n " );
95
97
fprintf (stderr, " options:\n " );
96
98
fprintf (stderr, " -h, --help [default] show this help message and exit\n " );
97
- fprintf (stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n " , params.n_threads );
98
- fprintf (stderr, " --step N [%-7d] audio step size in milliseconds\n " , params.step_ms );
99
- fprintf (stderr, " --length N [%-7d] audio length in milliseconds\n " , params.length_ms );
100
- fprintf (stderr, " --keep N [%-7d] audio to keep from previous step in ms\n " , params.keep_ms );
101
- fprintf (stderr, " -c ID, --capture ID [%-7d] capture device ID\n " , params.capture_id );
102
- fprintf (stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n " , params.max_tokens );
103
- fprintf (stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n " , params.audio_ctx );
104
- fprintf (stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n " , params.vad_thold );
105
- fprintf (stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n " , params.freq_thold );
106
- fprintf (stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n " , params.speed_up ? " true" : " false" );
107
- fprintf (stderr, " -tr, --translate [%-7s] translate from source language to english\n " , params.translate ? " true" : " false" );
108
- fprintf (stderr, " -ps, --print-special [%-7s] print special tokens\n " , params.print_special ? " true" : " false" );
109
- fprintf (stderr, " -kc, --keep-context [%-7s] keep context between audio chunks\n " , params.no_context ? " false" : " true" );
110
- fprintf (stderr, " -l LANG, --language LANG [%-7s] spoken language\n " , params.language .c_str ());
111
- fprintf (stderr, " -m FNAME, --model FNAME [%-7s] model path\n " , params.model .c_str ());
112
- fprintf (stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n " , params.fname_out .c_str ());
99
+ fprintf (stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n " , params.n_threads );
100
+ fprintf (stderr, " --step N [%-7d] audio step size in milliseconds\n " , params.step_ms );
101
+ fprintf (stderr, " --length N [%-7d] audio length in milliseconds\n " , params.length_ms );
102
+ fprintf (stderr, " --keep N [%-7d] audio to keep from previous step in ms\n " , params.keep_ms );
103
+ fprintf (stderr, " -c ID, --capture ID [%-7d] capture device ID\n " , params.capture_id );
104
+ fprintf (stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n " , params.max_tokens );
105
+ fprintf (stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n " , params.audio_ctx );
106
+ fprintf (stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n " , params.vad_thold );
107
+ fprintf (stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n " , params.freq_thold );
108
+ fprintf (stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n " , params.speed_up ? " true" : " false" );
109
+ fprintf (stderr, " -tr, --translate [%-7s] translate from source language to english\n " , params.translate ? " true" : " false" );
110
+ fprintf (stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n " , params.no_fallback ? " true" : " false" );
111
+ fprintf (stderr, " -ps, --print-special [%-7s] print special tokens\n " , params.print_special ? " true" : " false" );
112
+ fprintf (stderr, " -kc, --keep-context [%-7s] keep context between audio chunks\n " , params.no_context ? " false" : " true" );
113
+ fprintf (stderr, " -l LANG, --language LANG [%-7s] spoken language\n " , params.language .c_str ());
114
+ fprintf (stderr, " -m FNAME, --model FNAME [%-7s] model path\n " , params.model .c_str ());
115
+ fprintf (stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n " , params.fname_out .c_str ());
113
116
fprintf (stderr, " \n " );
114
117
}
115
118
@@ -297,7 +300,8 @@ int main(int argc, char ** argv) {
297
300
wparams.speed_up = params.speed_up ;
298
301
299
302
// disable temperature fallback
300
- wparams.temperature_inc = -1 .0f ;
303
+ // wparams.temperature_inc = -1.0f;
304
+ wparams.temperature_inc = params.no_fallback ? 0 .0f : wparams.temperature_inc ;
301
305
302
306
wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data ();
303
307
wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size ();
0 commit comments