Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove speed_up and phase_vocoder* functions #2198

Merged
merged 3 commits into from
May 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions bindings/go/examples/go-whisper/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,6 @@ func (flags *Flags) GetOut() string {
return strings.ToLower(flags.Lookup("out").Value.String())
}

func (flags *Flags) IsSpeedup() bool {
return flags.Lookup("speedup").Value.String() == "true"
}

func (flags *Flags) IsTokens() bool {
return flags.Lookup("tokens").Value.String() == "true"
}
Expand Down Expand Up @@ -111,10 +107,6 @@ func (flags *Flags) SetParams(context whisper.Context) error {
fmt.Fprintf(flags.Output(), "Setting duration to %v\n", duration)
context.SetDuration(duration)
}
if flags.IsSpeedup() {
fmt.Fprintf(flags.Output(), "Setting speedup to true\n")
context.SetSpeedup(true)
}
if threads := flags.GetThreads(); threads != 0 {
fmt.Fprintf(flags.Output(), "Setting threads to %d\n", threads)
context.SetThreads(threads)
Expand Down Expand Up @@ -146,7 +138,6 @@ func registerFlags(flag *Flags) {
flag.Duration("offset", 0, "Time offset")
flag.Duration("duration", 0, "Duration of audio to process")
flag.Uint("threads", 0, "Number of threads to use")
flag.Bool("speedup", false, "Enable speedup")
flag.Uint("max-len", 0, "Maximum segment length in characters")
flag.Uint("max-tokens", 0, "Maximum tokens per segment")
flag.Float64("word-thold", 0, "Maximum segment score")
Expand Down
7 changes: 0 additions & 7 deletions bindings/go/params.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,6 @@ func (p *Params) SetPrintTimestamps(v bool) {
p.print_timestamps = toBool(v)
}

func (p *Params) SetSpeedup(v bool) {
p.speed_up = toBool(v)
}

// Set language id
func (p *Params) SetLanguage(lang int) error {
if lang == -1 {
Expand Down Expand Up @@ -177,9 +173,6 @@ func (p *Params) String() string {
if p.token_timestamps {
str += " token_timestamps"
}
if p.speed_up {
str += " speed_up"
}

return str + ">"
}
5 changes: 0 additions & 5 deletions bindings/go/pkg/whisper/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,6 @@ func (context *context) SetTranslate(v bool) {
context.params.SetTranslate(v)
}

// Set speedup flag
func (context *context) SetSpeedup(v bool) {
context.params.SetSpeedup(v)
}

func (context *context) SetSplitOnWord(v bool) {
context.params.SetSplitOnWord(v)
}
Expand Down
1 change: 0 additions & 1 deletion bindings/go/pkg/whisper/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ type Context interface {
SetOffset(time.Duration) // Set offset
SetDuration(time.Duration) // Set duration
SetThreads(uint) // Set number of threads to use
SetSpeedup(bool) // Set speedup flag
SetSplitOnWord(bool) // Set split on word flag
SetTokenThreshold(float32) // Set timestamp token probability threshold
SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ public interface WhisperCppJnaLibrary extends Library {
* @return Whisper context on success, null on failure
*/
Pointer whisper_init_from_file(String path_model);

/**
* Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
* Because this function allocates memory for the params, the caller must call either:
Expand Down Expand Up @@ -304,14 +304,6 @@ public interface WhisperCppJnaLibrary extends Library {
/** Language id associated with the provided state */
int whisper_full_lang_id_from_state(Pointer state);

/**
* Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
* The resulting spectrogram is stored inside the default state of the provided whisper context.
* @return 0 on success
*/
int whisper_pcm_to_mel_phase_vocoder(Pointer ctx, final float[] samples, int n_samples, int n_threads);

int whisper_pcm_to_mel_phase_vocoder_with_state(Pointer ctx, Pointer state, final float[] samples, int n_samples, int n_threads);

/** Get the start time of the specified segment. */
long whisper_full_get_segment_t0(Pointer ctx, int i_segment);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,14 +129,6 @@ public void splitOnWord(boolean enable) {
/** Maximum tokens per segment (0, default = no limit) */
public int max_tokens;

/** Flag to speed up the audio by 2x using Phase Vocoder. (default = false) */
public CBool speed_up;

/** Flag to speed up the audio by 2x using Phase Vocoder. (default = false) */
public void speedUp(boolean enable) {
speed_up = enable ? CBool.TRUE : CBool.FALSE;
}

/** Overwrite the audio context size (0 = use default). */
public int audio_ctx;

Expand Down Expand Up @@ -321,7 +313,7 @@ protected List<String> getFieldOrder() {
return Arrays.asList("strategy", "n_threads", "n_max_text_ctx", "offset_ms", "duration_ms", "translate",
"no_context", "single_segment", "no_timestamps",
"print_special", "print_progress", "print_realtime", "print_timestamps", "token_timestamps",
"thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "speed_up", "audio_ctx",
"thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "audio_ctx",
"tdrz_enable", "suppress_regex", "initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
"suppress_blank", "suppress_non_speech_tokens", "temperature", "max_initial_ts", "length_penalty",
"temperature_inc", "entropy_thold", "logprob_thold", "no_speech_thold", "greedy", "beam_search",
Expand Down
8 changes: 0 additions & 8 deletions bindings/ruby/ext/ruby_whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -311,12 +311,6 @@ static VALUE ruby_whisper_params_get_split_on_word(VALUE self) {
static VALUE ruby_whisper_params_set_split_on_word(VALUE self, VALUE value) {
BOOL_PARAMS_SETTER(self, split_on_word, value)
}
static VALUE ruby_whisper_params_get_speed_up(VALUE self) {
BOOL_PARAMS_GETTER(self, speed_up)
}
static VALUE ruby_whisper_params_set_speed_up(VALUE self, VALUE value) {
BOOL_PARAMS_SETTER(self, speed_up, value)
}
static VALUE ruby_whisper_params_get_diarize(VALUE self) {
ruby_whisper_params *rwp;
Data_Get_Struct(self, ruby_whisper_params, rwp);
Expand Down Expand Up @@ -408,8 +402,6 @@ void Init_whisper() {
rb_define_method(cParams, "token_timestamps=", ruby_whisper_params_set_token_timestamps, 1);
rb_define_method(cParams, "split_on_word", ruby_whisper_params_get_split_on_word, 0);
rb_define_method(cParams, "split_on_word=", ruby_whisper_params_set_split_on_word, 1);
rb_define_method(cParams, "speed_up", ruby_whisper_params_get_speed_up, 0);
rb_define_method(cParams, "speed_up=", ruby_whisper_params_set_speed_up, 1);
rb_define_method(cParams, "diarize", ruby_whisper_params_get_diarize, 0);
rb_define_method(cParams, "diarize=", ruby_whisper_params_set_diarize, 1);

Expand Down
7 changes: 0 additions & 7 deletions bindings/ruby/tests/test_whisper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -117,13 +117,6 @@ def test_split_on_word
assert !@params.split_on_word
end

def test_speed_up
@params.speed_up = true
assert @params.speed_up
@params.speed_up = false
assert !@params.speed_up
end

def test_whisper
@whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
params = Whisper::Params.new
Expand Down
3 changes: 0 additions & 3 deletions examples/addon.node/addon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ struct whisper_params {
float entropy_thold = 2.4f;
float logprob_thold = -1.0f;

bool speed_up = false;
bool translate = false;
bool diarize = false;
bool output_txt = false;
Expand Down Expand Up @@ -232,8 +231,6 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
wparams.audio_ctx = params.audio_ctx;

wparams.speed_up = params.speed_up;

wparams.greedy.best_of = params.best_of;
wparams.beam_search.beam_size = params.beam_size;

Expand Down
5 changes: 0 additions & 5 deletions examples/command/command.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ struct whisper_params {

grammar_parser::parse_state grammar_parsed;

bool speed_up = false;
bool translate = false;
bool print_special = false;
bool print_energy = false;
Expand Down Expand Up @@ -76,7 +75,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
Expand Down Expand Up @@ -115,7 +113,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
Expand Down Expand Up @@ -165,7 +162,6 @@ std::string transcribe(
wparams.n_threads = params.n_threads;

wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up;

wparams.temperature = 0.4f;
wparams.temperature_inc = 1.0f;
Expand Down Expand Up @@ -371,7 +367,6 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const
wparams.n_threads = params.n_threads;

wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up;

wparams.prompt_tokens = k_tokens.data();
wparams.prompt_n_tokens = k_tokens.size();
Expand Down
2 changes: 1 addition & 1 deletion examples/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ class wav_writer {
// It is assumed that PCM data is normalized to a range from -1 to 1
bool write_audio(const float * data, size_t length) {
for (size_t i = 0; i < length; ++i) {
const int16_t intSample = data[i] * 32767;
const int16_t intSample = int16_t(data[i] * 32767);
file.write(reinterpret_cast<const char *>(&intSample), sizeof(int16_t));
dataSize += sizeof(int16_t);
}
Expand Down
5 changes: 0 additions & 5 deletions examples/lsp/lsp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ struct whisper_params {
float vad_thold = 0.6f;
float freq_thold = 100.0f;

bool speed_up = false;
bool translate = false;
bool print_special = false;
bool print_energy = false;
Expand Down Expand Up @@ -70,7 +69,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
Expand Down Expand Up @@ -102,7 +100,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
Expand Down Expand Up @@ -184,7 +181,6 @@ json unguided_transcription(struct whisper_context * ctx, audio_async &audio, js
wparams.n_threads = params.n_threads;

wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up;
wparams.suppress_non_speech_tokens = true;
// run the transformer and a single decoding pass
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
Expand Down Expand Up @@ -223,7 +219,6 @@ json guided_transcription(struct whisper_context * ctx, audio_async &audio, cons
wparams.n_threads = params.n_threads;

wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up;

// TODO: Do some time testing. Does an overly long prompt slow down processing?
// Set up command sets/precompute prompts
Expand Down
4 changes: 0 additions & 4 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ struct whisper_params {
float temperature = 0.0f;
float temperature_inc = 0.2f;

bool speed_up = false;
bool debug_mode = false;
bool translate = false;
bool detect_language = false;
Expand Down Expand Up @@ -138,7 +137,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
else if (arg == "-tp" || arg == "--temperature") { params.temperature = std::stof(argv[++i]); }
else if (arg == "-tpi" || arg == "--temperature-inc") { params.temperature_inc = std::stof(argv[++i]); }
// else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-debug"|| arg == "--debug-mode") { params.debug_mode = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
Expand Down Expand Up @@ -206,7 +204,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
fprintf(stderr, " -tp, --temperature N [%-7.2f] The sampling temperature, between 0 and 1\n", params.temperature);
fprintf(stderr, " -tpi, --temperature-inc N [%-7.2f] The increment of temperature, between 0 and 1\n",params.temperature_inc);
// fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. dump log_mel)\n", params.debug_mode ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
Expand Down Expand Up @@ -1106,7 +1103,6 @@ int main(int argc, char ** argv) {
wparams.split_on_word = params.split_on_word;
wparams.audio_ctx = params.audio_ctx;

wparams.speed_up = params.speed_up;
wparams.debug_mode = params.debug_mode;

wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
Expand Down
4 changes: 0 additions & 4 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ struct whisper_params {
float temperature = 0.00f;
float temperature_inc = 0.20f;

bool speed_up = false;
bool debug_mode = false;
bool translate = false;
bool detect_language = false;
Expand Down Expand Up @@ -112,7 +111,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
// fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. dump log_mel)\n", params.debug_mode ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
Expand Down Expand Up @@ -159,7 +157,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); }
else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
// else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-debug"|| arg == "--debug-mode") { params.debug_mode = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
Expand Down Expand Up @@ -768,7 +765,6 @@ int main(int argc, char ** argv) {
wparams.split_on_word = params.split_on_word;
wparams.audio_ctx = params.audio_ctx;

wparams.speed_up = params.speed_up;
wparams.debug_mode = params.debug_mode;

wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
Expand Down
Loading
Loading