From 231825ad0a76bdc944aa3baccb84416af16aaf0a Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Sun, 17 Sep 2023 00:29:08 +0300 Subject: [PATCH] add step processgin --- src/transcription-filter-data.h | 2 + src/transcription-filter.cpp | 26 +++++++- src/whisper-processing.cpp | 112 +++++++++++++++----------------- src/whisper-processing.h | 4 +- 4 files changed, 79 insertions(+), 65 deletions(-) diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h index d53af79..7fd10dc 100644 --- a/src/transcription-filter-data.h +++ b/src/transcription-filter-data.h @@ -30,6 +30,8 @@ struct transcription_filter_data { size_t overlap_ms; // How many frames were processed in the last whisper frame (this is dynamic) size_t last_num_frames; + // Milliseconds per processing step (e.g. rest of the whisper buffer may be filled with silence) + size_t step_size_msec; /* PCM buffers */ float *copy_buffers[MAX_PREPROC_CHANNELS]; diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp index 8d3a4ef..a9b5acd 100644 --- a/src/transcription-filter.cpp +++ b/src/transcription-filter.cpp @@ -77,9 +77,6 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_ { std::lock_guard lock(*gf->whisper_buf_mutex); // scoped lock - obs_log(gf->log_level, - "pushing %lu frames to input buffer. current size: %lu (bytes)", - (size_t)(audio->frames), gf->input_buffers[0].size); // push back current audio data to input circlebuf for (size_t c = 0; c < gf->channels; c++) { circlebuf_push_back(&gf->input_buffers[c], audio->data[c], @@ -229,6 +226,9 @@ void transcription_filter_update(void *data, obs_data_t *s) gf->vad_enabled = obs_data_get_bool(s, "vad_enabled"); gf->log_words = obs_data_get_bool(s, "log_words"); gf->caption_to_stream = obs_data_get_bool(s, "caption_to_stream"); + bool step_by_step_processing = obs_data_get_bool(s, "step_by_step_processing"); + gf->step_size_msec = step_by_step_processing ? (int)obs_data_get_int(s, "step_size_msec") + : BUFFER_SIZE_MSEC; obs_log(gf->log_level, "transcription_filter: update text source"); // update the text source @@ -382,6 +382,10 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter) gf->sample_rate = audio_output_get_sample_rate(obs_get_audio()); gf->frames = (size_t)((float)gf->sample_rate / (1000.0f / (float)BUFFER_SIZE_MSEC)); gf->last_num_frames = 0; + bool step_by_step_processing = obs_data_get_bool(settings, "step_by_step_processing"); + gf->step_size_msec = step_by_step_processing + ? (int)obs_data_get_int(settings, "step_size_msec") + : BUFFER_SIZE_MSEC; for (size_t i = 0; i < MAX_AUDIO_CHANNELS; i++) { circlebuf_init(&gf->input_buffers[i]); @@ -469,6 +473,8 @@ void transcription_filter_defaults(obs_data_t *s) obs_data_set_default_string(s, "whisper_model_path", "models/ggml-tiny.en.bin"); obs_data_set_default_string(s, "whisper_language_select", "en"); obs_data_set_default_string(s, "subtitle_sources", "none"); + obs_data_set_default_bool(s, "step_by_step_processing", false); + obs_data_set_default_int(s, "step_size_msec", 1000); // Whisper parameters obs_data_set_default_int(s, "whisper_sampling_method", WHISPER_SAMPLING_BEAM_SEARCH); @@ -508,6 +514,20 @@ obs_properties_t *transcription_filter_properties(void *data) obs_property_list_add_int(list, "WARNING", LOG_WARNING); obs_properties_add_bool(ppts, "log_words", "Log output words"); obs_properties_add_bool(ppts, "caption_to_stream", "Stream captions"); + obs_property_t *step_by_step_processing = + obs_properties_add_bool(ppts, "step_by_step_processing", "Step-by-step processing"); + obs_properties_add_int_slider(ppts, "step_size_msec", "Step size (ms)", 500, + BUFFER_SIZE_MSEC, 50); + + obs_property_set_modified_callback(step_by_step_processing, [](obs_properties_t *props, + obs_property_t *property, + obs_data_t *settings) { + UNUSED_PARAMETER(property); + // Show/Hide the step size input + obs_property_set_visible(obs_properties_get(props, "step_size_msec"), + obs_data_get_bool(settings, "step_by_step_processing")); + return true; + }); obs_property_t *subs_output = obs_properties_add_list(ppts, "subtitle_sources", "Subtitles Output", diff --git a/src/whisper-processing.cpp b/src/whisper-processing.cpp index cf7ba7b..6a6041e 100644 --- a/src/whisper-processing.cpp +++ b/src/whisper-processing.cpp @@ -148,7 +148,7 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter to_timestamp(t1).c_str(), sentence_p, text_lower.c_str()); } - if (text_lower.empty()) { + if (text_lower.empty() || text_lower == ".") { return {DETECTION_RESULT_SILENCE, ""}; } @@ -160,68 +160,66 @@ void process_audio_from_buffer(struct transcription_filter_data *gf) { uint32_t num_new_frames_from_infos = 0; uint64_t start_timestamp = 0; + bool last_step_in_segment = false; { // scoped lock the buffer mutex std::lock_guard lock(*gf->whisper_buf_mutex); - // We need (gf->frames - gf->overlap_frames) new frames to run inference, - // except for the first segment, where we need the whole gf->frames frames - size_t how_many_frames_needed = gf->frames - gf->overlap_frames; - if (gf->last_num_frames == 0) { - how_many_frames_needed = gf->frames; - } + // We need (gf->frames - gf->last_num_frames) new frames for a full segment, + const size_t remaining_frames_to_full_segment = gf->frames - gf->last_num_frames; // pop infos from the info buffer and mark the beginning timestamp from the first // info as the beginning timestamp of the segment struct transcription_filter_audio_info info_from_buf = {0}; - while (gf->info_buffer.size >= sizeof(struct transcription_filter_audio_info)) { - circlebuf_pop_front(&gf->info_buffer, &info_from_buf, - sizeof(struct transcription_filter_audio_info)); + const size_t size_of_audio_info = sizeof(struct transcription_filter_audio_info); + while (gf->info_buffer.size >= size_of_audio_info) { + circlebuf_pop_front(&gf->info_buffer, &info_from_buf, size_of_audio_info); num_new_frames_from_infos += info_from_buf.frames; if (start_timestamp == 0) { start_timestamp = info_from_buf.timestamp; } - obs_log(gf->log_level, "popped %d frames from info buffer, %lu needed", - num_new_frames_from_infos, how_many_frames_needed); // Check if we're within the needed segment length - if (num_new_frames_from_infos > how_many_frames_needed) { + if (num_new_frames_from_infos > remaining_frames_to_full_segment) { // too big, push the last info into the buffer's front where it was num_new_frames_from_infos -= info_from_buf.frames; - circlebuf_push_front( - &gf->info_buffer, &info_from_buf, - sizeof(struct transcription_filter_audio_info)); + circlebuf_push_front(&gf->info_buffer, &info_from_buf, + size_of_audio_info); + last_step_in_segment = + true; // this is the final step in the segment break; } } + obs_log(gf->log_level, + "with %lu remaining to full segment, popped %d info-frames, pushing into buffer at %lu", + remaining_frames_to_full_segment, num_new_frames_from_infos, + gf->last_num_frames); + /* Pop from input circlebuf */ for (size_t c = 0; c < gf->channels; c++) { - if (gf->last_num_frames > 0) { - // move overlap frames from the end of the last copy_buffers to the beginning - memcpy(gf->copy_buffers[c], - gf->copy_buffers[c] + gf->last_num_frames - - gf->overlap_frames, - gf->overlap_frames * sizeof(float)); - // copy new data to the end of copy_buffers[c] - circlebuf_pop_front(&gf->input_buffers[c], - gf->copy_buffers[c] + gf->overlap_frames, - num_new_frames_from_infos * sizeof(float)); - } else { - // Very first time, just copy data to copy_buffers[c] - circlebuf_pop_front(&gf->input_buffers[c], gf->copy_buffers[c], - num_new_frames_from_infos * sizeof(float)); - } + // Push the new data to the end of the existing buffer copy_buffers[c] + circlebuf_pop_front(&gf->input_buffers[c], + gf->copy_buffers[c] + gf->last_num_frames, + num_new_frames_from_infos * sizeof(float)); } - obs_log(gf->log_level, - "popped %u frames from input buffer. input_buffer[0] size is %lu", - num_new_frames_from_infos, gf->input_buffers[0].size); + } - if (gf->last_num_frames > 0) { - gf->last_num_frames = num_new_frames_from_infos + gf->overlap_frames; + if (gf->last_num_frames > 0) { + gf->last_num_frames += num_new_frames_from_infos; + if (!last_step_in_segment) { + // Mid-segment process + obs_log(gf->log_level, "mid-segment, now %d frames left to full segment", + (int)(gf->frames - gf->last_num_frames)); } else { - gf->last_num_frames = num_new_frames_from_infos; + // Final step in segment + obs_log(gf->log_level, "full segment, %d frames to process", + (int)(gf->last_num_frames)); } + } else { + gf->last_num_frames = num_new_frames_from_infos; + obs_log(gf->log_level, "first segment, %d frames to process", + (int)(gf->last_num_frames)); } obs_log(gf->log_level, "processing %d frames (%d ms), start timestamp %llu ", @@ -271,28 +269,21 @@ void process_audio_from_buffer(struct transcription_filter_data *gf) // end of timer auto end = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(end - start).count(); - const uint32_t new_frames_from_infos_ms = - num_new_frames_from_infos * 1000 / - gf->sample_rate; // number of frames in this packet - obs_log(gf->log_level, "audio processing of %u ms new data took %d ms", - new_frames_from_infos_ms, (int)duration); - - if (duration > new_frames_from_infos_ms) { - // try to decrease overlap down to minimum of 100 ms - gf->overlap_ms = std::max((uint64_t)gf->overlap_ms - 10, (uint64_t)100); - gf->overlap_frames = gf->overlap_ms * gf->sample_rate / 1000; - obs_log(gf->log_level, - "audio processing took too long (%d ms), reducing overlap to %lu ms", - (int)duration, gf->overlap_ms); - } else if (!skipped_inference) { - if (gf->overlap_ms < OVERLAP_SIZE_MSEC) { - // try to increase overlap up to OVERLAP_SIZE_MSEC - gf->overlap_ms = std::min((uint64_t)gf->overlap_ms + 10, - (uint64_t)OVERLAP_SIZE_MSEC); - gf->overlap_frames = gf->overlap_ms * gf->sample_rate / 1000; - obs_log(gf->log_level, - "audio processing took %d ms, increasing overlap to %lu ms", - (int)duration, gf->overlap_ms); + const uint64_t last_num_frames_ms = gf->last_num_frames * 1000 / gf->sample_rate; + obs_log(gf->log_level, "audio processing of %lu ms data took %d ms", last_num_frames_ms, + (int)duration); + + if (last_step_in_segment) { + for (size_t c = 0; c < gf->channels; c++) { + // This is the last step in the segment - reset the copy buffer (include overlap frames) + // move overlap frames from the end of the last copy_buffers to the beginning + memcpy(gf->copy_buffers[c], + gf->copy_buffers[c] + gf->last_num_frames - gf->overlap_frames, + gf->overlap_frames * sizeof(float)); + // zero out the rest of the buffer, just in case + memset(gf->copy_buffers[c] + gf->overlap_frames, 0, + (gf->frames - gf->overlap_frames) * sizeof(float)); + gf->last_num_frames = gf->overlap_frames; } } } @@ -306,7 +297,6 @@ void whisper_loop(void *data) struct transcription_filter_data *gf = static_cast(data); - const size_t segment_size = gf->frames * sizeof(float); obs_log(LOG_INFO, "starting whisper thread"); @@ -327,6 +317,8 @@ void whisper_loop(void *data) std::lock_guard lock(*gf->whisper_buf_mutex); input_buf_size = gf->input_buffers[0].size; } + const size_t step_size_frames = gf->step_size_msec * gf->sample_rate / 1000; + const size_t segment_size = step_size_frames * sizeof(float); if (input_buf_size >= segment_size) { obs_log(gf->log_level, diff --git a/src/whisper-processing.h b/src/whisper-processing.h index 5c5b28f..19b10fa 100644 --- a/src/whisper-processing.h +++ b/src/whisper-processing.h @@ -3,10 +3,10 @@ // buffer size in msec #define BUFFER_SIZE_MSEC 3000 -// at 16Khz, 3000 msec is 48000 samples +// at 16Khz, BUFFER_SIZE_MSEC is WHISPER_FRAME_SIZE samples #define WHISPER_FRAME_SIZE 48000 // overlap in msec -#define OVERLAP_SIZE_MSEC 200 +#define OVERLAP_SIZE_MSEC 100 void whisper_loop(void *data); struct whisper_context *init_whisper_context(const std::string &model_path);