From 2293e00f4ae43307e0a360d3a2cc3e157b82bab5 Mon Sep 17 00:00:00 2001 From: Jalaj Gupta Date: Fri, 26 Jul 2024 17:53:24 +0530 Subject: [PATCH 1/3] Included option to add vocabulary for better transcription --- app_rvc.py | 9 +++++++++ soni_translate/languages_gui.py | 2 ++ soni_translate/speech_segmentation.py | 11 ++++++----- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/app_rvc.py b/app_rvc.py index 47718cd..2d28325 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -434,6 +434,7 @@ def multilingual_media_conversion( custom_voices=False, custom_voices_workers=1, is_gui=False, + transcription_vocabulary="", progress=gr.Progress(), ): if not YOUR_HF_TOKEN: @@ -713,6 +714,7 @@ def multilingual_media_conversion( SOURCE_LANGUAGE, literalize_numbers, segment_duration_limit, + custom_vocab=transcription_vocabulary, ) logger.debug( "Transcript complete, " @@ -1866,6 +1868,11 @@ def submit(value): label=lg_conf["ctype_label"], info=lg_conf["ctype_info"], ) + transcription_vocabulary_gui = gr.Textbox( + label=lg_conf["transcription_custom_vocabulary_label"], + value="", + info=lg_conf["transcription_custom_vocabulary_info"], + ) batch_size = gr.Slider( minimum=1, maximum=32, @@ -2657,6 +2664,7 @@ def update_tts_list(): enable_cache_gui, enable_custom_voice, workers_custom_voice, + transcription_vocabulary_gui, is_gui_dummy_check, ], outputs=subs_edit_space, @@ -2724,6 +2732,7 @@ def update_tts_list(): enable_cache_gui, enable_custom_voice, workers_custom_voice, + transcription_vocabulary_gui, is_gui_dummy_check, ], outputs=video_output, diff --git a/soni_translate/languages_gui.py b/soni_translate/languages_gui.py index ec7338a..616467e 100644 --- a/soni_translate/languages_gui.py +++ b/soni_translate/languages_gui.py @@ -163,6 +163,8 @@ "srt_file_label": "Upload an SRT subtitle file (will be used instead of the transcription of Whisper)", "divide_text_label": "Redivide text segments by:", "divide_text_info": "(Experimental) Enter a separator to split existing text segments in the source language. The tool will identify occurrences and create new segments accordingly. Specify multiple separators using |, e.g.: !|?|...|。", + "transcription_custom_vocabulary_label": "Custom Vocabulary for transcription", + "transcription_custom_vocabulary_info": "Enter comma(,) separated vocabulary/keywords for better transcription quality (for eg. phising, vishing)", "diarization_label": "Diarization model", "tr_process_label": "Translation process", "out_type_label": "Output type", diff --git a/soni_translate/speech_segmentation.py b/soni_translate/speech_segmentation.py index 9b0b446..82ae0d8 100644 --- a/soni_translate/speech_segmentation.py +++ b/soni_translate/speech_segmentation.py @@ -59,9 +59,7 @@ def openai_api_whisper( - input_audio_file, - source_lang=None, - chunk_duration=1800 + input_audio_file, source_lang=None, chunk_duration=1800, custom_vocab="" ): info = sf.info(input_audio_file) @@ -99,6 +97,7 @@ def openai_api_whisper( language=language, response_format="verbose_json", timestamp_granularities=["segment"], + prompt=custom_vocab, ) try: @@ -152,6 +151,7 @@ def transcribe_speech( SOURCE_LANGUAGE, literalize_numbers=True, segment_duration_limit=15, + custom_vocab="", ): """ Transcribe speech using a whisper model. @@ -162,6 +162,7 @@ def transcribe_speech( - compute_type (str): Type of compute to be used (e.g., 'int8', 'float16'). - batch_size (int): Batch size for transcription. - SOURCE_LANGUAGE (str): Source language for transcription. + - custom_vocab (str): Comma separated words for better transcription Returns: - Tuple containing: @@ -175,10 +176,10 @@ def transcribe_speech( "OpenAI's API Whisper does not support " "the literalization of numbers." ) - return openai_api_whisper(audio_wav, SOURCE_LANGUAGE) + return openai_api_whisper(audio_wav, SOURCE_LANGUAGE, custom_vocab=custom_vocab) # https://github.com/openai/whisper/discussions/277 - prompt = "以下是普通话的句子。" if SOURCE_LANGUAGE == "zh" else None + prompt = "以下是普通话的句子。" if SOURCE_LANGUAGE == "zh" else custom_vocab SOURCE_LANGUAGE = ( SOURCE_LANGUAGE if SOURCE_LANGUAGE != "zh-TW" else "zh" ) From 34cc24646fbde45e48729dcaf57f12a205d1850c Mon Sep 17 00:00:00 2001 From: Jalaj Gupta Date: Thu, 22 Aug 2024 11:03:06 +0530 Subject: [PATCH 2/3] correction in param parsing for improved transcription --- app_rvc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app_rvc.py b/app_rvc.py index 2d28325..7143608 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -433,8 +433,8 @@ def multilingual_media_conversion( enable_cache=True, custom_voices=False, custom_voices_workers=1, - is_gui=False, transcription_vocabulary="", + is_gui=False, progress=gr.Progress(), ): if not YOUR_HF_TOKEN: From 4dd58ba0d2b3a0b8d2e8beb5a96a8ba2d3aa1e19 Mon Sep 17 00:00:00 2001 From: Jalaj Gupta Date: Thu, 22 Aug 2024 11:27:21 +0530 Subject: [PATCH 3/3] Added log for transcription vocabulary --- soni_translate/speech_segmentation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/soni_translate/speech_segmentation.py b/soni_translate/speech_segmentation.py index 82ae0d8..36a5977 100644 --- a/soni_translate/speech_segmentation.py +++ b/soni_translate/speech_segmentation.py @@ -180,9 +180,9 @@ def transcribe_speech( # https://github.com/openai/whisper/discussions/277 prompt = "以下是普通话的句子。" if SOURCE_LANGUAGE == "zh" else custom_vocab - SOURCE_LANGUAGE = ( - SOURCE_LANGUAGE if SOURCE_LANGUAGE != "zh-TW" else "zh" - ) + SOURCE_LANGUAGE = SOURCE_LANGUAGE if SOURCE_LANGUAGE != "zh-TW" else "zh" + + logger.debug(f"transcription vocabulary: {prompt}, type: {type(prompt)}") asr_options = { "initial_prompt": prompt, "suppress_numerals": literalize_numbers