diff --git a/app_rvc.py b/app_rvc.py index 47718cd..7143608 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -433,6 +433,7 @@ def multilingual_media_conversion( enable_cache=True, custom_voices=False, custom_voices_workers=1, + transcription_vocabulary="", is_gui=False, progress=gr.Progress(), ): @@ -713,6 +714,7 @@ def multilingual_media_conversion( SOURCE_LANGUAGE, literalize_numbers, segment_duration_limit, + custom_vocab=transcription_vocabulary, ) logger.debug( "Transcript complete, " @@ -1866,6 +1868,11 @@ def submit(value): label=lg_conf["ctype_label"], info=lg_conf["ctype_info"], ) + transcription_vocabulary_gui = gr.Textbox( + label=lg_conf["transcription_custom_vocabulary_label"], + value="", + info=lg_conf["transcription_custom_vocabulary_info"], + ) batch_size = gr.Slider( minimum=1, maximum=32, @@ -2657,6 +2664,7 @@ def update_tts_list(): enable_cache_gui, enable_custom_voice, workers_custom_voice, + transcription_vocabulary_gui, is_gui_dummy_check, ], outputs=subs_edit_space, @@ -2724,6 +2732,7 @@ def update_tts_list(): enable_cache_gui, enable_custom_voice, workers_custom_voice, + transcription_vocabulary_gui, is_gui_dummy_check, ], outputs=video_output, diff --git a/soni_translate/languages_gui.py b/soni_translate/languages_gui.py index ec7338a..616467e 100644 --- a/soni_translate/languages_gui.py +++ b/soni_translate/languages_gui.py @@ -163,6 +163,8 @@ "srt_file_label": "Upload an SRT subtitle file (will be used instead of the transcription of Whisper)", "divide_text_label": "Redivide text segments by:", "divide_text_info": "(Experimental) Enter a separator to split existing text segments in the source language. The tool will identify occurrences and create new segments accordingly. Specify multiple separators using |, e.g.: !|?|...|。", + "transcription_custom_vocabulary_label": "Custom Vocabulary for transcription", + "transcription_custom_vocabulary_info": "Enter comma(,) separated vocabulary/keywords for better transcription quality (for eg. phising, vishing)", "diarization_label": "Diarization model", "tr_process_label": "Translation process", "out_type_label": "Output type", diff --git a/soni_translate/speech_segmentation.py b/soni_translate/speech_segmentation.py index 9b0b446..36a5977 100644 --- a/soni_translate/speech_segmentation.py +++ b/soni_translate/speech_segmentation.py @@ -59,9 +59,7 @@ def openai_api_whisper( - input_audio_file, - source_lang=None, - chunk_duration=1800 + input_audio_file, source_lang=None, chunk_duration=1800, custom_vocab="" ): info = sf.info(input_audio_file) @@ -99,6 +97,7 @@ def openai_api_whisper( language=language, response_format="verbose_json", timestamp_granularities=["segment"], + prompt=custom_vocab, ) try: @@ -152,6 +151,7 @@ def transcribe_speech( SOURCE_LANGUAGE, literalize_numbers=True, segment_duration_limit=15, + custom_vocab="", ): """ Transcribe speech using a whisper model. @@ -162,6 +162,7 @@ def transcribe_speech( - compute_type (str): Type of compute to be used (e.g., 'int8', 'float16'). - batch_size (int): Batch size for transcription. - SOURCE_LANGUAGE (str): Source language for transcription. + - custom_vocab (str): Comma separated words for better transcription Returns: - Tuple containing: @@ -175,13 +176,13 @@ def transcribe_speech( "OpenAI's API Whisper does not support " "the literalization of numbers." ) - return openai_api_whisper(audio_wav, SOURCE_LANGUAGE) + return openai_api_whisper(audio_wav, SOURCE_LANGUAGE, custom_vocab=custom_vocab) # https://github.com/openai/whisper/discussions/277 - prompt = "以下是普通话的句子。" if SOURCE_LANGUAGE == "zh" else None - SOURCE_LANGUAGE = ( - SOURCE_LANGUAGE if SOURCE_LANGUAGE != "zh-TW" else "zh" - ) + prompt = "以下是普通话的句子。" if SOURCE_LANGUAGE == "zh" else custom_vocab + SOURCE_LANGUAGE = SOURCE_LANGUAGE if SOURCE_LANGUAGE != "zh-TW" else "zh" + + logger.debug(f"transcription vocabulary: {prompt}, type: {type(prompt)}") asr_options = { "initial_prompt": prompt, "suppress_numerals": literalize_numbers