From 2293e00f4ae43307e0a360d3a2cc3e157b82bab5 Mon Sep 17 00:00:00 2001
From: Jalaj Gupta <jalaj.gupta@kpoint.com>
Date: Fri, 26 Jul 2024 17:53:24 +0530
Subject: [PATCH 1/3] Included option to add vocabulary for better
 transcription

---
 app_rvc.py                            |  9 +++++++++
 soni_translate/languages_gui.py       |  2 ++
 soni_translate/speech_segmentation.py | 11 ++++++-----
 3 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/app_rvc.py b/app_rvc.py
index 47718cd..2d28325 100644
--- a/app_rvc.py
+++ b/app_rvc.py
@@ -434,6 +434,7 @@ def multilingual_media_conversion(
         custom_voices=False,
         custom_voices_workers=1,
         is_gui=False,
+        transcription_vocabulary="",
         progress=gr.Progress(),
     ):
         if not YOUR_HF_TOKEN:
@@ -713,6 +714,7 @@ def multilingual_media_conversion(
                         SOURCE_LANGUAGE,
                         literalize_numbers,
                         segment_duration_limit,
+                        custom_vocab=transcription_vocabulary,
                     )
                 logger.debug(
                     "Transcript complete, "
@@ -1866,6 +1868,11 @@ def submit(value):
                                 label=lg_conf["ctype_label"],
                                 info=lg_conf["ctype_info"],
                             )
+                            transcription_vocabulary_gui = gr.Textbox(
+                                label=lg_conf["transcription_custom_vocabulary_label"],
+                                value="",
+                                info=lg_conf["transcription_custom_vocabulary_info"],
+                            )
                             batch_size = gr.Slider(
                                 minimum=1,
                                 maximum=32,
@@ -2657,6 +2664,7 @@ def update_tts_list():
                 enable_cache_gui,
                 enable_custom_voice,
                 workers_custom_voice,
+                transcription_vocabulary_gui,
                 is_gui_dummy_check,
             ],
             outputs=subs_edit_space,
@@ -2724,6 +2732,7 @@ def update_tts_list():
                 enable_cache_gui,
                 enable_custom_voice,
                 workers_custom_voice,
+                transcription_vocabulary_gui,
                 is_gui_dummy_check,
             ],
             outputs=video_output,
diff --git a/soni_translate/languages_gui.py b/soni_translate/languages_gui.py
index ec7338a..616467e 100644
--- a/soni_translate/languages_gui.py
+++ b/soni_translate/languages_gui.py
@@ -163,6 +163,8 @@
         "srt_file_label": "Upload an SRT subtitle file (will be used instead of the transcription of Whisper)",
         "divide_text_label": "Redivide text segments by:",
         "divide_text_info": "(Experimental) Enter a separator to split existing text segments in the source language. The tool will identify occurrences and create new segments accordingly. Specify multiple separators using |, e.g.: !|?|...|。",
+        "transcription_custom_vocabulary_label": "Custom Vocabulary for transcription",
+        "transcription_custom_vocabulary_info": "Enter comma(,) separated vocabulary/keywords for better transcription quality (for eg. phising, vishing)",
         "diarization_label": "Diarization model",
         "tr_process_label": "Translation process",
         "out_type_label": "Output type",
diff --git a/soni_translate/speech_segmentation.py b/soni_translate/speech_segmentation.py
index 9b0b446..82ae0d8 100644
--- a/soni_translate/speech_segmentation.py
+++ b/soni_translate/speech_segmentation.py
@@ -59,9 +59,7 @@
 
 
 def openai_api_whisper(
-    input_audio_file,
-    source_lang=None,
-    chunk_duration=1800
+    input_audio_file, source_lang=None, chunk_duration=1800, custom_vocab=""
 ):
 
     info = sf.info(input_audio_file)
@@ -99,6 +97,7 @@ def openai_api_whisper(
           language=language,
           response_format="verbose_json",
           timestamp_granularities=["segment"],
+            prompt=custom_vocab,
         )
 
         try:
@@ -152,6 +151,7 @@ def transcribe_speech(
     SOURCE_LANGUAGE,
     literalize_numbers=True,
     segment_duration_limit=15,
+    custom_vocab="",
 ):
     """
     Transcribe speech using a whisper model.
@@ -162,6 +162,7 @@ def transcribe_speech(
     - compute_type (str): Type of compute to be used (e.g., 'int8', 'float16').
     - batch_size (int): Batch size for transcription.
     - SOURCE_LANGUAGE (str): Source language for transcription.
+    - custom_vocab (str): Comma separated words for better transcription
 
     Returns:
     - Tuple containing:
@@ -175,10 +176,10 @@ def transcribe_speech(
                 "OpenAI's API Whisper does not support "
                 "the literalization of numbers."
             )
-        return openai_api_whisper(audio_wav, SOURCE_LANGUAGE)
+        return openai_api_whisper(audio_wav, SOURCE_LANGUAGE, custom_vocab=custom_vocab)
 
     # https://github.com/openai/whisper/discussions/277
-    prompt = "以下是普通话的句子。" if SOURCE_LANGUAGE == "zh" else None
+    prompt = "以下是普通话的句子。" if SOURCE_LANGUAGE == "zh" else custom_vocab
     SOURCE_LANGUAGE = (
         SOURCE_LANGUAGE if SOURCE_LANGUAGE != "zh-TW" else "zh"
     )

From 34cc24646fbde45e48729dcaf57f12a205d1850c Mon Sep 17 00:00:00 2001
From: Jalaj Gupta <jalaj.gupta@kpoint.com>
Date: Thu, 22 Aug 2024 11:03:06 +0530
Subject: [PATCH 2/3] correction in param parsing for improved transcription

---
 app_rvc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/app_rvc.py b/app_rvc.py
index 2d28325..7143608 100644
--- a/app_rvc.py
+++ b/app_rvc.py
@@ -433,8 +433,8 @@ def multilingual_media_conversion(
         enable_cache=True,
         custom_voices=False,
         custom_voices_workers=1,
-        is_gui=False,
         transcription_vocabulary="",
+        is_gui=False,
         progress=gr.Progress(),
     ):
         if not YOUR_HF_TOKEN:

From 4dd58ba0d2b3a0b8d2e8beb5a96a8ba2d3aa1e19 Mon Sep 17 00:00:00 2001
From: Jalaj Gupta <jalaj.gupta@kpoint.com>
Date: Thu, 22 Aug 2024 11:27:21 +0530
Subject: [PATCH 3/3] Added log for transcription vocabulary

---
 soni_translate/speech_segmentation.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/soni_translate/speech_segmentation.py b/soni_translate/speech_segmentation.py
index 82ae0d8..36a5977 100644
--- a/soni_translate/speech_segmentation.py
+++ b/soni_translate/speech_segmentation.py
@@ -180,9 +180,9 @@ def transcribe_speech(
 
     # https://github.com/openai/whisper/discussions/277
     prompt = "以下是普通话的句子。" if SOURCE_LANGUAGE == "zh" else custom_vocab
-    SOURCE_LANGUAGE = (
-        SOURCE_LANGUAGE if SOURCE_LANGUAGE != "zh-TW" else "zh"
-    )
+    SOURCE_LANGUAGE = SOURCE_LANGUAGE if SOURCE_LANGUAGE != "zh-TW" else "zh"
+
+    logger.debug(f"transcription vocabulary: {prompt}, type: {type(prompt)}")
     asr_options = {
         "initial_prompt": prompt,
         "suppress_numerals": literalize_numbers