From 42915830b553700402288ecb440b7e2d0aa1eccb Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Fri, 15 Mar 2024 17:27:16 -0400 Subject: [PATCH 01/36] Valid text srt --- soni_translate/text_multiformat_processor.py | 53 +++++++++++++++++--- 1 file changed, 46 insertions(+), 7 deletions(-) diff --git a/soni_translate/text_multiformat_processor.py b/soni_translate/text_multiformat_processor.py index 3d8569b..4f56187 100644 --- a/soni_translate/text_multiformat_processor.py +++ b/soni_translate/text_multiformat_processor.py @@ -5,6 +5,12 @@ import re import os import copy +import string + +punctuation_list = list( + string.punctuation + "¡¿«»„”“”‚‘’「」『』《》()【】〈〉〔〕〖〗〘〙〚〛⸤⸥⸨⸩" +) +symbol_list = punctuation_list + ["", "..", "..."] def extract_from_srt(file_path): @@ -17,18 +23,51 @@ def extract_from_srt(file_path): return srt_content_list +def clean_text(text): + + # Remove content within square brackets + text = re.sub(r'\[.*?\]', '', text) + # Add pattern to remove content within tags + text = re.sub(r'.*?', '', text) + # Remove HTML tags + text = re.sub(r'<.*?>', '', text) + # Remove "♫" and "♪" content + text = re.sub(r'♫.*?♫', '', text) + text = re.sub(r'♪.*?♪', '', text) + # Replace newline characters with an empty string + text = text.replace("\n", ". ") + # Remove double quotation marks + text = text.replace('"', '') + # Collapse multiple spaces and replace with a single space + text = re.sub(r"\s+", " ", text) + # Normalize spaces around periods + text = re.sub(r"[\s\.]+(?=\s)", ". ", text) + # Check if there are ♫ or ♪ symbols present + if '♫' in text or '♪' in text: + return "" + + text = text.strip() + + # Valid text + return text if text not in symbol_list else "" + + def srt_file_to_segments(file_path, speaker=False): srt_content_list = extract_from_srt(file_path) segments = [] for segment in srt_content_list: - segments.append( - { - "text": str(segment.content), - "start": float(segment.start.total_seconds()), - "end": float(segment.end.total_seconds()), - } - ) + + text = clean_text(str(segment.content)) + + if text: + segments.append( + { + "text": text, + "start": float(segment.start.total_seconds()), + "end": float(segment.end.total_seconds()), + } + ) if not segments: raise Exception("No data found in srt subtitle file") From 36bd2ab7245a6082b12ab88e908fbbd574dd19a6 Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Fri, 15 Mar 2024 17:32:25 -0400 Subject: [PATCH 02/36] If only an SRT file is available, disable diarization --- app_rvc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/app_rvc.py b/app_rvc.py index eb2a693..6c40323 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -406,6 +406,7 @@ def multilingual_media_conversion( ) if not media_file and subtitle_file: + diarization_model = "disable" media_file = "audio_support.wav" if not get_video_from_text_json: remove_files(media_file) From c529ed0e4a8cc69e972c258fb66f430e69e5f2b4 Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Sun, 17 Mar 2024 18:09:23 -0400 Subject: [PATCH 03/36] Audio extensions --- soni_translate/utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/soni_translate/utils.py b/soni_translate/utils.py index 4200e32..be738b6 100644 --- a/soni_translate/utils.py +++ b/soni_translate/utils.py @@ -203,6 +203,12 @@ def is_audio_file(string_path): ".m4a", ".alac", ".pcm", + ".opus", + ".ape", + ".amr", + ".ac3", + ".vox", + ".caf", ] # Check if the string_path ends with any audio extension From 086088bf80003b4ccbb1556639cf9634209a41f8 Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Mon, 18 Mar 2024 21:59:37 +0000 Subject: [PATCH 04/36] windows install --- docs/windows_install.md | 130 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 docs/windows_install.md diff --git a/docs/windows_install.md b/docs/windows_install.md new file mode 100644 index 0000000..cf36b34 --- /dev/null +++ b/docs/windows_install.md @@ -0,0 +1,130 @@ +## Install Locally Windows + +### Before You Start + +Before you start installing and using SoniTranslate, there are a few things you need to do: + +1. Install Microsoft Visual C++ Build Tools, MSVC and Windows 10 SDK: + + * Go to the [Visual Studio downloads page](https://visualstudio.microsoft.com/visual-cpp-build-tools/); Or maybe you already have **Visual Studio Installer**? Open it. If you have it already click modify. + * Download and install the "Build Tools for Visual Studio" if you don't have it. + * During installation, under "Workloads", select "C++ build tools" and ensure the latest versions of "MSVCv142 - VS 2019 C++ x64/x86 build tools" and "Windows 10 SDK" are selected ("Windows 11 SDK" if you are using Windows 11); OR go to individual components and find those two listed. + * Complete the installation. + +2. Verify the NVIDIA driver on Windows using the command line: + + * **Open Command Prompt:** Press `Win + R`, type `cmd`, then press `Enter`. + + * **Type the command:** `nvidia-smi` and press `Enter`. + + * **Look for "CUDA Version"** in the output. + +``` ++-----------------------------------------------------------------------------+ +| NVIDIA-SMI 522.25 Driver Version: 522.25 CUDA Version: 11.8 | +|-------------------------------+----------------------+----------------------+ +``` + +3. If you see that your CUDA version is less than 11.8, you should update your NVIDIA driver. Visit the NVIDIA website's driver download page (https://www.nvidia.com/Download/index.aspx) and enter your graphics card information. + +4. Accept the license agreement for using Pyannote. You need to have an account on Hugging Face and `accept the license to use the models`: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation +5. Create a [huggingface token](https://huggingface.co/settings/tokens). Hugging Face is a natural language processing platform that provides access to state-of-the-art models and tools. You will need to create a token in order to use some of the automatic model download features in SoniTranslate. Follow the instructions on the Hugging Face website to create a token. +6. Install [Anaconda](https://www.anaconda.com/) or [Miniconda](https://docs.anaconda.com/free/miniconda/miniconda-install/). Anaconda is a free and open-source distribution of Python and R. It includes a package manager called conda that makes it easy to install and manage Python environments and packages. Follow the instructions on the Anaconda website to download and install Anaconda on your system. +7. Install Git for your system. Git is a version control system that helps you track changes to your code and collaborate with other developers. You can install Git with Anaconda by running `conda install -c anaconda git -y` in your terminal (Do this after step 1 in the following section.). If you have trouble installing Git via Anaconda, you can use the following link instead: + - [Git for Windows](https://git-scm.com/download/win) + +Once you have completed these steps, you will be ready to install SoniTranslate. + +### Getting Started + +To install SoniTranslate, follow these steps: + +1. Create a suitable anaconda environment for SoniTranslate and activate it: + +``` +conda create -n sonitr python=3.10 -y +conda activate sonitr +``` + +2. Clone this github repository and navigate to it: +``` +git clone https://github.com/r3gm/SoniTranslate.git +cd SoniTranslate +``` +3. Install CUDA Toolkit 11.8.0 + +``` +conda install -c "nvidia/label/cuda-11.8.0" cuda-toolkit +``` + +4. Install required packages: + +``` +pip install -r requirements_base.txt -v +pip install -r requirements_extra.txt -v +pip install onnxruntime-gpu +``` + +5. Install [ffmpeg](https://ffmpeg.org/download.html). FFmpeg is a free software project that produces libraries and programs for handling multimedia data. You will need it to process audio and video files. You can install ffmpeg with Anaconda by running `conda install -y ffmpeg` in your terminal. If you have trouble installing ffmpeg via Anaconda, you can use the following link instead: (https://ffmpeg.org/ffmpeg.html). Once it is installed, make sure it is in your PATH by running `ffmpeg -h` in your terminal. If you don't get an error message, you're good to go. + +6. Optional install: + +After installing FFmpeg, you can install these optional packages. + + +[Piper TTS](https://github.com/rhasspy/piper) is a fast, local neural text to speech system that sounds great and is optimized for the Raspberry Pi 4. Piper is used in a variety of projects. Voices are trained with VITS and exported to the onnxruntime. + +``` +pip install -q piper-tts==1.2.0 +``` + +[Coqui XTTS](https://github.com/coqui-ai/TTS) is a text-to-speech (TTS) model that lets you generate realistic voices in different languages. It can clone voices with just a short audio clip, even speak in a different language! It's like having a personal voice mimic for any text you need spoken. + +``` +pip install -q -r requirements_xtts.txt +pip install -q TTS==0.21.1 --no-deps +``` + + +### Running SoniTranslate + +To run SoniTranslate locally, make sure the `sonitr` conda environment is active: + +``` +conda activate sonitr +``` + +Setting your Hugging Face token as an environment variable in Linux: + +``` +export YOUR_HF_TOKEN="YOUR_HUGGING_FACE_TOKEN" +``` + +Then navigate to the `SoniTranslate` folder and run either the `app_rvc.py` + +``` +python app_rvc.py +``` +When the `local URL` `http://127.0.0.1:7860` is displayed in the terminal, simply open this URL in your web browser to access the SoniTranslate interface. + +### Stop and close SoniTranslate. + +In most environments, you can stop the execution by pressing Ctrl+C in the terminal where you launched the script `app_rvc.py`. This will interrupt the program and stop the Gradio app. +To deactivate the Conda environment, you can use the following command: + +``` +conda deactivate +``` + +This will deactivate the currently active Conda environment sonitr, and you'll return to the base environment or the global Python environment. + +### Starting Over + +If you need to start over from scratch, you can delete the `SoniTranslate` folder and remove the `sonitr` conda environment with the following set of commands: + +``` +conda deactivate +conda env remove -n sonitr +``` + +With the `sonitr` environment removed, you can start over with a fresh installation. \ No newline at end of file From 66f6fb967034e9e89345834917261d87a9bef26b Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Mon, 18 Mar 2024 22:13:03 +0000 Subject: [PATCH 05/36] fix environment variable in windows #26 --- docs/windows_install.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/docs/windows_install.md b/docs/windows_install.md index cf36b34..fa290af 100644 --- a/docs/windows_install.md +++ b/docs/windows_install.md @@ -85,6 +85,13 @@ pip install -q -r requirements_xtts.txt pip install -q TTS==0.21.1 --no-deps ``` +7. Setting your [Hugging Face token](https://huggingface.co/settings/tokens) as an environment variable in quotes: + +``` +conda env config vars set YOUR_HF_TOKEN="YOUR_HUGGING_FACE_TOKEN_HERE" +conda deactivate +``` + ### Running SoniTranslate @@ -94,12 +101,6 @@ To run SoniTranslate locally, make sure the `sonitr` conda environment is active conda activate sonitr ``` -Setting your Hugging Face token as an environment variable in Linux: - -``` -export YOUR_HF_TOKEN="YOUR_HUGGING_FACE_TOKEN" -``` - Then navigate to the `SoniTranslate` folder and run either the `app_rvc.py` ``` From c4085352c02594a128b5b7fe8f7bcfae8867b279 Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Mon, 18 Mar 2024 22:43:51 +0000 Subject: [PATCH 06/36] windows install piper-tts --- docs/windows_install.md | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/docs/windows_install.md b/docs/windows_install.md index fa290af..8b4ba31 100644 --- a/docs/windows_install.md +++ b/docs/windows_install.md @@ -71,18 +71,21 @@ pip install onnxruntime-gpu After installing FFmpeg, you can install these optional packages. - -[Piper TTS](https://github.com/rhasspy/piper) is a fast, local neural text to speech system that sounds great and is optimized for the Raspberry Pi 4. Piper is used in a variety of projects. Voices are trained with VITS and exported to the onnxruntime. +[Coqui XTTS](https://github.com/coqui-ai/TTS) is a text-to-speech (TTS) model that lets you generate realistic voices in different languages. It can clone voices with just a short audio clip, even speak in a different language! It's like having a personal voice mimic for any text you need spoken. ``` -pip install -q piper-tts==1.2.0 +pip install -q -r requirements_xtts.txt +pip install -q TTS==0.21.1 --no-deps ``` -[Coqui XTTS](https://github.com/coqui-ai/TTS) is a text-to-speech (TTS) model that lets you generate realistic voices in different languages. It can clone voices with just a short audio clip, even speak in a different language! It's like having a personal voice mimic for any text you need spoken. +[Piper TTS](https://github.com/rhasspy/piper) is a fast, local neural text to speech system that sounds great and is optimized for the Raspberry Pi 4. Piper is used in a variety of projects. Voices are trained with VITS and exported to the onnxruntime. + +🚧 For Windows users, it's important to note that the Python module piper-tts is not fully supported on this operating system. While it works smoothly on Linux, Windows compatibility is currently experimental. If you still wish to install it on Windows, you can follow this experimental method: ``` -pip install -q -r requirements_xtts.txt -pip install -q TTS==0.21.1 --no-deps +pip install https://github.com/R3gm/piper-phonemize/releases/download/1.2.0/piper_phonemize-1.2.0-cp310-cp310-win_amd64.whl +pip install sherpa-onnx==1.9.12 +pip install piper-tts==1.2.0 --no-deps ``` 7. Setting your [Hugging Face token](https://huggingface.co/settings/tokens) as an environment variable in quotes: From e556febece6e298125b151b46dca78e1c4f039a5 Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Mon, 18 Mar 2024 19:36:57 -0400 Subject: [PATCH 07/36] fix inference piper --- soni_translate/text_to_speech.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/soni_translate/text_to_speech.py b/soni_translate/text_to_speech.py index 6d1cad3..f858bdc 100644 --- a/soni_translate/text_to_speech.py +++ b/soni_translate/text_to_speech.py @@ -22,6 +22,7 @@ # from scipy.io.wavfile import write as write_wav import soundfile as sf +import platform import logging from .logging_setup import logger @@ -682,6 +683,10 @@ def load_piper_model( except Exception as error: raise TTS_OperationError(f"onnxruntime error: {str(error)}") + # Disable CUDA in Windows + if platform.system() == "Windows": + cuda = False + if not download_dir: # Download to first data directory by default download_dir = data_dir[0] From 4160a581efcc637600a56d6be80d33f134c59c72 Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Tue, 19 Mar 2024 17:11:53 -0400 Subject: [PATCH 08/36] fix error aling --- app_rvc.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/app_rvc.py b/app_rvc.py index 6c40323..08000e6 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -510,11 +510,14 @@ def multilingual_media_conversion( self.align_language = self.result["language"] if not subtitle_file: prog_disp("Aligning...", 0.45, is_gui, progress=progress) - self.result = align_speech(audio, self.result) - logger.debug( - "Align complete, " - f"segments count {len(self.result['segments'])}" - ) + try: + self.result = align_speech(audio, self.result) + logger.debug( + "Align complete, " + f"segments count {len(self.result['segments'])}" + ) + except Exception as error: + logger.error(str(error)) if self.result["segments"] == []: raise ValueError("No active speech found in audio") From 54165cdb26fa13176ed1b2a9e8303207617540d3 Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Fri, 22 Mar 2024 16:29:56 -0400 Subject: [PATCH 09/36] conda cudnn --- docs/windows_install.md | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/docs/windows_install.md b/docs/windows_install.md index 8b4ba31..98eedb7 100644 --- a/docs/windows_install.md +++ b/docs/windows_install.md @@ -54,10 +54,16 @@ cd SoniTranslate 3. Install CUDA Toolkit 11.8.0 ``` -conda install -c "nvidia/label/cuda-11.8.0" cuda-toolkit +conda install -c "nvidia/label/cuda-11.8.0" cuda-toolkit -y ``` -4. Install required packages: +4. Install cuDNN + +``` +conda install -c conda-forge cudnn -y +``` + +5. Install required packages: ``` pip install -r requirements_base.txt -v @@ -65,9 +71,9 @@ pip install -r requirements_extra.txt -v pip install onnxruntime-gpu ``` -5. Install [ffmpeg](https://ffmpeg.org/download.html). FFmpeg is a free software project that produces libraries and programs for handling multimedia data. You will need it to process audio and video files. You can install ffmpeg with Anaconda by running `conda install -y ffmpeg` in your terminal. If you have trouble installing ffmpeg via Anaconda, you can use the following link instead: (https://ffmpeg.org/ffmpeg.html). Once it is installed, make sure it is in your PATH by running `ffmpeg -h` in your terminal. If you don't get an error message, you're good to go. +6. Install [ffmpeg](https://ffmpeg.org/download.html). FFmpeg is a free software project that produces libraries and programs for handling multimedia data. You will need it to process audio and video files. You can install ffmpeg with Anaconda by running `conda install -y ffmpeg` in your terminal. If you have trouble installing ffmpeg via Anaconda, you can use the following link instead: (https://ffmpeg.org/ffmpeg.html). Once it is installed, make sure it is in your PATH by running `ffmpeg -h` in your terminal. If you don't get an error message, you're good to go. -6. Optional install: +7. Optional install: After installing FFmpeg, you can install these optional packages. @@ -88,7 +94,7 @@ pip install sherpa-onnx==1.9.12 pip install piper-tts==1.2.0 --no-deps ``` -7. Setting your [Hugging Face token](https://huggingface.co/settings/tokens) as an environment variable in quotes: +8. Setting your [Hugging Face token](https://huggingface.co/settings/tokens) as an environment variable in quotes: ``` conda env config vars set YOUR_HF_TOKEN="YOUR_HUGGING_FACE_TOKEN_HERE" @@ -131,4 +137,7 @@ conda deactivate conda env remove -n sonitr ``` -With the `sonitr` environment removed, you can start over with a fresh installation. \ No newline at end of file +With the `sonitr` environment removed, you can start over with a fresh installation. + +### Notes +If for any reason the installation fails or gets stuck when cuDNN is being installed, it will be necessary to directly install [CUDA Toolkit 11.8.0](https://developer.nvidia.com/cuda-11-8-0-download-archive) and [cuDNN for CUDA 11.x](https://developer.nvidia.com/rdp/cudnn-archive), [info](https://docs.nvidia.com/deeplearning/cudnn/installation/windows.html) provided by NVIDIA. From c9ff2693a6234911922e9219568a3e1b81a56c6c Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Fri, 22 Mar 2024 21:42:11 +0000 Subject: [PATCH 10/36] option overlap reduction #18 --- app_rvc.py | 17 +++++++- soni_translate/audio_segments.py | 75 ++++++++++++++++++++++++++++++-- 2 files changed, 86 insertions(+), 6 deletions(-) diff --git a/app_rvc.py b/app_rvc.py index 08000e6..2216059 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -325,6 +325,7 @@ def multilingual_media_conversion( get_translated_text=False, get_video_from_text_json=False, text_json="{}", + avoid_overlap=False, diarization_model="pyannote_2.1", translate_process="google_translator_batch", subtitle_file=None, @@ -712,7 +713,8 @@ def multilingual_media_conversion( os.getenv("VOICES_MODELS"), cc_model_paths, cc_index_values, - cc_transpose_values + cc_transpose_values, + avoid_overlap ], { "valid_speakers": self.valid_speakers }): @@ -765,7 +767,11 @@ def multilingual_media_conversion( ) remove_files(dub_audio_file) create_translated_audio( - self.result_diarize, audio_files, dub_audio_file + self.result_diarize, + audio_files, + dub_audio_file, + False, + avoid_overlap, ) # Voiceless track, change with file @@ -1270,6 +1276,11 @@ def submit(value): label=lg_conf["acc_rate_label"], info=lg_conf["acc_rate_info"], ) + avoid_overlap_gui = gr.Checkbox( + False, + label="Overlap Reduction", + info="Overlap Reduction: Ensures segments don't overlap by adjusting start times based on previous end times; could disrupt synchronization.", + ) gr.HTML("
") @@ -2219,6 +2230,7 @@ def update_tts_list(): edit_sub_check, # TRUE BY DEFAULT dummy_false_check, # dummy false subs_edit_space, + avoid_overlap_gui, diarization_process_dropdown, translate_process_dropdown, input_srt, @@ -2273,6 +2285,7 @@ def update_tts_list(): dummy_false_check, edit_sub_check, subs_edit_space, + avoid_overlap_gui, diarization_process_dropdown, translate_process_dropdown, input_srt, diff --git a/soni_translate/audio_segments.py b/soni_translate/audio_segments.py index 63d8cd2..83926f3 100644 --- a/soni_translate/audio_segments.py +++ b/soni_translate/audio_segments.py @@ -2,10 +2,60 @@ from tqdm import tqdm from .utils import run_command from .logging_setup import logger +from pydub import AudioSegment +import numpy as np + + +class Mixer: + def __init__(self): + self.parts = [] + + def __len__(self): + parts = self._sync() + seg = parts[0][1] + frame_count = max(offset + seg.frame_count() for offset, seg in parts) + return int(1000.0 * frame_count / seg.frame_rate) + + def overlay(self, sound, position=0): + self.parts.append((position, sound)) + return self + + def _sync(self): + positions, segs = zip(*self.parts) + + frame_rate = segs[0].frame_rate + array_type = segs[0].array_type + + offsets = [int(frame_rate * pos / 1000.0) for pos in positions] + segs = AudioSegment.empty()._sync(*segs) + return list(zip(offsets, segs)) + + def append(self, sound): + self.overlay(sound, position=len(self)) + + def to_audio_segment(self): + parts = self._sync() + seg = parts[0][1] + channels = seg.channels + + frame_count = max(offset + seg.frame_count() for offset, seg in parts) + sample_count = int(frame_count * seg.channels) + + output = np.zeros(sample_count, dtype="int32") + for offset, seg in parts: + sample_offset = offset * channels + samples = np.frombuffer(seg.get_array_of_samples(), dtype="int32") + samples = np.int16(samples/np.max(np.abs(samples)) * 32767) + start = sample_offset + end = start + len(samples) + output[start:end] += samples + + return seg._spawn( + output, overrides={"sample_width": 4}).normalize(headroom=0.0) def create_translated_audio( - result_diarize, audio_files, final_file, concat=False + result_diarize, audio_files, final_file, concat=False, avoid_overlap=False, ): total_duration = result_diarize["segments"][-1]["end"] # in seconds @@ -34,14 +84,18 @@ def create_translated_audio( else: # silent audio with total_duration - combined_audio = AudioSegment.silent( - duration=int(total_duration * 1000) + base_audio = AudioSegment.silent( + duration=int(total_duration * 1000), frame_rate=41000 ) + combined_audio = Mixer() + combined_audio.overlay(base_audio) + logger.info( f"Audio duration: {total_duration // 60} " f"minutes and {int(total_duration % 60)} seconds" ) + last_end_time = 0 for line, audio_file in tqdm( zip(result_diarize["segments"], audio_files) ): @@ -51,6 +105,18 @@ def create_translated_audio( try: audio = AudioSegment.from_file(audio_file) # audio_a = audio.speedup(playback_speed=1.5) + + if avoid_overlap: + if (last_end_time - 0.500) > start: + start = (last_end_time - 0.250) + logger.debug( + f"Avoid overlap for {str(audio_file)} " + f"with {str(start)}" + ) + + duration_tts_seconds = len(audio) / 1000.0 # to sec + last_end_time = (start + duration_tts_seconds) + start_time = start * 1000 # to ms combined_audio = combined_audio.overlay( audio, position=start_time @@ -60,6 +126,7 @@ def create_translated_audio( logger.error(f"Error audio file {audio_file}") # combined audio as a file - combined_audio.export( + combined_audio_data = combined_audio.to_audio_segment() + combined_audio_data.export( final_file, format="wav" ) # best than ogg, change if the audio is anomalous From 5998ca053a9c197046204c55bea549f6d4e581bf Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Mon, 25 Mar 2024 19:38:25 +0000 Subject: [PATCH 11/36] fix format edge tts, fix excessive pad with overlap reduction --- soni_translate/audio_segments.py | 12 ++++- soni_translate/text_to_speech.py | 84 ++++++++++++++++++++++++++------ 2 files changed, 79 insertions(+), 17 deletions(-) diff --git a/soni_translate/audio_segments.py b/soni_translate/audio_segments.py index 83926f3..8f8bad3 100644 --- a/soni_translate/audio_segments.py +++ b/soni_translate/audio_segments.py @@ -96,6 +96,7 @@ def create_translated_audio( ) last_end_time = 0 + previous_speaker = "" for line, audio_file in tqdm( zip(result_diarize["segments"], audio_files) ): @@ -107,13 +108,22 @@ def create_translated_audio( # audio_a = audio.speedup(playback_speed=1.5) if avoid_overlap: + speaker = line["speaker"] if (last_end_time - 0.500) > start: - start = (last_end_time - 0.250) + overlap_time = last_end_time - start + if previous_speaker and previous_speaker != speaker: + start = (last_end_time - 0.600) + else: + start = (last_end_time - 0.250) + if overlap_time > 2.5: + start = start - 0.3 logger.debug( f"Avoid overlap for {str(audio_file)} " f"with {str(start)}" ) + previous_speaker = speaker + duration_tts_seconds = len(audio) / 1000.0 # to sec last_end_time = (start + duration_tts_seconds) diff --git a/soni_translate/text_to_speech.py b/soni_translate/text_to_speech.py index f858bdc..0b67873 100644 --- a/soni_translate/text_to_speech.py +++ b/soni_translate/text_to_speech.py @@ -19,8 +19,6 @@ import numpy as np from typing import Any, Dict from pathlib import Path - -# from scipy.io.wavfile import write as write_wav import soundfile as sf import platform import logging @@ -81,6 +79,22 @@ def error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename): verify_saved_file_and_size(filename) +def pad_array(array, sr): + + valid_indices = np.where(np.abs(array) > 0.001)[0] + + if len(valid_indices) == 0: + return array + + pad_indice = int(0.1 * sr) + start_pad = max(0, valid_indices[0] - pad_indice) + end_pad = min(len(array), valid_indices[-1] + 1 + pad_indice) + + padded_array = array[start_pad:end_pad] + + return padded_array + + # ===================================== # EDGE TTS # ===================================== @@ -136,6 +150,7 @@ def segments_egde_tts(filtered_edge_segments, TRANSLATE_AUDIO_TO, is_gui): # make the tts audio filename = f"audio/{start}.ogg" + temp_file = filename[:-3] + "mp3" logger.info(f"{text} >> {filename}") try: @@ -143,9 +158,24 @@ def segments_egde_tts(filtered_edge_segments, TRANSLATE_AUDIO_TO, is_gui): asyncio.run( edge_tts.Communicate( text, "-".join(tts_name.split("-")[:-1]) - ).save(filename) + ).save(temp_file) + ) + verify_saved_file_and_size(temp_file) + + data, sample_rate = sf.read(temp_file) + data = pad_array(data, sample_rate) + # os.remove(temp_file) + + # Save file + sf.write( + file=filename, + samplerate=sample_rate, + data=data, + format="ogg", + subtype="vorbis", ) verify_saved_file_and_size(filename) + except Exception as error: error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename) @@ -205,10 +235,14 @@ def segments_bark_tts( pad_token_id=processor.tokenizer.pad_token_id, ) # Save file + data_tts = pad_array( + speech_output.cpu().numpy().squeeze().astype(np.float32), + sampling_rate, + ) sf.write( file=filename, samplerate=sampling_rate, - data=speech_output.cpu().numpy().squeeze().astype(np.float32), + data=data_tts, format="ogg", subtype="vorbis", ) @@ -305,11 +339,16 @@ def segments_vits_tts(filtered_vits_segments, TRANSLATE_AUDIO_TO): # Infer with torch.no_grad(): speech_output = model(**inputs).waveform + + data_tts = pad_array( + speech_output.cpu().numpy().squeeze().astype(np.float32), + sampling_rate, + ) # Save file sf.write( file=filename, samplerate=sampling_rate, - data=speech_output.cpu().numpy().squeeze().astype(np.float32), + data=data_tts, format="ogg", subtype="vorbis", ) @@ -403,7 +442,8 @@ def convert_to_xtts_good_sample(audio_path: str = "", destination: str = ""): def sanitize_file_name(file_name): import unicodedata - # Normalize the string to NFKD form to separate combined characters into base characters and diacritics + # Normalize the string to NFKD form to separate combined characters into + # base characters and diacritics normalized_name = unicodedata.normalize("NFKD", file_name) # Replace any non-ASCII characters or special symbols with an underscore sanitized_name = re.sub(r"[^\w\s.-]", "_", normalized_name) @@ -459,7 +499,11 @@ def create_wav_file_vc( raise Exception(f"Error wav: {final_sample}") -def create_new_files_for_vc(speakers_coqui, segments_base, dereverb_automatic=True): +def create_new_files_for_vc( + speakers_coqui, + segments_base, + dereverb_automatic=True +): # before function delete automatic delete_previous_automatic output_dir = os.path.join(".", "clean_song_output") # remove content remove_directory_contents(output_dir) @@ -557,7 +601,7 @@ def segments_coqui_tts( raise TTS_OperationError( f"'{TRANSLATE_AUDIO_TO}' is not a supported language for Coqui XTTS" ) - # Emotion and speed can only be used with Coqui Studio models. Which is discontinued + # Emotion and speed can only be used with Coqui Studio models. discontinued # emotions = ["Neutral", "Happy", "Sad", "Angry", "Dull"] if delete_previous_automatic: @@ -597,11 +641,15 @@ def segments_coqui_tts( wav = model.tts( text=text, speaker_wav=tts_name, language=TRANSLATE_AUDIO_TO ) + data_tts = pad_array( + wav, + sampling_rate, + ) # Save file sf.write( file=filename, samplerate=sampling_rate, - data=wav, + data=data_tts, format="ogg", subtype="vorbis", ) @@ -780,11 +828,15 @@ def segments_vits_onnx_tts(filtered_onnx_vits_segments, TRANSLATE_AUDIO_TO): speech_output = synthesize_text_to_audio_np_array( model, text, synthesize_args ) + data_tts = pad_array( + speech_output, # .cpu().numpy().squeeze().astype(np.float32), + sampling_rate, + ) # Save file sf.write( file=filename, samplerate=sampling_rate, - data=speech_output, # .cpu().numpy().squeeze().astype(np.float32), + data=data_tts, format="ogg", subtype="vorbis", ) @@ -972,7 +1024,7 @@ def accelerate_segments( acc_percentage = duration_tts / duration_true # Smoth - if acceleration_rate_regulation and acc_percentage >= 1.4: + if acceleration_rate_regulation and acc_percentage >= 1.3: try: next_segment = result_diarize["segments"][ min(max_count_segments_idx, i + 1) @@ -986,22 +1038,22 @@ def accelerate_segments( if speaker == next_speaker: # half - smoth_duration = duration_true + (extra_time * 1/2) + smoth_duration = duration_true + (extra_time * 0.5) else: - # 2/3 - smoth_duration = duration_true + (extra_time * 2/3) + # 7/10 + smoth_duration = duration_true + (extra_time * 0.7) logger.debug( f"Base acc: {acc_percentage}, " f"smoth acc: {duration_tts / smoth_duration}" ) - acc_percentage = max(1.21, (duration_tts / smoth_duration)) + acc_percentage = max(1.2, (duration_tts / smoth_duration)) except Exception as error: logger.error(str(error)) if acc_percentage > max_accelerate_audio: acc_percentage = max_accelerate_audio - elif acc_percentage <= 1.2 and acc_percentage >= 0.8: + elif acc_percentage <= 1.15 and acc_percentage >= 0.8: acc_percentage = 1.0 elif acc_percentage <= 0.79: acc_percentage = 0.8 From 3fe3bab94e51f5025749360b7d328aeab9a00d68 Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Wed, 27 Mar 2024 20:44:49 +0000 Subject: [PATCH 12/36] Added languages 1/2 --- app_rvc.py | 2 +- soni_translate/language_configuration.py | 101 +++++++++++++++++-- soni_translate/logging_setup.py | 22 +++- soni_translate/speech_segmentation.py | 79 ++++++++++----- soni_translate/text_multiformat_processor.py | 6 +- 5 files changed, 168 insertions(+), 42 deletions(-) diff --git a/app_rvc.py b/app_rvc.py index 2216059..9205653 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -530,7 +530,7 @@ def multilingual_media_conversion( "result": self.result, "align_language": self.align_language }): - if self.align_language in ["ja", "zh"]: + if self.align_language in ["ja", "zh", "zh-TW"]: divide_text_segments_by += "|!|?|...|。" if divide_text_segments_by: try: diff --git a/soni_translate/language_configuration.py b/soni_translate/language_configuration.py index c54f342..8759ed3 100644 --- a/soni_translate/language_configuration.py +++ b/soni_translate/language_configuration.py @@ -3,7 +3,7 @@ LANGUAGES = { "Automatic detection": "Automatic detection", "Arabic (ar)": "ar", - "Chinese (zh)": "zh", + "Chinese - Simplified (zh-CN)": "zh", "Czech (cs)": "cs", "Danish (da)": "da", "Dutch (nl)": "nl", @@ -39,6 +39,7 @@ "Swedish (sv)": "sv", "Amharic (am)": "am", "Welsh (cy)": "cy", # no aux gTTS + "Estonian (et)": "et", "Croatian (hr)": "hr", "Icelandic (is)": "is", "Georgian (ka)": "ka", # no aux gTTS @@ -54,17 +55,44 @@ "Kannada (kn)": "kn", "Lithuanian (lt)": "lt", # no aux gTTS "Latvian (lv)": "lv", - # "Macedonian (mk)": "mk", # no aux gTTS # error get align model + "Macedonian (mk)": "mk", # no aux gTTS # error get align model "Malayalam (ml)": "ml", - # "Malay (ms)": "ms", # error get align model + "Malay (ms)": "ms", # error get align model "Romanian (ro)": "ro", "Sinhala (si)": "si", "Sundanese (su)": "su", - # "Swahili (sw)": "sw", # error aling + "Swahili (sw)": "sw", # error aling + "Afrikaans (af)": "af", + "Bosnian (bs)": "bs", + "Latin (la)": "la", + "Myanmar Burmese (my)": "my", + "Norwegian (no|nb)": "no", + "Chinese - Traditional (zh-TW)": "zh-TW", + "Assamese (as)": "as", + "Basque (eu)": "eu", + "Hausa (ha)": "ha", + "Haitian Creole (ht)": "ht", + "Armenian (hy)": "hy", + "Lao (lo)": "lo", + "Malagasy (mg)": "mg", + "Mongolian (mn)": "mn", + "Maltese (mt)": "mt", + "Punjabi (pa)": "pa", + "Pashto (ps)": "ps", + "Slovenian (sl)": "sl", + "Shona (sn)": "sn", + "Somali (so)": "so", + "Tajik (tg)": "tg", + "Turkmen (tk)": "tk", + "Tatar (tt)": "tt", + "Uzbek (uz)": "uz", + "Yoruba (yo)": "yo", } BASE_L_LIST = LANGUAGES.keys() LANGUAGES_LIST = [list(BASE_L_LIST)[0]] + sorted(list(BASE_L_LIST)[1:]) +INVERTED_LANGUAGES = {value: key for key, value in LANGUAGES.items()} + EXTRA_ALIGN = { "id": "indonesian-nlp/wav2vec2-large-xlsr-indonesian", @@ -93,13 +121,38 @@ "kn": "Harveenchadha/vakyansh-wav2vec2-kannada-knm-560", "lt": "DeividasM/wav2vec2-large-xlsr-53-lithuanian", "lv": "anton-l/wav2vec2-large-xlsr-53-latvian", - "mk": "Konstantin-Bogdanoski/wav2vec2-macedonian-base", + "mk": "", # Konstantin-Bogdanoski/wav2vec2-macedonian-base "ml": "gvs/wav2vec2-large-xlsr-malayalam", - "ms": "Duy/wav2vec2_malay", + "ms": "", # Duy/wav2vec2_malay "ro": "anton-l/wav2vec2-large-xlsr-53-romanian", "si": "IAmNotAnanth/wav2vec2-large-xls-r-300m-sinhala", "su": "cahya/wav2vec2-large-xlsr-sundanese", - "sw": "Lians/fine-tune-wav2vec2-large-swahili", + "sw": "", # Lians/fine-tune-wav2vec2-large-swahili + "af": "", # ylacombe/wav2vec2-common_voice-af-demo + "bs": "", + "la": "", + "my": "", + "no": "NbAiLab/wav2vec2-xlsr-300m-norwegian", + "zh-TW": "jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn", + "as": "", + "eu": "", # cahya/wav2vec2-large-xlsr-basque # verify + "ha": "infinitejoy/wav2vec2-large-xls-r-300m-hausa", + "ht": "", + "hy": "infinitejoy/wav2vec2-large-xls-r-300m-armenian", # no (.) + "lo": "", + "mg": "", + "mn": "tugstugi/wav2vec2-large-xlsr-53-mongolian", + "mt": "carlosdanielhernandezmena/wav2vec2-large-xlsr-53-maltese-64h", + "pa": "kingabzpro/wav2vec2-large-xlsr-53-punjabi", + "ps": "aamirhs/wav2vec2-large-xls-r-300m-pashto-colab", + "sl": "anton-l/wav2vec2-large-xlsr-53-slovenian", + "sn": "", + "so": "", + "tg": "", + "tk": "", # Ragav/wav2vec2-tk + "tt": "anton-l/wav2vec2-large-xlsr-53-tatar", + "uz": "", # Mekhriddin/wav2vec2-large-xls-r-300m-uzbek-colab + "yo": "ogbi/wav2vec2-large-mms-1b-yoruba-test", } @@ -109,7 +162,7 @@ def fix_code_language(translate_to, syntax="google"): replace_lang_code = {"zh": "zh-CN", "he": "iw", "zh-cn": "zh-CN"} elif syntax == "coqui": # coqui-xtts - replace_lang_code = {"zh": "zh-cn", "zh-CN": "zh-cn"} + replace_lang_code = {"zh": "zh-cn", "zh-CN": "zh-cn", "zh-TW": "zh-cn"} new_code_lang = replace_lang_code.get(translate_to, translate_to) logger.debug(f"Fix code {translate_to} -> {new_code_lang}") @@ -314,10 +367,40 @@ def fix_code_language(translate_to, syntax="google"): "lv-facebook-mms VITS": "facebook/mms-tts-lav", # "mk-facebook-mms VITS": "facebook/mms-tts-mkd", "ml-facebook-mms VITS": "facebook/mms-tts-mal", - "ms-facebook-mms VITS": "facebook/mms-tts-zlm", # add 2 variant + "ms-facebook-mms VITS": "facebook/mms-tts-zlm", + "ms_Central-facebook-mms VITS": "facebook/mms-tts-pse", + "ms_Manado-facebook-mms VITS": "facebook/mms-tts-xmm", "ro-facebook-mms VITS": "facebook/mms-tts-ron", # "si-facebook-mms VITS": "facebook/mms-tts-sin", "sw-facebook-mms VITS": "facebook/mms-tts-swh", + # "af-facebook-mms VITS": "facebook/mms-tts-afr", + # "bs-facebook-mms VITS": "facebook/mms-tts-bos", + "la-facebook-mms VITS": "facebook/mms-tts-lat", + "my-facebook-mms VITS": "facebook/mms-tts-mya", + # "no_Bokmål-facebook-mms VITS": "thomasht86/mms-tts-nob", # verify + "as-facebook-mms VITS": "facebook/mms-tts-asm", + "as_Nagamese-facebook-mms VITS": "facebook/mms-tts-nag", + "eu-facebook-mms VITS": "facebook/mms-tts-eus", + "ha-facebook-mms VITS": "facebook/mms-tts-hau", + "ht-facebook-mms VITS": "facebook/mms-tts-hat", + "hy_Western-facebook-mms VITS": "facebook/mms-tts-hyw", + "lo-facebook-mms VITS": "facebook/mms-tts-lao", + "mg-facebook-mms VITS": "facebook/mms-tts-mlg", + "mn-facebook-mms VITS": "facebook/mms-tts-mon", + # "mt-facebook-mms VITS": "facebook/mms-tts-mlt", + "pa_Eastern-facebook-mms VITS": "facebook/mms-tts-pan", + # "pa_Western-facebook-mms VITS": "facebook/mms-tts-pnb", + # "ps-facebook-mms VITS": "facebook/mms-tts-pus", + # "sl-facebook-mms VITS": "facebook/mms-tts-slv", + "sn-facebook-mms VITS": "facebook/mms-tts-sna", + "so-facebook-mms VITS": "facebook/mms-tts-son", + "tg-facebook-mms VITS": "facebook/mms-tts-tgk", + "tk_script_arabic-facebook-mms VITS": "facebook/mms-tts-tuk-script_arabic", + "tk_script_latin-facebook-mms VITS": "facebook/mms-tts-tuk-script_latin", + "tt-facebook-mms VITS": "facebook/mms-tts-tat", + "tt_Crimean-facebook-mms VITS": "facebook/mms-tts-crh", + "uz_script_cyrillic-facebook-mms VITS": "facebook/mms-tts-uzb-script_cyrillic", + "yo-facebook-mms VITS": "facebook/mms-tts-yor", } LANGUAGE_CODE_IN_THREE_LETTERS = { diff --git a/soni_translate/logging_setup.py b/soni_translate/logging_setup.py index 93de968..bb17bef 100644 --- a/soni_translate/logging_setup.py +++ b/soni_translate/logging_setup.py @@ -9,11 +9,25 @@ def configure_logging_libs(debug=False): action="ignore", category=UserWarning, module="pyannote" ) modules = [ - "numba", "httpx", "markdown_it", "speechbrain", "fairseq", "pyannote" + "numba", "httpx", "markdown_it", "speechbrain", "fairseq", "pyannote", + "pytorch_lightning.utilities.migration.utils", + "pytorch_lightning.utilities.migration", + "pytorch_lightning", + "lightning", + "lightning.pytorch.utilities.migration.utils", ] - for module in modules: - logging.getLogger(module).setLevel(logging.WARNING) - os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3" if not debug else "1" + try: + for module in modules: + logging.getLogger(module).setLevel(logging.WARNING) + os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3" if not debug else "1" + + # fix verbose pyannote audio + def fix_verbose_pyannote(*args, what=""): + pass + import pyannote.audio.core.model # noqa + pyannote.audio.core.model.check_version = fix_verbose_pyannote + except Exception as error: + logger.error(str(error)) def setup_logger(name_log): diff --git a/soni_translate/speech_segmentation.py b/soni_translate/speech_segmentation.py index 0599f44..30c37f4 100644 --- a/soni_translate/speech_segmentation.py +++ b/soni_translate/speech_segmentation.py @@ -30,16 +30,31 @@ def transcribe_speech( - audio: Loaded audio file. - result: Transcription result as a dictionary. """ - with capture.capture_output() as cap: - model = whisperx.load_model( - WHISPER_MODEL_SIZE, - device, - compute_type=compute_type, - language=SOURCE_LANGUAGE, - ) - del cap + + # https://github.com/openai/whisper/discussions/277 + prompt = "以下是普通话的句子。" if SOURCE_LANGUAGE == "zh" else None + SOURCE_LANGUAGE = ( + SOURCE_LANGUAGE if SOURCE_LANGUAGE != "zh-TW" else "zh" + ) + asr_options = { + "initial_prompt": prompt + } + + model = whisperx.load_model( + WHISPER_MODEL_SIZE, + device, + compute_type=compute_type, + language=SOURCE_LANGUAGE, + asr_options=asr_options, + ) + audio = whisperx.load_audio(audio_wav) result = model.transcribe(audio, batch_size=batch_size) + + if result["language"] == "zh" and not prompt: + result["language"] = "zh-TW" + logger.info("Chinese - Traditional (zh-TW)") + del model gc.collect() torch.cuda.empty_cache() # noqa @@ -52,11 +67,12 @@ def align_speech(audio, result): Parameters: - audio (array): The audio data in a suitable format for alignment. - - result (dict): Metadata containing information about the segments and language. + - result (dict): Metadata containing information about the segments + and language. Returns: - - result (dict): Updated metadata after aligning the segments with the audio. - This includes character-level alignments if + - result (dict): Updated metadata after aligning the segments with + the audio. This includes character-level alignments if 'return_char_alignments' is set to True. Notes: @@ -70,7 +86,6 @@ def align_speech(audio, result): not result["language"] in DAMHF.keys() and not result["language"] in EXTRA_ALIGN.keys() ): - audio = result = None logger.warning( "Automatic detection: Source language not compatible with align" ) @@ -78,6 +93,15 @@ def align_speech(audio, result): f"Detected language {result['language']} incompatible, " "you can select the source language to avoid this error." ) + if ( + result["language"] in EXTRA_ALIGN.keys() + and EXTRA_ALIGN[result["language"]] == "" + ): + logger.warning( + "No compatible wav2vec2 model found " + "for this language, skipping alignment." + ) + return result model_a, metadata = whisperx.load_align_model( language_code=result["language"], @@ -119,12 +143,14 @@ def diarize_speech( Performs speaker diarization on speech segments. Parameters: - - audio_wav (array): Audio data in WAV format to perform speaker diarization. + - audio_wav (array): Audio data in WAV format to perform speaker + diarization. - result (dict): Metadata containing information about speech segments and alignments. - min_speakers (int): Minimum number of speakers expected in the audio. - max_speakers (int): Maximum number of speakers expected in the audio. - - YOUR_HF_TOKEN (str): Your Hugging Face API token for model authentication. + - YOUR_HF_TOKEN (str): Your Hugging Face API token for model + authentication. - model_name (str): Name of the speaker diarization model to be used (default: "pyannote/speaker-diarization@2.1"). @@ -137,19 +163,19 @@ def diarize_speech( segments in the audio. - It assigns speakers to word-level segments based on diarization results. - Cleans up memory by releasing resources after diarization. - - If only one speaker is specified, each segment is automatically assigned as - the first speaker, eliminating the need for diarization inference. + - If only one speaker is specified, each segment is automatically assigned + as the first speaker, eliminating the need for diarization inference. """ if max(min_speakers, max_speakers) > 1 and model_name: try: - with capture.capture_output() as cap: - diarize_model = whisperx.DiarizationPipeline( - model_name=model_name, - use_auth_token=YOUR_HF_TOKEN, - device=device, - ) - del cap + + diarize_model = whisperx.DiarizationPipeline( + model_name=model_name, + use_auth_token=YOUR_HF_TOKEN, + device=device, + ) + except Exception as error: error_str = str(error) gc.collect() @@ -162,14 +188,15 @@ def diarize_speech( "accept the license to use the models: " "https://huggingface.co/pyannote/speaker-diarization " "and https://huggingface.co/pyannote/segmentation " - "Get your KEY TOKEN here: https://hf.co/settings/tokens" + "Get your KEY TOKEN here: " + "https://hf.co/settings/tokens " ) elif model_name == diarization_models["pyannote_3.1"]: raise ValueError( "New Licence Pyannote 3.1: You need to have an account" " on Hugging Face and accept the license to use the " - "models: https://huggingface.co/pyannote/speaker-diarization-3.1" - " and https://huggingface.co/pyannote/segmentation-3.0 " + "models: https://huggingface.co/pyannote/speaker-diarization-3.1 " # noqa + "and https://huggingface.co/pyannote/segmentation-3.0 " ) else: raise error diff --git a/soni_translate/text_multiformat_processor.py b/soni_translate/text_multiformat_processor.py index 4f56187..041b0e3 100644 --- a/soni_translate/text_multiformat_processor.py +++ b/soni_translate/text_multiformat_processor.py @@ -252,7 +252,9 @@ def process_subtitles( # original lang subs_copy_result = copy.deepcopy(deep_copied_result) - subs_copy_result["language"] = align_language + subs_copy_result["language"] = ( + "zh" if align_language == "zh-TW" else align_language + ) for segment in subs_copy_result["segments"]: segment.pop("speaker", None) @@ -279,7 +281,7 @@ def process_subtitles( # translated lang subs_tra_copy_result = copy.deepcopy(result_diarize) subs_tra_copy_result["language"] = ( - "ja" if TRANSLATE_AUDIO_TO in ["ja", "zh"] else align_language + "ja" if TRANSLATE_AUDIO_TO in ["ja", "zh", "zh-TW"] else align_language ) subs_tra_copy_result.pop("word_segments", None) for segment in subs_tra_copy_result["segments"]: From b9b4cc3643a66609409433f0ef0b4837cbabf5e7 Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Sat, 30 Mar 2024 03:35:16 +0000 Subject: [PATCH 13/36] Added languages 2/2 (unidirectional) --- app_rvc.py | 7 +++ soni_translate/language_configuration.py | 78 +++++++++++++++++++++++- soni_translate/speech_segmentation.py | 9 ++- soni_translate/text_to_speech.py | 11 +++- 4 files changed, 100 insertions(+), 5 deletions(-) diff --git a/app_rvc.py b/app_rvc.py index 9205653..2e55b5f 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -24,6 +24,7 @@ from soni_translate.postprocessor import media_out from soni_translate.language_configuration import ( LANGUAGES, + UNIDIRECTIONAL_L_LIST, LANGUAGES_LIST, bark_voices_list, vits_voices_list, @@ -354,6 +355,12 @@ def multilingual_media_conversion( else: os.environ["YOUR_HF_TOKEN"] = YOUR_HF_TOKEN + if SOURCE_LANGUAGE in UNIDIRECTIONAL_L_LIST and not subtitle_file: + raise ValueError( + f"The language '{SOURCE_LANGUAGE}' " + "is not supported for transcription (ASR)." + ) + if get_translated_text: self.edit_subs_complete = False if get_video_from_text_json: diff --git a/soni_translate/language_configuration.py b/soni_translate/language_configuration.py index 8759ed3..1116c78 100644 --- a/soni_translate/language_configuration.py +++ b/soni_translate/language_configuration.py @@ -1,5 +1,33 @@ from .logging_setup import logger +LANGUAGES_UNIDIRECTIONAL = { + "Aymara (ay)": "ay", + "Bambara (bm)": "bm", + "Cebuano (ceb)": "ceb", + "Chichewa (ny)": "ny", + "Divehi (dv)": "dv", + "Dogri (doi)": "doi", + "Ewe (ee)": "ee", + "Guarani (gn)": "gn", + "Iloko (ilo)": "ilo", + "Kinyarwanda (rw)": "rw", + "Krio (kri)": "kri", + "Kurdish (ku)": "ku", + "Kirghiz (ky)": "ky", + "Ganda (lg)": "lg", + "Maithili (mai)": "mai", + "Oriya (or)": "or", + "Oromo (om)": "om", + "Quechua (qu)": "qu", + "Samoan (sm)": "sm", + "Tigrinya (ti)": "ti", + "Tsonga (ts)": "ts", + "Akan (ak)": "ak", + "Uighur (ug)": "ug" +} + +UNIDIRECTIONAL_L_LIST = LANGUAGES_UNIDIRECTIONAL.keys() + LANGUAGES = { "Automatic detection": "Automatic detection", "Arabic (ar)": "ar", @@ -87,13 +115,13 @@ "Tatar (tt)": "tt", "Uzbek (uz)": "uz", "Yoruba (yo)": "yo", + **LANGUAGES_UNIDIRECTIONAL } BASE_L_LIST = LANGUAGES.keys() LANGUAGES_LIST = [list(BASE_L_LIST)[0]] + sorted(list(BASE_L_LIST)[1:]) INVERTED_LANGUAGES = {value: key for key, value in LANGUAGES.items()} - EXTRA_ALIGN = { "id": "indonesian-nlp/wav2vec2-large-xlsr-indonesian", "bn": "arijitx/wav2vec2-large-xlsr-bengali", @@ -401,6 +429,54 @@ def fix_code_language(translate_to, syntax="google"): "tt_Crimean-facebook-mms VITS": "facebook/mms-tts-crh", "uz_script_cyrillic-facebook-mms VITS": "facebook/mms-tts-uzb-script_cyrillic", "yo-facebook-mms VITS": "facebook/mms-tts-yor", + "ay-facebook-mms VITS": "facebook/mms-tts-ayr", + "bm-facebook-mms VITS": "facebook/mms-tts-bam", + "ceb-facebook-mms VITS": "facebook/mms-tts-ceb", + "ny-facebook-mms VITS": "facebook/mms-tts-nya", + "dv-facebook-mms VITS": "facebook/mms-tts-div", + "doi-facebook-mms VITS": "facebook/mms-tts-dgo", + "ee-facebook-mms VITS": "facebook/mms-tts-ewe", + "gn-facebook-mms VITS": "facebook/mms-tts-grn", + "ilo-facebook-mms VITS": "facebook/mms-tts-ilo", + "rw-facebook-mms VITS": "facebook/mms-tts-kin", + "kri-facebook-mms VITS": "facebook/mms-tts-kri", + "ku_script_arabic-facebook-mms VITS": "facebook/mms-tts-kmr-script_arabic", + "ku_script_cyrillic-facebook-mms VITS": "facebook/mms-tts-kmr-script_cyrillic", + "ku_script_latin-facebook-mms VITS": "facebook/mms-tts-kmr-script_latin", + "ckb-facebook-mms VITS": "razhan/mms-tts-ckb", # Verify w + "ky-facebook-mms VITS": "facebook/mms-tts-kir", + "lg-facebook-mms VITS": "facebook/mms-tts-lug", + "mai-facebook-mms VITS": "facebook/mms-tts-mai", + "or-facebook-mms VITS": "facebook/mms-tts-ory", + "om-facebook-mms VITS": "facebook/mms-tts-orm", + "qu_Huallaga-facebook-mms VITS": "facebook/mms-tts-qub", + "qu_Lambayeque-facebook-mms VITS": "facebook/mms-tts-quf", + "qu_South_Bolivian-facebook-mms VITS": "facebook/mms-tts-quh", + "qu_North_Bolivian-facebook-mms VITS": "facebook/mms-tts-qul", + "qu_Tena_Lowland-facebook-mms VITS": "facebook/mms-tts-quw", + "qu_Ayacucho-facebook-mms VITS": "facebook/mms-tts-quy", + "qu_Cusco-facebook-mms VITS": "facebook/mms-tts-quz", + "qu_Cajamarca-facebook-mms VITS": "facebook/mms-tts-qvc", + "qu_Eastern_Apurímac-facebook-mms VITS": "facebook/mms-tts-qve", + "qu_Huamalíes_Dos_de_Mayo_Huánuco-facebook-mms VITS": "facebook/mms-tts-qvh", + "qu_Margos_Yarowilca_Lauricocha-facebook-mms VITS": "facebook/mms-tts-qvm", + "qu_North_Junín-facebook-mms VITS": "facebook/mms-tts-qvn", + "qu_Napo-facebook-mms VITS": "facebook/mms-tts-qvo", + "qu_San_Martín-facebook-mms VITS": "facebook/mms-tts-qvs", + "qu_Huaylla_Wanca-facebook-mms VITS": "facebook/mms-tts-qvw", + "qu_Northern_Pastaza-facebook-mms VITS": "facebook/mms-tts-qvz", + "qu_Huaylas_Ancash-facebook-mms VITS": "facebook/mms-tts-qwh", + "qu_Panao-facebook-mms VITS": "facebook/mms-tts-qxh", + "qu_Salasaca_Highland-facebook-mms VITS": "facebook/mms-tts-qxl", + "qu_Northern_Conchucos_Ancash-facebook-mms VITS": "facebook/mms-tts-qxn", + "qu_Southern_Conchucos-facebook-mms VITS": "facebook/mms-tts-qxo", + "qu_Cañar_Highland-facebook-mms VITS": "facebook/mms-tts-qxr", + "sm-facebook-mms VITS": "facebook/mms-tts-smo", + "ti-facebook-mms VITS": "facebook/mms-tts-tir", + "ts-facebook-mms VITS": "facebook/mms-tts-tso", + "ak-facebook-mms VITS": "facebook/mms-tts-aka", + "ug_script_arabic-facebook-mms VITS": "facebook/mms-tts-uig-script_arabic", + "ug_script_cyrillic-facebook-mms VITS": "facebook/mms-tts-uig-script_cyrillic", } LANGUAGE_CODE_IN_THREE_LETTERS = { diff --git a/soni_translate/speech_segmentation.py b/soni_translate/speech_segmentation.py index 30c37f4..8bb730b 100644 --- a/soni_translate/speech_segmentation.py +++ b/soni_translate/speech_segmentation.py @@ -6,7 +6,7 @@ import torch import gc from IPython.utils import capture -from .language_configuration import EXTRA_ALIGN +from .language_configuration import EXTRA_ALIGN, INVERTED_LANGUAGES from .logging_setup import logger device = "cuda" if torch.cuda.is_available() else "cpu" @@ -97,9 +97,14 @@ def align_speech(audio, result): result["language"] in EXTRA_ALIGN.keys() and EXTRA_ALIGN[result["language"]] == "" ): + lang_name = ( + INVERTED_LANGUAGES[result["language"]] + if result["language"] in INVERTED_LANGUAGES.keys() + else result["language"] + ) logger.warning( "No compatible wav2vec2 model found " - "for this language, skipping alignment." + f"for the language '{lang_name}', skipping alignment." ) return result diff --git a/soni_translate/text_to_speech.py b/soni_translate/text_to_speech.py index 0b67873..bb54923 100644 --- a/soni_translate/text_to_speech.py +++ b/soni_translate/text_to_speech.py @@ -1383,10 +1383,17 @@ def toneconverter_freevc( ) = f"audio2/audio/{str(seg['start'])}.ogg" # overwrite logger.debug(f"{src_path} - {original_wav_audio_segment}") - tts.voice_conversion_to_file( + wav = tts.voice_conversion( source_wav=src_path, target_wav=original_wav_audio_segment, - file_path=save_path + ) + + sf.write( + file=save_path, + samplerate=tts.voice_converter.vc_config.audio.output_sample_rate, + data=wav, + format="ogg", + subtype="vorbis", ) global_progress_bar.update(1) From ad96bd2694fc4c968eb93882eae983a8b26b711f Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Mon, 1 Apr 2024 21:05:27 +0000 Subject: [PATCH 14/36] perf: Improve translation accuracy --- app_rvc.py | 7 ++ soni_translate/translate_segments.py | 113 +++++++++++++++++++-------- 2 files changed, 87 insertions(+), 33 deletions(-) diff --git a/app_rvc.py b/app_rvc.py index 2e55b5f..1a233d5 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -576,11 +576,17 @@ def multilingual_media_conversion( "result_diarize": self.result_diarize }): prog_disp("Translating...", 0.70, is_gui, progress=progress) + lang_source = ( + self.align_language + if self.align_language + else SOURCE_LANGUAGE + ) self.result_diarize["segments"] = translate_text( self.result_diarize["segments"], TRANSLATE_AUDIO_TO, translate_process, chunk_size=1800, + source=lang_source, ) logger.debug("Translation complete") logger.debug(self.result_diarize) @@ -958,6 +964,7 @@ def multilingual_docs_conversion( TRANSLATE_AUDIO_TO, translate_process, chunk_size=0, + source=SOURCE_LANGUAGE, ) txt_file_path, result_text = segments_to_plain_text(result_diarize) diff --git a/soni_translate/translate_segments.py b/soni_translate/translate_segments.py index a7f94d2..8d2881b 100644 --- a/soni_translate/translate_segments.py +++ b/soni_translate/translate_segments.py @@ -2,34 +2,41 @@ from deep_translator import GoogleTranslator from itertools import chain import copy -from .language_configuration import fix_code_language +from tqdm import tqdm +from .language_configuration import fix_code_language, INVERTED_LANGUAGES from .logging_setup import logger -def translate_iterative(segments, TRANSLATE_AUDIO_TO): +def translate_iterative(segments, target, source=None): """ Translate text segments individually to the specified language. Parameters: - - segments (list): A list of dictionaries, each containing 'text' as a key - with the segment text to be translated. - - TRANSLATE_AUDIO_TO (str): The language code to which the text should be - translated. + - segments (list): A list of dictionaries with 'text' as a key for + segment text. + - target (str): Target language code. + - source (str, optional): Source language code. Defaults to None. Returns: - - list: A list of dictionaries with translated text segments in the specified language. + - list: Translated text segments in the target language. Notes: - - This function translates each text segment individually using the Google Translator. + - Translates each segment using Google Translate. Example: - segments = [{'text': 'This is the first segment.'}, {'text': 'And this is the second segment.'}] + segments = [{'text': 'first segment.'}, {'text': 'second segment.'}] translated_segments = translate_iterative(segments, 'es') """ segments_ = copy.deepcopy(segments) - translator = GoogleTranslator(source="auto", target=TRANSLATE_AUDIO_TO) + if ( + not source + ): + logger.debug("No source language") + source = "auto" + + translator = GoogleTranslator(source=source, target=target) for line in tqdm(range(len(segments_))): text = segments_[line]["text"] @@ -39,30 +46,42 @@ def translate_iterative(segments, TRANSLATE_AUDIO_TO): return segments_ -def translate_batch(segments, TRANSLATE_AUDIO_TO, chunk_size=2000): +def translate_batch(segments, target, chunk_size=2000, source=None): """ - Translate a batch of text segments into the specified language in chunks respecting the character limit. + Translate a batch of text segments into the specified language in chunks, + respecting the character limit. Parameters: - - segments (list): A list of dictionaries, each containing 'text' as a key with the segment text to be translated. - - TRANSLATE_AUDIO_TO (str): The language code to which the text should be translated. - - chunk_size (int, optional): The maximum character limit for each translation chunk (default is 2000); max 5000. + - segments (list): List of dictionaries with 'text' as a key for segment + text. + - target (str): Target language code. + - chunk_size (int, optional): Maximum character limit for each translation + chunk (default is 2000; max 5000). + - source (str, optional): Source language code. Defaults to None. Returns: - - list: A list of dictionaries with translated text segments in the specified language. + - list: Translated text segments in the target language. Notes: - - This function splits the input segments into chunks respecting the character limit for translation. - - It translates the chunks using the Google Translator. - - If the chunked translation fails, it switches to iterative translation using `translate_iterative()`. + - Splits input segments into chunks respecting the character limit for + translation. + - Translates the chunks using Google Translate. + - If chunked translation fails, switches to iterative translation using + `translate_iterative()`. Example: - segments = [{'text': 'This is the first segment.'}, {'text': 'And this is the second segment.'}] - translated_segments = translate_batch(segments, 'es', chunk_size=4000) + segments = [{'text': 'first segment.'}, {'text': 'second segment.'}] + translated = translate_batch(segments, 'es', chunk_size=4000, source='en') """ segments_copy = copy.deepcopy(segments) + if ( + not source + ): + logger.debug("No source language") + source = "auto" + # Get text text_lines = [] for line in range(len(segments_copy)): @@ -72,63 +91,91 @@ def translate_batch(segments, TRANSLATE_AUDIO_TO, chunk_size=2000): # chunk limit text_merge = [] actual_chunk = "" + global_text_list = [] + actual_text_list = [] for one_line in text_lines: + one_line = " " if not one_line else one_line if (len(actual_chunk) + len(one_line)) <= chunk_size: if actual_chunk: actual_chunk += " ||||| " actual_chunk += one_line + actual_text_list.append(one_line) else: text_merge.append(actual_chunk) - one_line = " " if not one_line else one_line actual_chunk = one_line + global_text_list.append(actual_text_list) + actual_text_list = [one_line] if actual_chunk: text_merge.append(actual_chunk) + global_text_list.append(actual_text_list) # translate chunks - translator = GoogleTranslator(source="auto", target=TRANSLATE_AUDIO_TO) + progress_bar = tqdm(total=len(segments), desc="Translating") + translator = GoogleTranslator(source=source, target=target) + split_list = [] try: - translated_lines = translator.translate_batch(text_merge) + for text, text_iterable in zip(text_merge, global_text_list): + translated_line = translator.translate(text.strip()) + split_text = translated_line.split("|||||") + if len(split_text) == len(text_iterable): + progress_bar.update(len(split_text)) + else: + logger.debug( + "Chunk fixing iteratively. Len chunk: " + f"{len(split_text)}, expected: {len(text_iterable)}" + ) + split_text = [] + for txt_iter in text_iterable: + translated_txt = translator.translate(txt_iter.strip()) + split_text.append(translated_txt) + progress_bar.update(1) + split_list.append(split_text) + progress_bar.close() except Exception as error: + progress_bar.close() logger.error(str(error)) logger.warning( - "The translation in chunks failed, switching to iterative. Related: too many request" + "The translation in chunks failed, switching to iterative." + " Related: too many request" ) # use proxy or less chunk size - return translate_iterative(segments, TRANSLATE_AUDIO_TO) + return translate_iterative(segments, target, source) # un chunk - split_list = [sentence.split("|||||") for sentence in translated_lines] translated_lines = list(chain.from_iterable(split_list)) # verify integrity ok if len(segments) == len(translated_lines): for line in range(len(segments_copy)): logger.debug( - f"{segments_copy[line]['text']} >> {translated_lines[line].strip()}" + f"{segments_copy[line]['text']} >> " + f"{translated_lines[line].strip()}" ) segments_copy[line]["text"] = translated_lines[line].strip() return segments_copy else: logger.error( - f"The translation in chunks failed, switching to iterative. {len(segments), len(translated_lines)}" + "The translation in chunks failed, switching to iterative. " + f"{len(segments), len(translated_lines)}" ) - return translate_iterative(segments, TRANSLATE_AUDIO_TO) + return translate_iterative(segments, target, source) def translate_text( segments, - TRANSLATE_AUDIO_TO, + target, translation_process="google_translator_batch", chunk_size=4500, + source=None, ): """Translates text segments using a specified process.""" match translation_process: case "google_translator_batch": return translate_batch( - segments, fix_code_language(TRANSLATE_AUDIO_TO), chunk_size + segments, fix_code_language(target), chunk_size, source ) case "google_translator_iterative": return translate_iterative( - segments, fix_code_language(TRANSLATE_AUDIO_TO) + segments, fix_code_language(target), source ) case "disable_translation": return segments From 108d10f0420476a420ef33850be0f0ea7aefb19f Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Wed, 3 Apr 2024 03:05:17 +0000 Subject: [PATCH 15/36] feat(translation): Added gpt's api openai translation #30 --- app_rvc.py | 41 ++-- requirements_extra.txt | 2 + soni_translate/audio_segments.py | 5 +- soni_translate/speech_segmentation.py | 11 +- soni_translate/text_to_speech.py | 13 +- soni_translate/translate_segments.py | 310 ++++++++++++++++++++++++-- 6 files changed, 341 insertions(+), 41 deletions(-) diff --git a/app_rvc.py b/app_rvc.py index 1a233d5..de9fc04 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -16,7 +16,11 @@ create_wav_file_vc, accelerate_segments, ) -from soni_translate.translate_segments import translate_text +from soni_translate.translate_segments import ( + translate_text, + TRANSLATION_PROCESS_OPTIONS, + DOCS_TRANSLATION_PROCESS_OPTIONS +) from soni_translate.preprocessor import ( audio_video_preprocessor, audio_preprocessor, @@ -235,6 +239,16 @@ def get_hash(filepath): return file_hash.hexdigest()[:18] +def check_openai_api_key(): + if not os.environ.get("OPENAI_API_KEY"): + raise ValueError( + "To use GPT for translation, please set up your OpenAI API key " + "as an environment variable in Linux as follows: " + "export OPENAI_API_KEY='your-api-key-here'. Or change the " + "translation process in Advanced settings." + ) + + class SoniTranslate(SoniTrCache): def __init__(self, dev=False): super().__init__() @@ -355,6 +369,9 @@ def multilingual_media_conversion( else: os.environ["YOUR_HF_TOKEN"] = YOUR_HF_TOKEN + if "gpt" in translate_process: + check_openai_api_key() + if SOURCE_LANGUAGE in UNIDIRECTIONAL_L_LIST and not subtitle_file: raise ValueError( f"The language '{SOURCE_LANGUAGE}' " @@ -898,12 +915,15 @@ def multilingual_docs_conversion( TRANSLATE_AUDIO_TO="English (en)", tts_voice00="en-AU-WilliamNeural-Male", name_final_file="sample", - translate_process="google_translator_iterative", + translate_process="google_translator", output_type="audio", chunk_size=None, is_gui=False, progress=gr.Progress(), ): + if "gpt" in translate_process: + check_openai_api_key() + SOURCE_LANGUAGE = LANGUAGES[SOURCE_LANGUAGE] if translate_process != "disable_translation": TRANSLATE_AUDIO_TO = LANGUAGES[TRANSLATE_AUDIO_TO] @@ -1420,14 +1440,9 @@ def get_subs_path(type_subs): value=pyannote_models_list[1], label=lg_conf["diarization_label"], ) - valid_translate_process = [ - "google_translator_batch", - "google_translator_iterative", - "disable_translation", - ] translate_process_dropdown = gr.Dropdown( - valid_translate_process, - value=valid_translate_process[0], + TRANSLATION_PROCESS_OPTIONS, + value=TRANSLATION_PROCESS_OPTIONS[0], label=lg_conf["tr_process_label"], ) @@ -1723,13 +1738,9 @@ def swap_visibility(data_type): with gr.Accordion( lg_conf["extra_setting"], open=False ): - docs_valid_translate_process = [ - "google_translator_iterative", - "disable_translation", - ] docs_translate_process_dropdown = gr.Dropdown( - docs_valid_translate_process, - value=docs_valid_translate_process[ + DOCS_TRANSLATION_PROCESS_OPTIONS, + value=DOCS_TRANSLATION_PROCESS_OPTIONS[ 0 ], label="Translation process", diff --git a/requirements_extra.txt b/requirements_extra.txt index ab705ff..5cb027f 100644 --- a/requirements_extra.txt +++ b/requirements_extra.txt @@ -12,6 +12,8 @@ optimum sentencepiece srt git+https://github.com/R3gm/openvoice_package.git@lite +openai==1.14.3 +tiktoken==0.6.0 # Documents PyPDF2 python-docx \ No newline at end of file diff --git a/soni_translate/audio_segments.py b/soni_translate/audio_segments.py index 8f8bad3..941c244 100644 --- a/soni_translate/audio_segments.py +++ b/soni_translate/audio_segments.py @@ -2,7 +2,6 @@ from tqdm import tqdm from .utils import run_command from .logging_setup import logger -from pydub import AudioSegment import numpy as np @@ -24,7 +23,7 @@ def _sync(self): positions, segs = zip(*self.parts) frame_rate = segs[0].frame_rate - array_type = segs[0].array_type + array_type = segs[0].array_type # noqa offsets = [int(frame_rate * pos / 1000.0) for pos in positions] segs = AudioSegment.empty()._sync(*segs) @@ -117,7 +116,7 @@ def create_translated_audio( start = (last_end_time - 0.250) if overlap_time > 2.5: start = start - 0.3 - logger.debug( + logger.info( f"Avoid overlap for {str(audio_file)} " f"with {str(start)}" ) diff --git a/soni_translate/speech_segmentation.py b/soni_translate/speech_segmentation.py index 8bb730b..f52cd16 100644 --- a/soni_translate/speech_segmentation.py +++ b/soni_translate/speech_segmentation.py @@ -5,7 +5,7 @@ import whisperx import torch import gc -from IPython.utils import capture +from IPython.utils import capture # noqa from .language_configuration import EXTRA_ALIGN, INVERTED_LANGUAGES from .logging_setup import logger @@ -212,6 +212,15 @@ def diarize_speech( result_diarize = whisperx.assign_word_speakers( diarize_segments, result ) + + for segment in result_diarize["segments"]: + if "speaker" not in segment: + segment["speaker"] = "SPEAKER_00" + logger.warning( + f"No speaker detected in {segment['start']}. First TTS " + f"will be used for the segment text: {segment['text']} " + ) + del diarize_model gc.collect() torch.cuda.empty_cache() # noqa diff --git a/soni_translate/text_to_speech.py b/soni_translate/text_to_speech.py index bb54923..0312150 100644 --- a/soni_translate/text_to_speech.py +++ b/soni_translate/text_to_speech.py @@ -39,7 +39,8 @@ def verify_saved_file_and_size(filename): raise TTS_OperationError(f"File '{filename}' was not saved.") if os.path.getsize(filename) == 0: raise TTS_OperationError( - f"File '{filename}' has a zero size. Related to incorrect TTS for the target language" + f"File '{filename}' has a zero size. " + "Related to incorrect TTS for the target language" ) @@ -64,7 +65,8 @@ def error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename): ) logger.warning( - f'TTS auxiliary will be utilized rather than TTS: {segment["tts_name"]}' + 'TTS auxiliary will be utilized ' + f'rather than TTS: {segment["tts_name"]}' ) verify_saved_file_and_size(filename) except Exception as error: @@ -1080,7 +1082,9 @@ def accelerate_segments( filename=f"{folder_output}/{filename}" ) logger.debug( - f"acc_percen is {acc_percentage}, tts duration is {duration_tts}, new duration is {duration_create}, for {filename}" + f"acc_percen is {acc_percentage}, tts duration " + f"is {duration_tts}, new duration is {duration_create}" + f", for {filename}" ) audio_files.append(f"{folder_output}/{filename}") @@ -1166,7 +1170,8 @@ def create_wav_vc( if os.path.exists(check_segment_audio_target_file): logger.debug( - f"Segment vc source exists: {check_segment_audio_target_file}" + "Segment vc source exists: " + f"{check_segment_audio_target_file}" ) pass else: diff --git a/soni_translate/translate_segments.py b/soni_translate/translate_segments.py index 8d2881b..52bea8f 100644 --- a/soni_translate/translate_segments.py +++ b/soni_translate/translate_segments.py @@ -2,9 +2,27 @@ from deep_translator import GoogleTranslator from itertools import chain import copy -from tqdm import tqdm from .language_configuration import fix_code_language, INVERTED_LANGUAGES from .logging_setup import logger +import re +import json +import time + +TRANSLATION_PROCESS_OPTIONS = [ + "google_translator_batch", + "google_translator", + "gpt-3.5-turbo-0125_batch", + "gpt-3.5-turbo-0125", + "gpt-4-turbo-preview_batch", + "gpt-4-turbo-preview", + "disable_translation", +] +DOCS_TRANSLATION_PROCESS_OPTIONS = [ + "google_translator", + "gpt-3.5-turbo-0125", + "gpt-4-turbo-preview", + "disable_translation", +] def translate_iterative(segments, target, source=None): @@ -46,6 +64,34 @@ def translate_iterative(segments, target, source=None): return segments_ +def verify_translate( + segments, + segments_copy, + translated_lines, + target, + source +): + """ + Verify integrity and translate segments if lengths match, otherwise + switch to iterative translation. + """ + if len(segments) == len(translated_lines): + for line in range(len(segments_copy)): + logger.debug( + f"{segments_copy[line]['text']} >> " + f"{translated_lines[line].strip()}" + ) + segments_copy[line]["text"] = translated_lines[ + line].replace("\t", "").replace("\n", "").strip() + return segments_copy + else: + logger.error( + "The translation failed, switching to google_translate iterative. " + f"{len(segments), len(translated_lines)}" + ) + return translate_iterative(segments, target, source) + + def translate_batch(segments, target, chunk_size=2000, source=None): """ Translate a batch of text segments into the specified language in chunks, @@ -143,21 +189,233 @@ def translate_batch(segments, target, chunk_size=2000, source=None): # un chunk translated_lines = list(chain.from_iterable(split_list)) - # verify integrity ok - if len(segments) == len(translated_lines): - for line in range(len(segments_copy)): - logger.debug( - f"{segments_copy[line]['text']} >> " - f"{translated_lines[line].strip()}" - ) - segments_copy[line]["text"] = translated_lines[line].strip() - return segments_copy - else: - logger.error( - "The translation in chunks failed, switching to iterative. " - f"{len(segments), len(translated_lines)}" + return verify_translate( + segments, segments_copy, translated_lines, target, source + ) + + +def call_gpt_translate( + client, + model, + system_prompt, + user_prompt, + original_text=None, + batch_lines=None, +): + + # https://platform.openai.com/docs/guides/text-generation/json-mode + response = client.chat.completions.create( + model=model, + response_format={"type": "json_object"}, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ] + ) + result = response.choices[0].message.content + logger.debug(f"Result: {str(result)}") + + try: + translation = json.loads(result) + except Exception as error: + match_result = re.search(r'\{.*?\}', result) + if match_result: + logger.error(str(error)) + json_str = match_result.group(0) + translation = json.loads(json_str) + else: + raise error + + # Get valid data + if batch_lines: + for conversation in translation.values(): + if isinstance(conversation, dict): + conversation = list(conversation.values())[0] + if ( + list( + original_text["conversation"][0].values() + )[0].strip() == + list(conversation[0].values())[0].strip() + ): + continue + if len(conversation) == batch_lines: + break + + fix_conversation_length = [] + for line in conversation: + for speaker_code, text_tr in line.items(): + fix_conversation_length.append({speaker_code: text_tr}) + + logger.debug(f"Data batch: {str(fix_conversation_length)}") + logger.debug( + f"Lines Received: {len(fix_conversation_length)}," + f" expected: {batch_lines}" ) - return translate_iterative(segments, target, source) + + return fix_conversation_length + + else: + if isinstance(translation, dict): + translation = list(translation.values())[0] + if isinstance(translation, list): + translation = translation[0] + if isinstance(translation, set): + translation = list(translation)[0] + if not isinstance(translation, str): + raise ValueError(f"No valid response received: {str(translation)}") + + return translation + + +def gpt_sequential(segments, model, target, source=None): + from openai import OpenAI + + translated_segments = copy.deepcopy(segments) + + client = OpenAI() + progress_bar = tqdm(total=len(segments), desc="Translating") + + lang_tg = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[target]).strip() + lang_sc = "" + if source: + lang_sc = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[source]).strip() + + fixed_target = fix_code_language(target) + fixed_source = fix_code_language(source) if source else "auto" + + system_prompt = "Machine translation designed to output the translated_text JSON." + + for i, line in enumerate(translated_segments): + text = line["text"].strip() + start = line["start"] + user_prompt = f"Translate the following {lang_sc} text into {lang_tg}, write the fully translated text and nothing more:\n{text}" + + time.sleep(0.5) + + try: + translated_text = call_gpt_translate( + client, + model, + system_prompt, + user_prompt, + ) + + except Exception as error: + logger.error( + f"{str(error)} >> The text of segment {start} " + "is being corrected with Google Translate" + ) + translator = GoogleTranslator( + source=fixed_source, target=fixed_target + ) + translated_text = translator.translate(text.strip()) + + translated_segments[i]["text"] = translated_text.strip() + progress_bar.update(1) + + progress_bar.close() + + return translated_segments + + +def gpt_batch(segments, model, target, token_batch_limit=900, source=None): + from openai import OpenAI + import tiktoken + + token_batch_limit = max(100, (token_batch_limit - 40) // 2) + progress_bar = tqdm(total=len(segments), desc="Translating") + segments_copy = copy.deepcopy(segments) + encoding = tiktoken.get_encoding("cl100k_base") + client = OpenAI() + + lang_tg = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[target]).strip() + lang_sc = "" + if source: + lang_sc = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[source]).strip() + + fixed_target = fix_code_language(target) + fixed_source = fix_code_language(source) if source else "auto" + + name_speaker = "ABCDEF" + + translated_lines = [] + text_data_dict = [] + num_tokens = 0 + count_sk = {char: 0 for char in "ABCDEF"} + + for i, line in enumerate(segments_copy): + text = line["text"] + speaker = line["speaker"] + last_start = line["start"] + # text_data_dict.append({str(int(speaker[-1])+1): text}) + index_sk = int(speaker[-1]) + character_sk = name_speaker[index_sk] + count_sk[character_sk] += 1 + code_sk = character_sk+str(count_sk[character_sk]) + text_data_dict.append({code_sk: text}) + num_tokens += len(encoding.encode(text)) + 7 + if num_tokens >= token_batch_limit or i == len(segments_copy)-1: + try: + batch_lines = len(text_data_dict) + batch_conversation = {"conversation": copy.deepcopy(text_data_dict)} + # Reset vars + num_tokens = 0 + text_data_dict = [] + count_sk = {char: 0 for char in "ABCDEF"} + # Process translation + # https://arxiv.org/pdf/2309.03409.pdf + system_prompt = f"Machine translation designed to output the translated_conversation key JSON containing a list of {batch_lines} items." + user_prompt = f"Translate each of the following text values in conversation{' from' if lang_sc else ''} {lang_sc} to {lang_tg}:\n{batch_conversation}" + logger.debug(f"Prompt: {str(user_prompt)}") + + conversation = call_gpt_translate( + client, + model, + system_prompt, + user_prompt, + original_text=batch_conversation, + batch_lines=batch_lines, + ) + + if len(conversation) < batch_lines: + raise ValueError( + "Incomplete result received. Batch lines: " + f"{len(conversation)}, expected: {batch_lines}" + ) + + for i, translated_text in enumerate(conversation): + if i+1 > batch_lines: + break + translated_lines.append(list(translated_text.values())[0]) + + progress_bar.update(batch_lines) + + except Exception as error: + logger.error(str(error)) + + first_start = segments_copy[max(0, i-(batch_lines-1))]["start"] + logger.warning( + f"The batch from {first_start} to {last_start} " + "failed, is being corrected with Google Translate" + ) + + translator = GoogleTranslator( + source=fixed_source, + target=fixed_target + ) + + for txt_source in batch_conversation["conversation"]: + translated_txt = translator.translate( + list(txt_source.values())[0].strip() + ) + translated_lines.append(translated_txt.strip()) + progress_bar.update(1) + + progress_bar.close() + + return verify_translate( + segments, segments_copy, translated_lines, fixed_target, fixed_source + ) def translate_text( @@ -166,16 +424,32 @@ def translate_text( translation_process="google_translator_batch", chunk_size=4500, source=None, + token_batch_limit=1000, ): """Translates text segments using a specified process.""" match translation_process: case "google_translator_batch": return translate_batch( - segments, fix_code_language(target), chunk_size, source + segments, + fix_code_language(target), + chunk_size, + fix_code_language(source) ) - case "google_translator_iterative": + case "google_translator": return translate_iterative( - segments, fix_code_language(target), source + segments, + fix_code_language(target), + fix_code_language(source) + ) + case model if model in ["gpt-3.5-turbo-0125", "gpt-4-turbo-preview"]: + return gpt_sequential(segments, model, target, source) + case model if model in ["gpt-3.5-turbo-0125_batch", "gpt-4-turbo-preview_batch",]: + return gpt_batch( + segments, + translation_process.replace("_batch", ""), + target, + token_batch_limit, + source ) case "disable_translation": return segments From 51f41d0eaeec2c48ff143905ce4a433679f450cc Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Wed, 3 Apr 2024 03:46:16 +0000 Subject: [PATCH 16/36] docs(Windows): Set openai api key --- docs/windows_install.md | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/docs/windows_install.md b/docs/windows_install.md index 98eedb7..7bad658 100644 --- a/docs/windows_install.md +++ b/docs/windows_install.md @@ -57,10 +57,9 @@ cd SoniTranslate conda install -c "nvidia/label/cuda-11.8.0" cuda-toolkit -y ``` -4. Install cuDNN - +4. Install PyTorch using conda ``` -conda install -c conda-forge cudnn -y +conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia -y ``` 5. Install required packages: @@ -71,7 +70,7 @@ pip install -r requirements_extra.txt -v pip install onnxruntime-gpu ``` -6. Install [ffmpeg](https://ffmpeg.org/download.html). FFmpeg is a free software project that produces libraries and programs for handling multimedia data. You will need it to process audio and video files. You can install ffmpeg with Anaconda by running `conda install -y ffmpeg` in your terminal. If you have trouble installing ffmpeg via Anaconda, you can use the following link instead: (https://ffmpeg.org/ffmpeg.html). Once it is installed, make sure it is in your PATH by running `ffmpeg -h` in your terminal. If you don't get an error message, you're good to go. +6. Install [ffmpeg](https://ffmpeg.org/download.html). FFmpeg is a free software project that produces libraries and programs for handling multimedia data. You will need it to process audio and video files. You can install ffmpeg with Anaconda by running `conda install -y ffmpeg` in your terminal (recommended). If you have trouble installing ffmpeg via Anaconda, you can use the following link instead: (https://ffmpeg.org/ffmpeg.html). Once it is installed, make sure it is in your PATH by running `ffmpeg -h` in your terminal. If you don't get an error message, you're good to go. 7. Optional install: @@ -140,4 +139,12 @@ conda env remove -n sonitr With the `sonitr` environment removed, you can start over with a fresh installation. ### Notes -If for any reason the installation fails or gets stuck when cuDNN is being installed, it will be necessary to directly install [CUDA Toolkit 11.8.0](https://developer.nvidia.com/cuda-11-8-0-download-archive) and [cuDNN for CUDA 11.x](https://developer.nvidia.com/rdp/cudnn-archive), [info](https://docs.nvidia.com/deeplearning/cudnn/installation/windows.html) provided by NVIDIA. +- To use OpenAI's GPT API for translation, set up your OpenAI API key as an environment variable in quotes: + +``` +conda activate sonitr +conda env config vars set OPENAI_API_KEY="your-api-key-here" +conda deactivate +``` + +- Alternatively, you can install the CUDA Toolkit 11.8.0 directly on your system [CUDA Toolkit 11.8.0](https://developer.nvidia.com/cuda-11-8-0-download-archive). \ No newline at end of file From 576b4b89f138715ab01f25effc2512fc5709d1db Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Fri, 5 Apr 2024 04:05:59 +0000 Subject: [PATCH 17/36] feat: python bindings, which allow it to be used without an interface, mainly for test purposes --- .gitignore | 1 + app_rvc.py | 167 ++++++++++++++++--------------- soni_translate/text_to_speech.py | 16 +-- voice_main.py | 14 +-- 4 files changed, 107 insertions(+), 91 deletions(-) diff --git a/.gitignore b/.gitignore index 3cb07f4..375d3fe 100644 --- a/.gitignore +++ b/.gitignore @@ -179,3 +179,4 @@ clean_song_output/ audio2/ audio/ outputs/ +PIPER_MODELS/ \ No newline at end of file diff --git a/app_rvc.py b/app_rvc.py index de9fc04..c505e8c 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -260,8 +260,51 @@ def __init__(self, dev=False): self.voiceless_id = None self.burn_subs_id = None + os.environ["VOICES_MODELS"] = "DISABLE" + self.vci = ClassVoices() + + self.tts_voices = self.get_tts_voice_list() + logger.info(f"Working in: {self.device}") + def get_tts_voice_list(self): + try: + from piper import PiperVoice # noqa + + piper_enabled = True + logger.info("PIPER TTS enabled") + except Exception as error: + logger.debug(str(error)) + piper_enabled = False + logger.info("PIPER TTS disabled") + try: + from TTS.api import TTS # noqa + + xtts_enabled = True + logger.info("Coqui XTTS enabled") + logger.info( + "In this app, by using Coqui TTS (text-to-speech), you " + "acknowledge and agree to the license.\n" + "You confirm that you have read, understood, and agreed " + "to the Terms and Conditions specified at the following " + "link:\nhttps://coqui.ai/cpml.txt." + ) + os.environ["COQUI_TOS_AGREED"] = "1" + except Exception as error: + logger.debug(str(error)) + xtts_enabled = False + logger.info("Coqui XTTS disabled") + + self.tts_info = TTS_Info(piper_enabled, xtts_enabled) + + return self.tts_info.tts_list() + + def enable_custom_model_voice(self): + os.environ["VOICES_MODELS"] = "ENABLE" + + def disable_custom_model_voice(self): + os.environ["VOICES_MODELS"] = "DISABLE" + def batch_multilingual_media_conversion(self, *kwargs): # logger.debug(str(kwargs)) @@ -312,10 +355,10 @@ def batch_multilingual_media_conversion(self, *kwargs): def multilingual_media_conversion( self, - media_file, - link_media, - directory_input, - YOUR_HF_TOKEN, + media_file=None, + link_media="", + directory_input="", + YOUR_HF_TOKEN="", preview=False, WHISPER_MODEL_SIZE="large-v3", batch_size=16, @@ -688,7 +731,7 @@ def multilingual_media_conversion( self.valid_speakers = audio_segmentation_to_voice( self.result_diarize, TRANSLATE_AUDIO_TO, - True, + is_gui, tts_voice00, tts_voice01, tts_voice02, @@ -698,38 +741,38 @@ def multilingual_media_conversion( dereverb_automatic_xtts, ) - if not hasattr(vci, 'model_voice_path00'): + if not hasattr(self.vci, 'model_voice_path00'): cc_transpose_values = cc_index_values = cc_model_paths = None else: cc_model_paths = [ - vci.model_voice_path00, - vci.model_voice_path01, - vci.model_voice_path02, - vci.model_voice_path03, - vci.model_voice_path04, - vci.model_voice_path05, - vci.model_voice_path99 + self.vci.model_voice_path00, + self.vci.model_voice_path01, + self.vci.model_voice_path02, + self.vci.model_voice_path03, + self.vci.model_voice_path04, + self.vci.model_voice_path05, + self.vci.model_voice_path99 ] cc_index_values = [ - vci.file_index200, - vci.file_index201, - vci.file_index202, - vci.file_index203, - vci.file_index204, - vci.file_index205, - vci.file_index299 + self.vci.file_index200, + self.vci.file_index201, + self.vci.file_index202, + self.vci.file_index203, + self.vci.file_index204, + self.vci.file_index205, + self.vci.file_index299 ] cc_transpose_values = [ - vci.f0method, - vci.transpose00, - vci.transpose01, - vci.transpose02, - vci.transpose03, - vci.transpose04, - vci.transpose05, - vci.transpose99 + self.vci.f0method, + self.vci.transpose00, + self.vci.transpose01, + self.vci.transpose02, + self.vci.transpose03, + self.vci.transpose04, + self.vci.transpose05, + self.vci.transpose99 ] if not self.task_in_cache("acc_and_vc", [ @@ -785,7 +828,7 @@ def multilingual_media_conversion( logger.error("Apply the configuration!") try: - vci(speakers_list, audio_files) + self.vci(speakers_list, audio_files) except Exception as error: logger.error(str(error)) @@ -908,7 +951,7 @@ def multilingual_media_conversion( def multilingual_docs_conversion( self, - string_text, # string + string_text="", # string document=None, # doc path gui directory_input="", # doc path SOURCE_LANGUAGE="English (en)", @@ -1027,7 +1070,7 @@ def multilingual_docs_conversion( is_gui, progress=progress, ) - vci(speakers_list, audio_files) + self.vci(speakers_list, audio_files) prog_disp( "Creating final audio file...", 0.90, is_gui, progress=progress @@ -1144,42 +1187,42 @@ def submit(value): return [value for value in visibility_dict.values()] tts_voice00 = gr.Dropdown( - tts_info.tts_list(), + SoniTr.tts_info.tts_list(), value="en-AU-WilliamNeural-Male", label=lg_conf["sk1"], visible=True, interactive=True, ) tts_voice01 = gr.Dropdown( - tts_info.tts_list(), + SoniTr.tts_info.tts_list(), value="en-CA-ClaraNeural-Female", label=lg_conf["sk2"], visible=True, interactive=True, ) tts_voice02 = gr.Dropdown( - tts_info.tts_list(), + SoniTr.tts_info.tts_list(), value="en-GB-ThomasNeural-Male", label=lg_conf["sk3"], visible=False, interactive=True, ) tts_voice03 = gr.Dropdown( - tts_info.tts_list(), + SoniTr.tts_info.tts_list(), value="en-GB-SoniaNeural-Female", label=lg_conf["sk4"], visible=False, interactive=True, ) tts_voice04 = gr.Dropdown( - tts_info.tts_list(), + SoniTr.tts_info.tts_list(), value="en-NZ-MitchellNeural-Male", label=lg_conf["sk4"], visible=False, interactive=True, ) tts_voice05 = gr.Dropdown( - tts_info.tts_list(), + SoniTr.tts_info.tts_list(), value="en-GB-MaisieNeural-Female", label=lg_conf["sk6"], visible=False, @@ -1211,7 +1254,7 @@ def submit(value): ) voice_imitation_method_options = ( ["freevc", "openvoice"] - if xtts_enabled + if SoniTr.tts_info.xtts_enabled else ["openvoice"] ) voice_imitation_method_gui = gr.Dropdown( @@ -1241,7 +1284,7 @@ def submit(value): info=lg_conf["vc_remove_info"], ) - if xtts_enabled: + if SoniTr.tts_info.xtts_enabled: with gr.Column(): with gr.Accordion( lg_conf["xtts_title"], @@ -1710,7 +1753,7 @@ def swap_visibility(data_type): list( filter( lambda x: x != "_XTTS_/AUTOMATIC.wav", - tts_info.tts_list(), + SoniTr.tts_info.tts_list(), ) ), value="en-GB-ThomasNeural-Male", @@ -2002,7 +2045,7 @@ def update_models(): confirm_conf = gr.HTML() button_config.click( - vci.apply_conf, + SoniTr.vci.apply_conf, inputs=[ f0_method_global, model_voice_path00, @@ -2042,7 +2085,7 @@ def update_models(): ) with gr.Column(): tts_test = gr.Dropdown( - sorted(tts_info.list_edge), + sorted(SoniTr.tts_info.list_edge), value="en-GB-ThomasNeural-Male", label="TTS", visible=True, @@ -2083,7 +2126,7 @@ def update_models(): ttsvoice = gr.Audio() button_test.click( - vci.make_test, + SoniTr.vci.make_test, inputs=[ text_test, tts_test, @@ -2181,18 +2224,18 @@ def read_logs(): logs = gr.Textbox(label=">>>") app.load(read_logs, None, logs, every=1) - if xtts_enabled: + if SoniTr.tts_info.xtts_enabled: # Update tts list def update_tts_list(): update_dict = { - f"tts_voice{i:02d}": gr.update(choices=tts_info.tts_list()) + f"tts_voice{i:02d}": gr.update(choices=SoniTr.tts_info.tts_list()) for i in range(6) } update_dict["tts_documents"] = gr.update( choices=list( filter( lambda x: x != "_XTTS_/AUTOMATIC.wav", - tts_info.tts_list(), + SoniTr.tts_info.tts_list(), ) ) ) @@ -2437,41 +2480,9 @@ def create_parser(): ) models, index_paths = upload_model_list() - os.environ["VOICES_MODELS"] = "DISABLE" - vci = ClassVoices() SoniTr = SoniTranslate() - try: - from piper import PiperVoice # noqa - - piper_enabled = True - logger.info("PIPER TTS enabled") - except Exception as error: - logger.warning(str(error)) - piper_enabled = False - logger.info("PIPER TTS disabled") - try: - from TTS.api import TTS # noqa - - xtts_enabled = True - logger.info("Coqui XTTS enabled") - logger.info( - "In this app, by using Coqui TTS (text-to-speech), you " - "acknowledge and agree to the license.\n" - "You confirm that you have read, understood, and agreed " - "to the Terms and Conditions specified at the following link:\n" - "https://coqui.ai/cpml.txt." - ) - os.environ["COQUI_TOS_AGREED"] = "1" - except Exception as error: - logger.warning(str(error)) - xtts_enabled = False - logger.info("Coqui XTTS disabled") - - tts_info = TTS_Info(piper_enabled, xtts_enabled) - # list_tts = tts_info.tts_list() - lg_conf = get_language_config(language_data, language=args.language) app = create_gui(args.theme, logs_in_gui=args.logs_in_gui) diff --git a/soni_translate/text_to_speech.py b/soni_translate/text_to_speech.py index 0312150..4e1ddc0 100644 --- a/soni_translate/text_to_speech.py +++ b/soni_translate/text_to_speech.py @@ -156,12 +156,16 @@ def segments_egde_tts(filtered_edge_segments, TRANSLATE_AUDIO_TO, is_gui): logger.info(f"{text} >> {filename}") try: - # nest_asyncio.apply() if not is_gui else None - asyncio.run( - edge_tts.Communicate( - text, "-".join(tts_name.split("-")[:-1]) - ).save(temp_file) - ) + if is_gui: + asyncio.run( + edge_tts.Communicate( + text, "-".join(tts_name.split("-")[:-1]) + ).save(temp_file) + ) + else: + # nest_asyncio.apply() if not is_gui else None + command = f'edge-tts -t "{text}" -v "{tts_name.replace("-Male", "").replace("-Female", "")}" --write-media "{temp_file}"' + run_command(command) verify_saved_file_and_size(temp_file) data, sample_rate = sf.read(temp_file) diff --git a/voice_main.py b/voice_main.py index e83149c..a6df058 100644 --- a/voice_main.py +++ b/voice_main.py @@ -318,15 +318,15 @@ def __init__(self): def apply_conf(self, f0method, model_voice_path00, transpose00, file_index2_00, - model_voice_path01, transpose01, file_index2_01, - model_voice_path02, transpose02, file_index2_02, - model_voice_path03, transpose03, file_index2_03, - model_voice_path04, transpose04, file_index2_04, - model_voice_path05, transpose05, file_index2_05, - model_voice_path99, transpose99, file_index2_99): + model_voice_path01="", transpose01=0, file_index2_01="", + model_voice_path02="", transpose02=0, file_index2_02="", + model_voice_path03="", transpose03=0, file_index2_03="", + model_voice_path04="", transpose04=0, file_index2_04="", + model_voice_path05="", transpose05=0, file_index2_05="", + model_voice_path99="", transpose99=0, file_index2_99=""): #self.filename = filename - self.f0method = f0method # pm + self.f0method = f0method self.model_voice_path00 = model_voice_path00 self.transpose00 = transpose00 From 021efd8da7f4d6c50461403baf67999cda879564 Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Sat, 6 Apr 2024 19:09:33 +0000 Subject: [PATCH 18/36] feat(output type): Subtitles can be obtained for each speaker --- .gitignore | 10 ++++ app_rvc.py | 45 ++++++++------- soni_translate/postprocessor.py | 60 ++++++++++++++++++++ soni_translate/text_multiformat_processor.py | 43 ++++++++++++++ 4 files changed, 139 insertions(+), 19 deletions(-) diff --git a/.gitignore b/.gitignore index 375d3fe..2dce161 100644 --- a/.gitignore +++ b/.gitignore @@ -161,6 +161,13 @@ cython_debug/ # Ignore sub_tra.* sub_ori.* +SPEAKER_00.* +SPEAKER_01.* +SPEAKER_02.* +SPEAKER_03.* +SPEAKER_04.* +SPEAKER_05.* +task_subtitle.* *.mp3 *.mp4 *.ogg @@ -170,6 +177,9 @@ list.txt text_preprocessor.txt text_translation.txt *.srt +*.vtt +*.tsv +*.aud mdx_models/*.onnx _XTTS_/ downloads/ diff --git a/app_rvc.py b/app_rvc.py index c505e8c..5fdb9c7 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -25,7 +25,12 @@ audio_video_preprocessor, audio_preprocessor, ) -from soni_translate.postprocessor import media_out +from soni_translate.postprocessor import ( + OUTPUT_TYPE_OPTIONS, + DOCS_OUTPUT_TYPE_OPTIONS, + media_out, + get_subtitle_speaker, +) from soni_translate.language_configuration import ( LANGUAGES, UNIDIRECTIONAL_L_LIST, @@ -346,10 +351,13 @@ def batch_multilingual_media_conversion(self, *kwargs): output_file = self.multilingual_media_conversion( media, "", "", *kwargs ) - result.append(output_file) + + if isinstance(output_file, str): + output_file = [output_file] + result.extend(output_file) if is_gui_arg and len(media_batch) > 1: - gr.Info(f"Done: {os.path.basename(output_file)}") + gr.Info(f"Done: {os.path.basename(output_file[0])}") return result @@ -715,6 +723,17 @@ def multilingual_media_conversion( logger.info(f"Done: {output}") return output + if output_type == "subtitle [by speaker]": + output = get_subtitle_speaker( + media_file, + result=self.result_diarize, + language=TRANSLATE_AUDIO_TO, + extension=output_format_subtitle, + base_name=video_output_name, + ) + logger.info(f"Done: {str(output)}") + return output + if not self.task_in_cache("tts", [ TRANSLATE_AUDIO_TO, tts_voice00, @@ -1490,17 +1509,9 @@ def get_subs_path(type_subs): ) gr.HTML("
") - main_output_type_opt = [ - "video (mp4)", - "video (mkv)", - "audio (mp3)", - "audio (ogg)", - "audio (wav)", - "subtitle", - ] main_output_type = gr.Dropdown( - main_output_type_opt, - value=main_output_type_opt[0], + OUTPUT_TYPE_OPTIONS, + value=OUTPUT_TYPE_OPTIONS[0], label=lg_conf["out_type_label"], ) VIDEO_OUTPUT_NAME = gr.Textbox( @@ -1791,13 +1802,9 @@ def swap_visibility(data_type): gr.HTML("
") - docs_output_type_opt = [ - "audio", - "text", - ] # Add DOCX and etc. docs_output_type = gr.Dropdown( - docs_output_type_opt, - value=docs_output_type_opt[0], + DOCS_OUTPUT_TYPE_OPTIONS, + value=DOCS_OUTPUT_TYPE_OPTIONS[0], label="Output type", ) docs_OUTPUT_NAME = gr.Textbox( diff --git a/soni_translate/postprocessor.py b/soni_translate/postprocessor.py index 2f79964..44b70e5 100644 --- a/soni_translate/postprocessor.py +++ b/soni_translate/postprocessor.py @@ -1,10 +1,27 @@ from .utils import remove_files, run_command +from .text_multiformat_processor import get_subtitle from .logging_setup import logger import unicodedata import shutil +import copy import os import re +OUTPUT_TYPE_OPTIONS = [ + "video (mp4)", + "video (mkv)", + "audio (mp3)", + "audio (ogg)", + "audio (wav)", + "subtitle", + "subtitle [by speaker]", +] + +DOCS_OUTPUT_TYPE_OPTIONS = [ + "audio", + "text", +] # Add DOCX and etc. + def get_no_ext_filename(file_path): file_name_with_extension = os.path.basename(rf"{file_path}") @@ -104,3 +121,46 @@ def media_out( f_name = f"{sanitize_file_name(media_out_name)}.{extension}" return get_output_file(file_obj, f_name, soft_subtitles) + + +def get_subtitle_speaker(media_file, result, language, extension, base_name): + + segments_base = copy.deepcopy(result) + + # Sub segments by speaker + segments_by_speaker = {} + for segment in segments_base["segments"]: + if segment["speaker"] not in segments_by_speaker.keys(): + segments_by_speaker[segment["speaker"]] = [segment] + else: + segments_by_speaker[segment["speaker"]].append(segment) + + if not base_name: + if os.path.exists(media_file): + base_name = get_no_ext_filename(media_file) + else: + base_name, _ = get_video_info(media_file) + + files_subs = [] + for name_sk, segments in segments_by_speaker.items(): + + subtitle_speaker = get_subtitle( + language, + {"segments": segments}, + extension, + filename=name_sk, + ) + + media_out_name = f"{base_name}_{language}_{name_sk}" + + output = media_out( + media_file, # no need + language, + media_out_name, + extension, + file_obj=subtitle_speaker, + ) + + files_subs.append(output) + + return files_subs \ No newline at end of file diff --git a/soni_translate/text_multiformat_processor.py b/soni_translate/text_multiformat_processor.py index 041b0e3..fa43796 100644 --- a/soni_translate/text_multiformat_processor.py +++ b/soni_translate/text_multiformat_processor.py @@ -230,6 +230,49 @@ def segments_to_plain_text(result_diarize): # subtitles +def get_subtitle( + language, + segments_data, + extension, + filename=None, + highlight_words=False, +): + if not filename: + filename = "task_subtitle" + + sub_file = filename + "." + extension + support_name = filename + ".mp3" + remove_files(sub_file) + + writer = get_writer(extension, output_dir=".") + word_options = { + "highlight_words": highlight_words, + "max_line_count": None, + "max_line_width": None, + } + + # Get data subs + subtitle_data = copy.deepcopy(segments_data) + subtitle_data["language"] = ( + "ja" if language in ["ja", "zh", "zh-TW"] else language + ) + + # Clean + if not highlight_words: + subtitle_data.pop("word_segments", None) + for segment in subtitle_data["segments"]: + for key in ["speaker", "chars", "words"]: + segment.pop(key, None) + + writer( + subtitle_data, + support_name, + word_options, + ) + + return sub_file + + def process_subtitles( deep_copied_result, align_language, From 4765174893b923386005cc6e185d422ab430e5a8 Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Sat, 6 Apr 2024 22:47:11 +0000 Subject: [PATCH 19/36] feat: Text segmentation: sentence, word or character --- app_rvc.py | 53 ++++++++++++++++++-- soni_translate/text_multiformat_processor.py | 37 ++++++++++++++ 2 files changed, 87 insertions(+), 3 deletions(-) diff --git a/app_rvc.py b/app_rvc.py index 5fdb9c7..1c9ea61 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -68,6 +68,7 @@ plain_text_to_segments, segments_to_plain_text, process_subtitles, + linguistic_level_segments, break_aling_segments, ) from soni_translate.languages_gui import language_data, news @@ -403,6 +404,7 @@ def multilingual_media_conversion( voice_imitation_remove_previous=True, voice_imitation_method="freevc", dereverb_automatic_xtts=True, + text_segmentation_scale="sentence", divide_text_segments_by="", soft_subtitles_to_video=False, burn_subtitles_to_video=False, @@ -438,6 +440,19 @@ def multilingual_media_conversion( TRANSLATE_AUDIO_TO = LANGUAGES[TRANSLATE_AUDIO_TO] SOURCE_LANGUAGE = LANGUAGES[SOURCE_LANGUAGE] + if ( + text_segmentation_scale in ["word", "character"] + and "subtitle" not in output_type + ): + wrn_lang = ( + "Text segmentation by words or characters is typically" + " used for generating subtitles. If subtitles are not the" + " intended output, consider selecting 'sentence' " + "segmentation method to ensure optimal results." + + ) + warn_disp(wrn_lang, is_gui) + if tts_voice00[:2].lower() != TRANSLATE_AUDIO_TO[:2].lower(): wrn_lang = ( "Make sure to select a 'TTS Speaker' suitable for" @@ -548,7 +563,13 @@ def multilingual_media_conversion( SOURCE_LANGUAGE, WHISPER_MODEL_SIZE, compute_type, - batch_size + batch_size, + ( + "l_unit" + if text_segmentation_scale in ["word", "character"] + and subtitle_file + else "sentence" + ) ], {}): if subtitle_file: prog_disp( @@ -584,7 +605,10 @@ def multilingual_media_conversion( ) self.align_language = self.result["language"] - if not subtitle_file: + if ( + not subtitle_file + or text_segmentation_scale in ["word", "character"] + ): prog_disp("Aligning...", 0.45, is_gui, progress=progress) try: self.result = align_speech(audio, self.result) @@ -600,6 +624,7 @@ def multilingual_media_conversion( if not self.task_in_cache("break_align", [ divide_text_segments_by, + text_segmentation_scale, self.align_language ], { "result": self.result, @@ -607,7 +632,12 @@ def multilingual_media_conversion( }): if self.align_language in ["ja", "zh", "zh-TW"]: divide_text_segments_by += "|!|?|...|。" - if divide_text_segments_by: + if text_segmentation_scale in ["word", "character"]: + self.result = linguistic_level_segments( + self.result, + text_segmentation_scale, + ) + elif divide_text_segments_by: try: self.result = break_aling_segments( self.result, @@ -1489,11 +1519,26 @@ def get_subs_path(type_subs): file_types=[".srt", ".ass"], height=130, ) + + gr.HTML("
") + text_segmentation_options = [ + "sentence", + "word", + "character" + ] + text_segmentation_scale_gui = gr.Dropdown( + text_segmentation_options, + value=text_segmentation_options[0], + label="Text Segmentation Scale", + info="Divide text into segments by sentences, words, or characters. Word and character segmentation offer finer granularity, useful for subtitles; disabling translation preserves original structure.", + ) divide_text_segments_by_gui = gr.Textbox( label=lg_conf["divide_text_label"], value="", info=lg_conf["divide_text_info"], ) + + gr.HTML("
") pyannote_models_list = list( diarization_models.keys() ) @@ -2317,6 +2362,7 @@ def update_tts_list(): voice_imitation_remove_previous_gui, voice_imitation_method_gui, wav_speaker_dereverb, + text_segmentation_scale_gui, divide_text_segments_by_gui, soft_subtitles_to_video_gui, burn_subtitles_to_video_gui, @@ -2372,6 +2418,7 @@ def update_tts_list(): voice_imitation_remove_previous_gui, voice_imitation_method_gui, wav_speaker_dereverb, + text_segmentation_scale_gui, divide_text_segments_by_gui, soft_subtitles_to_video_gui, burn_subtitles_to_video_gui, diff --git a/soni_translate/text_multiformat_processor.py b/soni_translate/text_multiformat_processor.py index fa43796..560b59e 100644 --- a/soni_translate/text_multiformat_processor.py +++ b/soni_translate/text_multiformat_processor.py @@ -340,6 +340,43 @@ def process_subtitles( return name_tra + output_format_subtitle +def linguistic_level_segments( + result_base, + linguistic_unit="word", # word or char +): + linguistic_unit = linguistic_unit[:4] + linguistic_unit_key = linguistic_unit + "s" + result = copy.deepcopy(result_base) + + if linguistic_unit_key not in result["segments"][0].keys(): + raise ValueError("No alignment detected, can't process") + + segments_by_unit = [] + for segment in result["segments"]: + segment_units = segment[linguistic_unit_key] + # segment_speaker = segment.get("speaker", "SPEAKER_00") + + for unit in segment_units: + + text = unit[linguistic_unit] + + if "start" in unit.keys(): + segments_by_unit.append( + { + "start": unit["start"], + "end": unit["end"], + "text": text, + # "speaker": segment_speaker, + } + ) + elif not segments_by_unit: + pass + else: + segments_by_unit[-1]["text"] += text + + return {"segments": segments_by_unit} + + def break_aling_segments( result: dict, break_characters: str = "", # ":|,|.|" From bd30289f96f4af1d2a206be60064159a318dae2d Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Wed, 10 Apr 2024 22:42:16 +0000 Subject: [PATCH 20/36] feat(transcription): fine-tuned Whisper models and adjustment of more parameters --- .gitignore | 3 +- app_rvc.py | 70 ++++++++++----- soni_translate/audio_segments.py | 4 +- soni_translate/speech_segmentation.py | 118 ++++++++++++++++++++++++-- 4 files changed, 164 insertions(+), 31 deletions(-) diff --git a/.gitignore b/.gitignore index 2dce161..e63bd72 100644 --- a/.gitignore +++ b/.gitignore @@ -189,4 +189,5 @@ clean_song_output/ audio2/ audio/ outputs/ -PIPER_MODELS/ \ No newline at end of file +PIPER_MODELS/ +WHISPER_MODELS/ \ No newline at end of file diff --git a/app_rvc.py b/app_rvc.py index 1c9ea61..022ec49 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -56,6 +56,10 @@ mdxnet_models_dir, ) from soni_translate.speech_segmentation import ( + ASR_MODEL_OPTIONS, + COMPUTE_TYPE_GPU, + COMPUTE_TYPE_CPU, + find_whisper_models, transcribe_speech, align_speech, diarize_speech, @@ -393,6 +397,8 @@ def multilingual_media_conversion( get_video_from_text_json=False, text_json="{}", avoid_overlap=False, + literalize_numbers=True, + segment_duration_limit=15, diarization_model="pyannote_2.1", translate_process="google_translator_batch", subtitle_file=None, @@ -523,7 +529,9 @@ def multilingual_media_conversion( ) # Check GPU - compute_type = "float32" if self.device == "cpu" else compute_type + if self.device == "cpu" and compute_type not in COMPUTE_TYPE_CPU: + logger.info("Compute type changed to float32") + compute_type = "float32" base_video_file = "Video.mp4" base_audio_wav = "audio.wav" @@ -564,6 +572,8 @@ def multilingual_media_conversion( WHISPER_MODEL_SIZE, compute_type, batch_size, + literalize_numbers, + segment_duration_limit, ( "l_unit" if text_segmentation_scale in ["word", "character"] @@ -598,6 +608,8 @@ def multilingual_media_conversion( compute_type, batch_size, SOURCE_LANGUAGE, + literalize_numbers, + segment_duration_limit, ) logger.debug( "Transcript complete, " @@ -1482,41 +1494,49 @@ def get_subs_path(type_subs): gr.HTML("
") gr.Markdown(lg_conf["whisper_title"]) - whisper_model_options = [ - "tiny", - "base", - "small", - "medium", - "large-v1", - "large-v2", - "large-v3", - ] + literalize_numbers_gui = gr.Checkbox( + True, + label="Literalize Numbers", + info="Literalize Numbers: Replace numerical representations with their written equivalents in the transcript.", + ) + segment_duration_limit_gui = gr.Slider( + label="Segment Duration Limit", + info="Specify the maximum duration (in seconds) for each segment. The audio will be processed using VAD, limiting the duration for each segment chunk.", + value=15, + step=1, + minimum=1, + maximum=30, + ) whisper_model_default = ( "large-v3" if torch.cuda.is_available() else "medium" ) + WHISPER_MODEL_SIZE = gr.Dropdown( - whisper_model_options, + ASR_MODEL_OPTIONS + find_whisper_models(), value=whisper_model_default, - label="Whisper model", + label="Whisper ASR model", + info="It converts spoken language to text using the Whisper model by default. Use a custom model, for example, by inputting the repository name 'BELLE-2/Belle-whisper-large-v3-zh' in the dropdown to utilize a Chinese language finetuned model. Find finetuned models on Hugging Face.", + allow_custom_value=True, ) - batch_size = gr.Slider( - 1, 32, value=16, label="Batch size", step=1 - ) - list_compute_type = ( - ["int8", "float16", "float32"] + com_t_opt, com_t_default = ( + [COMPUTE_TYPE_GPU, "float16"] if torch.cuda.is_available() - else ["int8", "float32"] + else [COMPUTE_TYPE_CPU, "float32"] ) compute_type = gr.Dropdown( - list_compute_type, - value=list_compute_type[1], + com_t_opt, + value=com_t_default, label="Compute type", + info="Choosing smaller types like int8 or float16 can improve performance by reducing memory usage and increasing computational throughput, but may sacrifice precision compared to larger data types like float32.", + ) + batch_size = gr.Slider( + 1, 32, value=16, label="Batch size", step=1 ) input_srt = gr.File( label=lg_conf["srt_file_label"], - file_types=[".srt", ".ass"], + file_types=[".srt", ".ass" ".vtt"], height=130, ) @@ -1671,7 +1691,7 @@ def visible_component_subs(input_bool): False, whisper_model_default, 16, - list_compute_type[1], + com_t_default, "Spanish (es)", "English (en)", 1, @@ -1693,7 +1713,7 @@ def visible_component_subs(input_bool): False, whisper_model_default, 16, - list_compute_type[1], + com_t_default, "Japanese (ja)", "English (en)", 1, @@ -2351,6 +2371,8 @@ def update_tts_list(): dummy_false_check, # dummy false subs_edit_space, avoid_overlap_gui, + literalize_numbers_gui, + segment_duration_limit_gui, diarization_process_dropdown, translate_process_dropdown, input_srt, @@ -2407,6 +2429,8 @@ def update_tts_list(): edit_sub_check, subs_edit_space, avoid_overlap_gui, + literalize_numbers_gui, + segment_duration_limit_gui, diarization_process_dropdown, translate_process_dropdown, input_srt, diff --git a/soni_translate/audio_segments.py b/soni_translate/audio_segments.py index 941c244..3a51e52 100644 --- a/soni_translate/audio_segments.py +++ b/soni_translate/audio_segments.py @@ -111,9 +111,9 @@ def create_translated_audio( if (last_end_time - 0.500) > start: overlap_time = last_end_time - start if previous_speaker and previous_speaker != speaker: - start = (last_end_time - 0.600) + start = (last_end_time - 0.500) else: - start = (last_end_time - 0.250) + start = (last_end_time - 0.200) if overlap_time > 2.5: start = start - 0.3 logger.info( diff --git a/soni_translate/speech_segmentation.py b/soni_translate/speech_segmentation.py index f52cd16..7d3837e 100644 --- a/soni_translate/speech_segmentation.py +++ b/soni_translate/speech_segmentation.py @@ -5,22 +5,84 @@ import whisperx import torch import gc +import os from IPython.utils import capture # noqa from .language_configuration import EXTRA_ALIGN, INVERTED_LANGUAGES from .logging_setup import logger +from .postprocessor import sanitize_file_name + +ASR_MODEL_OPTIONS = [ + "tiny", + "base", + "small", + "medium", + "large", + "large-v1", + "large-v2", + "large-v3", + "distil-large-v2", + "Systran/faster-distil-whisper-large-v3", + "tiny.en", + "base.en", + "small.en", + "medium.en", + "distil-small.en", + "distil-medium.en", +] + +COMPUTE_TYPE_GPU = [ + "int8", + "int8_float32", + "int8_float16", + "int8_bfloat16", + "int16", + "float16", + "bfloat16", + "float32" +] + +COMPUTE_TYPE_CPU = [ + "int8", + "int8_float32", + "int16", + "float32", +] + +WHISPER_MODELS_PATH = './WHISPER_MODELS' device = "cuda" if torch.cuda.is_available() else "cpu" +def find_whisper_models(): + path = WHISPER_MODELS_PATH + folders = [] + + if os.path.exists(path): + for folder in os.listdir(path): + folder_path = os.path.join(path, folder) + if ( + os.path.isdir(folder_path) + and 'model.bin' in os.listdir(folder_path) + ): + folders.append(folder) + return folders + + def transcribe_speech( - audio_wav, WHISPER_MODEL_SIZE, compute_type, batch_size, SOURCE_LANGUAGE + audio_wav, + asr_model, + compute_type, + batch_size, + SOURCE_LANGUAGE, + literalize_numbers=True, + segment_duration_limit=15, ): """ Transcribe speech using a whisper model. Parameters: - audio_wav (str): Path to the audio file in WAV format. - - WHISPER_MODEL_SIZE (str): The whisper model to be loaded. + - asr_model (str): The whisper model to be loaded. - compute_type (str): Type of compute to be used (e.g., 'int8', 'float16'). - batch_size (int): Batch size for transcription. - SOURCE_LANGUAGE (str): Source language for transcription. @@ -37,11 +99,53 @@ def transcribe_speech( SOURCE_LANGUAGE if SOURCE_LANGUAGE != "zh-TW" else "zh" ) asr_options = { - "initial_prompt": prompt + "initial_prompt": prompt, + "suppress_numerals": literalize_numbers } + if asr_model not in ASR_MODEL_OPTIONS: + + base_dir = WHISPER_MODELS_PATH + if not os.path.exists(base_dir): + os.makedirs(base_dir) + model_dir = os.path.join(base_dir, sanitize_file_name(asr_model)) + + if not os.path.exists(model_dir): + from ctranslate2.converters import TransformersConverter + + quantization = "float32" + # Download new model + try: + converter = TransformersConverter( + asr_model, + low_cpu_mem_usage=True, + copy_files=[ + "tokenizer_config.json", "preprocessor_config.json" + ] + ) + converter.convert( + model_dir, + quantization=quantization, + force=False + ) + except Exception as error: + if "File tokenizer_config.json does not exist" in str(error): + converter._copy_files = [ + "tokenizer.json", "preprocessor_config.json" + ] + converter.convert( + model_dir, + quantization=quantization, + force=False + ) + else: + raise error + + asr_model = model_dir + logger.info(f"ASR Model: {str(model_dir)}") + model = whisperx.load_model( - WHISPER_MODEL_SIZE, + asr_model, device, compute_type=compute_type, language=SOURCE_LANGUAGE, @@ -49,7 +153,11 @@ def transcribe_speech( ) audio = whisperx.load_audio(audio_wav) - result = model.transcribe(audio, batch_size=batch_size) + result = model.transcribe( + audio, + batch_size=batch_size, + chunk_size=segment_duration_limit, + ) if result["language"] == "zh" and not prompt: result["language"] = "zh-TW" From f8eae9bde2e8b7b682fb473af9e096d6bca39470 Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Sun, 14 Apr 2024 02:53:09 +0000 Subject: [PATCH 21/36] feat: ass subtitles, custom voices with extra settings #7 --- .gitignore | 3 + app_rvc.py | 508 ++++---- configs/32k.json | 46 - configs/32k_v2.json | 46 - configs/40k.json | 46 - configs/48k.json | 46 - configs/48k_v2.json | 46 - lib/infer_pack/models_dml.py | 1124 ----------------- lib/infer_pack/models_onnx.py | 819 ------------ .../modules/F0Predictor/DioF0Predictor.py | 90 -- .../modules/F0Predictor/F0Predictor.py | 16 - .../modules/F0Predictor/HarvestF0Predictor.py | 86 -- .../modules/F0Predictor/PMF0Predictor.py | 97 -- .../modules/F0Predictor/__init__.py | 0 lib/infer_pack/onnx_inference.py | 145 --- soni_translate/logging_setup.py | 1 + soni_translate/text_multiformat_processor.py | 24 +- soni_translate/text_to_speech.py | 16 +- soni_translate/utils.py | 8 +- voice_main.py | 1093 +++++++++------- 20 files changed, 872 insertions(+), 3388 deletions(-) delete mode 100644 configs/32k.json delete mode 100644 configs/32k_v2.json delete mode 100644 configs/40k.json delete mode 100644 configs/48k.json delete mode 100644 configs/48k_v2.json delete mode 100644 lib/infer_pack/models_dml.py delete mode 100644 lib/infer_pack/models_onnx.py delete mode 100644 lib/infer_pack/modules/F0Predictor/DioF0Predictor.py delete mode 100644 lib/infer_pack/modules/F0Predictor/F0Predictor.py delete mode 100644 lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py delete mode 100644 lib/infer_pack/modules/F0Predictor/PMF0Predictor.py delete mode 100644 lib/infer_pack/modules/F0Predictor/__init__.py delete mode 100644 lib/infer_pack/onnx_inference.py diff --git a/.gitignore b/.gitignore index e63bd72..a76cd30 100644 --- a/.gitignore +++ b/.gitignore @@ -180,6 +180,8 @@ text_translation.txt *.vtt *.tsv *.aud +*.ass +*.pt mdx_models/*.onnx _XTTS_/ downloads/ @@ -189,5 +191,6 @@ clean_song_output/ audio2/ audio/ outputs/ +processed/ PIPER_MODELS/ WHISPER_MODELS/ \ No newline at end of file diff --git a/app_rvc.py b/app_rvc.py index 022ec49..7b0db9f 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -133,6 +133,11 @@ def custom_model_voice_enable(enable_custom_voice): ) +def custom_model_voice_workers(workers): + # os.environ["VOICES_MODELS_WORKERS"] = str(workers) + pass + + def prog_disp(msg, percent, is_gui, progress=None): logger.info(msg) if is_gui: @@ -271,6 +276,7 @@ def __init__(self, dev=False): self.burn_subs_id = None os.environ["VOICES_MODELS"] = "DISABLE" + os.environ["VOICES_MODELS_WORKERS"] = "1" self.vci = ClassVoices() self.tts_voices = self.get_tts_voice_list() @@ -309,8 +315,9 @@ def get_tts_voice_list(self): return self.tts_info.tts_list() - def enable_custom_model_voice(self): + def enable_custom_model_voice(self, workers=1): os.environ["VOICES_MODELS"] = "ENABLE" + os.environ["VOICES_MODELS_WORKERS"] = str(workers) def disable_custom_model_voice(self): os.environ["VOICES_MODELS"] = "DISABLE" @@ -502,6 +509,12 @@ def multilingual_media_conversion( else subtitle_file.name ) + if subtitle_file and SOURCE_LANGUAGE == "Automatic detection": + raise Exception( + "To use an SRT file, you need to specify its " + "original language (Source language)" + ) + if not media_file and subtitle_file: diarization_model = "disable" media_file = "audio_support.wav" @@ -585,11 +598,6 @@ def multilingual_media_conversion( prog_disp( "From SRT file...", 0.30, is_gui, progress=progress ) - if SOURCE_LANGUAGE == "Automatic detection": - raise Exception( - "To use an SRT file, you need to specify its " - "original language (Source language)" - ) audio = whisperx.load_audio(base_audio_wav) self.result = srt_file_to_segments(subtitle_file) self.result["language"] = SOURCE_LANGUAGE @@ -738,13 +746,14 @@ def multilingual_media_conversion( ], { "result_diarize": self.result_diarize }): - self.sub_file = process_subtitles( - self.result_source_lang, - self.align_language, - self.result_diarize, - output_format_subtitle, - TRANSLATE_AUDIO_TO, - ) + if output_format_subtitle != "ass": + self.sub_file = process_subtitles( + self.result_source_lang, + self.align_language, + self.result_diarize, + output_format_subtitle, + TRANSLATE_AUDIO_TO, + ) if output_format_subtitle != "srt": _ = process_subtitles( self.result_source_lang, @@ -753,6 +762,12 @@ def multilingual_media_conversion( "srt", TRANSLATE_AUDIO_TO, ) + if output_format_subtitle == "ass": + convert_ori = "ffmpeg -i sub_ori.srt sub_ori.ass -y" + convert_tra = "ffmpeg -i sub_tra.srt sub_tra.ass -y" + self.sub_file = "sub_tra.ass" + run_command(convert_ori) + run_command(convert_tra) if output_type == "subtitle": output = media_out( @@ -802,40 +817,6 @@ def multilingual_media_conversion( dereverb_automatic_xtts, ) - if not hasattr(self.vci, 'model_voice_path00'): - cc_transpose_values = cc_index_values = cc_model_paths = None - else: - cc_model_paths = [ - self.vci.model_voice_path00, - self.vci.model_voice_path01, - self.vci.model_voice_path02, - self.vci.model_voice_path03, - self.vci.model_voice_path04, - self.vci.model_voice_path05, - self.vci.model_voice_path99 - ] - - cc_index_values = [ - self.vci.file_index200, - self.vci.file_index201, - self.vci.file_index202, - self.vci.file_index203, - self.vci.file_index204, - self.vci.file_index205, - self.vci.file_index299 - ] - - cc_transpose_values = [ - self.vci.f0method, - self.vci.transpose00, - self.vci.transpose01, - self.vci.transpose02, - self.vci.transpose03, - self.vci.transpose04, - self.vci.transpose05, - self.vci.transpose99 - ] - if not self.task_in_cache("acc_and_vc", [ max_accelerate_audio, acceleration_rate_regulation, @@ -845,9 +826,8 @@ def multilingual_media_conversion( voice_imitation_vocals_dereverb, voice_imitation_method, os.getenv("VOICES_MODELS"), - cc_model_paths, - cc_index_values, - cc_transpose_values, + os.getenv("VOICES_MODELS_WORKERS"), + copy.deepcopy(self.vci.model_config), avoid_overlap ], { "valid_speakers": self.valid_speakers @@ -885,11 +865,16 @@ def multilingual_media_conversion( is_gui, progress=progress, ) - if cc_model_paths is None: - logger.error("Apply the configuration!") try: - self.vci(speakers_list, audio_files) + self.vci( + audio_files, + speakers_list, + overwrite=True, + parallel_workers=int( + os.getenv("VOICES_MODELS_WORKERS") + ), + ) except Exception as error: logger.error(str(error)) @@ -1131,7 +1116,14 @@ def multilingual_docs_conversion( is_gui, progress=progress, ) - self.vci(speakers_list, audio_files) + self.vci( + audio_files, + speakers_list, + overwrite=True, + parallel_workers=int( + os.getenv("VOICES_MODELS_WORKERS") + ), + ) prog_disp( "Creating final audio file...", 0.90, is_gui, progress=progress @@ -1461,6 +1453,7 @@ def submit(value): sub_type_options = [ "srt", "vtt", + "ass", "txt", "tsv", "json", @@ -1536,7 +1529,7 @@ def get_subs_path(type_subs): ) input_srt = gr.File( label=lg_conf["srt_file_label"], - file_types=[".srt", ".ass" ".vtt"], + file_types=[".srt", ".ass", ".vtt"], height=130, ) @@ -1900,6 +1893,7 @@ def swap_visibility(data_type): ) with gr.Tab("Custom voice R.V.C. (Optional)"): + with gr.Column(): with gr.Accordion("Get the R.V.C. Models", open=True): url_links = gr.Textbox( @@ -1913,22 +1907,22 @@ def swap_visibility(data_type): download_button = gr.Button("DOWNLOAD MODELS") def update_models(): - models, index_paths = upload_model_list() - for i in range(8): - dict_models = { - f"model_voice_path{i:02d}": gr.update( - choices=models - ) - for i in range(8) - } - dict_index = { - f"file_index2_{i:02d}": gr.update( - choices=index_paths - ) - for i in range(8) - } - dict_changes = {**dict_models, **dict_index} - return [value for value in dict_changes.values()] + models_path, index_path = upload_model_list() + + dict_models = { + f"fmodel{i:02d}": gr.update( + choices=models_path + ) + for i in range(7) + } + dict_index = { + f"findex{i:02d}": gr.update( + choices=index_path, value=None + ) + for i in range(7) + } + dict_changes = {**dict_models, **dict_index} + return [value for value in dict_changes.values()] with gr.Column(): with gr.Accordion(lg_conf["replace_title"], open=False): @@ -1943,207 +1937,168 @@ def update_models(): [enable_custom_voice], [], ) - + workers_custom_voice = gr.Number( + step=1, + value=1, + minimum=1, + maximum=50, + label="workers", + visible=False, + ) + workers_custom_voice.change( + custom_model_voice_workers, + [workers_custom_voice], + [], + ) gr.Markdown(lg_conf["sec2_title"]) gr.Markdown(lg_conf["sec2_subtitle"]) - gr.Markdown(lg_conf["cv_tts1"]) - with gr.Row(): - model_voice_path00 = gr.Dropdown( - models, - label="Model-1", - visible=True, - interactive=True, - ) - file_index2_00 = gr.Dropdown( - index_paths, - label="Index-1", - visible=True, - interactive=True, - ) - name_transpose00 = gr.Number( - label="Transpose-1", - value=0, - visible=True, - interactive=True, - ) - gr.HTML("
") - gr.Markdown(lg_conf["cv_tts2"]) - with gr.Row(): - model_voice_path01 = gr.Dropdown( - models, - label="Model-2", - visible=True, - interactive=True, - ) - file_index2_01 = gr.Dropdown( - index_paths, - label="Index-2", - visible=True, - interactive=True, - ) - name_transpose01 = gr.Number( - label="Transpose-2", - value=0, - visible=True, - interactive=True, - ) - gr.HTML("
") - gr.Markdown(lg_conf["cv_tts3"]) - with gr.Row(): - model_voice_path02 = gr.Dropdown( - models, - label="Model-3", - visible=True, - interactive=True, - ) - file_index2_02 = gr.Dropdown( - index_paths, - label="Index-3", - visible=True, - interactive=True, - ) - name_transpose02 = gr.Number( - label="Transpose-3", - value=0, - visible=True, - interactive=True, - ) - gr.HTML("
") - gr.Markdown(lg_conf["cv_tts4"]) - with gr.Row(): - model_voice_path03 = gr.Dropdown( - models, - label="Model-4", - visible=True, - interactive=True, - ) - file_index2_03 = gr.Dropdown( - index_paths, - label="Index-4", - visible=True, - interactive=True, - ) - name_transpose03 = gr.Number( - label="Transpose-4", - value=0, - visible=True, - interactive=True, - ) - gr.HTML("
") - gr.Markdown(lg_conf["cv_tts5"]) - with gr.Row(): - model_voice_path04 = gr.Dropdown( - models, - label="Model-5", + + PITCH_ALGO_OPT = [ + "pm", + "harvest", + "crepe", + "rmvpe", + ] + + def model_conf(): + return gr.Dropdown( + models_path, + # value="", + label="Model", visible=True, interactive=True, ) - file_index2_04 = gr.Dropdown( - index_paths, - label="Index-5", + + def pitch_algo_conf(): + return gr.Dropdown( + PITCH_ALGO_OPT, + value=PITCH_ALGO_OPT[3], + label="Pitch algorithm", visible=True, interactive=True, ) - name_transpose04 = gr.Number( - label="Transpose-5", + + def pitch_lvl_conf(): + return gr.Slider( + label="Pitch level", + minimum=-24, + maximum=24, + step=1, value=0, visible=True, interactive=True, ) - gr.HTML("
") - gr.Markdown(lg_conf["cv_tts6"]) - with gr.Row(): - model_voice_path05 = gr.Dropdown( - models, - label="Model-6", - visible=True, - interactive=True, - ) - file_index2_05 = gr.Dropdown( - index_paths, - label="Index-6", + + def index_conf(): + return gr.Dropdown( + index_path, + value=None, + label="Index", visible=True, interactive=True, ) - name_transpose05 = gr.Number( - label="Transpose-6", - value=0, - visible=True, - interactive=True, + + def index_inf_conf(): + return gr.Slider( + minimum=0, + maximum=1, + label="Index influence", + value=0.75, ) - gr.HTML("
") - gr.Markdown(lg_conf["cv_aux"]) - with gr.Row(): - model_voice_path06 = gr.Dropdown( - models, - label="Model-Aux", - visible=True, + + def respiration_filter_conf(): + return gr.Slider( + minimum=0, + maximum=7, + label="Respiration median filtering", + value=3, + step=1, interactive=True, ) - file_index2_06 = gr.Dropdown( - index_paths, - label="Index-Aux", - visible=True, + + def envelope_ratio_conf(): + return gr.Slider( + minimum=0, + maximum=1, + label="Envelope ratio", + value=0.25, interactive=True, ) - name_transpose06 = gr.Number( - label="Transpose-Aux", - value=0, - visible=True, + + def consonant_protec_conf(): + return gr.Slider( + minimum=0, + maximum=0.5, + label="Consonant breath protection", + value=0.5, interactive=True, ) - gr.HTML("
") - with gr.Row(): - f0_methods_voice = [ - "pm", - "harvest", - "crepe", - "rmvpe", - ] - f0_method_global = gr.Dropdown( - f0_methods_voice, - value="pm", - label="Global F0 method", - visible=True, - interactive=True, + + def button_conf(tts_name): + return gr.Button( + lg_conf["cv_button_apply"]+" "+tts_name, + variant="primary", ) - with gr.Row(variant="compact"): - button_config = gr.Button( - lg_conf["cv_button_apply"], - variant="primary", - ) + TTS_TABS = [ + 'TTS Speaker {}'.format(i) for i in range(1, 7) + ] - confirm_conf = gr.HTML() + CV_SUBTITLES = [ + lg_conf["cv_tts1"], + lg_conf["cv_tts2"], + lg_conf["cv_tts3"], + lg_conf["cv_tts4"], + lg_conf["cv_tts5"], + lg_conf["cv_tts6"], + ] - button_config.click( - SoniTr.vci.apply_conf, - inputs=[ - f0_method_global, - model_voice_path00, - name_transpose00, - file_index2_00, - model_voice_path01, - name_transpose01, - file_index2_01, - model_voice_path02, - name_transpose02, - file_index2_02, - model_voice_path03, - name_transpose03, - file_index2_03, - model_voice_path04, - name_transpose04, - file_index2_04, - model_voice_path05, - name_transpose05, - file_index2_05, - model_voice_path06, - name_transpose06, - file_index2_06, - ], - outputs=[confirm_conf], - ) + configs_storage = [] + + for i in range(6): # Loop from 00 to 05 + with gr.Accordion(CV_SUBTITLES[i], open=False): + gr.Markdown(TTS_TABS[i]) + with gr.Column(): + tag_gui = gr.Textbox( + value=TTS_TABS[i], visible=False + ) + model_gui = model_conf() + pitch_algo_gui = pitch_algo_conf() + pitch_lvl_gui = pitch_lvl_conf() + index_gui = index_conf() + index_inf_gui = index_inf_conf() + rmf_gui = respiration_filter_conf() + er_gui = envelope_ratio_conf() + cbp_gui = consonant_protec_conf() + + with gr.Row(variant="compact"): + button_config = button_conf( + TTS_TABS[i] + ) + + confirm_conf = gr.HTML() + + button_config.click( + SoniTr.vci.apply_conf, + inputs=[ + tag_gui, + model_gui, + pitch_algo_gui, + pitch_lvl_gui, + index_gui, + index_inf_gui, + rmf_gui, + er_gui, + cbp_gui, + ], + outputs=[confirm_conf], + ) + + configs_storage.append({ + "tag": tag_gui, + "model": model_gui, + "index": index_gui, + }) with gr.Column(): with gr.Accordion("Test R.V.C.", open=False): @@ -2163,32 +2118,10 @@ def update_models(): visible=True, interactive=True, ) - model_voice_path07 = gr.Dropdown( - models, - label="Model", - visible=True, - interactive=True, - ) # value='' - file_index2_07 = gr.Dropdown( - index_paths, - label="Index", - visible=True, - interactive=True, - ) # value='' - transpose_test = gr.Number( - label="Transpose", - value=0, - visible=True, - interactive=True, - info="integer, number of semitones, raise by an octave: 12, lower by an octave: -12", - ) - f0method_test = gr.Dropdown( - f0_methods_voice, - value="pm", - label="F0 method", - visible=True, - interactive=True, - ) + model_test = model_conf() + index_test = index_conf() + pitch_test = pitch_lvl_conf() + pitch_alg_test = pitch_algo_conf() with gr.Row(variant="compact"): button_test = gr.Button("Test audio") @@ -2202,10 +2135,10 @@ def update_models(): inputs=[ text_test, tts_test, - model_voice_path07, - file_index2_07, - transpose_test, - f0method_test, + model_test, + index_test, + pitch_test, + pitch_alg_test, ], outputs=[ttsvoice, original_ttsvoice], ) @@ -2219,23 +2152,10 @@ def update_models(): update_models, [], [ - model_voice_path00, - model_voice_path01, - model_voice_path02, - model_voice_path03, - model_voice_path04, - model_voice_path05, - model_voice_path06, - model_voice_path07, - file_index2_00, - file_index2_01, - file_index2_02, - file_index2_03, - file_index2_04, - file_index2_05, - file_index2_06, - file_index2_07, - ], + elem["model"] for elem in configs_storage + ] + [model_test] + [ + elem["index"] for elem in configs_storage + ] + [index_test], ) with gr.Tab(lg_conf["tab_help"]): @@ -2557,7 +2477,7 @@ def create_parser(): os.path.join(MDX_DOWNLOAD_LINK, id_model), mdxnet_models_dir ) - models, index_paths = upload_model_list() + models_path, index_path = upload_model_list() SoniTr = SoniTranslate() diff --git a/configs/32k.json b/configs/32k.json deleted file mode 100644 index 400b6be..0000000 --- a/configs/32k.json +++ /dev/null @@ -1,46 +0,0 @@ -{ - "train": { - "log_interval": 200, - "seed": 1234, - "epochs": 20000, - "learning_rate": 1e-4, - "betas": [0.8, 0.99], - "eps": 1e-9, - "batch_size": 4, - "fp16_run": false, - "lr_decay": 0.999875, - "segment_size": 12800, - "init_lr_ratio": 1, - "warmup_epochs": 0, - "c_mel": 45, - "c_kl": 1.0 - }, - "data": { - "max_wav_value": 32768.0, - "sampling_rate": 32000, - "filter_length": 1024, - "hop_length": 320, - "win_length": 1024, - "n_mel_channels": 80, - "mel_fmin": 0.0, - "mel_fmax": null - }, - "model": { - "inter_channels": 192, - "hidden_channels": 192, - "filter_channels": 768, - "n_heads": 2, - "n_layers": 6, - "kernel_size": 3, - "p_dropout": 0, - "resblock": "1", - "resblock_kernel_sizes": [3,7,11], - "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], - "upsample_rates": [10,4,2,2,2], - "upsample_initial_channel": 512, - "upsample_kernel_sizes": [16,16,4,4,4], - "use_spectral_norm": false, - "gin_channels": 256, - "spk_embed_dim": 109 - } -} diff --git a/configs/32k_v2.json b/configs/32k_v2.json deleted file mode 100644 index 36adb8a..0000000 --- a/configs/32k_v2.json +++ /dev/null @@ -1,46 +0,0 @@ -{ - "train": { - "log_interval": 200, - "seed": 1234, - "epochs": 20000, - "learning_rate": 1e-4, - "betas": [0.8, 0.99], - "eps": 1e-9, - "batch_size": 4, - "fp16_run": false, - "lr_decay": 0.999875, - "segment_size": 12800, - "init_lr_ratio": 1, - "warmup_epochs": 0, - "c_mel": 45, - "c_kl": 1.0 - }, - "data": { - "max_wav_value": 32768.0, - "sampling_rate": 32000, - "filter_length": 1024, - "hop_length": 320, - "win_length": 1024, - "n_mel_channels": 80, - "mel_fmin": 0.0, - "mel_fmax": null - }, - "model": { - "inter_channels": 192, - "hidden_channels": 192, - "filter_channels": 768, - "n_heads": 2, - "n_layers": 6, - "kernel_size": 3, - "p_dropout": 0, - "resblock": "1", - "resblock_kernel_sizes": [3,7,11], - "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], - "upsample_rates": [10,8,2,2], - "upsample_initial_channel": 512, - "upsample_kernel_sizes": [20,16,4,4], - "use_spectral_norm": false, - "gin_channels": 256, - "spk_embed_dim": 109 - } -} diff --git a/configs/40k.json b/configs/40k.json deleted file mode 100644 index cb30b8b..0000000 --- a/configs/40k.json +++ /dev/null @@ -1,46 +0,0 @@ -{ - "train": { - "log_interval": 200, - "seed": 1234, - "epochs": 20000, - "learning_rate": 1e-4, - "betas": [0.8, 0.99], - "eps": 1e-9, - "batch_size": 4, - "fp16_run": false, - "lr_decay": 0.999875, - "segment_size": 12800, - "init_lr_ratio": 1, - "warmup_epochs": 0, - "c_mel": 45, - "c_kl": 1.0 - }, - "data": { - "max_wav_value": 32768.0, - "sampling_rate": 40000, - "filter_length": 2048, - "hop_length": 400, - "win_length": 2048, - "n_mel_channels": 125, - "mel_fmin": 0.0, - "mel_fmax": null - }, - "model": { - "inter_channels": 192, - "hidden_channels": 192, - "filter_channels": 768, - "n_heads": 2, - "n_layers": 6, - "kernel_size": 3, - "p_dropout": 0, - "resblock": "1", - "resblock_kernel_sizes": [3,7,11], - "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], - "upsample_rates": [10,10,2,2], - "upsample_initial_channel": 512, - "upsample_kernel_sizes": [16,16,4,4], - "use_spectral_norm": false, - "gin_channels": 256, - "spk_embed_dim": 109 - } -} diff --git a/configs/48k.json b/configs/48k.json deleted file mode 100644 index 6875991..0000000 --- a/configs/48k.json +++ /dev/null @@ -1,46 +0,0 @@ -{ - "train": { - "log_interval": 200, - "seed": 1234, - "epochs": 20000, - "learning_rate": 1e-4, - "betas": [0.8, 0.99], - "eps": 1e-9, - "batch_size": 4, - "fp16_run": false, - "lr_decay": 0.999875, - "segment_size": 11520, - "init_lr_ratio": 1, - "warmup_epochs": 0, - "c_mel": 45, - "c_kl": 1.0 - }, - "data": { - "max_wav_value": 32768.0, - "sampling_rate": 48000, - "filter_length": 2048, - "hop_length": 480, - "win_length": 2048, - "n_mel_channels": 128, - "mel_fmin": 0.0, - "mel_fmax": null - }, - "model": { - "inter_channels": 192, - "hidden_channels": 192, - "filter_channels": 768, - "n_heads": 2, - "n_layers": 6, - "kernel_size": 3, - "p_dropout": 0, - "resblock": "1", - "resblock_kernel_sizes": [3,7,11], - "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], - "upsample_rates": [10,6,2,2,2], - "upsample_initial_channel": 512, - "upsample_kernel_sizes": [16,16,4,4,4], - "use_spectral_norm": false, - "gin_channels": 256, - "spk_embed_dim": 109 - } -} diff --git a/configs/48k_v2.json b/configs/48k_v2.json deleted file mode 100644 index 73ee363..0000000 --- a/configs/48k_v2.json +++ /dev/null @@ -1,46 +0,0 @@ -{ - "train": { - "log_interval": 200, - "seed": 1234, - "epochs": 20000, - "learning_rate": 1e-4, - "betas": [0.8, 0.99], - "eps": 1e-9, - "batch_size": 4, - "fp16_run": false, - "lr_decay": 0.999875, - "segment_size": 17280, - "init_lr_ratio": 1, - "warmup_epochs": 0, - "c_mel": 45, - "c_kl": 1.0 - }, - "data": { - "max_wav_value": 32768.0, - "sampling_rate": 48000, - "filter_length": 2048, - "hop_length": 480, - "win_length": 2048, - "n_mel_channels": 128, - "mel_fmin": 0.0, - "mel_fmax": null - }, - "model": { - "inter_channels": 192, - "hidden_channels": 192, - "filter_channels": 768, - "n_heads": 2, - "n_layers": 6, - "kernel_size": 3, - "p_dropout": 0, - "resblock": "1", - "resblock_kernel_sizes": [3,7,11], - "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], - "upsample_rates": [12,10,2,2], - "upsample_initial_channel": 512, - "upsample_kernel_sizes": [24,20,4,4], - "use_spectral_norm": false, - "gin_channels": 256, - "spk_embed_dim": 109 - } -} diff --git a/lib/infer_pack/models_dml.py b/lib/infer_pack/models_dml.py deleted file mode 100644 index 958d7b2..0000000 --- a/lib/infer_pack/models_dml.py +++ /dev/null @@ -1,1124 +0,0 @@ -import math, pdb, os -from time import time as ttime -import torch -from torch import nn -from torch.nn import functional as F -from lib.infer_pack import modules -from lib.infer_pack import attentions -from lib.infer_pack import commons -from lib.infer_pack.commons import init_weights, get_padding -from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d -from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm -from lib.infer_pack.commons import init_weights -import numpy as np -from lib.infer_pack import commons - - -class TextEncoder256(nn.Module): - def __init__( - self, - out_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - f0=True, - ): - super().__init__() - self.out_channels = out_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.emb_phone = nn.Linear(256, hidden_channels) - self.lrelu = nn.LeakyReLU(0.1, inplace=True) - if f0 == True: - self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 - self.encoder = attentions.Encoder( - hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout - ) - self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - - def forward(self, phone, pitch, lengths): - if pitch == None: - x = self.emb_phone(phone) - else: - x = self.emb_phone(phone) + self.emb_pitch(pitch) - x = x * math.sqrt(self.hidden_channels) # [b, t, h] - x = self.lrelu(x) - x = torch.transpose(x, 1, -1) # [b, h, t] - x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( - x.dtype - ) - x = self.encoder(x * x_mask, x_mask) - stats = self.proj(x) * x_mask - - m, logs = torch.split(stats, self.out_channels, dim=1) - return m, logs, x_mask - - -class TextEncoder768(nn.Module): - def __init__( - self, - out_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - f0=True, - ): - super().__init__() - self.out_channels = out_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.emb_phone = nn.Linear(768, hidden_channels) - self.lrelu = nn.LeakyReLU(0.1, inplace=True) - if f0 == True: - self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 - self.encoder = attentions.Encoder( - hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout - ) - self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - - def forward(self, phone, pitch, lengths): - if pitch == None: - x = self.emb_phone(phone) - else: - x = self.emb_phone(phone) + self.emb_pitch(pitch) - x = x * math.sqrt(self.hidden_channels) # [b, t, h] - x = self.lrelu(x) - x = torch.transpose(x, 1, -1) # [b, h, t] - x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( - x.dtype - ) - x = self.encoder(x * x_mask, x_mask) - stats = self.proj(x) * x_mask - - m, logs = torch.split(stats, self.out_channels, dim=1) - return m, logs, x_mask - - -class ResidualCouplingBlock(nn.Module): - def __init__( - self, - channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - n_flows=4, - gin_channels=0, - ): - super().__init__() - self.channels = channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.n_flows = n_flows - self.gin_channels = gin_channels - - self.flows = nn.ModuleList() - for i in range(n_flows): - self.flows.append( - modules.ResidualCouplingLayer( - channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - gin_channels=gin_channels, - mean_only=True, - ) - ) - self.flows.append(modules.Flip()) - - def forward(self, x, x_mask, g=None, reverse=False): - if not reverse: - for flow in self.flows: - x, _ = flow(x, x_mask, g=g, reverse=reverse) - else: - for flow in reversed(self.flows): - x = flow(x, x_mask, g=g, reverse=reverse) - return x - - def remove_weight_norm(self): - for i in range(self.n_flows): - self.flows[i * 2].remove_weight_norm() - - -class PosteriorEncoder(nn.Module): - def __init__( - self, - in_channels, - out_channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - gin_channels=0, - ): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.gin_channels = gin_channels - - self.pre = nn.Conv1d(in_channels, hidden_channels, 1) - self.enc = modules.WN( - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - gin_channels=gin_channels, - ) - self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - - def forward(self, x, x_lengths, g=None): - x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( - x.dtype - ) - x = self.pre(x) * x_mask - x = self.enc(x, x_mask, g=g) - stats = self.proj(x) * x_mask - m, logs = torch.split(stats, self.out_channels, dim=1) - z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask - return z, m, logs, x_mask - - def remove_weight_norm(self): - self.enc.remove_weight_norm() - - -class Generator(torch.nn.Module): - def __init__( - self, - initial_channel, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=0, - ): - super(Generator, self).__init__() - self.num_kernels = len(resblock_kernel_sizes) - self.num_upsamples = len(upsample_rates) - self.conv_pre = Conv1d( - initial_channel, upsample_initial_channel, 7, 1, padding=3 - ) - resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 - - self.ups = nn.ModuleList() - for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): - self.ups.append( - weight_norm( - ConvTranspose1d( - upsample_initial_channel // (2**i), - upsample_initial_channel // (2 ** (i + 1)), - k, - u, - padding=(k - u) // 2, - ) - ) - ) - - self.resblocks = nn.ModuleList() - for i in range(len(self.ups)): - ch = upsample_initial_channel // (2 ** (i + 1)) - for j, (k, d) in enumerate( - zip(resblock_kernel_sizes, resblock_dilation_sizes) - ): - self.resblocks.append(resblock(ch, k, d)) - - self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) - self.ups.apply(init_weights) - - if gin_channels != 0: - self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) - - def forward(self, x, g=None): - x = self.conv_pre(x) - if g is not None: - x = x + self.cond(g) - - for i in range(self.num_upsamples): - x = F.leaky_relu(x, modules.LRELU_SLOPE) - x = self.ups[i](x) - xs = None - for j in range(self.num_kernels): - if xs is None: - xs = self.resblocks[i * self.num_kernels + j](x) - else: - xs += self.resblocks[i * self.num_kernels + j](x) - x = xs / self.num_kernels - x = F.leaky_relu(x) - x = self.conv_post(x) - x = torch.tanh(x) - - return x - - def remove_weight_norm(self): - for l in self.ups: - remove_weight_norm(l) - for l in self.resblocks: - l.remove_weight_norm() - - -class SineGen(torch.nn.Module): - """Definition of sine generator - SineGen(samp_rate, harmonic_num = 0, - sine_amp = 0.1, noise_std = 0.003, - voiced_threshold = 0, - flag_for_pulse=False) - samp_rate: sampling rate in Hz - harmonic_num: number of harmonic overtones (default 0) - sine_amp: amplitude of sine-wavefrom (default 0.1) - noise_std: std of Gaussian noise (default 0.003) - voiced_thoreshold: F0 threshold for U/V classification (default 0) - flag_for_pulse: this SinGen is used inside PulseGen (default False) - Note: when flag_for_pulse is True, the first time step of a voiced - segment is always sin(np.pi) or cos(0) - """ - - def __init__( - self, - samp_rate, - harmonic_num=0, - sine_amp=0.1, - noise_std=0.003, - voiced_threshold=0, - flag_for_pulse=False, - ): - super(SineGen, self).__init__() - self.sine_amp = sine_amp - self.noise_std = noise_std - self.harmonic_num = harmonic_num - self.dim = self.harmonic_num + 1 - self.sampling_rate = samp_rate - self.voiced_threshold = voiced_threshold - - def _f02uv(self, f0): - # generate uv signal - uv = torch.ones_like(f0) - uv = uv * (f0 > self.voiced_threshold) - return uv.float() - - def forward(self, f0, upp): - """sine_tensor, uv = forward(f0) - input F0: tensor(batchsize=1, length, dim=1) - f0 for unvoiced steps should be 0 - output sine_tensor: tensor(batchsize=1, length, dim) - output uv: tensor(batchsize=1, length, 1) - """ - with torch.no_grad(): - f0 = f0[:, None].transpose(1, 2) - f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) - # fundamental component - f0_buf[:, :, 0] = f0[:, :, 0] - for idx in np.arange(self.harmonic_num): - f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( - idx + 2 - ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic - rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化 - rand_ini = torch.rand( - f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device - ) - rand_ini[:, 0] = 0 - rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini - tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化 - tmp_over_one *= upp - tmp_over_one = F.interpolate( - tmp_over_one.transpose(2, 1), - scale_factor=upp, - mode="linear", - align_corners=True, - ).transpose(2, 1) - rad_values = F.interpolate( - rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" - ).transpose( - 2, 1 - ) ####### - tmp_over_one %= 1 - tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 - cumsum_shift = torch.zeros_like(rad_values) - cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 - sine_waves = torch.sin( - torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi - ) - sine_waves = sine_waves * self.sine_amp - uv = self._f02uv(f0) - uv = F.interpolate( - uv.transpose(2, 1), scale_factor=upp, mode="nearest" - ).transpose(2, 1) - noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 - noise = noise_amp * torch.randn_like(sine_waves) - sine_waves = sine_waves * uv + noise - return sine_waves, uv, noise - - -class SourceModuleHnNSF(torch.nn.Module): - """SourceModule for hn-nsf - SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, - add_noise_std=0.003, voiced_threshod=0) - sampling_rate: sampling_rate in Hz - harmonic_num: number of harmonic above F0 (default: 0) - sine_amp: amplitude of sine source signal (default: 0.1) - add_noise_std: std of additive Gaussian noise (default: 0.003) - note that amplitude of noise in unvoiced is decided - by sine_amp - voiced_threshold: threhold to set U/V given F0 (default: 0) - Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) - F0_sampled (batchsize, length, 1) - Sine_source (batchsize, length, 1) - noise_source (batchsize, length 1) - uv (batchsize, length, 1) - """ - - def __init__( - self, - sampling_rate, - harmonic_num=0, - sine_amp=0.1, - add_noise_std=0.003, - voiced_threshod=0, - is_half=True, - ): - super(SourceModuleHnNSF, self).__init__() - - self.sine_amp = sine_amp - self.noise_std = add_noise_std - self.is_half = is_half - # to produce sine waveforms - self.l_sin_gen = SineGen( - sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod - ) - - # to merge source harmonics into a single excitation - self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) - self.l_tanh = torch.nn.Tanh() - - def forward(self, x, upp=None): - sine_wavs, uv, _ = self.l_sin_gen(x, upp) - if self.is_half: - sine_wavs = sine_wavs.half() - sine_merge = self.l_tanh(self.l_linear(sine_wavs)) - return sine_merge, None, None # noise, uv - - -class GeneratorNSF(torch.nn.Module): - def __init__( - self, - initial_channel, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels, - sr, - is_half=False, - ): - super(GeneratorNSF, self).__init__() - self.num_kernels = len(resblock_kernel_sizes) - self.num_upsamples = len(upsample_rates) - - self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) - self.m_source = SourceModuleHnNSF( - sampling_rate=sr, harmonic_num=0, is_half=is_half - ) - self.noise_convs = nn.ModuleList() - self.conv_pre = Conv1d( - initial_channel, upsample_initial_channel, 7, 1, padding=3 - ) - resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 - - self.ups = nn.ModuleList() - for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): - c_cur = upsample_initial_channel // (2 ** (i + 1)) - self.ups.append( - weight_norm( - ConvTranspose1d( - upsample_initial_channel // (2**i), - upsample_initial_channel // (2 ** (i + 1)), - k, - u, - padding=(k - u) // 2, - ) - ) - ) - if i + 1 < len(upsample_rates): - stride_f0 = np.prod(upsample_rates[i + 1 :]) - self.noise_convs.append( - Conv1d( - 1, - c_cur, - kernel_size=stride_f0 * 2, - stride=stride_f0, - padding=stride_f0 // 2, - ) - ) - else: - self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) - - self.resblocks = nn.ModuleList() - for i in range(len(self.ups)): - ch = upsample_initial_channel // (2 ** (i + 1)) - for j, (k, d) in enumerate( - zip(resblock_kernel_sizes, resblock_dilation_sizes) - ): - self.resblocks.append(resblock(ch, k, d)) - - self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) - self.ups.apply(init_weights) - - if gin_channels != 0: - self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) - - self.upp = np.prod(upsample_rates) - - def forward(self, x, f0, g=None): - har_source, noi_source, uv = self.m_source(f0, self.upp) - har_source = har_source.transpose(1, 2) - x = self.conv_pre(x) - if g is not None: - x = x + self.cond(g) - - for i in range(self.num_upsamples): - x = F.leaky_relu(x, modules.LRELU_SLOPE) - x = self.ups[i](x) - x_source = self.noise_convs[i](har_source) - x = x + x_source - xs = None - for j in range(self.num_kernels): - if xs is None: - xs = self.resblocks[i * self.num_kernels + j](x) - else: - xs += self.resblocks[i * self.num_kernels + j](x) - x = xs / self.num_kernels - x = F.leaky_relu(x) - x = self.conv_post(x) - x = torch.tanh(x) - return x - - def remove_weight_norm(self): - for l in self.ups: - remove_weight_norm(l) - for l in self.resblocks: - l.remove_weight_norm() - - -sr2sr = { - "32k": 32000, - "40k": 40000, - "48k": 48000, -} - - -class SynthesizerTrnMs256NSFsid(nn.Module): - def __init__( - self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - spk_embed_dim, - gin_channels, - sr, - **kwargs - ): - super().__init__() - if type(sr) == type("strr"): - sr = sr2sr[sr] - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.resblock = resblock - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.gin_channels = gin_channels - # self.hop_length = hop_length# - self.spk_embed_dim = spk_embed_dim - self.enc_p = TextEncoder256( - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - ) - self.dec = GeneratorNSF( - inter_channels, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=gin_channels, - sr=sr, - is_half=kwargs["is_half"], - ) - self.enc_q = PosteriorEncoder( - spec_channels, - inter_channels, - hidden_channels, - 5, - 1, - 16, - gin_channels=gin_channels, - ) - self.flow = ResidualCouplingBlock( - inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels - ) - self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) - - def remove_weight_norm(self): - self.dec.remove_weight_norm() - self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() - - def forward( - self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds - ): # 这里ds是id,[bs,1] - # print(1,pitch.shape)#[bs,t] - g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) - z_p = self.flow(z, y_mask, g=g) - z_slice, ids_slice = commons.rand_slice_segments( - z, y_lengths, self.segment_size - ) - # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) - pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) - # print(-2,pitchf.shape,z_slice.shape) - o = self.dec(z_slice, pitchf, g=g) - return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - - def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): - g = self.emb_g(sid).unsqueeze(-1) - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) - return o, x_mask, (z, z_p, m_p, logs_p) - - -class SynthesizerTrnMs768NSFsid(nn.Module): - def __init__( - self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - spk_embed_dim, - gin_channels, - sr, - **kwargs - ): - super().__init__() - if type(sr) == type("strr"): - sr = sr2sr[sr] - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.resblock = resblock - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.gin_channels = gin_channels - # self.hop_length = hop_length# - self.spk_embed_dim = spk_embed_dim - self.enc_p = TextEncoder768( - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - ) - self.dec = GeneratorNSF( - inter_channels, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=gin_channels, - sr=sr, - is_half=kwargs["is_half"], - ) - self.enc_q = PosteriorEncoder( - spec_channels, - inter_channels, - hidden_channels, - 5, - 1, - 16, - gin_channels=gin_channels, - ) - self.flow = ResidualCouplingBlock( - inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels - ) - self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) - - def remove_weight_norm(self): - self.dec.remove_weight_norm() - self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() - - def forward( - self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds - ): # 这里ds是id,[bs,1] - # print(1,pitch.shape)#[bs,t] - g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) - z_p = self.flow(z, y_mask, g=g) - z_slice, ids_slice = commons.rand_slice_segments( - z, y_lengths, self.segment_size - ) - # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) - pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) - # print(-2,pitchf.shape,z_slice.shape) - o = self.dec(z_slice, pitchf, g=g) - return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - - def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): - g = self.emb_g(sid).unsqueeze(-1) - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) - return o, x_mask, (z, z_p, m_p, logs_p) - - -class SynthesizerTrnMs256NSFsid_nono(nn.Module): - def __init__( - self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - spk_embed_dim, - gin_channels, - sr=None, - **kwargs - ): - super().__init__() - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.resblock = resblock - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.gin_channels = gin_channels - # self.hop_length = hop_length# - self.spk_embed_dim = spk_embed_dim - self.enc_p = TextEncoder256( - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - f0=False, - ) - self.dec = Generator( - inter_channels, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=gin_channels, - ) - self.enc_q = PosteriorEncoder( - spec_channels, - inter_channels, - hidden_channels, - 5, - 1, - 16, - gin_channels=gin_channels, - ) - self.flow = ResidualCouplingBlock( - inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels - ) - self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) - - def remove_weight_norm(self): - self.dec.remove_weight_norm() - self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() - - def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1] - g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 - m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) - z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) - z_p = self.flow(z, y_mask, g=g) - z_slice, ids_slice = commons.rand_slice_segments( - z, y_lengths, self.segment_size - ) - o = self.dec(z_slice, g=g) - return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - - def infer(self, phone, phone_lengths, sid, max_len=None): - g = self.emb_g(sid).unsqueeze(-1) - m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) - z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], g=g) - return o, x_mask, (z, z_p, m_p, logs_p) - - -class SynthesizerTrnMs768NSFsid_nono(nn.Module): - def __init__( - self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - spk_embed_dim, - gin_channels, - sr=None, - **kwargs - ): - super().__init__() - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.resblock = resblock - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.gin_channels = gin_channels - # self.hop_length = hop_length# - self.spk_embed_dim = spk_embed_dim - self.enc_p = TextEncoder768( - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - f0=False, - ) - self.dec = Generator( - inter_channels, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=gin_channels, - ) - self.enc_q = PosteriorEncoder( - spec_channels, - inter_channels, - hidden_channels, - 5, - 1, - 16, - gin_channels=gin_channels, - ) - self.flow = ResidualCouplingBlock( - inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels - ) - self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) - - def remove_weight_norm(self): - self.dec.remove_weight_norm() - self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() - - def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1] - g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 - m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) - z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) - z_p = self.flow(z, y_mask, g=g) - z_slice, ids_slice = commons.rand_slice_segments( - z, y_lengths, self.segment_size - ) - o = self.dec(z_slice, g=g) - return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - - def infer(self, phone, phone_lengths, sid, max_len=None): - g = self.emb_g(sid).unsqueeze(-1) - m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) - z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], g=g) - return o, x_mask, (z, z_p, m_p, logs_p) - - -class MultiPeriodDiscriminator(torch.nn.Module): - def __init__(self, use_spectral_norm=False): - super(MultiPeriodDiscriminator, self).__init__() - periods = [2, 3, 5, 7, 11, 17] - # periods = [3, 5, 7, 11, 17, 23, 37] - - discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] - discs = discs + [ - DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods - ] - self.discriminators = nn.ModuleList(discs) - - def forward(self, y, y_hat): - y_d_rs = [] # - y_d_gs = [] - fmap_rs = [] - fmap_gs = [] - for i, d in enumerate(self.discriminators): - y_d_r, fmap_r = d(y) - y_d_g, fmap_g = d(y_hat) - # for j in range(len(fmap_r)): - # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) - y_d_rs.append(y_d_r) - y_d_gs.append(y_d_g) - fmap_rs.append(fmap_r) - fmap_gs.append(fmap_g) - - return y_d_rs, y_d_gs, fmap_rs, fmap_gs - - -class MultiPeriodDiscriminatorV2(torch.nn.Module): - def __init__(self, use_spectral_norm=False): - super(MultiPeriodDiscriminatorV2, self).__init__() - # periods = [2, 3, 5, 7, 11, 17] - periods = [2, 3, 5, 7, 11, 17, 23, 37] - - discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] - discs = discs + [ - DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods - ] - self.discriminators = nn.ModuleList(discs) - - def forward(self, y, y_hat): - y_d_rs = [] # - y_d_gs = [] - fmap_rs = [] - fmap_gs = [] - for i, d in enumerate(self.discriminators): - y_d_r, fmap_r = d(y) - y_d_g, fmap_g = d(y_hat) - # for j in range(len(fmap_r)): - # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) - y_d_rs.append(y_d_r) - y_d_gs.append(y_d_g) - fmap_rs.append(fmap_r) - fmap_gs.append(fmap_g) - - return y_d_rs, y_d_gs, fmap_rs, fmap_gs - - -class DiscriminatorS(torch.nn.Module): - def __init__(self, use_spectral_norm=False): - super(DiscriminatorS, self).__init__() - norm_f = weight_norm if use_spectral_norm == False else spectral_norm - self.convs = nn.ModuleList( - [ - norm_f(Conv1d(1, 16, 15, 1, padding=7)), - norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), - norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), - norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), - norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), - norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), - ] - ) - self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) - - def forward(self, x): - fmap = [] - - for l in self.convs: - x = l(x) - x = F.leaky_relu(x, modules.LRELU_SLOPE) - fmap.append(x) - x = self.conv_post(x) - fmap.append(x) - x = torch.flatten(x, 1, -1) - - return x, fmap - - -class DiscriminatorP(torch.nn.Module): - def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): - super(DiscriminatorP, self).__init__() - self.period = period - self.use_spectral_norm = use_spectral_norm - norm_f = weight_norm if use_spectral_norm == False else spectral_norm - self.convs = nn.ModuleList( - [ - norm_f( - Conv2d( - 1, - 32, - (kernel_size, 1), - (stride, 1), - padding=(get_padding(kernel_size, 1), 0), - ) - ), - norm_f( - Conv2d( - 32, - 128, - (kernel_size, 1), - (stride, 1), - padding=(get_padding(kernel_size, 1), 0), - ) - ), - norm_f( - Conv2d( - 128, - 512, - (kernel_size, 1), - (stride, 1), - padding=(get_padding(kernel_size, 1), 0), - ) - ), - norm_f( - Conv2d( - 512, - 1024, - (kernel_size, 1), - (stride, 1), - padding=(get_padding(kernel_size, 1), 0), - ) - ), - norm_f( - Conv2d( - 1024, - 1024, - (kernel_size, 1), - 1, - padding=(get_padding(kernel_size, 1), 0), - ) - ), - ] - ) - self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) - - def forward(self, x): - fmap = [] - - # 1d to 2d - b, c, t = x.shape - if t % self.period != 0: # pad first - n_pad = self.period - (t % self.period) - x = F.pad(x, (0, n_pad), "reflect") - t = t + n_pad - x = x.view(b, c, t // self.period, self.period) - - for l in self.convs: - x = l(x) - x = F.leaky_relu(x, modules.LRELU_SLOPE) - fmap.append(x) - x = self.conv_post(x) - fmap.append(x) - x = torch.flatten(x, 1, -1) - - return x, fmap diff --git a/lib/infer_pack/models_onnx.py b/lib/infer_pack/models_onnx.py deleted file mode 100644 index 963e67b..0000000 --- a/lib/infer_pack/models_onnx.py +++ /dev/null @@ -1,819 +0,0 @@ -import math, pdb, os -from time import time as ttime -import torch -from torch import nn -from torch.nn import functional as F -from lib.infer_pack import modules -from lib.infer_pack import attentions -from lib.infer_pack import commons -from lib.infer_pack.commons import init_weights, get_padding -from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d -from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm -from lib.infer_pack.commons import init_weights -import numpy as np -from lib.infer_pack import commons - - -class TextEncoder256(nn.Module): - def __init__( - self, - out_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - f0=True, - ): - super().__init__() - self.out_channels = out_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.emb_phone = nn.Linear(256, hidden_channels) - self.lrelu = nn.LeakyReLU(0.1, inplace=True) - if f0 == True: - self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 - self.encoder = attentions.Encoder( - hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout - ) - self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - - def forward(self, phone, pitch, lengths): - if pitch == None: - x = self.emb_phone(phone) - else: - x = self.emb_phone(phone) + self.emb_pitch(pitch) - x = x * math.sqrt(self.hidden_channels) # [b, t, h] - x = self.lrelu(x) - x = torch.transpose(x, 1, -1) # [b, h, t] - x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( - x.dtype - ) - x = self.encoder(x * x_mask, x_mask) - stats = self.proj(x) * x_mask - - m, logs = torch.split(stats, self.out_channels, dim=1) - return m, logs, x_mask - - -class TextEncoder768(nn.Module): - def __init__( - self, - out_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - f0=True, - ): - super().__init__() - self.out_channels = out_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.emb_phone = nn.Linear(768, hidden_channels) - self.lrelu = nn.LeakyReLU(0.1, inplace=True) - if f0 == True: - self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 - self.encoder = attentions.Encoder( - hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout - ) - self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - - def forward(self, phone, pitch, lengths): - if pitch == None: - x = self.emb_phone(phone) - else: - x = self.emb_phone(phone) + self.emb_pitch(pitch) - x = x * math.sqrt(self.hidden_channels) # [b, t, h] - x = self.lrelu(x) - x = torch.transpose(x, 1, -1) # [b, h, t] - x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( - x.dtype - ) - x = self.encoder(x * x_mask, x_mask) - stats = self.proj(x) * x_mask - - m, logs = torch.split(stats, self.out_channels, dim=1) - return m, logs, x_mask - - -class ResidualCouplingBlock(nn.Module): - def __init__( - self, - channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - n_flows=4, - gin_channels=0, - ): - super().__init__() - self.channels = channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.n_flows = n_flows - self.gin_channels = gin_channels - - self.flows = nn.ModuleList() - for i in range(n_flows): - self.flows.append( - modules.ResidualCouplingLayer( - channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - gin_channels=gin_channels, - mean_only=True, - ) - ) - self.flows.append(modules.Flip()) - - def forward(self, x, x_mask, g=None, reverse=False): - if not reverse: - for flow in self.flows: - x, _ = flow(x, x_mask, g=g, reverse=reverse) - else: - for flow in reversed(self.flows): - x = flow(x, x_mask, g=g, reverse=reverse) - return x - - def remove_weight_norm(self): - for i in range(self.n_flows): - self.flows[i * 2].remove_weight_norm() - - -class PosteriorEncoder(nn.Module): - def __init__( - self, - in_channels, - out_channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - gin_channels=0, - ): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.gin_channels = gin_channels - - self.pre = nn.Conv1d(in_channels, hidden_channels, 1) - self.enc = modules.WN( - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - gin_channels=gin_channels, - ) - self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - - def forward(self, x, x_lengths, g=None): - x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( - x.dtype - ) - x = self.pre(x) * x_mask - x = self.enc(x, x_mask, g=g) - stats = self.proj(x) * x_mask - m, logs = torch.split(stats, self.out_channels, dim=1) - z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask - return z, m, logs, x_mask - - def remove_weight_norm(self): - self.enc.remove_weight_norm() - - -class Generator(torch.nn.Module): - def __init__( - self, - initial_channel, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=0, - ): - super(Generator, self).__init__() - self.num_kernels = len(resblock_kernel_sizes) - self.num_upsamples = len(upsample_rates) - self.conv_pre = Conv1d( - initial_channel, upsample_initial_channel, 7, 1, padding=3 - ) - resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 - - self.ups = nn.ModuleList() - for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): - self.ups.append( - weight_norm( - ConvTranspose1d( - upsample_initial_channel // (2**i), - upsample_initial_channel // (2 ** (i + 1)), - k, - u, - padding=(k - u) // 2, - ) - ) - ) - - self.resblocks = nn.ModuleList() - for i in range(len(self.ups)): - ch = upsample_initial_channel // (2 ** (i + 1)) - for j, (k, d) in enumerate( - zip(resblock_kernel_sizes, resblock_dilation_sizes) - ): - self.resblocks.append(resblock(ch, k, d)) - - self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) - self.ups.apply(init_weights) - - if gin_channels != 0: - self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) - - def forward(self, x, g=None): - x = self.conv_pre(x) - if g is not None: - x = x + self.cond(g) - - for i in range(self.num_upsamples): - x = F.leaky_relu(x, modules.LRELU_SLOPE) - x = self.ups[i](x) - xs = None - for j in range(self.num_kernels): - if xs is None: - xs = self.resblocks[i * self.num_kernels + j](x) - else: - xs += self.resblocks[i * self.num_kernels + j](x) - x = xs / self.num_kernels - x = F.leaky_relu(x) - x = self.conv_post(x) - x = torch.tanh(x) - - return x - - def remove_weight_norm(self): - for l in self.ups: - remove_weight_norm(l) - for l in self.resblocks: - l.remove_weight_norm() - - -class SineGen(torch.nn.Module): - """Definition of sine generator - SineGen(samp_rate, harmonic_num = 0, - sine_amp = 0.1, noise_std = 0.003, - voiced_threshold = 0, - flag_for_pulse=False) - samp_rate: sampling rate in Hz - harmonic_num: number of harmonic overtones (default 0) - sine_amp: amplitude of sine-wavefrom (default 0.1) - noise_std: std of Gaussian noise (default 0.003) - voiced_thoreshold: F0 threshold for U/V classification (default 0) - flag_for_pulse: this SinGen is used inside PulseGen (default False) - Note: when flag_for_pulse is True, the first time step of a voiced - segment is always sin(np.pi) or cos(0) - """ - - def __init__( - self, - samp_rate, - harmonic_num=0, - sine_amp=0.1, - noise_std=0.003, - voiced_threshold=0, - flag_for_pulse=False, - ): - super(SineGen, self).__init__() - self.sine_amp = sine_amp - self.noise_std = noise_std - self.harmonic_num = harmonic_num - self.dim = self.harmonic_num + 1 - self.sampling_rate = samp_rate - self.voiced_threshold = voiced_threshold - - def _f02uv(self, f0): - # generate uv signal - uv = torch.ones_like(f0) - uv = uv * (f0 > self.voiced_threshold) - return uv - - def forward(self, f0, upp): - """sine_tensor, uv = forward(f0) - input F0: tensor(batchsize=1, length, dim=1) - f0 for unvoiced steps should be 0 - output sine_tensor: tensor(batchsize=1, length, dim) - output uv: tensor(batchsize=1, length, 1) - """ - with torch.no_grad(): - f0 = f0[:, None].transpose(1, 2) - f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) - # fundamental component - f0_buf[:, :, 0] = f0[:, :, 0] - for idx in np.arange(self.harmonic_num): - f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( - idx + 2 - ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic - rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化 - rand_ini = torch.rand( - f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device - ) - rand_ini[:, 0] = 0 - rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini - tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化 - tmp_over_one *= upp - tmp_over_one = F.interpolate( - tmp_over_one.transpose(2, 1), - scale_factor=upp, - mode="linear", - align_corners=True, - ).transpose(2, 1) - rad_values = F.interpolate( - rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" - ).transpose( - 2, 1 - ) ####### - tmp_over_one %= 1 - tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 - cumsum_shift = torch.zeros_like(rad_values) - cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 - sine_waves = torch.sin( - torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi - ) - sine_waves = sine_waves * self.sine_amp - uv = self._f02uv(f0) - uv = F.interpolate( - uv.transpose(2, 1), scale_factor=upp, mode="nearest" - ).transpose(2, 1) - noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 - noise = noise_amp * torch.randn_like(sine_waves) - sine_waves = sine_waves * uv + noise - return sine_waves, uv, noise - - -class SourceModuleHnNSF(torch.nn.Module): - """SourceModule for hn-nsf - SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, - add_noise_std=0.003, voiced_threshod=0) - sampling_rate: sampling_rate in Hz - harmonic_num: number of harmonic above F0 (default: 0) - sine_amp: amplitude of sine source signal (default: 0.1) - add_noise_std: std of additive Gaussian noise (default: 0.003) - note that amplitude of noise in unvoiced is decided - by sine_amp - voiced_threshold: threhold to set U/V given F0 (default: 0) - Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) - F0_sampled (batchsize, length, 1) - Sine_source (batchsize, length, 1) - noise_source (batchsize, length 1) - uv (batchsize, length, 1) - """ - - def __init__( - self, - sampling_rate, - harmonic_num=0, - sine_amp=0.1, - add_noise_std=0.003, - voiced_threshod=0, - is_half=True, - ): - super(SourceModuleHnNSF, self).__init__() - - self.sine_amp = sine_amp - self.noise_std = add_noise_std - self.is_half = is_half - # to produce sine waveforms - self.l_sin_gen = SineGen( - sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod - ) - - # to merge source harmonics into a single excitation - self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) - self.l_tanh = torch.nn.Tanh() - - def forward(self, x, upp=None): - sine_wavs, uv, _ = self.l_sin_gen(x, upp) - if self.is_half: - sine_wavs = sine_wavs.half() - sine_merge = self.l_tanh(self.l_linear(sine_wavs)) - return sine_merge, None, None # noise, uv - - -class GeneratorNSF(torch.nn.Module): - def __init__( - self, - initial_channel, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels, - sr, - is_half=False, - ): - super(GeneratorNSF, self).__init__() - self.num_kernels = len(resblock_kernel_sizes) - self.num_upsamples = len(upsample_rates) - - self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) - self.m_source = SourceModuleHnNSF( - sampling_rate=sr, harmonic_num=0, is_half=is_half - ) - self.noise_convs = nn.ModuleList() - self.conv_pre = Conv1d( - initial_channel, upsample_initial_channel, 7, 1, padding=3 - ) - resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 - - self.ups = nn.ModuleList() - for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): - c_cur = upsample_initial_channel // (2 ** (i + 1)) - self.ups.append( - weight_norm( - ConvTranspose1d( - upsample_initial_channel // (2**i), - upsample_initial_channel // (2 ** (i + 1)), - k, - u, - padding=(k - u) // 2, - ) - ) - ) - if i + 1 < len(upsample_rates): - stride_f0 = np.prod(upsample_rates[i + 1 :]) - self.noise_convs.append( - Conv1d( - 1, - c_cur, - kernel_size=stride_f0 * 2, - stride=stride_f0, - padding=stride_f0 // 2, - ) - ) - else: - self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) - - self.resblocks = nn.ModuleList() - for i in range(len(self.ups)): - ch = upsample_initial_channel // (2 ** (i + 1)) - for j, (k, d) in enumerate( - zip(resblock_kernel_sizes, resblock_dilation_sizes) - ): - self.resblocks.append(resblock(ch, k, d)) - - self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) - self.ups.apply(init_weights) - - if gin_channels != 0: - self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) - - self.upp = np.prod(upsample_rates) - - def forward(self, x, f0, g=None): - har_source, noi_source, uv = self.m_source(f0, self.upp) - har_source = har_source.transpose(1, 2) - x = self.conv_pre(x) - if g is not None: - x = x + self.cond(g) - - for i in range(self.num_upsamples): - x = F.leaky_relu(x, modules.LRELU_SLOPE) - x = self.ups[i](x) - x_source = self.noise_convs[i](har_source) - x = x + x_source - xs = None - for j in range(self.num_kernels): - if xs is None: - xs = self.resblocks[i * self.num_kernels + j](x) - else: - xs += self.resblocks[i * self.num_kernels + j](x) - x = xs / self.num_kernels - x = F.leaky_relu(x) - x = self.conv_post(x) - x = torch.tanh(x) - return x - - def remove_weight_norm(self): - for l in self.ups: - remove_weight_norm(l) - for l in self.resblocks: - l.remove_weight_norm() - - -sr2sr = { - "32k": 32000, - "40k": 40000, - "48k": 48000, -} - - -class SynthesizerTrnMsNSFsidM(nn.Module): - def __init__( - self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - spk_embed_dim, - gin_channels, - sr, - version, - **kwargs - ): - super().__init__() - if type(sr) == type("strr"): - sr = sr2sr[sr] - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.resblock = resblock - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.gin_channels = gin_channels - # self.hop_length = hop_length# - self.spk_embed_dim = spk_embed_dim - if version == "v1": - self.enc_p = TextEncoder256( - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - ) - else: - self.enc_p = TextEncoder768( - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - ) - self.dec = GeneratorNSF( - inter_channels, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=gin_channels, - sr=sr, - is_half=kwargs["is_half"], - ) - self.enc_q = PosteriorEncoder( - spec_channels, - inter_channels, - hidden_channels, - 5, - 1, - 16, - gin_channels=gin_channels, - ) - self.flow = ResidualCouplingBlock( - inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels - ) - self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - self.speaker_map = None - print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) - - def remove_weight_norm(self): - self.dec.remove_weight_norm() - self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() - - def construct_spkmixmap(self, n_speaker): - self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels)) - for i in range(n_speaker): - self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]])) - self.speaker_map = self.speaker_map.unsqueeze(0) - - def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None): - if self.speaker_map is not None: # [N, S] * [S, B, 1, H] - g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1] - g = g * self.speaker_map # [N, S, B, 1, H] - g = torch.sum(g, dim=1) # [N, 1, B, 1, H] - g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N] - else: - g = g.unsqueeze(0) - g = self.emb_g(g).transpose(1, 2) - - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask - z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) - return o - - -class MultiPeriodDiscriminator(torch.nn.Module): - def __init__(self, use_spectral_norm=False): - super(MultiPeriodDiscriminator, self).__init__() - periods = [2, 3, 5, 7, 11, 17] - # periods = [3, 5, 7, 11, 17, 23, 37] - - discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] - discs = discs + [ - DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods - ] - self.discriminators = nn.ModuleList(discs) - - def forward(self, y, y_hat): - y_d_rs = [] # - y_d_gs = [] - fmap_rs = [] - fmap_gs = [] - for i, d in enumerate(self.discriminators): - y_d_r, fmap_r = d(y) - y_d_g, fmap_g = d(y_hat) - # for j in range(len(fmap_r)): - # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) - y_d_rs.append(y_d_r) - y_d_gs.append(y_d_g) - fmap_rs.append(fmap_r) - fmap_gs.append(fmap_g) - - return y_d_rs, y_d_gs, fmap_rs, fmap_gs - - -class MultiPeriodDiscriminatorV2(torch.nn.Module): - def __init__(self, use_spectral_norm=False): - super(MultiPeriodDiscriminatorV2, self).__init__() - # periods = [2, 3, 5, 7, 11, 17] - periods = [2, 3, 5, 7, 11, 17, 23, 37] - - discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] - discs = discs + [ - DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods - ] - self.discriminators = nn.ModuleList(discs) - - def forward(self, y, y_hat): - y_d_rs = [] # - y_d_gs = [] - fmap_rs = [] - fmap_gs = [] - for i, d in enumerate(self.discriminators): - y_d_r, fmap_r = d(y) - y_d_g, fmap_g = d(y_hat) - # for j in range(len(fmap_r)): - # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) - y_d_rs.append(y_d_r) - y_d_gs.append(y_d_g) - fmap_rs.append(fmap_r) - fmap_gs.append(fmap_g) - - return y_d_rs, y_d_gs, fmap_rs, fmap_gs - - -class DiscriminatorS(torch.nn.Module): - def __init__(self, use_spectral_norm=False): - super(DiscriminatorS, self).__init__() - norm_f = weight_norm if use_spectral_norm == False else spectral_norm - self.convs = nn.ModuleList( - [ - norm_f(Conv1d(1, 16, 15, 1, padding=7)), - norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), - norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), - norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), - norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), - norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), - ] - ) - self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) - - def forward(self, x): - fmap = [] - - for l in self.convs: - x = l(x) - x = F.leaky_relu(x, modules.LRELU_SLOPE) - fmap.append(x) - x = self.conv_post(x) - fmap.append(x) - x = torch.flatten(x, 1, -1) - - return x, fmap - - -class DiscriminatorP(torch.nn.Module): - def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): - super(DiscriminatorP, self).__init__() - self.period = period - self.use_spectral_norm = use_spectral_norm - norm_f = weight_norm if use_spectral_norm == False else spectral_norm - self.convs = nn.ModuleList( - [ - norm_f( - Conv2d( - 1, - 32, - (kernel_size, 1), - (stride, 1), - padding=(get_padding(kernel_size, 1), 0), - ) - ), - norm_f( - Conv2d( - 32, - 128, - (kernel_size, 1), - (stride, 1), - padding=(get_padding(kernel_size, 1), 0), - ) - ), - norm_f( - Conv2d( - 128, - 512, - (kernel_size, 1), - (stride, 1), - padding=(get_padding(kernel_size, 1), 0), - ) - ), - norm_f( - Conv2d( - 512, - 1024, - (kernel_size, 1), - (stride, 1), - padding=(get_padding(kernel_size, 1), 0), - ) - ), - norm_f( - Conv2d( - 1024, - 1024, - (kernel_size, 1), - 1, - padding=(get_padding(kernel_size, 1), 0), - ) - ), - ] - ) - self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) - - def forward(self, x): - fmap = [] - - # 1d to 2d - b, c, t = x.shape - if t % self.period != 0: # pad first - n_pad = self.period - (t % self.period) - x = F.pad(x, (0, n_pad), "reflect") - t = t + n_pad - x = x.view(b, c, t // self.period, self.period) - - for l in self.convs: - x = l(x) - x = F.leaky_relu(x, modules.LRELU_SLOPE) - fmap.append(x) - x = self.conv_post(x) - fmap.append(x) - x = torch.flatten(x, 1, -1) - - return x, fmap diff --git a/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py b/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py deleted file mode 100644 index b5a8e3e..0000000 --- a/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py +++ /dev/null @@ -1,90 +0,0 @@ -from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor -import pyworld -import numpy as np - - -class DioF0Predictor(F0Predictor): - def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): - self.hop_length = hop_length - self.f0_min = f0_min - self.f0_max = f0_max - self.sampling_rate = sampling_rate - - def interpolate_f0(self, f0): - """ - 对F0进行插值处理 - """ - - data = np.reshape(f0, (f0.size, 1)) - - vuv_vector = np.zeros((data.size, 1), dtype=np.float32) - vuv_vector[data > 0.0] = 1.0 - vuv_vector[data <= 0.0] = 0.0 - - ip_data = data - - frame_number = data.size - last_value = 0.0 - for i in range(frame_number): - if data[i] <= 0.0: - j = i + 1 - for j in range(i + 1, frame_number): - if data[j] > 0.0: - break - if j < frame_number - 1: - if last_value > 0.0: - step = (data[j] - data[i - 1]) / float(j - i) - for k in range(i, j): - ip_data[k] = data[i - 1] + step * (k - i + 1) - else: - for k in range(i, j): - ip_data[k] = data[j] - else: - for k in range(i, frame_number): - ip_data[k] = last_value - else: - ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝 - last_value = data[i] - - return ip_data[:, 0], vuv_vector[:, 0] - - def resize_f0(self, x, target_len): - source = np.array(x) - source[source < 0.001] = np.nan - target = np.interp( - np.arange(0, len(source) * target_len, len(source)) / target_len, - np.arange(0, len(source)), - source, - ) - res = np.nan_to_num(target) - return res - - def compute_f0(self, wav, p_len=None): - if p_len is None: - p_len = wav.shape[0] // self.hop_length - f0, t = pyworld.dio( - wav.astype(np.double), - fs=self.sampling_rate, - f0_floor=self.f0_min, - f0_ceil=self.f0_max, - frame_period=1000 * self.hop_length / self.sampling_rate, - ) - f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) - for index, pitch in enumerate(f0): - f0[index] = round(pitch, 1) - return self.interpolate_f0(self.resize_f0(f0, p_len))[0] - - def compute_f0_uv(self, wav, p_len=None): - if p_len is None: - p_len = wav.shape[0] // self.hop_length - f0, t = pyworld.dio( - wav.astype(np.double), - fs=self.sampling_rate, - f0_floor=self.f0_min, - f0_ceil=self.f0_max, - frame_period=1000 * self.hop_length / self.sampling_rate, - ) - f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) - for index, pitch in enumerate(f0): - f0[index] = round(pitch, 1) - return self.interpolate_f0(self.resize_f0(f0, p_len)) diff --git a/lib/infer_pack/modules/F0Predictor/F0Predictor.py b/lib/infer_pack/modules/F0Predictor/F0Predictor.py deleted file mode 100644 index 0d81b05..0000000 --- a/lib/infer_pack/modules/F0Predictor/F0Predictor.py +++ /dev/null @@ -1,16 +0,0 @@ -class F0Predictor(object): - def compute_f0(self, wav, p_len): - """ - input: wav:[signal_length] - p_len:int - output: f0:[signal_length//hop_length] - """ - pass - - def compute_f0_uv(self, wav, p_len): - """ - input: wav:[signal_length] - p_len:int - output: f0:[signal_length//hop_length],uv:[signal_length//hop_length] - """ - pass diff --git a/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py b/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py deleted file mode 100644 index f8dae30..0000000 --- a/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py +++ /dev/null @@ -1,86 +0,0 @@ -from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor -import pyworld -import numpy as np - - -class HarvestF0Predictor(F0Predictor): - def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): - self.hop_length = hop_length - self.f0_min = f0_min - self.f0_max = f0_max - self.sampling_rate = sampling_rate - - def interpolate_f0(self, f0): - """ - 对F0进行插值处理 - """ - - data = np.reshape(f0, (f0.size, 1)) - - vuv_vector = np.zeros((data.size, 1), dtype=np.float32) - vuv_vector[data > 0.0] = 1.0 - vuv_vector[data <= 0.0] = 0.0 - - ip_data = data - - frame_number = data.size - last_value = 0.0 - for i in range(frame_number): - if data[i] <= 0.0: - j = i + 1 - for j in range(i + 1, frame_number): - if data[j] > 0.0: - break - if j < frame_number - 1: - if last_value > 0.0: - step = (data[j] - data[i - 1]) / float(j - i) - for k in range(i, j): - ip_data[k] = data[i - 1] + step * (k - i + 1) - else: - for k in range(i, j): - ip_data[k] = data[j] - else: - for k in range(i, frame_number): - ip_data[k] = last_value - else: - ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝 - last_value = data[i] - - return ip_data[:, 0], vuv_vector[:, 0] - - def resize_f0(self, x, target_len): - source = np.array(x) - source[source < 0.001] = np.nan - target = np.interp( - np.arange(0, len(source) * target_len, len(source)) / target_len, - np.arange(0, len(source)), - source, - ) - res = np.nan_to_num(target) - return res - - def compute_f0(self, wav, p_len=None): - if p_len is None: - p_len = wav.shape[0] // self.hop_length - f0, t = pyworld.harvest( - wav.astype(np.double), - fs=self.hop_length, - f0_ceil=self.f0_max, - f0_floor=self.f0_min, - frame_period=1000 * self.hop_length / self.sampling_rate, - ) - f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs) - return self.interpolate_f0(self.resize_f0(f0, p_len))[0] - - def compute_f0_uv(self, wav, p_len=None): - if p_len is None: - p_len = wav.shape[0] // self.hop_length - f0, t = pyworld.harvest( - wav.astype(np.double), - fs=self.sampling_rate, - f0_floor=self.f0_min, - f0_ceil=self.f0_max, - frame_period=1000 * self.hop_length / self.sampling_rate, - ) - f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) - return self.interpolate_f0(self.resize_f0(f0, p_len)) diff --git a/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py b/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py deleted file mode 100644 index b70de29..0000000 --- a/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py +++ /dev/null @@ -1,97 +0,0 @@ -from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor -import parselmouth -import numpy as np - - -class PMF0Predictor(F0Predictor): - def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): - self.hop_length = hop_length - self.f0_min = f0_min - self.f0_max = f0_max - self.sampling_rate = sampling_rate - - def interpolate_f0(self, f0): - """ - 对F0进行插值处理 - """ - - data = np.reshape(f0, (f0.size, 1)) - - vuv_vector = np.zeros((data.size, 1), dtype=np.float32) - vuv_vector[data > 0.0] = 1.0 - vuv_vector[data <= 0.0] = 0.0 - - ip_data = data - - frame_number = data.size - last_value = 0.0 - for i in range(frame_number): - if data[i] <= 0.0: - j = i + 1 - for j in range(i + 1, frame_number): - if data[j] > 0.0: - break - if j < frame_number - 1: - if last_value > 0.0: - step = (data[j] - data[i - 1]) / float(j - i) - for k in range(i, j): - ip_data[k] = data[i - 1] + step * (k - i + 1) - else: - for k in range(i, j): - ip_data[k] = data[j] - else: - for k in range(i, frame_number): - ip_data[k] = last_value - else: - ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝 - last_value = data[i] - - return ip_data[:, 0], vuv_vector[:, 0] - - def compute_f0(self, wav, p_len=None): - x = wav - if p_len is None: - p_len = x.shape[0] // self.hop_length - else: - assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" - time_step = self.hop_length / self.sampling_rate * 1000 - f0 = ( - parselmouth.Sound(x, self.sampling_rate) - .to_pitch_ac( - time_step=time_step / 1000, - voicing_threshold=0.6, - pitch_floor=self.f0_min, - pitch_ceiling=self.f0_max, - ) - .selected_array["frequency"] - ) - - pad_size = (p_len - len(f0) + 1) // 2 - if pad_size > 0 or p_len - len(f0) - pad_size > 0: - f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") - f0, uv = self.interpolate_f0(f0) - return f0 - - def compute_f0_uv(self, wav, p_len=None): - x = wav - if p_len is None: - p_len = x.shape[0] // self.hop_length - else: - assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" - time_step = self.hop_length / self.sampling_rate * 1000 - f0 = ( - parselmouth.Sound(x, self.sampling_rate) - .to_pitch_ac( - time_step=time_step / 1000, - voicing_threshold=0.6, - pitch_floor=self.f0_min, - pitch_ceiling=self.f0_max, - ) - .selected_array["frequency"] - ) - - pad_size = (p_len - len(f0) + 1) // 2 - if pad_size > 0 or p_len - len(f0) - pad_size > 0: - f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") - f0, uv = self.interpolate_f0(f0) - return f0, uv diff --git a/lib/infer_pack/modules/F0Predictor/__init__.py b/lib/infer_pack/modules/F0Predictor/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/lib/infer_pack/onnx_inference.py b/lib/infer_pack/onnx_inference.py deleted file mode 100644 index b4aba75..0000000 --- a/lib/infer_pack/onnx_inference.py +++ /dev/null @@ -1,145 +0,0 @@ -import onnxruntime -import librosa -import numpy as np -import soundfile - - -class ContentVec: - def __init__(self, vec_path="pretrained/vec-768-layer-12.onnx", device=None): - print("load model(s) from {}".format(vec_path)) - if device == "cpu" or device is None: - providers = ["CPUExecutionProvider"] - elif device == "cuda": - providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] - elif device == "dml": - providers = ["DmlExecutionProvider"] - else: - raise RuntimeError("Unsportted Device") - self.model = onnxruntime.InferenceSession(vec_path, providers=providers) - - def __call__(self, wav): - return self.forward(wav) - - def forward(self, wav): - feats = wav - if feats.ndim == 2: # double channels - feats = feats.mean(-1) - assert feats.ndim == 1, feats.ndim - feats = np.expand_dims(np.expand_dims(feats, 0), 0) - onnx_input = {self.model.get_inputs()[0].name: feats} - logits = self.model.run(None, onnx_input)[0] - return logits.transpose(0, 2, 1) - - -def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs): - if f0_predictor == "pm": - from lib.infer_pack.modules.F0Predictor.PMF0Predictor import PMF0Predictor - - f0_predictor_object = PMF0Predictor( - hop_length=hop_length, sampling_rate=sampling_rate - ) - elif f0_predictor == "harvest": - from lib.infer_pack.modules.F0Predictor.HarvestF0Predictor import ( - HarvestF0Predictor, - ) - - f0_predictor_object = HarvestF0Predictor( - hop_length=hop_length, sampling_rate=sampling_rate - ) - elif f0_predictor == "dio": - from lib.infer_pack.modules.F0Predictor.DioF0Predictor import DioF0Predictor - - f0_predictor_object = DioF0Predictor( - hop_length=hop_length, sampling_rate=sampling_rate - ) - else: - raise Exception("Unknown f0 predictor") - return f0_predictor_object - - -class OnnxRVC: - def __init__( - self, - model_path, - sr=40000, - hop_size=512, - vec_path="vec-768-layer-12", - device="cpu", - ): - vec_path = f"pretrained/{vec_path}.onnx" - self.vec_model = ContentVec(vec_path, device) - if device == "cpu" or device is None: - providers = ["CPUExecutionProvider"] - elif device == "cuda": - providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] - elif device == "dml": - providers = ["DmlExecutionProvider"] - else: - raise RuntimeError("Unsportted Device") - self.model = onnxruntime.InferenceSession(model_path, providers=providers) - self.sampling_rate = sr - self.hop_size = hop_size - - def forward(self, hubert, hubert_length, pitch, pitchf, ds, rnd): - onnx_input = { - self.model.get_inputs()[0].name: hubert, - self.model.get_inputs()[1].name: hubert_length, - self.model.get_inputs()[2].name: pitch, - self.model.get_inputs()[3].name: pitchf, - self.model.get_inputs()[4].name: ds, - self.model.get_inputs()[5].name: rnd, - } - return (self.model.run(None, onnx_input)[0] * 32767).astype(np.int16) - - def inference( - self, - raw_path, - sid, - f0_method="dio", - f0_up_key=0, - pad_time=0.5, - cr_threshold=0.02, - ): - f0_min = 50 - f0_max = 1100 - f0_mel_min = 1127 * np.log(1 + f0_min / 700) - f0_mel_max = 1127 * np.log(1 + f0_max / 700) - f0_predictor = get_f0_predictor( - f0_method, - hop_length=self.hop_size, - sampling_rate=self.sampling_rate, - threshold=cr_threshold, - ) - wav, sr = librosa.load(raw_path, sr=self.sampling_rate) - org_length = len(wav) - if org_length / sr > 50.0: - raise RuntimeError("Reached Max Length") - - wav16k = librosa.resample(wav, orig_sr=self.sampling_rate, target_sr=16000) - wav16k = wav16k - - hubert = self.vec_model(wav16k) - hubert = np.repeat(hubert, 2, axis=2).transpose(0, 2, 1).astype(np.float32) - hubert_length = hubert.shape[1] - - pitchf = f0_predictor.compute_f0(wav, hubert_length) - pitchf = pitchf * 2 ** (f0_up_key / 12) - pitch = pitchf.copy() - f0_mel = 1127 * np.log(1 + pitch / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( - f0_mel_max - f0_mel_min - ) + 1 - f0_mel[f0_mel <= 1] = 1 - f0_mel[f0_mel > 255] = 255 - pitch = np.rint(f0_mel).astype(np.int64) - - pitchf = pitchf.reshape(1, len(pitchf)).astype(np.float32) - pitch = pitch.reshape(1, len(pitch)) - ds = np.array([sid]).astype(np.int64) - - rnd = np.random.randn(1, 192, hubert_length).astype(np.float32) - hubert_length = np.array([hubert_length]).astype(np.int64) - - out_wav = self.forward(hubert, hubert_length, pitch, pitchf, ds, rnd).squeeze() - out_wav = np.pad(out_wav, (0, 2 * self.hop_size), "constant") - return out_wav[0:org_length] diff --git a/soni_translate/logging_setup.py b/soni_translate/logging_setup.py index bb17bef..28f0ce6 100644 --- a/soni_translate/logging_setup.py +++ b/soni_translate/logging_setup.py @@ -10,6 +10,7 @@ def configure_logging_libs(debug=False): ) modules = [ "numba", "httpx", "markdown_it", "speechbrain", "fairseq", "pyannote", + "faiss", "pytorch_lightning.utilities.migration.utils", "pytorch_lightning.utilities.migration", "pytorch_lightning", diff --git a/soni_translate/text_multiformat_processor.py b/soni_translate/text_multiformat_processor.py index 560b59e..9ba6871 100644 --- a/soni_translate/text_multiformat_processor.py +++ b/soni_translate/text_multiformat_processor.py @@ -1,6 +1,6 @@ from .logging_setup import logger from whisperx.utils import get_writer -from .utils import remove_files +from .utils import remove_files, run_command import srt import re import os @@ -53,7 +53,15 @@ def clean_text(text): def srt_file_to_segments(file_path, speaker=False): - srt_content_list = extract_from_srt(file_path) + try: + srt_content_list = extract_from_srt(file_path) + except Exception as error: + logger.error(str(error)) + fixed_file = "fixed_sub.srt" + remove_files(fixed_file) + fix_sub = f'ffmpeg -i "{file_path}" "{fixed_file}" -y' + run_command(fix_sub) + srt_content_list = extract_from_srt(fixed_file) segments = [] for segment in srt_content_list: @@ -240,6 +248,11 @@ def get_subtitle( if not filename: filename = "task_subtitle" + is_ass_extension = False + if extension == "ass": + is_ass_extension = True + extension = "srt" + sub_file = filename + "." + extension support_name = filename + ".mp3" remove_files(sub_file) @@ -270,6 +283,13 @@ def get_subtitle( word_options, ) + if is_ass_extension: + temp_name = filename + ".ass" + remove_files(temp_name) + convert_sub = f'ffmpeg -i "{sub_file}" "{temp_name}" -y' + run_command(convert_sub) + sub_file = temp_name + return sub_file diff --git a/soni_translate/text_to_speech.py b/soni_translate/text_to_speech.py index 4e1ddc0..7e67e65 100644 --- a/soni_translate/text_to_speech.py +++ b/soni_translate/text_to_speech.py @@ -484,13 +484,16 @@ def create_wav_file_vc( from .mdx_net import process_uvr_task - _, _, _, _, vocals_dereverb_path = process_uvr_task( - orig_song_path=audio_segment, - main_vocals=True, - dereverb=get_vocals_dereverb, - ) + try: + _, _, _, _, audio_segment = process_uvr_task( + orig_song_path=audio_segment, + main_vocals=True, + dereverb=get_vocals_dereverb, + ) + except Exception as error: + logger.error(str(error)) - sample = convert_to_xtts_good_sample(vocals_dereverb_path) + sample = convert_to_xtts_good_sample(audio_segment) sample_name = f"{sample_name}.wav" sample_rename = rename_file(sample, sample_name) @@ -1092,6 +1095,7 @@ def accelerate_segments( ) audio_files.append(f"{folder_output}/{filename}") + speaker = "TTS Speaker " + str(int(speaker[-1]) + 1) speakers_list.append(speaker) return audio_files, speakers_list diff --git a/soni_translate/utils.py b/soni_translate/utils.py index be738b6..ad70780 100644 --- a/soni_translate/utils.py +++ b/soni_translate/utils.py @@ -60,17 +60,17 @@ def upload_model_list(): models = [] for name in os.listdir(weight_root): if name.endswith(".pth"): - models.append(name) + models.append("weights/" + name) if models: - logger.info(models) + logger.debug(models) index_root = "logs" - index_paths = [] + index_paths = [None] for name in os.listdir(index_root): if name.endswith(".index"): index_paths.append("logs/" + name) if index_paths: - logger.info(index_paths) + logger.debug(index_paths) return models, index_paths diff --git a/voice_main.py b/voice_main.py index a6df058..f6b33de 100644 --- a/voice_main.py +++ b/voice_main.py @@ -1,244 +1,48 @@ -import torch from soni_translate.logging_setup import logger +import torch +import gc +import numpy as np +import os +import shutil +import warnings +import threading +from tqdm import tqdm from lib.infer_pack.models import ( SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono, SynthesizerTrnMs768NSFsid, SynthesizerTrnMs768NSFsid_nono, ) -from vci_pipeline import VC -import traceback, pdb from lib.audio import load_audio -import numpy as np -import os, shutil -from fairseq import checkpoint_utils import soundfile as sf -from gtts import gTTS import edge_tts import asyncio from soni_translate.utils import remove_directory_contents, create_directories +from scipy import signal +from time import time as ttime +import faiss +from vci_pipeline import VC, change_rms, bh, ah +import librosa +warnings.filterwarnings("ignore") -def generate_inference(sid, to_return_protect0, to_return_protect1): - global n_spk, tgt_sr, net_g, vc, cpt, version - if sid == "" or sid == []: - global hubert_model - if hubert_model is not None: # change model or not - logger.debug("Clean empty cache") - del net_g, n_spk, vc, hubert_model, tgt_sr # ,cpt - hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None - if torch.cuda.is_available(): - torch.cuda.empty_cache() - # if clean - if_f0 = cpt.get("f0", 1) - version = cpt.get("version", "v1") - if version == "v1": - if if_f0 == 1: - net_g = SynthesizerTrnMs256NSFsid( - *cpt["config"], is_half=config.is_half - ) - else: - net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) - elif version == "v2": - if if_f0 == 1: - net_g = SynthesizerTrnMs768NSFsid( - *cpt["config"], is_half=config.is_half - ) - else: - net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) - del net_g, cpt - if torch.cuda.is_available(): - torch.cuda.empty_cache() - return {"visible": False, "__type__": "update"} - person = "%s/%s" % (weight_root, sid) - logger.info("Loading %s" % person) - cpt = torch.load(person, map_location="cpu") - tgt_sr = cpt["config"][-1] - cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk - if_f0 = cpt.get("f0", 1) - if if_f0 == 0: - to_return_protect0 = to_return_protect1 = { - "visible": False, - "value": 0.5, - "__type__": "update", - } - else: - to_return_protect0 = { - "visible": True, - "value": to_return_protect0, - "__type__": "update", - } - to_return_protect1 = { - "visible": True, - "value": to_return_protect1, - "__type__": "update", - } - version = cpt.get("version", "v1") - if version == "v1": - if if_f0 == 1: - net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half) - else: - net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) - elif version == "v2": - if if_f0 == 1: - net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half) - else: - net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) - del net_g.enc_q - print(net_g.load_state_dict(cpt["weight"], strict=False)) - net_g.eval().to(config.device) - if config.is_half: - net_g = net_g.half() - else: - net_g = net_g.float() - vc = VC(tgt_sr, config) - n_spk = cpt["config"][-3] - return ( - {"visible": True, "maximum": n_spk, "__type__": "update"}, - to_return_protect0, - to_return_protect1, - ) - - -# inference -def vc_single( - sid, - input_audio_path, - f0_up_key, - f0_file, - f0_method, - file_index, - file_index2, - # file_big_npy, - index_rate, - filter_radius, - resample_sr, - rms_mix_rate, - protect, -): - global tgt_sr, net_g, vc, hubert_model, version, cpt - if input_audio_path is None: - return "You need to upload an audio", None - f0_up_key = int(f0_up_key) - try: - audio = load_audio(input_audio_path, 16000) - audio_max = np.abs(audio).max() / 0.95 - if audio_max > 1: - audio /= audio_max - times = [0, 0, 0] - if not hubert_model: - load_hubert() - if_f0 = cpt.get("f0", 1) - file_index = ( - ( - file_index.strip(" ") - .strip('"') - .strip("\n") - .strip('"') - .strip(" ") - .replace("trained", "added") - ) - if file_index != "" - else file_index2 - ) # reemplace for 2 - # file_big_npy = ( - # file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - # ) - audio_opt = vc.pipeline( - hubert_model, - net_g, - sid, - audio, - input_audio_path, - times, - f0_up_key, - f0_method, - file_index, - # file_big_npy, - index_rate, - if_f0, - filter_radius, - tgt_sr, - resample_sr, - rms_mix_rate, - version, - protect, - f0_file=f0_file, - ) - if tgt_sr != resample_sr >= 16000: - tgt_sr = resample_sr - index_info = ( - "Using index:%s." % file_index - if os.path.exists(file_index) - else "Index not used." - ) - return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % ( - index_info, - times[0], - times[1], - times[2], - ), (tgt_sr, audio_opt) - except: - info = traceback.format_exc() - logger.error(str(info)) - return info, (None, None) - - -BASE_DOWNLOAD_LINK = "https://huggingface.co/r3gm/sonitranslate_voice_models/resolve/main/" -BASE_MODELS = [ - "hubert_base.pt", - "rmvpe.pt" -] -BASE_DIR = "." - -# hubert model -def load_hubert(): - global hubert_model - - from soni_translate.utils import download_manager - for id_model in BASE_MODELS: - download_manager( - os.path.join(BASE_DOWNLOAD_LINK, id_model), BASE_DIR - ) - models, _, _ = checkpoint_utils.load_model_ensemble_and_task( - ["hubert_base.pt"], - suffix="", - ) - hubert_model = models[0] - hubert_model = hubert_model.to(config.device) - if config.is_half: - hubert_model = hubert_model.half() - else: - hubert_model = hubert_model.float() - hubert_model.eval() - -# config cpu -def use_fp32_config(): - for config_file in [ - "32k.json", - "40k.json", - "48k.json", - "48k_v2.json", - "32k_v2.json", - ]: - with open(f"configs/{config_file}", "r") as f: - strr = f.read().replace("true", "false") - with open(f"configs/{config_file}", "w") as f: - f.write(strr) - -# config device and torch type class Config: - def __init__(self, device, is_half): - self.device = device - self.is_half = is_half - self.n_cpu = 2 # set cpu cores #################### + def __init__(self, only_cpu=False): + self.device = "cuda:0" + self.is_half = True + self.n_cpu = 0 self.gpu_name = None self.gpu_mem = None - self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config() - - def device_config(self) -> tuple: - if torch.cuda.is_available(): + ( + self.x_pad, + self.x_query, + self.x_center, + self.x_max + ) = self.device_config(only_cpu) + + def device_config(self, only_cpu) -> tuple: + if torch.cuda.is_available() and not only_cpu: i_device = int(self.device.split(":")[-1]) self.gpu_name = torch.cuda.get_device_name(i_device) if ( @@ -248,17 +52,11 @@ def device_config(self) -> tuple: or "1070" in self.gpu_name or "1080" in self.gpu_name ): - logger.info("16/10 Series GPUs and P40 excel in single-precision tasks.") + logger.info( + "16/10 Series GPUs and P40 excel " + "in single-precision tasks." + ) self.is_half = False - for config_file in ["32k.json", "40k.json", "48k.json"]: - with open(f"configs/{config_file}", "r") as f: - strr = f.read().replace("true", "false") - with open(f"configs/{config_file}", "w") as f: - f.write(strr) - with open("trainset_preprocess_pipeline_print.py", "r") as f: - strr = f.read().replace("3.7", "3.0") - with open("trainset_preprocess_pipeline_print.py", "w") as f: - f.write(strr) else: self.gpu_name = None self.gpu_mem = int( @@ -268,22 +66,16 @@ def device_config(self) -> tuple: / 1024 + 0.4 ) - if self.gpu_mem <= 4: - with open("trainset_preprocess_pipeline_print.py", "r") as f: - strr = f.read().replace("3.7", "3.0") - with open("trainset_preprocess_pipeline_print.py", "w") as f: - f.write(strr) - elif torch.backends.mps.is_available(): + elif torch.backends.mps.is_available() and not only_cpu: logger.info("Supported N-card not found, using MPS for inference") self.device = "mps" else: logger.info("No supported N-card found, using CPU for inference") self.device = "cpu" self.is_half = False - use_fp32_config() if self.n_cpu == 0: - self.n_cpu = cpu_count() + self.n_cpu = os.cpu_count() if self.is_half: # 6GB VRAM configuration @@ -298,270 +90,621 @@ def device_config(self) -> tuple: x_center = 38 x_max = 41 - if self.gpu_mem != None and self.gpu_mem <= 4: + if self.gpu_mem is not None and self.gpu_mem <= 4: x_pad = 1 x_query = 5 x_center = 30 x_max = 32 + logger.info( + f"Config: Device is {self.device}, " + f"half precision is {self.is_half}" + ) + return x_pad, x_query, x_center, x_max - logger.info(f"Config: Device is {self.device}, half precision is {self.is_half}") +BASE_DOWNLOAD_LINK = "https://huggingface.co/r3gm/sonitranslate_voice_models/resolve/main/" +BASE_MODELS = [ + "hubert_base.pt", + "rmvpe.pt" +] +BASE_DIR = "." + + +def load_hu_bert(config): + from fairseq import checkpoint_utils + from soni_translate.utils import download_manager + + for id_model in BASE_MODELS: + download_manager( + os.path.join(BASE_DOWNLOAD_LINK, id_model), BASE_DIR + ) + + models, _, _ = checkpoint_utils.load_model_ensemble_and_task( + ["hubert_base.pt"], + suffix="", + ) + hubert_model = models[0] + hubert_model = hubert_model.to(config.device) + if config.is_half: + hubert_model = hubert_model.half() + else: + hubert_model = hubert_model.float() + hubert_model.eval() + + return hubert_model + + +def load_trained_model(model_path, config): + + if not model_path: + raise ValueError("No model found") + + logger.info("Loading %s" % model_path) + cpt = torch.load(model_path, map_location="cpu") + tgt_sr = cpt["config"][-1] + cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk + if_f0 = cpt.get("f0", 1) + if if_f0 == 0: + # protect to 0.5 need? + pass + + version = cpt.get("version", "v1") + if version == "v1": + if if_f0 == 1: + net_g = SynthesizerTrnMs256NSFsid( + *cpt["config"], is_half=config.is_half + ) + else: + net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) + elif version == "v2": + if if_f0 == 1: + net_g = SynthesizerTrnMs768NSFsid( + *cpt["config"], is_half=config.is_half + ) + else: + net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) + del net_g.enc_q + + net_g.load_state_dict(cpt["weight"], strict=False) + net_g.eval().to(config.device) + + if config.is_half: + net_g = net_g.half() + else: + net_g = net_g.float() + + vc = VC(tgt_sr, config) + n_spk = cpt["config"][-3] + + return n_spk, tgt_sr, net_g, vc, cpt, version - return x_pad, x_query, x_center, x_max -# call inference class ClassVoices: - def __init__(self): - self.file_index = "" # root - - def apply_conf(self, f0method, - model_voice_path00, transpose00, file_index2_00, - model_voice_path01="", transpose01=0, file_index2_01="", - model_voice_path02="", transpose02=0, file_index2_02="", - model_voice_path03="", transpose03=0, file_index2_03="", - model_voice_path04="", transpose04=0, file_index2_04="", - model_voice_path05="", transpose05=0, file_index2_05="", - model_voice_path99="", transpose99=0, file_index2_99=""): - - #self.filename = filename - self.f0method = f0method - - self.model_voice_path00 = model_voice_path00 - self.transpose00 = transpose00 - self.file_index200 = file_index2_00 - - self.model_voice_path01 = model_voice_path01 - self.transpose01 = transpose01 - self.file_index201 = file_index2_01 - - self.model_voice_path02 = model_voice_path02 - self.transpose02 = transpose02 - self.file_index202 = file_index2_02 - - self.model_voice_path03 = model_voice_path03 - self.transpose03 = transpose03 - self.file_index203 = file_index2_03 - - self.model_voice_path04 = model_voice_path04 - self.transpose04 = transpose04 - self.file_index204 = file_index2_04 - - self.model_voice_path05 = model_voice_path05 - self.transpose05 = transpose05 - self.file_index205 = file_index2_05 - - self.model_voice_path99 = model_voice_path99 - self.transpose99 = transpose99 - self.file_index299 = file_index2_99 - return "CONFIGURATION APPLIED" - - def custom_voice(self, - _values, # filter indices - audio_files, # all audio files - model_voice_path='', - transpose=0, - f0method='pm', - file_index='', - file_index2='', - ): - - #hubert_model = None - - generate_inference( - sid=model_voice_path, # model path - to_return_protect0=0.33, - to_return_protect1=0.33 + def __init__(self, only_cpu=False): + self.model_config = {} + self.config = None + self.only_cpu = only_cpu + + def apply_conf( + self, + tag="base_model", + file_model="", + pitch_algo="pm", + pitch_lvl=0, + file_index="", + index_influence=0.66, + respiration_median_filtering=3, + envelope_ratio=0.25, + consonant_breath_protection=0.33, + resample_sr=0, + file_pitch_algo="", + ): + + if not file_model: + raise ValueError("Model not found") + + if file_index is None: + file_index = "" + + if file_pitch_algo is None: + file_pitch_algo = "" + + if not self.config: + self.config = Config(self.only_cpu) + self.hu_bert_model = None + + self.model_config[tag] = { + "file_model": file_model, + "pitch_algo": pitch_algo, + "pitch_lvl": pitch_lvl, # no decimal + "file_index": file_index, + "index_influence": index_influence, + "respiration_median_filtering": respiration_median_filtering, + "envelope_ratio": envelope_ratio, + "consonant_breath_protection": consonant_breath_protection, + "resample_sr": resample_sr, + "file_pitch_algo": file_pitch_algo, + } + return f"CONFIGURATION APPLIED FOR {tag}: {file_model}" + + def infer( + self, + params, + # load model + n_spk, + tgt_sr, + net_g, + pipe, + cpt, + version, + if_f0, + # load index + index_rate, + index, + big_npy, + # load f0 file + inp_f0, + # audio file + input_audio_path, + overwrite, + ): + + f0_method = params["pitch_algo"] + f0_up_key = params["pitch_lvl"] + filter_radius = params["respiration_median_filtering"] + resample_sr = params["resample_sr"] + rms_mix_rate = params["envelope_ratio"] + protect = params["consonant_breath_protection"] + + if not os.path.exists(input_audio_path): + raise ValueError( + "The audio file was not found or is not " + f"a valid file: {input_audio_path}" + ) + + f0_up_key = int(f0_up_key) + + audio = load_audio(input_audio_path, 16000) + + # Normalize audio + audio_max = np.abs(audio).max() / 0.95 + if audio_max > 1: + audio /= audio_max + + times = [0, 0, 0] + + # filters audio signal, pads it, computes sliding window sums, + # and extracts optimized time indices + audio = signal.filtfilt(bh, ah, audio) + audio_pad = np.pad( + audio, (pipe.window // 2, pipe.window // 2), mode="reflect" ) + opt_ts = [] + if audio_pad.shape[0] > pipe.t_max: + audio_sum = np.zeros_like(audio) + for i in range(pipe.window): + audio_sum += audio_pad[i:i - pipe.window] + for t in range(pipe.t_center, audio.shape[0], pipe.t_center): + opt_ts.append( + t + - pipe.t_query + + np.where( + np.abs(audio_sum[t - pipe.t_query: t + pipe.t_query]) + == np.abs(audio_sum[t - pipe.t_query: t + pipe.t_query]).min() + )[0][0] + ) - for _value_item in _values: - filename = audio_files[_value_item] if _value_item != "test" else audio_files[0] - #filename = audio_files[_value_item] - try: - logger.info(f"{audio_files[_value_item]}, {model_voice_path}") - except: - pass - - info_, (sample_, audio_output_) = vc_single( - sid=0, - input_audio_path=filename, # Original file - f0_up_key=transpose, # transpose for m to f and reverse 0 12 - f0_file=None, - f0_method= f0method, - file_index= file_index, # dir pwd? - file_index2= file_index2, - # file_big_npy1, - index_rate= float(0.66), - filter_radius= int(3), - resample_sr= int(0), - rms_mix_rate= float(0.25), - protect= float(0.33), + s = 0 + audio_opt = [] + t = None + t1 = ttime() + + sid_value = 0 + sid = torch.tensor(sid_value, device=pipe.device).unsqueeze(0).long() + + # Pads audio symmetrically, calculates length divided by window size. + audio_pad = np.pad(audio, (pipe.t_pad, pipe.t_pad), mode="reflect") + p_len = audio_pad.shape[0] // pipe.window + + # Estimates pitch from audio signal + pitch, pitchf = None, None + if if_f0 == 1: + pitch, pitchf = pipe.get_f0( + input_audio_path, + audio_pad, + p_len, + f0_up_key, + f0_method, + filter_radius, + inp_f0, + ) + pitch = pitch[:p_len] + pitchf = pitchf[:p_len] + if pipe.device == "mps": + pitchf = pitchf.astype(np.float32) + pitch = torch.tensor( + pitch, device=pipe.device + ).unsqueeze(0).long() + pitchf = torch.tensor( + pitchf, device=pipe.device + ).unsqueeze(0).float() + + t2 = ttime() + times[1] += t2 - t1 + for t in opt_ts: + t = t // pipe.window * pipe.window + if if_f0 == 1: + pitch_slice = pitch[ + :, s // pipe.window: (t + pipe.t_pad2) // pipe.window + ] + pitchf_slice = pitchf[ + :, s // pipe.window: (t + pipe.t_pad2) // pipe.window + ] + else: + pitch_slice = None + pitchf_slice = None + + audio_slice = audio_pad[s:t + pipe.t_pad2 + pipe.window] + audio_opt.append( + pipe.vc( + self.hu_bert_model, + net_g, + sid, + audio_slice, + pitch_slice, + pitchf_slice, + times, + index, + big_npy, + index_rate, + version, + protect, + )[pipe.t_pad_tgt:-pipe.t_pad_tgt] ) + s = t + + pitch_end_slice = pitch[ + :, t // pipe.window: + ] if t is not None else pitch + pitchf_end_slice = pitchf[ + :, t // pipe.window: + ] if t is not None else pitchf + + audio_opt.append( + pipe.vc( + self.hu_bert_model, + net_g, + sid, + audio_pad[t:], + pitch_end_slice, + pitchf_end_slice, + times, + index, + big_npy, + index_rate, + version, + protect, + )[pipe.t_pad_tgt:-pipe.t_pad_tgt] + ) - sf.write( - file= filename, # Overwrite - samplerate=sample_, - data=audio_output_ + audio_opt = np.concatenate(audio_opt) + if rms_mix_rate != 1: + audio_opt = change_rms( + audio, 16000, audio_opt, tgt_sr, rms_mix_rate + ) + if resample_sr >= 16000 and tgt_sr != resample_sr: + audio_opt = librosa.resample( + audio_opt, orig_sr=tgt_sr, target_sr=resample_sr ) + audio_max = np.abs(audio_opt).max() / 0.99 + max_int16 = 32768 + if audio_max > 1: + max_int16 /= audio_max + audio_opt = (audio_opt * max_int16).astype(np.int16) + del pitch, pitchf, sid + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + if tgt_sr != resample_sr >= 16000: + final_sr = resample_sr + else: + final_sr = tgt_sr + + """ + "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % ( + times[0], + times[1], + times[2], + ), (final_sr, audio_opt) + + """ + + if overwrite: + output_audio_path = input_audio_path # Overwrite + else: + basename = os.path.basename(input_audio_path) + dirname = os.path.dirname(input_audio_path) - # detele the model + new_basename = basename.split( + '.')[0] + "_edited." + basename.split('.')[-1] + new_path = os.path.join(dirname, new_basename) + logger.info(str(new_path)) - def make_test(self, - tts_text, - tts_voice, + output_audio_path = new_path + + # Save file + sf.write( + file=output_audio_path, + samplerate=final_sr, + data=audio_opt + ) + + self.output_list.append(output_audio_path) + + def make_test( + self, + tts_text, + tts_voice, model_path, index_path, transpose, f0_method, - ): + ): - create_directories("test") - remove_directory_contents("test") - filename = "test/test.wav" + folder_test = "test" + tag = "test_edge" + tts_file = "test/test.wav" + tts_edited = "test/test_edited.wav" + + create_directories(folder_test) + remove_directory_contents(folder_test) if "SET_LIMIT" == os.getenv("DEMO"): - if len(tts_text) > 60: - tts_text = tts_text[:60] - logger.warning("DEMO; limit to 60 characters") + if len(tts_text) > 60: + tts_text = tts_text[:60] + logger.warning("DEMO; limit to 60 characters") - language = tts_voice[:2] try: - asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save(filename)) - except: - try: - tts = gTTS(tts_text, lang=language) - tts.save(filename) - tts.save - logger.warning(f'No audio was received. Please change the tts voice for {tts_voice}. USING gTTS.') - except: - tts = gTTS('a', lang=language) - tts.save(filename) - logger.error('Audio will be replaced.') - - shutil.copy("test/test.wav", "test/real_test.wav") - - self([],[]) # start modules - - self.custom_voice( - ["test"], # filter indices - ["test/test.wav"], # all audio files - model_voice_path=model_path, - transpose=transpose, - f0method=f0_method, - file_index='', - file_index2=index_path, + asyncio.run(edge_tts.Communicate( + tts_text, "-".join(tts_voice.split('-')[:-1]) + ).save(tts_file)) + except Exception as e: + raise ValueError( + "No audio was received. Please change the " + f"tts voice for {tts_voice}. Error: {str(e)}" + ) + + shutil.copy(tts_file, tts_edited) + + self.apply_conf( + tag=tag, + file_model=model_path, + pitch_algo=f0_method, + pitch_lvl=transpose, + file_index=index_path, + index_influence=0.66, + respiration_median_filtering=3, + envelope_ratio=0.25, + consonant_breath_protection=0.33, ) - return "test/test.wav", "test/real_test.wav" - def __call__(self, speakers_list, audio_files): + self( + audio_files=tts_edited, + tag_list=tag, + overwrite=True + ) - speakers_indices = {} + return tts_edited, tts_file - for index, speak_ in enumerate(speakers_list): - if speak_ in speakers_indices: - speakers_indices[speak_].append(index) - else: - speakers_indices[speak_] = [index] - - - # find models and index - global weight_root, index_root, config, hubert_model - weight_root = "weights" - names = [] - for name in os.listdir(weight_root): - if name.endswith(".pth"): - names.append(name) - - index_root = "logs" - index_paths = [] - for name in os.listdir(index_root): - if name.endswith(".index"): - index_paths.append(name) - - logger.info(f"{names}, {index_paths}") - # config machine - hubert_model = None - config = Config('cuda:0', is_half=True) # config = Config('cpu', is_half=False) # cpu - - # filter by speaker - for _speak, _values in speakers_indices.items(): - logger.debug(f"{_speak}, {_values}") - #for _value_item in _values: - # self.filename = "audio2/"+audio_files[_value_item] - ###print(audio_files[_value_item]) - - #vc(_speak, _values, audio_files) - - if _speak == "SPEAKER_00": - self.custom_voice( - _values, # filteredd - audio_files, - model_voice_path=self.model_voice_path00, - file_index2=self.file_index200, - transpose=self.transpose00, - f0method=self.f0method, - file_index=self.file_index, - ) - elif _speak == "SPEAKER_01": - self.custom_voice( - _values, - audio_files, - model_voice_path=self.model_voice_path01, - file_index2=self.file_index201, - transpose=self.transpose01, - f0method=self.f0method, - file_index=self.file_index, - ) - elif _speak == "SPEAKER_02": - self.custom_voice( - _values, - audio_files, - model_voice_path=self.model_voice_path02, - file_index2=self.file_index202, - transpose=self.transpose02, - f0method=self.f0method, - file_index=self.file_index, - ) - elif _speak == "SPEAKER_03": - self.custom_voice( - _values, - audio_files, - model_voice_path=self.model_voice_path03, - file_index2=self.file_index203, - transpose=self.transpose03, - f0method=self.f0method, - file_index=self.file_index, - ) - elif _speak == "SPEAKER_04": - self.custom_voice( - _values, - audio_files, - model_voice_path=self.model_voice_path04, - file_index2=self.file_index204, - transpose=self.transpose04, - f0method=self.f0method, - file_index=self.file_index, - ) - elif _speak == "SPEAKER_05": - self.custom_voice( - _values, - audio_files, - model_voice_path=self.model_voice_path05, - file_index2=self.file_index205, - transpose=self.transpose05, - f0method=self.f0method, - file_index=self.file_index, + def run_threads(self, threads): + # Start threads + for thread in threads: + thread.start() + + # Wait for all threads to finish + for thread in threads: + thread.join() + + gc.collect() + torch.cuda.empty_cache() + + def __call__( + self, + audio_files=[], + tag_list=[], + overwrite=False, + parallel_workers=1, + ): + logger.info(f"Parallel workers: {str(parallel_workers)}") + + self.output_list = [] + + if not self.model_config: + raise ValueError("No model has been configured for inference") + + if isinstance(audio_files, str): + audio_files = [audio_files] + if isinstance(tag_list, str): + tag_list = [tag_list] + + if not audio_files: + raise ValueError("No audio found to convert") + if not tag_list: + tag_list = [list(self.model_config.keys())[-1]] * len(audio_files) + + if len(audio_files) > len(tag_list): + logger.info("Extend tag list to match audio files") + extend_number = len(audio_files) - len(tag_list) + tag_list.extend([tag_list[0]] * extend_number) + + if len(audio_files) < len(tag_list): + logger.info("Cut list tags") + tag_list = tag_list[:len(audio_files)] + + tag_file_pairs = list(zip(tag_list, audio_files)) + sorted_tag_file = sorted(tag_file_pairs, key=lambda x: x[0]) + + # Base params + if not self.hu_bert_model: + self.hu_bert_model = load_hu_bert(self.config) + + cache_params = None + threads = [] + progress_bar = tqdm(total=len(tag_list), desc="Progress") + for i, (id_tag, input_audio_path) in enumerate(sorted_tag_file): + + if id_tag not in self.model_config.keys(): + logger.info( + f"No configured model for {id_tag} with {input_audio_path}" ) - elif _speak == "SPEAKER_99": - self.custom_voice( - _values, - audio_files, - model_voice_path=self.model_voice_path99, - file_index2=self.file_index299, - transpose=self.transpose99, - f0method=self.f0method, - file_index=self.file_index, + continue + + if ( + len(threads) >= parallel_workers + or cache_params != id_tag + and cache_params is not None + ): + + self.run_threads(threads) + progress_bar.update(len(threads)) + + threads = [] + + if cache_params != id_tag: + + # Unload previous + ( + n_spk, + tgt_sr, + net_g, + pipe, + cpt, + version, + if_f0, + index_rate, + index, + big_npy, + inp_f0, + ) = [None] * 11 + gc.collect() + torch.cuda.empty_cache() + + # Model params + params = self.model_config[id_tag] + + model_path = params["file_model"] + f0_method = params["pitch_algo"] + file_index = params["file_index"] + index_rate = params["index_influence"] + f0_file = params["file_pitch_algo"] + + # Load model + ( + n_spk, + tgt_sr, + net_g, + pipe, + cpt, + version + ) = load_trained_model(model_path, self.config) + if_f0 = cpt.get("f0", 1) # pitch data + + # Load index + if os.path.exists(file_index) and index_rate != 0: + try: + index = faiss.read_index(file_index) + big_npy = index.reconstruct_n(0, index.ntotal) + except Exception as error: + logger.error(f"Index: {str(error)}") + index_rate = 0 + index = big_npy = None + else: + logger.warning("File index not found") + index_rate = 0 + index = big_npy = None + + # Load f0 file + inp_f0 = None + if os.path.exists(f0_file): + try: + with open(f0_file, "r") as f: + lines = f.read().strip("\n").split("\n") + inp_f0 = [] + for line in lines: + inp_f0.append([float(i) for i in line.split(",")]) + inp_f0 = np.array(inp_f0, dtype="float32") + except Exception as error: + logger.error(f"f0 file: {str(error)}") + + if "rmvpe" in f0_method: + if not hasattr(self, "model_pitch_estimator"): + from lib.rmvpe import RMVPE + + logger.info("Loading vocal pitch estimator model") + self.model_pitch_estimator = RMVPE( + "rmvpe.pt", + is_half=self.config.is_half, + device=self.config.device + ) + + pipe.model_rmvpe = self.model_pitch_estimator + + cache_params = id_tag + + # self.infer( + # params, + # # load model + # n_spk, + # tgt_sr, + # net_g, + # pipe, + # cpt, + # version, + # if_f0, + # # load index + # index_rate, + # index, + # big_npy, + # # load f0 file + # inp_f0, + # # output file + # input_audio_path, + # overwrite, + # ) + + thread = threading.Thread( + target=self.infer, + args=( + params, + # loaded model + n_spk, + tgt_sr, + net_g, + pipe, + cpt, + version, + if_f0, + # loaded index + index_rate, + index, + big_npy, + # loaded f0 file + inp_f0, + # audio file + input_audio_path, + overwrite, ) - else: - pass + ) + + threads.append(thread) + + # Run last + if threads: + self.run_threads(threads) + + progress_bar.update(len(threads)) + progress_bar.close() + + return self.output_list From 741b788f80cd7ee8bb37b65ebd4eb790067a4698 Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Tue, 16 Apr 2024 16:43:59 +0000 Subject: [PATCH 22/36] feat: enhance vocals before transcription. New output type. Batch subtitles --- app_rvc.py | 125 +++++++++++++++++++++++--- soni_translate/mdx_net.py | 1 + soni_translate/postprocessor.py | 52 ++++++++++- soni_translate/speech_segmentation.py | 2 +- soni_translate/utils.py | 111 ++++++++++++----------- 5 files changed, 221 insertions(+), 70 deletions(-) diff --git a/app_rvc.py b/app_rvc.py index 7b0db9f..23b36bd 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -28,6 +28,8 @@ from soni_translate.postprocessor import ( OUTPUT_TYPE_OPTIONS, DOCS_OUTPUT_TYPE_OPTIONS, + sound_separate, + get_no_ext_filename, media_out, get_subtitle_speaker, ) @@ -45,6 +47,7 @@ download_manager, run_command, is_audio_file, + is_subtitle_file, copy_files, get_valid_files, get_link_list, @@ -154,6 +157,7 @@ class SoniTrCache: def __init__(self): self.cache = { 'media': [[]], + 'refine_vocals': [], 'transcript_align': [], 'break_align': [], 'diarize': [], @@ -167,6 +171,7 @@ def __init__(self): self.cache_data = { 'media': [], + 'refine_vocals': [], 'transcript_align': [], 'break_align': [], 'diarize': [], @@ -404,6 +409,7 @@ def multilingual_media_conversion( get_video_from_text_json=False, text_json="{}", avoid_overlap=False, + vocal_refinement=False, literalize_numbers=True, segment_duration_limit=15, diarization_model="pyannote_2.1", @@ -438,6 +444,23 @@ def multilingual_media_conversion( if "gpt" in translate_process: check_openai_api_key() + if media_file is None: + media_file = ( + directory_input + if os.path.exists(directory_input) + else link_media + ) + media_file = ( + media_file if isinstance(media_file, str) else media_file.name + ) + + if is_subtitle_file(media_file): + subtitle_file = media_file + media_file = "" + + if media_file is None: + media_file = "" + if SOURCE_LANGUAGE in UNIDIRECTIONAL_L_LIST and not subtitle_file: raise ValueError( f"The language '{SOURCE_LANGUAGE}' " @@ -450,6 +473,15 @@ def multilingual_media_conversion( if not self.edit_subs_complete: raise ValueError("Generate the transcription first.") + if ( + ("sound" in output_type or output_type == "raw media") + and (get_translated_text or get_video_from_text_json) + ): + raise ValueError( + "Please disable 'edit generate subtitles' " + f"first to acquire the {output_type}." + ) + TRANSLATE_AUDIO_TO = LANGUAGES[TRANSLATE_AUDIO_TO] SOURCE_LANGUAGE = LANGUAGES[SOURCE_LANGUAGE] @@ -487,16 +519,6 @@ def multilingual_media_conversion( ) warn_disp(wrn_lang, is_gui) - if media_file is None: - media_file = ( - directory_input - if os.path.exists(directory_input) - else link_media - ) - media_file = ( - media_file if isinstance(media_file, str) else media_file.name - ) - if not media_file and not subtitle_file: raise ValueError( "Specifify a media or SRT file in advanced settings" @@ -549,6 +571,7 @@ def multilingual_media_conversion( base_video_file = "Video.mp4" base_audio_wav = "audio.wav" dub_audio_file = "audio_dub_solo.ogg" + vocals_audio_file = "audio_Vocals_DeReverb.wav" voiceless_audio_file = "audio_Voiceless.wav" mix_audio_file = "audio_mix.mp3" vid_subs = "video_subs_file.mp4" @@ -579,6 +602,55 @@ def multilingual_media_conversion( ) logger.debug("Set file complete.") + if "sound" in output_type: + prog_disp( + "Separating sounds in the file...", + 0.50, + is_gui, + progress=progress + ) + separate_out = sound_separate(base_audio_wav, output_type) + final_outputs = [] + for out in separate_out: + final_name = media_out( + media_file, + f"{get_no_ext_filename(out)}", + video_output_name, + "wav", + file_obj=out, + ) + final_outputs.append(final_name) + logger.info(f"Done: {str(final_outputs)}") + return final_outputs + + if output_type == "raw media": + output = media_out( + media_file, + "raw_media", + video_output_name, + "wav" if is_audio_file(media_file) else "mp4", + file_obj=base_audio_wav if is_audio_file(media_file) else base_video_file, + ) + logger.info(f"Done: {output}") + return output + + if not self.task_in_cache("refine_vocals", [vocal_refinement], {}): + self.vocals = None + if vocal_refinement: + try: + from soni_translate.mdx_net import process_uvr_task + _, _, _, _, file_vocals = process_uvr_task( + orig_song_path=base_audio_wav, + main_vocals=False, + dereverb=True, + remove_files_output_dir=True, + ) + remove_files(vocals_audio_file) + copy_files(file_vocals, ".") + self.vocals = vocals_audio_file + except Exception as error: + logger.error(str(error)) + if not self.task_in_cache("transcript_align", [ subtitle_file, SOURCE_LANGUAGE, @@ -593,12 +665,14 @@ def multilingual_media_conversion( and subtitle_file else "sentence" ) - ], {}): + ], {"vocals": self.vocals}): if subtitle_file: prog_disp( "From SRT file...", 0.30, is_gui, progress=progress ) - audio = whisperx.load_audio(base_audio_wav) + audio = whisperx.load_audio( + base_audio_wav if not self.vocals else self.vocals + ) self.result = srt_file_to_segments(subtitle_file) self.result["language"] = SOURCE_LANGUAGE else: @@ -611,7 +685,7 @@ def multilingual_media_conversion( else SOURCE_LANGUAGE ) audio, self.result = transcribe_speech( - base_audio_wav, + base_audio_wav if not self.vocals else self.vocals, WHISPER_MODEL_SIZE, compute_type, batch_size, @@ -677,7 +751,7 @@ def multilingual_media_conversion( prog_disp("Diarizing...", 0.60, is_gui, progress=progress) diarize_model_select = diarization_models[diarization_model] self.result_diarize = diarize_speech( - base_audio_wav, + base_audio_wav if not self.vocals else self.vocals, self.result, min_speakers, max_speakers, @@ -791,6 +865,20 @@ def multilingual_media_conversion( logger.info(f"Done: {str(output)}") return output + if "video [subtitled]" in output_type: + output = media_out( + media_file, + TRANSLATE_AUDIO_TO + "_subtitled", + video_output_name, + "wav" if is_audio_file(media_file) else ( + "mkv" if "mkv" in output_type else "mp4" + ), + file_obj=base_audio_wav if is_audio_file(media_file) else base_video_file, + soft_subtitles=False if is_audio_file(media_file) else True, + ) + logger.info(f"Done: {output}") + return output + if not self.task_in_cache("tts", [ TRANSLATE_AUDIO_TO, tts_voice00, @@ -1133,6 +1221,8 @@ def multilingual_docs_conversion( result_diarize, audio_files, final_wav_file, True ) + logger.info(f"Done: {final_wav_file}") + return final_wav_file @@ -1492,6 +1582,11 @@ def get_subs_path(type_subs): label="Literalize Numbers", info="Literalize Numbers: Replace numerical representations with their written equivalents in the transcript.", ) + vocal_refinement_gui = gr.Checkbox( + False, + label="Sound Cleanup", + info="Sound Cleanup: Enhance vocals, remove background noise before transcription for utmost timestamp precision. This operation may take time, especially with lengthy audio files.", + ) segment_duration_limit_gui = gr.Slider( label="Segment Duration Limit", info="Specify the maximum duration (in seconds) for each segment. The audio will be processed using VAD, limiting the duration for each segment chunk.", @@ -2291,6 +2386,7 @@ def update_tts_list(): dummy_false_check, # dummy false subs_edit_space, avoid_overlap_gui, + vocal_refinement_gui, literalize_numbers_gui, segment_duration_limit_gui, diarization_process_dropdown, @@ -2349,6 +2445,7 @@ def update_tts_list(): edit_sub_check, subs_edit_space, avoid_overlap_gui, + vocal_refinement_gui, literalize_numbers_gui, segment_duration_limit_gui, diarization_process_dropdown, diff --git a/soni_translate/mdx_net.py b/soni_translate/mdx_net.py index 23ecc91..ce67600 100644 --- a/soni_translate/mdx_net.py +++ b/soni_translate/mdx_net.py @@ -426,6 +426,7 @@ def run_mdx( del mdx_sess, wave_processed, wave gc.collect() + torch.cuda.empty_cache() return main_filepath, invert_filepath diff --git a/soni_translate/postprocessor.py b/soni_translate/postprocessor.py index 44b70e5..1628d21 100644 --- a/soni_translate/postprocessor.py +++ b/soni_translate/postprocessor.py @@ -15,6 +15,14 @@ "audio (wav)", "subtitle", "subtitle [by speaker]", + "video [subtitled] (mp4)", + "video [subtitled] (mkv)", + "audio [original vocal sound]", + "audio [original background sound]", + "audio [original vocal and background sound]", + "audio [original vocal-dereverb sound]", + "audio [original vocal-dereverb and background sound]", + "raw media", ] DOCS_OUTPUT_TYPE_OPTIONS = [ @@ -68,12 +76,12 @@ def get_output_file( soft_subtitles, output_directory="", ): - directory, filename = os.path.split(original_file) + directory_base = "." # default directory if output_directory and os.path.isdir(output_directory): new_file_path = os.path.join(output_directory, new_file_name) else: - new_file_path = os.path.join(directory, "outputs", new_file_name) + new_file_path = os.path.join(directory_base, "outputs", new_file_name) remove_files(new_file_path) cm = None @@ -84,7 +92,7 @@ def get_output_file( cm = f'ffmpeg -y -i "{original_file}" -i sub_tra.srt -i sub_ori.srt -map 0:v -map 0:a -map 1 -map 2 -c:v copy -c:a copy -c:s srt -movflags use_metadata_tags -map_metadata 0 "{new_file_path}"' elif new_file_path.endswith(".mkv"): cm = f'ffmpeg -i "{original_file}" -c:v copy -c:a copy "{new_file_path}"' - elif new_file_path.endswith(".wav"): + elif new_file_path.endswith(".wav") and not original_file.endswith(".wav"): cm = f'ffmpeg -y -i "{original_file}" -acodec pcm_s16le -ar 44100 -ac 2 "{new_file_path}"' elif new_file_path.endswith(".ogg"): cm = f'ffmpeg -i "{original_file}" -c:a libvorbis "{new_file_path}"' @@ -163,4 +171,40 @@ def get_subtitle_speaker(media_file, result, language, extension, base_name): files_subs.append(output) - return files_subs \ No newline at end of file + return files_subs + + +def sound_separate(media_file, task_uvr): + from .mdx_net import process_uvr_task + + outputs = [] + + if "vocal" in task_uvr: + try: + _, _, _, _, vocal_audio = process_uvr_task( + orig_song_path=media_file, + main_vocals=False, + dereverb=True if "dereverb" in task_uvr else False, + remove_files_output_dir=True, + ) + outputs.append(vocal_audio) + except Exception as error: + logger.error(str(error)) + + if "background" in task_uvr: + try: + background_audio, _ = process_uvr_task( + orig_song_path=media_file, + song_id="voiceless", + only_voiceless=True, + remove_files_output_dir=False if "vocal" in task_uvr else True, + ) + # copy_files(background_audio, ".") + outputs.append(background_audio) + except Exception as error: + logger.error(str(error)) + + if not outputs: + raise Exception("Error in uvr process") + + return outputs diff --git a/soni_translate/speech_segmentation.py b/soni_translate/speech_segmentation.py index 7d3837e..2d7996e 100644 --- a/soni_translate/speech_segmentation.py +++ b/soni_translate/speech_segmentation.py @@ -136,7 +136,7 @@ def transcribe_speech( converter.convert( model_dir, quantization=quantization, - force=False + force=True ) else: raise error diff --git a/soni_translate/utils.py b/soni_translate/utils.py index ad70780..d699b3e 100644 --- a/soni_translate/utils.py +++ b/soni_translate/utils.py @@ -4,6 +4,46 @@ from IPython.utils import capture import re +VIDEO_EXTENSIONS = [ + ".mp4", + ".avi", + ".mov", + ".mkv", + ".wmv", + ".flv", + ".webm", + ".m4v", + ".mpeg", + ".mpg", + ".3gp" +] + +AUDIO_EXTENSIONS = [ + ".mp3", + ".wav", + ".aiff", + ".aif", + ".flac", + ".aac", + ".ogg", + ".wma", + ".m4a", + ".alac", + ".pcm", + ".opus", + ".ape", + ".amr", + ".ac3", + ".vox", + ".caf" +] + +SUBTITLE_EXTENSIONS = [ + ".srt", + ".vtt", + ".ass" +] + def run_command(command): logger.debug(command) @@ -167,62 +207,26 @@ def move_files_with_extension(src_dir, extension, destination_dir): return "Download complete" +def is_file_with_extensions(string_path, extensions): + return any(string_path.lower().endswith(ext) for ext in extensions) + + def is_video_file(string_path): - video_extensions = [ - ".mp4", - ".avi", - ".mov", - ".mkv", - ".wmv", - ".flv", - ".webm", - ".m4v", - ".mpeg", - ".mpg", - ".3gp", - ] - - if any( - string_path.lower().endswith(ext) for ext in video_extensions - ) and os.path.exists(string_path): - return True - else: - return False + return is_file_with_extensions(string_path, VIDEO_EXTENSIONS) def is_audio_file(string_path): - audio_extensions = [ - ".mp3", - ".wav", - ".aiff", - ".aif", - ".flac", - ".aac", - ".ogg", - ".wma", - ".m4a", - ".alac", - ".pcm", - ".opus", - ".ape", - ".amr", - ".ac3", - ".vox", - ".caf", - ] - - # Check if the string_path ends with any audio extension - if any( - string_path.lower().endswith(ext) for ext in audio_extensions - ) and os.path.exists(string_path): - return True - else: - return False + return is_file_with_extensions(string_path, AUDIO_EXTENSIONS) + +def is_subtitle_file(string_path): + return is_file_with_extensions(string_path, SUBTITLE_EXTENSIONS) -def get_audio_and_video_files(directory): + +def get_directory_files(directory): audio_files = [] video_files = [] + sub_files = [] for item in os.listdir(directory): item_path = os.path.join(directory, item) @@ -235,20 +239,25 @@ def get_audio_and_video_files(directory): elif is_video_file(item_path): video_files.append(item_path) + elif is_subtitle_file(item_path): + sub_files.append(item_path) + logger.info( - f"Files in path ({directory}): {str(audio_files + video_files)}" + f"Files in path ({directory}): " + f"{str(audio_files + video_files + sub_files)}" ) - return audio_files, video_files + return audio_files, video_files, sub_files def get_valid_files(paths): valid_paths = [] for path in paths: if os.path.isdir(path): - audio_files, video_files = get_audio_and_video_files(path) + audio_files, video_files, sub_files = get_directory_files(path) valid_paths.extend(audio_files) valid_paths.extend(video_files) + valid_paths.extend(sub_files) else: valid_paths.append(path) From d916723e9964e766ea3d8b2e38402eafbe08dd70 Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Sun, 21 Apr 2024 21:59:38 +0000 Subject: [PATCH 23/36] perf(preprocessor): No format conversion needed for H.264, H.265, VP9, MPEG-4, MPEG-2 and MJPEG video files --- .gitignore | 8 ++++++ soni_translate/preprocessor.py | 49 +++++++++++++++++++++++++++++++--- 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index a76cd30..871f0d8 100644 --- a/.gitignore +++ b/.gitignore @@ -173,6 +173,12 @@ task_subtitle.* *.ogg *.wav *.mkv +*.webm +*.avi +*.mpg +*.mov +*.ogv +*.wmv list.txt text_preprocessor.txt text_translation.txt @@ -182,6 +188,7 @@ text_translation.txt *.aud *.ass *.pt +.vscode/ mdx_models/*.onnx _XTTS_/ downloads/ @@ -192,5 +199,6 @@ audio2/ audio/ outputs/ processed/ +OPENVOICE_MODELS/ PIPER_MODELS/ WHISPER_MODELS/ \ No newline at end of file diff --git a/soni_translate/preprocessor.py b/soni_translate/preprocessor.py index 893c2e1..9eb115d 100644 --- a/soni_translate/preprocessor.py +++ b/soni_translate/preprocessor.py @@ -1,6 +1,24 @@ from .utils import remove_files import os, shutil, subprocess, time, shlex, sys # noqa from .logging_setup import logger +import json + +ERROR_INCORRECT_CODEC_PARAMETERS = [ + "prores", # mov + "ffv1", # mkv + "msmpeg4v3", # avi + "wmv2", # wmv + "theora", # ogv +] # fix final merge + +TESTED_CODECS = [ + "h264", # mp4 + "h265", # mp4 + "vp9", # webm + "mpeg4", # mp4 + "mpeg2video", # mpg + "mjpeg", # avi +] class OperationFailedError(Exception): @@ -9,6 +27,24 @@ def __init__(self, message="The operation did not complete successfully."): super().__init__(self.message) +def get_video_codec(video_file): + command_base = rf'ffprobe -v error -select_streams v:0 -show_entries stream=codec_name -of json "{video_file}"' + command = shlex.split(command_base) + try: + process = subprocess.Popen( + command, + stdout=subprocess.PIPE, + creationflags=subprocess.CREATE_NO_WINDOW if sys.platform == "win32" else 0, + ) + output, _ = process.communicate() + codec_info = json.loads(output.decode('utf-8')) + codec_name = codec_info['streams'][0]['codec_name'] + return codec_name + except Exception as error: + logger.debug(str(error)) + return None + + def audio_preprocessor(preview, base_audio, audio_wav, use_cuda=False): base_audio = base_audio.strip() previous_files_to_remove = [audio_wav] @@ -57,8 +93,14 @@ def audio_video_preprocessor( ) mp4_ = f'ffmpeg -y -i "{video}" -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4' else: - # Check if the file ends with ".mp4" extension - if video.endswith(".mp4"): + video_codec = get_video_codec(video) + if not video_codec: + logger.debug("No video codec found in video") + else: + logger.info(f"Video codec: {video_codec}") + + # Check if the file ends with ".mp4" extension or is valid codec + if video.endswith(".mp4") or video_codec in TESTED_CODECS: destination_path = os.path.join(os.getcwd(), "Video.mp4") shutil.copy(video, destination_path) time.sleep(0.5) @@ -68,7 +110,8 @@ def audio_video_preprocessor( mp4_ = f'ffmpeg -y -i "{video}" -c copy Video.mp4' else: logger.warning( - "File does not have the '.mp4' extension. Converting video." + "File does not have the '.mp4' extension or a " + "supported codec. Converting video to mp4 (codec: h264)." ) mp4_ = f'ffmpeg -y -i "{video}" -c:v libx264 -c:a aac -strict experimental Video.mp4' else: From bf199f28f085e81ba7d8678dc6eebd3fb778e702 Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Sun, 21 Apr 2024 22:36:09 +0000 Subject: [PATCH 24/36] feat: Set parameter --cpu_mode to use the CPU mode #39 --- app_rvc.py | 25 +++++++++++++++++++------ soni_translate/mdx_net.py | 20 +++++++++++++++----- soni_translate/speech_segmentation.py | 10 ++++------ soni_translate/text_to_speech.py | 21 +++++++++++++-------- 4 files changed, 51 insertions(+), 25 deletions(-) diff --git a/app_rvc.py b/app_rvc.py index 23b36bd..ea9c2eb 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -270,9 +270,16 @@ def check_openai_api_key(): class SoniTranslate(SoniTrCache): - def __init__(self, dev=False): + def __init__(self, cpu_mode=False): super().__init__() - self.device = "cuda" if torch.cuda.is_available() else "cpu" + if cpu_mode: + os.environ["SONITR_DEVICE"] = "cpu" + else: + os.environ["SONITR_DEVICE"] = ( + "cuda" if torch.cuda.is_available() else "cpu" + ) + + self.device = os.environ.get("SONITR_DEVICE") self.result_diarize = None self.align_language = None self.result_source_lang = None @@ -282,7 +289,7 @@ def __init__(self, dev=False): os.environ["VOICES_MODELS"] = "DISABLE" os.environ["VOICES_MODELS_WORKERS"] = "1" - self.vci = ClassVoices() + self.vci = ClassVoices(only_cpu=cpu_mode) self.tts_voices = self.get_tts_voice_list() @@ -1597,7 +1604,7 @@ def get_subs_path(type_subs): ) whisper_model_default = ( "large-v3" - if torch.cuda.is_available() + if SoniTr.device == "cuda" else "medium" ) @@ -1610,7 +1617,7 @@ def get_subs_path(type_subs): ) com_t_opt, com_t_default = ( [COMPUTE_TYPE_GPU, "float16"] - if torch.cuda.is_available() + if SoniTr.device == "cuda" else [COMPUTE_TYPE_CPU, "float32"] ) compute_type = gr.Dropdown( @@ -2555,6 +2562,12 @@ def create_parser(): default="english", help=" Select the language of the interface: english, spanish", ) + parser.add_argument( + "--cpu_mode", + action="store_true", + default=False, + help="Enable CPU mode to run the program without utilizing GPU acceleration.", + ) return parser @@ -2576,7 +2589,7 @@ def create_parser(): models_path, index_path = upload_model_list() - SoniTr = SoniTranslate() + SoniTr = SoniTranslate(cpu_mode=args.cpu_mode) lg_conf = get_language_config(language_data, language=args.language) diff --git a/soni_translate/mdx_net.py b/soni_translate/mdx_net.py index ce67600..1623ecd 100644 --- a/soni_translate/mdx_net.py +++ b/soni_translate/mdx_net.py @@ -119,10 +119,8 @@ class MDX: DEFAULT_CHUNK_SIZE = 0 * DEFAULT_SR DEFAULT_MARGIN_SIZE = 1 * DEFAULT_SR - DEFAULT_PROCESSOR = 0 if torch.cuda.is_available() else -1 - def __init__( - self, model_path: str, params: MDXModel, processor=DEFAULT_PROCESSOR + self, model_path: str, params: MDXModel, processor=0 ): # Set the device and the provider (CPU or CUDA) self.device = ( @@ -356,14 +354,17 @@ def run_mdx( denoise=False, keep_orig=True, m_threads=2, + device_base="cuda", ): - if torch.cuda.is_available(): + if device_base == "cuda": device = torch.device("cuda:0") + processor_num = 0 device_properties = torch.cuda.get_device_properties(device) vram_gb = device_properties.total_memory / 1024**3 m_threads = 1 if vram_gb < 8 else 2 else: device = torch.device("cpu") + processor_num = -1 m_threads = 1 model_hash = MDX.get_hash(model_path) @@ -377,7 +378,7 @@ def run_mdx( compensation=mp["compensate"], ) - mdx_sess = MDX(model_path, model) + mdx_sess = MDX(model_path, model, processor=processor_num) wave, sr = librosa.load(filename, mono=False, sr=44100) # normalizing input wave gives better output peak = max(np.max(wave), abs(np.min(wave))) @@ -478,6 +479,11 @@ def process_uvr_task( only_voiceless: bool = False, remove_files_output_dir: bool = False, ): + if os.environ.get("SONITR_DEVICE") == "cpu": + device_base = "cpu" + else: + device_base = "cuda" if torch.cuda.is_available() else "cpu" + if remove_files_output_dir: remove_directory_contents(output_dir) @@ -501,6 +507,7 @@ def process_uvr_task( denoise=False, keep_orig=True, exclude_inversion=True, + device_base=device_base, ) logger.info("Vocal Track Isolation and Voiceless Track Separation...") @@ -511,6 +518,7 @@ def process_uvr_task( orig_song_path, denoise=True, keep_orig=True, + device_base=device_base, ) if main_vocals: @@ -523,6 +531,7 @@ def process_uvr_task( suffix="Backup", invert_suffix="Main", denoise=True, + device_base=device_base, ) else: backup_vocals_path, main_vocals_path = None, vocals_path @@ -537,6 +546,7 @@ def process_uvr_task( invert_suffix="DeReverb", exclude_main=True, denoise=True, + device_base=device_base, ) else: vocals_dereverb_path = main_vocals_path diff --git a/soni_translate/speech_segmentation.py b/soni_translate/speech_segmentation.py index 2d7996e..810ab85 100644 --- a/soni_translate/speech_segmentation.py +++ b/soni_translate/speech_segmentation.py @@ -50,8 +50,6 @@ WHISPER_MODELS_PATH = './WHISPER_MODELS' -device = "cuda" if torch.cuda.is_available() else "cpu" - def find_whisper_models(): path = WHISPER_MODELS_PATH @@ -146,7 +144,7 @@ def transcribe_speech( model = whisperx.load_model( asr_model, - device, + os.environ.get("SONITR_DEVICE"), compute_type=compute_type, language=SOURCE_LANGUAGE, asr_options=asr_options, @@ -218,7 +216,7 @@ def align_speech(audio, result): model_a, metadata = whisperx.load_align_model( language_code=result["language"], - device=device, + device=os.environ.get("SONITR_DEVICE"), model_name=None if result["language"] in DAMHF.keys() else EXTRA_ALIGN[result["language"]], @@ -228,7 +226,7 @@ def align_speech(audio, result): model_a, metadata, audio, - device, + os.environ.get("SONITR_DEVICE"), return_char_alignments=True, ) del model_a @@ -286,7 +284,7 @@ def diarize_speech( diarize_model = whisperx.DiarizationPipeline( model_name=model_name, use_auth_token=YOUR_HF_TOKEN, - device=device, + device=os.environ.get("SONITR_DEVICE"), ) except Exception as error: diff --git a/soni_translate/text_to_speech.py b/soni_translate/text_to_speech.py index 7e67e65..1e619e6 100644 --- a/soni_translate/text_to_speech.py +++ b/soni_translate/text_to_speech.py @@ -24,9 +24,6 @@ import logging from .logging_setup import logger -device = "cuda:0" if torch.cuda.is_available() else "cpu" -torch_dtype_env = torch.float16 if torch.cuda.is_available() else torch.float32 - class TTS_OperationError(Exception): def __init__(self, message="The operation did not complete successfully."): @@ -197,6 +194,9 @@ def segments_bark_tts( from transformers import AutoProcessor, BarkModel from optimum.bettertransformer import BetterTransformer + device = os.environ.get("SONITR_DEVICE") + torch_dtype_env = torch.float16 if device == "cuda" else torch.float32 + # load model bark model = BarkModel.from_pretrained( model_id_bark, torch_dtype=torch_dtype_env @@ -205,7 +205,7 @@ def segments_bark_tts( processor = AutoProcessor.from_pretrained( model_id_bark, return_tensors="pt" ) # , padding=True - if torch.cuda.is_available(): + if device == "cuda": # convert to bettertransformer model = BetterTransformer.transform(model, keep_original_model=False) # enable CPU offload @@ -626,6 +626,7 @@ def segments_coqui_tts( ) # Init TTS + device = os.environ.get("SONITR_DEVICE") model = TTS(model_id_coqui).to(device) sampling_rate = 24000 @@ -729,7 +730,7 @@ def load_piper_model( try: import onnxruntime as rt - if rt.get_device() == "GPU" and torch.cuda.is_available(): + if rt.get_device() == "GPU" and os.environ.get("SONITR_DEVICE") == "cuda": logger.debug("onnxruntime device > GPU") cuda = True else: @@ -742,6 +743,7 @@ def load_piper_model( # Disable CUDA in Windows if platform.system() == "Windows": + logger.info("Employing CPU exclusivity with Piper TTS") cuda = False if not download_dir: @@ -1107,7 +1109,7 @@ def accelerate_segments( def se_process_audio_segments( - source_seg, tone_color_converter, remove_previous_processed=True + source_seg, tone_color_converter, device, remove_previous_processed=True ): # list wav seg source_audio_segs = glob.glob(f"{source_seg}/*.wav") @@ -1280,6 +1282,7 @@ def toneconverter_openvoice( url=checkpoint_url, path=model_path_openvoice ) + device = os.environ.get("SONITR_DEVICE") tone_color_converter = ToneColorConverter(config_path, device=device) tone_color_converter.load_ckpt(checkpoint_path) @@ -1290,9 +1293,9 @@ def toneconverter_openvoice( path_source_segments, path_target_segments, valid_speakers ): # source_se_path = os.path.join(source_seg, 'se.pth') - source_se = se_process_audio_segments(source_seg, tone_color_converter) + source_se = se_process_audio_segments(source_seg, tone_color_converter, device) # target_se_path = os.path.join(target_seg, 'se.pth') - target_se = se_process_audio_segments(target_seg, tone_color_converter) + target_se = se_process_audio_segments(target_seg, tone_color_converter, device) # Iterate throw segments encode_message = "@MyShell" @@ -1361,6 +1364,8 @@ def toneconverter_freevc( ) logger.info("FreeVC loading model...") + device_id = os.environ.get("SONITR_DEVICE") + device = None if device_id == "cpu" else device_id try: from TTS.api import TTS tts = TTS( From 53e5f81bccba6b239542ea822af6014c90ad1e88 Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Thu, 25 Apr 2024 22:54:12 +0000 Subject: [PATCH 25/36] feat(transcription): audio transcription generated by the OpenAI API --- .gitignore | 3 +- app_rvc.py | 23 +++++++- soni_translate/speech_segmentation.py | 83 +++++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 871f0d8..851ef07 100644 --- a/.gitignore +++ b/.gitignore @@ -201,4 +201,5 @@ outputs/ processed/ OPENVOICE_MODELS/ PIPER_MODELS/ -WHISPER_MODELS/ \ No newline at end of file +WHISPER_MODELS/ +whisper_api_audio_parts/ \ No newline at end of file diff --git a/app_rvc.py b/app_rvc.py index ea9c2eb..a7bbba8 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -87,6 +87,7 @@ import argparse import time import hashlib +import sys directories = [ "downloads", @@ -448,7 +449,10 @@ def multilingual_media_conversion( else: os.environ["YOUR_HF_TOKEN"] = YOUR_HF_TOKEN - if "gpt" in translate_process: + if ( + "gpt" in translate_process + or WHISPER_MODEL_SIZE == "OpenAI_API_Whisper" + ): check_openai_api_key() if media_file is None: @@ -492,6 +496,15 @@ def multilingual_media_conversion( TRANSLATE_AUDIO_TO = LANGUAGES[TRANSLATE_AUDIO_TO] SOURCE_LANGUAGE = LANGUAGES[SOURCE_LANGUAGE] + if ( + WHISPER_MODEL_SIZE == "OpenAI_API_Whisper" + and SOURCE_LANGUAGE == "zh-TW" + ): + logger.warning( + "OpenAI API Whisper only supports Chinese (Simplified)." + ) + SOURCE_LANGUAGE = "zh" + if ( text_segmentation_scale in ["word", "character"] and "subtitle" not in output_type @@ -1627,7 +1640,12 @@ def get_subs_path(type_subs): info="Choosing smaller types like int8 or float16 can improve performance by reducing memory usage and increasing computational throughput, but may sacrifice precision compared to larger data types like float32.", ) batch_size = gr.Slider( - 1, 32, value=16, label="Batch size", step=1 + minimum=1, + maximum=32, + value=8, + label="Batch size", + info="Reducing the batch size saves memory if your GPU has less VRAM and helps manage Out of Memory issues.", + step=1, ) input_srt = gr.File( label=lg_conf["srt_file_label"], @@ -2289,7 +2307,6 @@ def play_sound_alert(play_sound): if logs_in_gui: logger.info("Logs in gui need public url") - import sys class Logger: def __init__(self, filename): diff --git a/soni_translate/speech_segmentation.py b/soni_translate/speech_segmentation.py index 810ab85..dd0dce8 100644 --- a/soni_translate/speech_segmentation.py +++ b/soni_translate/speech_segmentation.py @@ -2,14 +2,17 @@ DEFAULT_ALIGN_MODELS_TORCH as DAMT, DEFAULT_ALIGN_MODELS_HF as DAMHF, ) +from whisperx.utils import TO_LANGUAGE_CODE import whisperx import torch import gc import os +import soundfile as sf from IPython.utils import capture # noqa from .language_configuration import EXTRA_ALIGN, INVERTED_LANGUAGES from .logging_setup import logger from .postprocessor import sanitize_file_name +from .utils import remove_directory_contents, run_command ASR_MODEL_OPTIONS = [ "tiny", @@ -28,6 +31,7 @@ "medium.en", "distil-small.en", "distil-medium.en", + "OpenAI_API_Whisper", ] COMPUTE_TYPE_GPU = [ @@ -51,6 +55,77 @@ WHISPER_MODELS_PATH = './WHISPER_MODELS' +def openai_api_whisper( + input_audio_file, + source_lang=None, + chunk_duration=1800 +): + + info = sf.info(input_audio_file) + duration = info.duration + + output_directory = "./whisper_api_audio_parts" + os.makedirs(output_directory, exist_ok=True) + remove_directory_contents(output_directory) + + if duration > chunk_duration: + # Split the audio file into smaller chunks with 30-minute duration + cm = f'ffmpeg -i "{input_audio_file}" -f segment -segment_time {chunk_duration} -c:a libvorbis "{output_directory}/output%03d.ogg"' + run_command(cm) + # Get list of generated chunk files + chunk_files = sorted( + [f"{output_directory}/{f}" for f in os.listdir(output_directory) if f.endswith('.ogg')] + ) + else: + one_file = f"{output_directory}/output000.ogg" + cm = f'ffmpeg -i "{input_audio_file}" -c:a libvorbis {one_file}' + run_command(cm) + chunk_files = [one_file] + + # Transcript + segments = [] + language = source_lang if source_lang else None + for i, chunk in enumerate(chunk_files): + from openai import OpenAI + client = OpenAI() + + audio_file = open(chunk, "rb") + transcription = client.audio.transcriptions.create( + model="whisper-1", + file=audio_file, + language=language, + response_format="verbose_json", + timestamp_granularities=["segment"], + ) + + try: + transcript_dict = transcription.model_dump() + except: # noqa + transcript_dict = transcription.to_dict() + + if language is None: + logger.info(f'Language detected: {transcript_dict["language"]}') + language = TO_LANGUAGE_CODE[transcript_dict["language"]] + + chunk_time = chunk_duration * (i) + + for seg in transcript_dict["segments"]: + + if "start" in seg.keys(): + segments.append( + { + "text": seg["text"], + "start": seg["start"] + chunk_time, + "end": seg["end"] + chunk_time, + } + ) + + audio = whisperx.load_audio(input_audio_file) + result = {"segments": segments, "language": language} + + return audio, result + + def find_whisper_models(): path = WHISPER_MODELS_PATH folders = [] @@ -91,6 +166,14 @@ def transcribe_speech( - result: Transcription result as a dictionary. """ + if asr_model == "OpenAI_API_Whisper": + if literalize_numbers: + logger.info( + "OpenAI's API Whisper does not support " + "the literalization of numbers." + ) + return openai_api_whisper(audio_wav, SOURCE_LANGUAGE) + # https://github.com/openai/whisper/discussions/277 prompt = "以下是普通话的句子。" if SOURCE_LANGUAGE == "zh" else None SOURCE_LANGUAGE = ( From ee60f34b480d32f386d1417dace0352e5dd91943 Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Fri, 26 Apr 2024 22:43:06 +0000 Subject: [PATCH 26/36] feat(tts): Added openai tts --- app_rvc.py | 62 ++++++----- soni_translate/language_configuration.py | 27 ++++- soni_translate/postprocessor.py | 2 +- soni_translate/speech_segmentation.py | 2 + soni_translate/text_multiformat_processor.py | 3 +- soni_translate/text_to_speech.py | 103 ++++++++++++++++--- 6 files changed, 154 insertions(+), 45 deletions(-) diff --git a/app_rvc.py b/app_rvc.py index a7bbba8..9b79c04 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -37,8 +37,9 @@ LANGUAGES, UNIDIRECTIONAL_L_LIST, LANGUAGES_LIST, - bark_voices_list, - vits_voices_list, + BARK_VOICES_LIST, + VITS_VOICES_LIST, + OPENAI_TTS_MODELS, ) from soni_translate.utils import ( remove_files, @@ -109,8 +110,9 @@ class TTS_Info: def __init__(self, piper_enabled, xtts_enabled): self.list_edge = edge_tts_voices_list() - self.list_bark = list(bark_voices_list.keys()) - self.list_vits = list(vits_voices_list.keys()) + self.list_bark = list(BARK_VOICES_LIST.keys()) + self.list_vits = list(VITS_VOICES_LIST.keys()) + self.list_openai_tts = OPENAI_TTS_MODELS self.piper_enabled = piper_enabled self.list_vits_onnx = ( piper_tts_voices_list() if self.piper_enabled else [] @@ -121,12 +123,12 @@ def tts_list(self): self.list_coqui_xtts = ( coqui_xtts_voices_list() if self.xtts_enabled else [] ) - list_tts = sorted( + list_tts = self.list_coqui_xtts + sorted( self.list_edge + self.list_bark + self.list_vits + + self.list_openai_tts + self.list_vits_onnx - + self.list_coqui_xtts ) return list_tts @@ -393,11 +395,11 @@ def multilingual_media_conversion( directory_input="", YOUR_HF_TOKEN="", preview=False, - WHISPER_MODEL_SIZE="large-v3", + transcriber_model="large-v3", batch_size=16, compute_type="float16", - SOURCE_LANGUAGE="Automatic detection", - TRANSLATE_AUDIO_TO="English (en)", + origin_language="Automatic detection", + target_language="English (en)", min_speakers=1, max_speakers=2, tts_voice00="en-AU-WilliamNeural-Male", @@ -407,7 +409,7 @@ def multilingual_media_conversion( tts_voice04="en-NZ-MitchellNeural-Male", tts_voice05="en-GB-MaisieNeural-Female", video_output_name="", - AUDIO_MIX_METHOD="Adjusting volumes and mixing audio", + mix_method_audio="Adjusting volumes and mixing audio", max_accelerate_audio=2.1, acceleration_rate_regulation=False, volume_original_audio=0.25, @@ -451,7 +453,8 @@ def multilingual_media_conversion( if ( "gpt" in translate_process - or WHISPER_MODEL_SIZE == "OpenAI_API_Whisper" + or transcriber_model == "OpenAI_API_Whisper" + or "OpenAI-TTS" in tts_voice00 ): check_openai_api_key() @@ -472,9 +475,12 @@ def multilingual_media_conversion( if media_file is None: media_file = "" - if SOURCE_LANGUAGE in UNIDIRECTIONAL_L_LIST and not subtitle_file: + if not origin_language: + origin_language = "Automatic detection" + + if origin_language in UNIDIRECTIONAL_L_LIST and not subtitle_file: raise ValueError( - f"The language '{SOURCE_LANGUAGE}' " + f"The language '{origin_language}' " "is not supported for transcription (ASR)." ) @@ -493,11 +499,11 @@ def multilingual_media_conversion( f"first to acquire the {output_type}." ) - TRANSLATE_AUDIO_TO = LANGUAGES[TRANSLATE_AUDIO_TO] - SOURCE_LANGUAGE = LANGUAGES[SOURCE_LANGUAGE] + TRANSLATE_AUDIO_TO = LANGUAGES[target_language] + SOURCE_LANGUAGE = LANGUAGES[origin_language] if ( - WHISPER_MODEL_SIZE == "OpenAI_API_Whisper" + transcriber_model == "OpenAI_API_Whisper" and SOURCE_LANGUAGE == "zh-TW" ): logger.warning( @@ -574,8 +580,8 @@ def multilingual_media_conversion( if "SET_LIMIT" == os.getenv("DEMO"): preview = True - AUDIO_MIX_METHOD = "Adjusting volumes and mixing audio" - WHISPER_MODEL_SIZE = "medium" + mix_method_audio = "Adjusting volumes and mixing audio" + transcriber_model = "medium" logger.info( "DEMO; set preview=True; Generation is limited to " "10 seconds to prevent CPU errors. No limitations with GPU.\n" @@ -674,7 +680,7 @@ def multilingual_media_conversion( if not self.task_in_cache("transcript_align", [ subtitle_file, SOURCE_LANGUAGE, - WHISPER_MODEL_SIZE, + transcriber_model, compute_type, batch_size, literalize_numbers, @@ -706,7 +712,7 @@ def multilingual_media_conversion( ) audio, self.result = transcribe_speech( base_audio_wav if not self.vocals else self.vocals, - WHISPER_MODEL_SIZE, + transcriber_model, compute_type, batch_size, SOURCE_LANGUAGE, @@ -1026,7 +1032,7 @@ def multilingual_media_conversion( base_audio_wav = voiceless_audio_file if not self.task_in_cache("mix_aud", [ - AUDIO_MIX_METHOD, + mix_method_audio, volume_original_audio, volume_translated_audio, voiceless_track @@ -1035,7 +1041,7 @@ def multilingual_media_conversion( remove_files(mix_audio_file) command_volume_mix = f'ffmpeg -y -i {base_audio_wav} -i {dub_audio_file} -filter_complex "[0:0]volume={volume_original_audio}[a];[1:0]volume={volume_translated_audio}[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio_file}' command_background_mix = f'ffmpeg -i {base_audio_wav} -i {dub_audio_file} -filter_complex "[1:a]asplit=2[sc][mix];[0:a][sc]sidechaincompress=threshold=0.003:ratio=20[bg]; [bg][mix]amerge[final]" -map [final] {mix_audio_file}' - if AUDIO_MIX_METHOD == "Adjusting volumes and mixing audio": + if mix_method_audio == "Adjusting volumes and mixing audio": # volume mix run_command(command_volume_mix) else: @@ -1108,8 +1114,8 @@ def multilingual_docs_conversion( string_text="", # string document=None, # doc path gui directory_input="", # doc path - SOURCE_LANGUAGE="English (en)", - TRANSLATE_AUDIO_TO="English (en)", + origin_language="English (en)", + target_language="English (en)", tts_voice00="en-AU-WilliamNeural-Male", name_final_file="sample", translate_process="google_translator", @@ -1121,9 +1127,9 @@ def multilingual_docs_conversion( if "gpt" in translate_process: check_openai_api_key() - SOURCE_LANGUAGE = LANGUAGES[SOURCE_LANGUAGE] + SOURCE_LANGUAGE = LANGUAGES[origin_language] if translate_process != "disable_translation": - TRANSLATE_AUDIO_TO = LANGUAGES[TRANSLATE_AUDIO_TO] + TRANSLATE_AUDIO_TO = LANGUAGES[target_language] else: TRANSLATE_AUDIO_TO = SOURCE_LANGUAGE logger.info("No translation") @@ -1803,7 +1809,7 @@ def visible_component_subs(input_bool): "", False, whisper_model_default, - 16, + 8, com_t_default, "Spanish (es)", "English (en)", @@ -1825,7 +1831,7 @@ def visible_component_subs(input_bool): "", False, whisper_model_default, - 16, + 8, com_t_default, "Japanese (ja)", "English (en)", diff --git a/soni_translate/language_configuration.py b/soni_translate/language_configuration.py index 1116c78..e697e28 100644 --- a/soni_translate/language_configuration.py +++ b/soni_translate/language_configuration.py @@ -197,7 +197,7 @@ def fix_code_language(translate_to, syntax="google"): return new_code_lang -bark_voices_list = { +BARK_VOICES_LIST = { "de_speaker_0-Male BARK": "v2/de_speaker_0", "de_speaker_1-Male BARK": "v2/de_speaker_1", "de_speaker_2-Male BARK": "v2/de_speaker_2", @@ -330,7 +330,7 @@ def fix_code_language(translate_to, syntax="google"): "zh_speaker_9-Female BARK": "v2/zh_speaker_9", } -vits_voices_list = { +VITS_VOICES_LIST = { "ar-facebook-mms VITS": "facebook/mms-tts-ara", # 'zh-facebook-mms VITS': 'facebook/mms-tts-cmn', "zh_Hakka-facebook-mms VITS": "facebook/mms-tts-hak", @@ -479,6 +479,29 @@ def fix_code_language(translate_to, syntax="google"): "ug_script_cyrillic-facebook-mms VITS": "facebook/mms-tts-uig-script_cyrillic", } +OPENAI_TTS_CODES = [ + "af", "ar", "hy", "az", "be", "bs", "bg", "ca", "zh", "hr", "cs", "da", + "nl", "en", "et", "fi", "fr", "gl", "de", "el", "he", "hi", "hu", "is", + "id", "it", "ja", "kn", "kk", "ko", "lv", "lt", "mk", "ms", "mr", "mi", + "ne", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sk", "sl", "es", "sw", + "sv", "tl", "ta", "th", "tr", "uk", "ur", "vi", "cy", "zh-TW" +] + +OPENAI_TTS_MODELS = [ + ">alloy OpenAI-TTS", + ">echo OpenAI-TTS", + ">fable OpenAI-TTS", + ">onyx OpenAI-TTS", + ">nova OpenAI-TTS", + ">shimmer OpenAI-TTS", + ">alloy HD OpenAI-TTS", + ">echo HD OpenAI-TTS", + ">fable HD OpenAI-TTS", + ">onyx HD OpenAI-TTS", + ">nova HD OpenAI-TTS", + ">shimmer HD OpenAI-TTS" +] + LANGUAGE_CODE_IN_THREE_LETTERS = { "Automatic detection": "aut", "ar": "ara", diff --git a/soni_translate/postprocessor.py b/soni_translate/postprocessor.py index 1628d21..9753857 100644 --- a/soni_translate/postprocessor.py +++ b/soni_translate/postprocessor.py @@ -107,7 +107,7 @@ def get_output_file( else: shutil.copy2(original_file, new_file_path) - return os.path.join(os.getcwd(), new_file_path) + return os.path.abspath(new_file_path) def media_out( diff --git a/soni_translate/speech_segmentation.py b/soni_translate/speech_segmentation.py index dd0dce8..0cb9d65 100644 --- a/soni_translate/speech_segmentation.py +++ b/soni_translate/speech_segmentation.py @@ -238,6 +238,7 @@ def transcribe_speech( audio, batch_size=batch_size, chunk_size=segment_duration_limit, + print_progress=True, ) if result["language"] == "zh" and not prompt: @@ -311,6 +312,7 @@ def align_speech(audio, result): audio, os.environ.get("SONITR_DEVICE"), return_char_alignments=True, + print_progress=False, ) del model_a gc.collect() diff --git a/soni_translate/text_multiformat_processor.py b/soni_translate/text_multiformat_processor.py index 9ba6871..a795c00 100644 --- a/soni_translate/text_multiformat_processor.py +++ b/soni_translate/text_multiformat_processor.py @@ -181,13 +181,14 @@ def split_text_into_chunks(text, chunk_size): def determine_chunk_size(file_name): patterns = { - re.compile(r".*-(Male|Female)$"): 600, # by character + re.compile(r".*-(Male|Female)$"): 1024, # by character re.compile(r".* BARK$"): 100, # t 64 256 re.compile(r".* VITS$"): 500, re.compile( r".+\.(wav|mp3|ogg|m4a)$" ): 150, # t 250 400 api automatic split re.compile(r".* VITS-onnx$"): 250, # automatic sentence split + re.compile(r".* OpenAI-TTS$"): 1024 # max charaters 4096 } for pattern, chunk_size in patterns.items(): diff --git a/soni_translate/text_to_speech.py b/soni_translate/text_to_speech.py index 1e619e6..9b2e967 100644 --- a/soni_translate/text_to_speech.py +++ b/soni_translate/text_to_speech.py @@ -4,8 +4,8 @@ import librosa, os, re, torch, gc, subprocess # noqa from .language_configuration import ( fix_code_language, - bark_voices_list, - vits_voices_list, + BARK_VOICES_LIST, + VITS_VOICES_LIST, ) from .utils import ( download_manager, @@ -80,18 +80,24 @@ def error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename): def pad_array(array, sr): + if not array.shape[0]: + raise ValueError("The generated audio does not contain any data") + valid_indices = np.where(np.abs(array) > 0.001)[0] if len(valid_indices) == 0: + logger.debug(f"No valid indices: {array}") return array - pad_indice = int(0.1 * sr) - start_pad = max(0, valid_indices[0] - pad_indice) - end_pad = min(len(array), valid_indices[-1] + 1 + pad_indice) - - padded_array = array[start_pad:end_pad] - - return padded_array + try: + pad_indice = int(0.1 * sr) + start_pad = max(0, valid_indices[0] - pad_indice) + end_pad = min(len(array), valid_indices[-1] + 1 + pad_indice) + padded_array = array[start_pad:end_pad] + return padded_array + except Exception as error: + logger.error(str(error)) + return array # ===================================== @@ -223,7 +229,7 @@ def segments_bark_tts( start = segment["start"] tts_name = segment["tts_name"] - inputs = processor(text, voice_preset=bark_voices_list[tts_name]).to( + inputs = processor(text, voice_preset=BARK_VOICES_LIST[tts_name]).to( device ) @@ -325,9 +331,9 @@ def segments_vits_tts(filtered_vits_segments, TRANSLATE_AUDIO_TO): if tts_name != model_name_key: model_name_key = tts_name - model = VitsModel.from_pretrained(vits_voices_list[tts_name]) + model = VitsModel.from_pretrained(VITS_VOICES_LIST[tts_name]) tokenizer = AutoTokenizer.from_pretrained( - vits_voices_list[tts_name] + VITS_VOICES_LIST[tts_name] ) sampling_rate = model.config.sampling_rate @@ -866,6 +872,67 @@ def segments_vits_onnx_tts(filtered_onnx_vits_segments, TRANSLATE_AUDIO_TO): torch.cuda.empty_cache() +# ===================================== +# CLOSEAI TTS +# ===================================== + + +def segments_openai_tts( + filtered_openai_tts_segments, TRANSLATE_AUDIO_TO +): + from openai import OpenAI + + client = OpenAI() + sampling_rate = 24000 + + # filtered_segments = filtered_openai_tts_segments['segments'] + # Sorting the segments by 'tts_name' + # sorted_segments = sorted(filtered_segments, key=lambda x: x['tts_name']) + + for segment in tqdm(filtered_openai_tts_segments["segments"]): + speaker = segment["speaker"] # noqa + text = segment["text"].strip() + start = segment["start"] + tts_name = segment["tts_name"] + + # make the tts audio + filename = f"audio/{start}.ogg" + logger.info(f"{text} >> {filename}") + + try: + # Request + response = client.audio.speech.create( + model="tts-1-hd" if "HD" in tts_name else "tts-1", + voice=tts_name.split()[0][1:], + response_format="wav", + input=text + ) + + audio_bytes = b'' + for data in response.iter_bytes(chunk_size=4096): + audio_bytes += data + + speech_output = np.frombuffer(audio_bytes, dtype=np.int16) + + # Save file + data_tts = pad_array( + speech_output[240:], + sampling_rate, + ) + + sf.write( + file=filename, + samplerate=sampling_rate, + data=data_tts, + format="ogg", + subtype="vorbis", + ) + verify_saved_file_and_size(filename) + + except Exception as error: + error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename) + + # ===================================== # Select task TTS # ===================================== @@ -936,6 +1003,7 @@ def audio_segmentation_to_voice( pattern_vits = re.compile(r".* VITS$") pattern_coqui = re.compile(r".+\.(wav|mp3|ogg|m4a)$") pattern_vits_onnx = re.compile(r".* VITS-onnx$") + pattern_openai_tts = re.compile(r".* OpenAI-TTS$") all_segments = result_diarize["segments"] @@ -946,6 +1014,9 @@ def audio_segmentation_to_voice( speakers_vits_onnx = find_spkr( pattern_vits_onnx, speaker_to_voice, all_segments ) + speakers_openai_tts = find_spkr( + pattern_openai_tts, speaker_to_voice, all_segments + ) # Filter method in segments filtered_edge = filter_by_speaker(speakers_edge, all_segments) @@ -953,6 +1024,7 @@ def audio_segmentation_to_voice( filtered_vits = filter_by_speaker(speakers_vits, all_segments) filtered_coqui = filter_by_speaker(speakers_coqui, all_segments) filtered_vits_onnx = filter_by_speaker(speakers_vits_onnx, all_segments) + filtered_openai_tts = filter_by_speaker(speakers_openai_tts, all_segments) # Infer if filtered_edge["segments"]: @@ -979,6 +1051,9 @@ def audio_segmentation_to_voice( if filtered_vits_onnx["segments"]: logger.info(f"PIPER TTS: {speakers_vits_onnx}") segments_vits_onnx_tts(filtered_vits_onnx, TRANSLATE_AUDIO_TO) # wav + if filtered_openai_tts["segments"]: + logger.info(f"OpenAI TTS: {speakers_openai_tts}") + segments_openai_tts(filtered_openai_tts, TRANSLATE_AUDIO_TO) # wav [result.pop("tts_name", None) for result in result_diarize["segments"]] return [ @@ -987,6 +1062,7 @@ def audio_segmentation_to_voice( speakers_vits, speakers_coqui, speakers_vits_onnx, + speakers_openai_tts ] @@ -1004,7 +1080,8 @@ def accelerate_segments( speakers_bark, speakers_vits, speakers_coqui, - speakers_vits_onnx + speakers_vits_onnx, + speakers_openai_tts ) = valid_speakers create_directories(f"{folder_output}/audio/") From 14d4c3e1a89ca590edee09185e75fbb89d214316 Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Tue, 30 Apr 2024 19:48:50 +0000 Subject: [PATCH 27/36] refactor: custom voices without env var --- .gitignore | 3 +- app_rvc.py | 78 +++++++++++---------------- lib/rmvpe.py | 11 ++++ soni_translate/speech_segmentation.py | 1 - vci_pipeline.py | 9 +++- voice_main.py | 26 ++++++++- 6 files changed, 74 insertions(+), 54 deletions(-) diff --git a/.gitignore b/.gitignore index 851ef07..cfb27e0 100644 --- a/.gitignore +++ b/.gitignore @@ -202,4 +202,5 @@ processed/ OPENVOICE_MODELS/ PIPER_MODELS/ WHISPER_MODELS/ -whisper_api_audio_parts/ \ No newline at end of file +whisper_api_audio_parts/ +uroman/ \ No newline at end of file diff --git a/app_rvc.py b/app_rvc.py index 9b79c04..350f8cc 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -133,17 +133,6 @@ def tts_list(self): return list_tts -def custom_model_voice_enable(enable_custom_voice): - os.environ["VOICES_MODELS"] = ( - "ENABLE" if enable_custom_voice else "DISABLE" - ) - - -def custom_model_voice_workers(workers): - # os.environ["VOICES_MODELS_WORKERS"] = str(workers) - pass - - def prog_disp(msg, percent, is_gui, progress=None): logger.info(msg) if is_gui: @@ -290,8 +279,6 @@ def __init__(self, cpu_mode=False): self.voiceless_id = None self.burn_subs_id = None - os.environ["VOICES_MODELS"] = "DISABLE" - os.environ["VOICES_MODELS_WORKERS"] = "1" self.vci = ClassVoices(only_cpu=cpu_mode) self.tts_voices = self.get_tts_voice_list() @@ -330,13 +317,6 @@ def get_tts_voice_list(self): return self.tts_info.tts_list() - def enable_custom_model_voice(self, workers=1): - os.environ["VOICES_MODELS"] = "ENABLE" - os.environ["VOICES_MODELS_WORKERS"] = str(workers) - - def disable_custom_model_voice(self): - os.environ["VOICES_MODELS"] = "DISABLE" - def batch_multilingual_media_conversion(self, *kwargs): # logger.debug(str(kwargs)) @@ -396,12 +376,12 @@ def multilingual_media_conversion( YOUR_HF_TOKEN="", preview=False, transcriber_model="large-v3", - batch_size=16, + batch_size=4, compute_type="float16", origin_language="Automatic detection", target_language="English (en)", min_speakers=1, - max_speakers=2, + max_speakers=1, tts_voice00="en-AU-WilliamNeural-Male", tts_voice01="en-CA-ClaraNeural-Female", tts_voice02="en-GB-ThomasNeural-Male", @@ -435,9 +415,11 @@ def multilingual_media_conversion( dereverb_automatic_xtts=True, text_segmentation_scale="sentence", divide_text_segments_by="", - soft_subtitles_to_video=False, + soft_subtitles_to_video=True, burn_subtitles_to_video=False, - enable_cache=False, + enable_cache=True, + custom_voices=False, + custom_voices_workers=1, is_gui=False, progress=gr.Progress(), ): @@ -538,7 +520,7 @@ def multilingual_media_conversion( ) warn_disp(wrn_lang, is_gui) - if os.getenv("VOICES_MODELS") == "ENABLE" and voice_imitation: + if custom_voices and voice_imitation: wrn_lang = ( "When you use R.V.C. models, it is advisable" " to disable Voice Imitation." @@ -939,8 +921,8 @@ def multilingual_media_conversion( voice_imitation_remove_previous, voice_imitation_vocals_dereverb, voice_imitation_method, - os.getenv("VOICES_MODELS"), - os.getenv("VOICES_MODELS_WORKERS"), + custom_voices, + custom_voices_workers, copy.deepcopy(self.vci.model_config), avoid_overlap ], { @@ -972,7 +954,7 @@ def multilingual_media_conversion( logger.error(str(error)) # custom voice - if os.getenv("VOICES_MODELS") == "ENABLE": + if custom_voices: prog_disp( "Applying customized voices...", 0.90, @@ -985,10 +967,9 @@ def multilingual_media_conversion( audio_files, speakers_list, overwrite=True, - parallel_workers=int( - os.getenv("VOICES_MODELS_WORKERS") - ), + parallel_workers=custom_voices_workers, ) + self.vci.unload_models() except Exception as error: logger.error(str(error)) @@ -1121,6 +1102,8 @@ def multilingual_docs_conversion( translate_process="google_translator", output_type="audio", chunk_size=None, + custom_voices=False, + custom_voices_workers=1, is_gui=False, progress=gr.Progress(), ): @@ -1223,7 +1206,7 @@ def multilingual_docs_conversion( ) # custom voice - if os.getenv("VOICES_MODELS") == "ENABLE": + if custom_voices: prog_disp( "Applying customized voices...", 0.80, @@ -1234,10 +1217,9 @@ def multilingual_docs_conversion( audio_files, speakers_list, overwrite=True, - parallel_workers=int( - os.getenv("VOICES_MODELS_WORKERS") - ), + parallel_workers=custom_voices_workers, ) + self.vci.unload_models() prog_disp( "Creating final audio file...", 0.90, is_gui, progress=progress @@ -1809,7 +1791,7 @@ def visible_component_subs(input_bool): "", False, whisper_model_default, - 8, + 4, com_t_default, "Spanish (es)", "English (en)", @@ -1831,7 +1813,7 @@ def visible_component_subs(input_bool): "", False, whisper_model_default, - 8, + 4, com_t_default, "Japanese (ja)", "English (en)", @@ -2056,12 +2038,9 @@ def update_models(): with gr.Column(): gr.Markdown(lg_conf["sec1_title"]) enable_custom_voice = gr.Checkbox( - label="ENABLE", info=lg_conf["enable_replace"] - ) - enable_custom_voice.change( - custom_model_voice_enable, - [enable_custom_voice], - [], + False, + label="ENABLE", + info=lg_conf["enable_replace"] ) workers_custom_voice = gr.Number( step=1, @@ -2071,11 +2050,7 @@ def update_models(): label="workers", visible=False, ) - workers_custom_voice.change( - custom_model_voice_workers, - [workers_custom_voice], - [], - ) + gr.Markdown(lg_conf["sec2_title"]) gr.Markdown(lg_conf["sec2_subtitle"]) @@ -2084,6 +2059,7 @@ def update_models(): "harvest", "crepe", "rmvpe", + "rmvpe+", ] def model_conf(): @@ -2435,6 +2411,8 @@ def update_tts_list(): soft_subtitles_to_video_gui, burn_subtitles_to_video_gui, enable_cache_gui, + enable_custom_voice, + workers_custom_voice, is_gui_dummy_check, ], outputs=subs_edit_space, @@ -2494,6 +2472,8 @@ def update_tts_list(): soft_subtitles_to_video_gui, burn_subtitles_to_video_gui, enable_cache_gui, + enable_custom_voice, + workers_custom_voice, is_gui_dummy_check, ], outputs=video_output, @@ -2518,6 +2498,8 @@ def update_tts_list(): docs_translate_process_dropdown, docs_output_type, docs_chunk_size, + enable_custom_voice, + workers_custom_voice, docs_dummy_check, ], outputs=docs_output, diff --git a/lib/rmvpe.py b/lib/rmvpe.py index 04c41c7..38e8bc4 100644 --- a/lib/rmvpe.py +++ b/lib/rmvpe.py @@ -384,6 +384,17 @@ def infer_from_audio(self, audio, thred=0.03): # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0)) return f0 + def pitch_based_audio_inference(self, audio, thred=0.03, f0_min=50, f0_max=1100): + audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0) + mel = self.mel_extractor(audio, center=True) + hidden = self.mel2hidden(mel) + hidden = hidden.squeeze(0).cpu().numpy() + if self.is_half == True: + hidden = hidden.astype("float32") + f0 = self.decode(hidden, thred=thred) + f0[(f0 < f0_min) | (f0 > f0_max)] = 0 + return f0 + def to_local_average_cents(self, salience, thred=0.05): # t0 = ttime() center = np.argmax(salience, axis=1) # frame length#index diff --git a/soni_translate/speech_segmentation.py b/soni_translate/speech_segmentation.py index 0cb9d65..9ef5f6b 100644 --- a/soni_translate/speech_segmentation.py +++ b/soni_translate/speech_segmentation.py @@ -39,7 +39,6 @@ "int8_float32", "int8_float16", "int8_bfloat16", - "int16", "float16", "bfloat16", "float32" diff --git a/vci_pipeline.py b/vci_pipeline.py index ab68bc7..f67e3ec 100644 --- a/vci_pipeline.py +++ b/vci_pipeline.py @@ -128,7 +128,7 @@ def get_f0( f0 = torchcrepe.filter.mean(f0, 3) f0[pd < 0.1] = 0 f0 = f0[0].cpu().numpy() - elif f0_method == "rmvpe": + elif "rmvpe" in f0_method: if hasattr(self, "model_rmvpe") == False: from lib.rmvpe import RMVPE @@ -136,7 +136,12 @@ def get_f0( self.model_rmvpe = RMVPE( "rmvpe.pt", is_half=self.is_half, device=self.device ) - f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + thred = 0.03 + if "+" in f0_method: + f0 = self.model_rmvpe.pitch_based_audio_inference(x, thred, f0_min, f0_max) + else: + f0 = self.model_rmvpe.infer_from_audio(x, thred) + f0 *= pow(2, f0_up_key / 12) # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) tf0 = self.sr // self.window # f0 points per second diff --git a/voice_main.py b/voice_main.py index f6b33de..5e9c1ee 100644 --- a/voice_main.py +++ b/voice_main.py @@ -214,6 +214,7 @@ def apply_conf( if not self.config: self.config = Config(self.only_cpu) self.hu_bert_model = None + self.model_pitch_estimator = None self.model_config[tag] = { "file_model": file_model, @@ -231,6 +232,7 @@ def apply_conf( def infer( self, + task_id, params, # load model n_spk, @@ -441,6 +443,7 @@ def infer( data=audio_opt ) + self.model_config[task_id]["result"].append(output_audio_path) self.output_list.append(output_audio_path) def make_test( @@ -510,6 +513,12 @@ def run_threads(self, threads): gc.collect() torch.cuda.empty_cache() + def unload_models(self): + self.hu_bert_model = None + self.model_pitch_estimator = None + gc.collect() + torch.cuda.empty_cache() + def __call__( self, audio_files=[], @@ -574,6 +583,8 @@ def __call__( if cache_params != id_tag: + self.model_config[id_tag]["result"] = [] + # Unload previous ( n_spk, @@ -639,7 +650,7 @@ def __call__( logger.error(f"f0 file: {str(error)}") if "rmvpe" in f0_method: - if not hasattr(self, "model_pitch_estimator"): + if not self.model_pitch_estimator: from lib.rmvpe import RMVPE logger.info("Loading vocal pitch estimator model") @@ -654,6 +665,7 @@ def __call__( cache_params = id_tag # self.infer( + # id_tag, # params, # # load model # n_spk, @@ -677,6 +689,7 @@ def __call__( thread = threading.Thread( target=self.infer, args=( + id_tag, params, # loaded model n_spk, @@ -707,4 +720,13 @@ def __call__( progress_bar.update(len(threads)) progress_bar.close() - return self.output_list + final_result = [] + valid_tags = set(tag_list) + for tag in valid_tags: + if ( + tag in self.model_config.keys() + and "result" in self.model_config[tag].keys() + ): + final_result.extend(self.model_config[tag]["result"]) + + return final_result From e466fd0a83adc1c8690a1945c79f4d87e4b6d82c Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Tue, 30 Apr 2024 23:07:28 +0000 Subject: [PATCH 28/36] feat(diarization): 12 speakers #42 --- .gitignore | 6 + app_rvc.py | 136 +++++++++++++-- soni_translate/languages_gui.py | 240 +++++++++++++++++++++++++++ soni_translate/text_to_speech.py | 14 +- soni_translate/translate_segments.py | 8 +- 5 files changed, 388 insertions(+), 16 deletions(-) diff --git a/.gitignore b/.gitignore index cfb27e0..69a138c 100644 --- a/.gitignore +++ b/.gitignore @@ -167,6 +167,12 @@ SPEAKER_02.* SPEAKER_03.* SPEAKER_04.* SPEAKER_05.* +SPEAKER_06.* +SPEAKER_07.* +SPEAKER_08.* +SPEAKER_09.* +SPEAKER_10.* +SPEAKER_11.* task_subtitle.* *.mp3 *.mp4 diff --git a/app_rvc.py b/app_rvc.py index 350f8cc..566b497 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -330,8 +330,8 @@ def batch_multilingual_media_conversion(self, *kwargs): path_arg = [x.strip() for x in path_arg.split(',')] path_arg = get_valid_files(path_arg) - edit_text_arg = kwargs[25] - get_text_arg = kwargs[26] + edit_text_arg = kwargs[31] + get_text_arg = kwargs[32] is_gui_arg = kwargs[-1] @@ -388,6 +388,12 @@ def multilingual_media_conversion( tts_voice03="en-GB-SoniaNeural-Female", tts_voice04="en-NZ-MitchellNeural-Male", tts_voice05="en-GB-MaisieNeural-Female", + tts_voice06="en-AU-WilliamNeural-Male", + tts_voice07="en-CA-ClaraNeural-Female", + tts_voice08="en-GB-ThomasNeural-Male", + tts_voice09="en-GB-SoniaNeural-Female", + tts_voice10="en-NZ-MitchellNeural-Male", + tts_voice11="en-GB-MaisieNeural-Female", video_output_name="", mix_method_audio="Adjusting volumes and mixing audio", max_accelerate_audio=2.1, @@ -797,7 +803,7 @@ def multilingual_media_conversion( for segment in self.result_diarize["segments"]: start = segment["start"] text = segment["text"] - speaker = int(segment.get("speaker", "SPEAKER_00")[-1]) + 1 + speaker = int(segment.get("speaker", "SPEAKER_00")[-2:]) + 1 json_data.append( {"start": start, "text": text, "speaker": speaker} ) @@ -816,7 +822,7 @@ def multilingual_media_conversion( text_json_loaded = json.loads(text_json) for i, segment in enumerate(self.result_diarize["segments"]): segment["text"] = text_json_loaded[i]["text"] - segment["speaker"] = "SPEAKER_0" + str( + segment["speaker"] = "SPEAKER_{:02d}".format( int(text_json_loaded[i]["speaker"]) - 1 ) @@ -895,6 +901,12 @@ def multilingual_media_conversion( tts_voice03, tts_voice04, tts_voice05, + tts_voice06, + tts_voice07, + tts_voice08, + tts_voice09, + tts_voice10, + tts_voice11, dereverb_automatic_xtts ], { "sub_file": self.sub_file @@ -910,6 +922,12 @@ def multilingual_media_conversion( tts_voice03, tts_voice04, tts_voice05, + tts_voice06, + tts_voice07, + tts_voice08, + tts_voice09, + tts_voice10, + tts_voice11, dereverb_automatic_xtts, ) @@ -1196,6 +1214,12 @@ def multilingual_docs_conversion( "", "", "", + "", + "", + "", + "", + "", + "", ) # fix format and set folder output @@ -1312,7 +1336,7 @@ def swap_visibility(data_type): gr.HTML("
") gr.Markdown(lg_conf["num_speakers"]) - MAX_TTS = 6 + MAX_TTS = 12 min_speakers = gr.Slider( 1, MAX_TTS, @@ -1333,7 +1357,7 @@ def swap_visibility(data_type): def submit(value): visibility_dict = { f"tts_voice{i:02d}": gr.update(visible=i < value) - for i in range(6) + for i in range(MAX_TTS) } return [value for value in visibility_dict.values()] @@ -1379,6 +1403,48 @@ def submit(value): visible=False, interactive=True, ) + tts_voice06 = gr.Dropdown( + SoniTr.tts_info.tts_list(), + value="en-GB-MaisieNeural-Female", + label=lg_conf["sk7"], + visible=False, + interactive=True, + ) + tts_voice07 = gr.Dropdown( + SoniTr.tts_info.tts_list(), + value="en-GB-MaisieNeural-Female", + label=lg_conf["sk8"], + visible=False, + interactive=True, + ) + tts_voice08 = gr.Dropdown( + SoniTr.tts_info.tts_list(), + value="en-GB-MaisieNeural-Female", + label=lg_conf["sk9"], + visible=False, + interactive=True, + ) + tts_voice09 = gr.Dropdown( + SoniTr.tts_info.tts_list(), + value="en-GB-MaisieNeural-Female", + label=lg_conf["sk10"], + visible=False, + interactive=True, + ) + tts_voice10 = gr.Dropdown( + SoniTr.tts_info.tts_list(), + value="en-GB-MaisieNeural-Female", + label=lg_conf["sk11"], + visible=False, + interactive=True, + ) + tts_voice11 = gr.Dropdown( + SoniTr.tts_info.tts_list(), + value="en-GB-MaisieNeural-Female", + label=lg_conf["sk12"], + visible=False, + interactive=True, + ) max_speakers.change( submit, max_speakers, @@ -1389,6 +1455,12 @@ def submit(value): tts_voice03, tts_voice04, tts_voice05, + tts_voice06, + tts_voice07, + tts_voice08, + tts_voice09, + tts_voice10, + tts_voice11, ], ) @@ -1803,6 +1875,12 @@ def visible_component_subs(input_bool): "en-GB-SoniaNeural-Female", "en-NZ-MitchellNeural-Male", "en-GB-MaisieNeural-Female", + "en-AU-WilliamNeural-Male", + "en-CA-ClaraNeural-Female", + "en-GB-ThomasNeural-Male", + "en-GB-SoniaNeural-Female", + "en-NZ-MitchellNeural-Male", + "en-GB-MaisieNeural-Female", "", "Adjusting volumes and mixing audio", ], @@ -1825,6 +1903,12 @@ def visible_component_subs(input_bool): "en-GB-SoniaNeural-Female", "en-NZ-MitchellNeural-Male", "en-GB-MaisieNeural-Female", + "en-AU-WilliamNeural-Male", + "en-CA-ClaraNeural-Female", + "en-GB-ThomasNeural-Male", + "en-GB-SoniaNeural-Female", + "en-NZ-MitchellNeural-Male", + "en-GB-MaisieNeural-Female", "", "Adjusting volumes and mixing audio", ], @@ -1849,6 +1933,12 @@ def visible_component_subs(input_bool): tts_voice03, tts_voice04, tts_voice05, + tts_voice06, + tts_voice07, + tts_voice08, + tts_voice09, + tts_voice10, + tts_voice11, VIDEO_OUTPUT_NAME, AUDIO_MIX, audio_accelerate, @@ -2021,13 +2111,13 @@ def update_models(): f"fmodel{i:02d}": gr.update( choices=models_path ) - for i in range(7) + for i in range(MAX_TTS+1) } dict_index = { f"findex{i:02d}": gr.update( choices=index_path, value=None ) - for i in range(7) + for i in range(MAX_TTS+1) } dict_changes = {**dict_models, **dict_index} return [value for value in dict_changes.values()] @@ -2143,7 +2233,7 @@ def button_conf(tts_name): ) TTS_TABS = [ - 'TTS Speaker {}'.format(i) for i in range(1, 7) + 'TTS Speaker {:02d}'.format(i) for i in range(1, MAX_TTS+1) ] CV_SUBTITLES = [ @@ -2153,11 +2243,17 @@ def button_conf(tts_name): lg_conf["cv_tts4"], lg_conf["cv_tts5"], lg_conf["cv_tts6"], + lg_conf["cv_tts7"], + lg_conf["cv_tts8"], + lg_conf["cv_tts9"], + lg_conf["cv_tts10"], + lg_conf["cv_tts11"], + lg_conf["cv_tts12"], ] configs_storage = [] - for i in range(6): # Loop from 00 to 05 + for i in range(MAX_TTS): # Loop from 00 to 11 with gr.Accordion(CV_SUBTITLES[i], open=False): gr.Markdown(TTS_TABS[i]) with gr.Column(): @@ -2322,7 +2418,7 @@ def read_logs(): def update_tts_list(): update_dict = { f"tts_voice{i:02d}": gr.update(choices=SoniTr.tts_info.tts_list()) - for i in range(6) + for i in range(MAX_TTS) } update_dict["tts_documents"] = gr.update( choices=list( @@ -2355,6 +2451,12 @@ def update_tts_list(): tts_voice03, tts_voice04, tts_voice05, + tts_voice06, + tts_voice07, + tts_voice08, + tts_voice09, + tts_voice10, + tts_voice11, tts_documents, ], ) @@ -2381,6 +2483,12 @@ def update_tts_list(): tts_voice03, tts_voice04, tts_voice05, + tts_voice06, + tts_voice07, + tts_voice08, + tts_voice09, + tts_voice10, + tts_voice11, VIDEO_OUTPUT_NAME, AUDIO_MIX, audio_accelerate, @@ -2442,6 +2550,12 @@ def update_tts_list(): tts_voice03, tts_voice04, tts_voice05, + tts_voice06, + tts_voice07, + tts_voice08, + tts_voice09, + tts_voice10, + tts_voice11, VIDEO_OUTPUT_NAME, AUDIO_MIX, audio_accelerate, diff --git a/soni_translate/languages_gui.py b/soni_translate/languages_gui.py index 73b39fc..e9be02a 100644 --- a/soni_translate/languages_gui.py +++ b/soni_translate/languages_gui.py @@ -93,6 +93,12 @@ "sk4": "TTS Speaker 4", "sk5": "TTS Speaker 5", "sk6": "TTS Speaker 6", + "sk7": "TTS Speaker 7", + "sk8": "TTS Speaker 8", + "sk9": "TTS Speaker 9", + "sk10": "TTS Speaker 10", + "sk11": "TTS Speaker 11", + "sk12": "TTS Speaker 12", "vc_title": "Voice Imitation in Different Languages", "vc_subtitle": """ ### Replicate a person's voice across various languages. @@ -180,6 +186,12 @@ "cv_tts4": "Choose the voice to apply for Speaker 4.", "cv_tts5": "Choose the voice to apply for Speaker 5.", "cv_tts6": "Choose the voice to apply for Speaker 6.", + "cv_tts7": "Choose the voice to apply for Speaker 7.", + "cv_tts8": "Choose the voice to apply for Speaker 8.", + "cv_tts9": "Choose the voice to apply for Speaker 9.", + "cv_tts10": "Choose the voice to apply for Speaker 10.", + "cv_tts11": "Choose the voice to apply for Speaker 11.", + "cv_tts12": "Choose the voice to apply for Speaker 12.", "cv_aux": "- Voice to apply in case a Speaker is not detected successfully.", "cv_button_apply": "APPLY CONFIGURATION", "tab_help": "Help", @@ -255,6 +267,12 @@ "sk4": "TTS Hablante 4", "sk5": "TTS Hablante 5", "sk6": "TTS Hablante 6", + "sk7": "TTS Hablante 7", + "sk8": "TTS Hablante 8", + "sk9": "TTS Hablante 9", + "sk10": "TTS Hablante 10", + "sk11": "TTS Hablante 11", + "sk12": "TTS Hablante 12", "vc_title": "Imitación de voz en diferentes idiomas", "vc_subtitle": """ ### Replicar la voz de una persona en varios idiomas. @@ -342,6 +360,12 @@ "cv_tts4": "Voz a aplicar al TTS Hablante 4.", "cv_tts5": "Voz a aplicar al TTS Hablante 5.", "cv_tts6": "Voz a aplicar al TTS Hablante 6.", + "cv_tts7": "Voz a aplicar al TTS Hablante 7.", + "cv_tts8": "Voz a aplicar al TTS Hablante 8.", + "cv_tts9": "Voz a aplicar al TTS Hablante 9.", + "cv_tts10": "Voz a aplicar al TTS Hablante 10.", + "cv_tts11": "Voz a aplicar al TTS Hablante 11.", + "cv_tts12": "Voz a aplicar al TTS Hablante 12.", "cv_aux": "- Voz a aplicar en caso de que un hablante no sea detectado correctamente.", "cv_button_apply": "APLICAR CONFIGURACIÓN", "tab_help": "Ayuda", @@ -417,6 +441,12 @@ "sk4": "Locuteur TTS 4", "sk5": "Locuteur TTS 5", "sk6": "Locuteur TTS 6", + "sk7": "Locuteur TTS 7", + "sk8": "Locuteur TTS 8", + "sk9": "Locuteur TTS 9", + "sk10": "Locuteur TTS 10", + "sk11": "Locuteur TTS 11", + "sk12": "Locuteur TTS 12", "vc_title": "Imitation de voix dans différentes langues", "vc_subtitle": """ ### Répliquez la voix d'une personne dans différentes langues. @@ -504,6 +534,12 @@ "cv_tts4": "Choisissez la voix à appliquer pour le Locuteur 4.", "cv_tts5": "Choisissez la voix à appliquer pour le Locuteur 5.", "cv_tts6": "Choisissez la voix à appliquer pour le Locuteur 6.", + "cv_tts7": "Choisissez la voix à appliquer pour le Locuteur 7.", + "cv_tts8": "Choisissez la voix à appliquer pour le Locuteur 8.", + "cv_tts9": "Choisissez la voix à appliquer pour le Locuteur 9.", + "cv_tts10": "Choisissez la voix à appliquer pour le Locuteur 10.", + "cv_tts11": "Choisissez la voix à appliquer pour le Locuteur 11.", + "cv_tts12": "Choisissez la voix à appliquer pour le Locuteur 12.", "cv_aux": "- Voix à appliquer en cas de détection incorrecte d'un locuteur.", "cv_button_apply": "APPLIQUER LA CONFIGURATION", "tab_help": "Aide", @@ -579,6 +615,12 @@ "sk4": "TTS-Sprecher 4", "sk5": "TTS-Sprecher 5", "sk6": "TTS-Sprecher 6", + "sk7": "TTS-Sprecher 7", + "sk8": "TTS-Sprecher 8", + "sk9": "TTS-Sprecher 9", + "sk10": "TTS-Sprecher 10", + "sk11": "TTS-Sprecher 11", + "sk12": "TTS-Sprecher 12", "vc_title": "Stimmenimitation in verschiedenen Sprachen", "vc_subtitle": """ ### Reproduzieren Sie die Stimme einer Person in verschiedenen Sprachen. @@ -666,6 +708,12 @@ "cv_tts4": "Wählen Sie die Stimme für Sprecher 4 aus.", "cv_tts5": "Wählen Sie die Stimme für Sprecher 5 aus.", "cv_tts6": "Wählen Sie die Stimme für Sprecher 6 aus.", + "cv_tts7": "Wählen Sie die Stimme für Sprecher 7 aus.", + "cv_tts8": "Wählen Sie die Stimme für Sprecher 8 aus.", + "cv_tts9": "Wählen Sie die Stimme für Sprecher 9 aus.", + "cv_tts10": "Wählen Sie die Stimme für Sprecher 10 aus.", + "cv_tts11": "Wählen Sie die Stimme für Sprecher 11 aus.", + "cv_tts12": "Wählen Sie die Stimme für Sprecher 12 aus.", "cv_aux": "- Stimme, die angewendet wird, falls ein Sprecher nicht erfolgreich erkannt wird.", "cv_button_apply": "KONFIGURATION ANWENDEN", "tab_help": "Hilfe", @@ -741,6 +789,12 @@ "sk4": "Altoparlante TTS 4", "sk5": "Altoparlante TTS 5", "sk6": "Altoparlante TTS 6", + "sk7": "Altoparlante TTS 7", + "sk8": "Altoparlante TTS 8", + "sk9": "Altoparlante TTS 9", + "sk10": "Altoparlante TTS 10", + "sk11": "Altoparlante TTS 11", + "sk12": "Altoparlante TTS 12", "vc_title": "Imitazione della voce in diverse lingue", "vc_subtitle": """ ### Replica la voce di una persona in varie lingue. @@ -828,6 +882,12 @@ "cv_tts4": "Scegli la voce da applicare per l'Altoparlante 4.", "cv_tts5": "Scegli la voce da applicare per l'Altoparlante 5.", "cv_tts6": "Scegli la voce da applicare per l'Altoparlante 6.", + "cv_tts7": "Scegli la voce da applicare per l'Altoparlante 7.", + "cv_tts8": "Scegli la voce da applicare per l'Altoparlante 8.", + "cv_tts9": "Scegli la voce da applicare per l'Altoparlante 9.", + "cv_tts10": "Scegli la voce da applicare per l'Altoparlante 10.", + "cv_tts11": "Scegli la voce da applicare per l'Altoparlante 11.", + "cv_tts12": "Scegli la voce da applicare per l'Altoparlante 12.", "cv_aux": "- Voce da applicare nel caso in cui un altoparlante non venga rilevato correttamente.", "cv_button_apply": "APPLICA CONFIGURAZIONE", "tab_help": "Aiuto", @@ -903,6 +963,12 @@ "sk4": "TTSスピーカー4", "sk5": "TTSスピーカー5", "sk6": "TTSスピーカー6", + "sk7": "TTSスピーカー7", + "sk8": "TTSスピーカー8", + "sk9": "TTSスピーカー9", + "sk10": "TTSスピーカー10", + "sk11": "TTSスピーカー11", + "sk12": "TTSスピーカー12", "vc_title": "異なる言語での音声模倣", "vc_subtitle": """ ### さまざまな言語で人の声を再現します。 @@ -990,6 +1056,12 @@ "cv_tts4": "スピーカー4に適用する音声を選択してください。", "cv_tts5": "スピーカー5に適用する音声を選択してください。", "cv_tts6": "スピーカー6に適用する音声を選択してください。", + "cv_tts7": "スピーカー7に適用する音声を選択してください。", + "cv_tts8": "スピーカー8に適用する音声を選択してください。", + "cv_tts9": "スピーカー9に適用する音声を選択してください。", + "cv_tts10": "スピーカー10に適用する音声を選択してください。", + "cv_tts11": "スピーカー11に適用する音声を選択してください。", + "cv_tts12": "スピーカー12に適用する音声を選択してください。", "cv_aux": "- スピーカーが正常に検出されない場合に適用する音声。", "cv_button_apply": "設定を適用", "tab_help": "ヘルプ", @@ -1065,6 +1137,12 @@ "sk4": "TTS发言者 4", "sk5": "TTS发言者 5", "sk6": "TTS发言者 6", + "sk7": "TTS发言者 7", + "sk8": "TTS发言者 8", + "sk9": "TTS发言者 9", + "sk10": "TTS发言者 10", + "sk11": "TTS发言者 11", + "sk12": "TTS发言者 12", "vc_title": "不同语言的语音模仿", "vc_subtitle": """ ### 在各种语言中复制一个人的声音。 @@ -1152,6 +1230,12 @@ "cv_tts4": "选择要为发言者 4 应用的声音。", "cv_tts5": "选择要为发言者 5 应用的声音。", "cv_tts6": "选择要为发言者 6 应用的声音。", + "cv_tts7": "选择要为发言者 7 应用的声音。", + "cv_tts8": "选择要为发言者 8 应用的声音。", + "cv_tts9": "选择要为发言者 9 应用的声音。", + "cv_tts10": "选择要为发言者 10 应用的声音。", + "cv_tts11": "选择要为发言者 11 应用的声音。", + "cv_tts12": "选择要为发言者 12 应用的声音。", "cv_aux": "- 在某种原因下未成功检测到发言者时应用的声音。", "cv_button_apply": "应用配置", "tab_help": "帮助", @@ -1227,6 +1311,12 @@ "sk4": "Говорець TTS 4", "sk5": "Говорець TTS 5", "sk6": "Говорець TTS 6", + "sk7": "Говорець TTS 7", + "sk8": "Говорець TTS 8", + "sk9": "Говорець TTS 9", + "sk10": "Говорець TTS 10", + "sk11": "Говорець TTS 11", + "sk12": "Говорець TTS 12", "vc_title": "Імітація голосу на різних мовах", "vc_subtitle": """ ### Відтворення голосу людини на різних мовах. @@ -1314,6 +1404,12 @@ "cv_tts4": "Виберіть голос для застосування до говорця 4.", "cv_tts5": "Виберіть голос для застосування до говорця 5.", "cv_tts6": "Виберіть голос для застосування до говорця 6.", + "cv_tts7": "Виберіть голос для застосування до говорця 7.", + "cv_tts8": "Виберіть голос для застосування до говорця 8.", + "cv_tts9": "Виберіть голос для застосування до говорця 9.", + "cv_tts10": "Виберіть голос для застосування до говорця 10.", + "cv_tts11": "Виберіть голос для застосування до говорця 11.", + "cv_tts12": "Виберіть голос для застосування до говорця 12.", "cv_aux": "- Голос, який застосовується у разі невдалого розпізнавання говорця.", "cv_button_apply": "ЗАСТОСУВАТИ КОНФІГУРАЦІЮ", "tab_help": "Довідка", @@ -1389,6 +1485,12 @@ "sk4": "متحدث TTS 4", "sk5": "متحدث TTS 5", "sk6": "متحدث TTS 6", + "sk7": "متحدث TTS 7", + "sk8": "متحدث TTS 8", + "sk9": "متحدث TTS 9", + "sk10": "متحدث TTS 10", + "sk11": "متحدث TTS 11", + "sk12": "متحدث TTS 12", "vc_title": "تقليد صوت في لغات مختلفة", "vc_subtitle": """ ### استنساخ صوت الشخص عبر لغات متعددة. @@ -1476,6 +1578,12 @@ "cv_tts4": "اختر الصوت المراد تطبيقه على المتحدث 4.", "cv_tts5": "اختر الصوت المراد تطبيقه على المتحدث 5.", "cv_tts6": "اختر الصوت المراد تطبيقه على المتحدث 6.", + "cv_tts7": "اختر الصوت المراد تطبيقه على المتحدث 7.", + "cv_tts8": "اختر الصوت المراد تطبيقه على المتحدث 8.", + "cv_tts9": "اختر الصوت المراد تطبيقه على المتحدث 9.", + "cv_tts10": "اختر الصوت المراد تطبيقه على المتحدث 10.", + "cv_tts11": "اختر الصوت المراد تطبيقه على المتحدث 11.", + "cv_tts12": "اختر الصوت المراد تطبيقه على المتحدث 12.", "cv_aux": "- الصوت المراد تطبيقه في حالة عدم اكتشاف المتحدث بنجاح.", "cv_button_apply": "تطبيق التكوين", "tab_help": "مساعدة", @@ -1551,6 +1659,12 @@ "sk4": "Говорящий 4 (TTS)", "sk5": "Говорящий 5 (TTS)", "sk6": "Говорящий 6 (TTS)", + "sk7": "Говорящий 7 (TTS)", + "sk8": "Говорящий 8 (TTS)", + "sk9": "Говорящий 9 (TTS)", + "sk10": "Говорящий 10 (TTS)", + "sk11": "Говорящий 11 (TTS)", + "sk12": "Говорящий 12 (TTS)", "vc_title": "Имитация голоса на разных языках", "vc_subtitle": """ ### Воспроизведение голоса человека на разных языках. @@ -1638,6 +1752,12 @@ "cv_tts4": "Выберите голос для применения для Говорящего 4.", "cv_tts5": "Выберите голос для применения для Говорящего 5.", "cv_tts6": "Выберите голос для применения для Говорящего 6.", + "cv_tts7": "Выберите голос для применения для Говорящего 7.", + "cv_tts8": "Выберите голос для применения для Говорящего 8.", + "cv_tts9": "Выберите голос для применения для Говорящего 9.", + "cv_tts10": "Выберите голос для применения для Говорящего 10.", + "cv_tts11": "Выберите голос для применения для Говорящего 11.", + "cv_tts12": "Выберите голос для применения для Говорящего 12.", "cv_aux": "- Голос, который будет применен в случае успешного неопределения говорящего.", "cv_button_apply": "ПРИМЕНИТЬ КОНФИГУРАЦИЮ", "tab_help": "Помощь", @@ -1713,6 +1833,12 @@ "sk4": "TTS Konuşmacı 4", "sk5": "TTS Konuşmacı 5", "sk6": "TTS Konuşmacı 6", + "sk7": "TTS Konuşmacı 7", + "sk8": "TTS Konuşmacı 8", + "sk9": "TTS Konuşmacı 9", + "sk10": "TTS Konuşmacı 10", + "sk11": "TTS Konuşmacı 11", + "sk12": "TTS Konuşmacı 12", "vc_title": "Farklı Dillerde Ses Taklidi", "vc_subtitle": """ ### Bir kişinin sesini çeşitli dillere yayın. @@ -1801,6 +1927,12 @@ "cv_tts4": "Konuşmacı 4 için uygulanacak sesi seçin.", "cv_tts5": "Konuşmacı 5 için uygulanacak sesi seçin.", "cv_tts6": "Konuşmacı 6 için uygulanacak sesi seçin.", + "cv_tts7": "Konuşmacı 7 için uygulanacak sesi seçin.", + "cv_tts8": "Konuşmacı 8 için uygulanacak sesi seçin.", + "cv_tts9": "Konuşmacı 9 için uygulanacak sesi seçin.", + "cv_tts10": "Konuşmacı 10 için uygulanacak sesi seçin.", + "cv_tts11": "Konuşmacı 11 için uygulanacak sesi seçin.", + "cv_tts12": "Konuşmacı 12 için uygulanacak sesi seçin.", "cv_aux": "- Konuşmacı doğru şekilde algılanamadığında uygulanacak ses.", "cv_button_apply": "AYARLARI UYGULA", "tab_help": "Yardım", @@ -1876,6 +2008,12 @@ "sk4": "Pembicara TTS 4", "sk5": "Pembicara TTS 5", "sk6": "Pembicara TTS 6", + "sk7": "Pembicara TTS 7", + "sk8": "Pembicara TTS 8", + "sk9": "Pembicara TTS 9", + "sk10": "Pembicara TTS 10", + "sk11": "Pembicara TTS 11", + "sk12": "Pembicara TTS 12", "vc_title": "Imitasi Suara dalam Berbagai Bahasa", "vc_subtitle": """ ### Reproduksi suara seseorang di berbagai bahasa. @@ -1963,6 +2101,12 @@ "cv_tts4": "Pilih suara yang akan diterapkan untuk Pembicara 4.", "cv_tts5": "Pilih suara yang akan diterapkan untuk Pembicara 5.", "cv_tts6": "Pilih suara yang akan diterapkan untuk Pembicara 6.", + "cv_tts7": "Pilih suara yang akan diterapkan untuk Pembicara 7.", + "cv_tts8": "Pilih suara yang akan diterapkan untuk Pembicara 8.", + "cv_tts9": "Pilih suara yang akan diterapkan untuk Pembicara 9.", + "cv_tts10": "Pilih suara yang akan diterapkan untuk Pembicara 10.", + "cv_tts11": "Pilih suara yang akan diterapkan untuk Pembicara 11.", + "cv_tts12": "Pilih suara yang akan diterapkan untuk Pembicara 12.", "cv_aux": "- Suara yang akan diterapkan jika Pembicara tidak terdeteksi dengan sukses.", "cv_button_apply": "TERAPKAN KONFIGURASI", "tab_help": "Bantuan", @@ -2038,6 +2182,12 @@ "sk4": "Falante TTS 4", "sk5": "Falante TTS 5", "sk6": "Falante TTS 6", + "sk7": "Falante TTS 7", + "sk8": "Falante TTS 8", + "sk9": "Falante TTS 9", + "sk10": "Falante TTS 10", + "sk11": "Falante TTS 11", + "sk12": "Falante TTS 12", "vc_title": "Imitação de Voz em Diferentes Idiomas", "vc_subtitle": """ ### Reproduza a voz de uma pessoa em vários idiomas. @@ -2125,6 +2275,12 @@ "cv_tts4": "Escolha a voz para aplicar ao Falante 4.", "cv_tts5": "Escolha a voz para aplicar ao Falante 5.", "cv_tts6": "Escolha a voz para aplicar ao Falante 6.", + "cv_tts7": "Escolha a voz para aplicar ao Falante 7.", + "cv_tts8": "Escolha a voz para aplicar ao Falante 8.", + "cv_tts9": "Escolha a voz para aplicar ao Falante 9.", + "cv_tts10": "Escolha a voz para aplicar ao Falante 10.", + "cv_tts11": "Escolha a voz para aplicar ao Falante 11.", + "cv_tts12": "Escolha a voz para aplicar ao Falante 12.", "cv_aux": "- Voz para aplicar caso um Falante não seja detectado com sucesso.", "cv_button_apply": "APLICAR CONFIGURAÇÃO", "tab_help": "Ajuda", @@ -2200,6 +2356,12 @@ "sk4": "TTS बोलने वाला 4", "sk5": "TTS बोलने वाला 5", "sk6": "TTS बोलने वाला 6", + "sk7": "TTS बोलने वाला 7", + "sk8": "TTS बोलने वाला 8", + "sk9": "TTS बोलने वाला 9", + "sk10": "TTS बोलने वाला 10", + "sk11": "TTS बोलने वाला 11", + "sk12": "TTS बोलने वाला 12", "vc_title": "विभिन्न भाषाओं में आवाज़ का नकल", "vc_subtitle": """ ### विभिन्न भाषाओं में एक व्यक्ति की आवाज़ का नकल। @@ -2287,6 +2449,12 @@ "cv_tts4": "बोलने वाले 4 के लिए लागू करने के लिए आवाज़ चुनें।", "cv_tts5": "बोलने वाले 5 के लिए लागू करने के लिए आवाज़ चुनें।", "cv_tts6": "बोलने वाले 6 के लिए लागू करने के लिए आवाज़ चुनें।", + "cv_tts7": "बोलने वाले 7 के लिए लागू करने के लिए आवाज़ चुनें।", + "cv_tts8": "बोलने वाले 8 के लिए लागू करने के लिए आवाज़ चुनें।", + "cv_tts9": "बोलने वाले 9 के लिए लागू करने के लिए आवाज़ चुनें।", + "cv_tts10": "बोलने वाले 10 के लिए लागू करने के लिए आवाज़ चुनें।", + "cv_tts11": "बोलने वाले 11 के लिए लागू करने के लिए आवाज़ चुनें।", + "cv_tts12": "बोलने वाले 12 के लिए लागू करने के लिए आवाज़ चुनें।", "cv_aux": "- यदि किसी कारणवश स्पीकर सही ढंग से पहचाना नहीं गया है, तो लागू करने के लिए आवाज़।", "cv_button_apply": "आवेदन को लागू करें", "tab_help": "सहायता", @@ -2362,6 +2530,12 @@ "sk4": "Người Nói TTS 4", "sk5": "Người Nói TTS 5", "sk6": "Người Nói TTS 6", + "sk7": "Người Nói TTS 7", + "sk8": "Người Nói TTS 8", + "sk9": "Người Nói TTS 9", + "sk10": "Người Nói TTS 10", + "sk11": "Người Nói TTS 11", + "sk12": "Người Nói TTS 12", "vc_title": "Sao chép giọng nói trong các ngôn ngữ khác nhau", "vc_subtitle": """ ### Sao chép giọng nói của một người qua các ngôn ngữ khác nhau. @@ -2449,6 +2623,12 @@ "cv_tts4": "Chọn giọng nói áp dụng cho Người Nói 4.", "cv_tts5": "Chọn giọng nói áp dụng cho Người Nói 5.", "cv_tts6": "Chọn giọng nói áp dụng cho Người Nói 6.", + "cv_tts7": "Chọn giọng nói áp dụng cho Người Nói 7.", + "cv_tts8": "Chọn giọng nói áp dụng cho Người Nói 8.", + "cv_tts9": "Chọn giọng nói áp dụng cho Người Nói 9.", + "cv_tts10": "Chọn giọng nói áp dụng cho Người Nói 10.", + "cv_tts11": "Chọn giọng nói áp dụng cho Người Nói 11.", + "cv_tts12": "Chọn giọng nói áp dụng cho Người Nói 12.", "cv_aux": "- Giọng nói được áp dụng trong trường hợp không nhận diện được người nói thành công.", "cv_button_apply": "ÁP DỤNG CẤU HÌNH", "tab_help": "Trợ giúp", @@ -2524,6 +2704,12 @@ "sk4": "Głos TTS Mówca 4", "sk5": "Głos TTS Mówca 5", "sk6": "Głos TTS Mówca 6", + "sk7": "Głos TTS Mówca 7", + "sk8": "Głos TTS Mówca 8", + "sk9": "Głos TTS Mówca 9", + "sk10": "Głos TTS Mówca 10", + "sk11": "Głos TTS Mówca 11", + "sk12": "Głos TTS Mówca 12", "vc_title": "Imitacja głosu w różnych językach", "vc_subtitle": """ ### Odtwórz głos osoby w różnych językach. @@ -2611,6 +2797,12 @@ "cv_tts4": "Wybierz głos, który ma być stosowany dla Mówcy 4.", "cv_tts5": "Wybierz głos, który ma być stosowany dla Mówcy 5.", "cv_tts6": "Wybierz głos, który ma być stosowany dla Mówcy 6.", + "cv_tts7": "Wybierz głos, który ma być stosowany dla Mówcy 7.", + "cv_tts8": "Wybierz głos, który ma być stosowany dla Mówcy 8.", + "cv_tts9": "Wybierz głos, który ma być stosowany dla Mówcy 9.", + "cv_tts10": "Wybierz głos, który ma być stosowany dla Mówcy 10.", + "cv_tts11": "Wybierz głos, który ma być stosowany dla Mówcy 11.", + "cv_tts12": "Wybierz głos, który ma być stosowany dla Mówcy 12.", "cv_aux": "- Głos do zastosowania w przypadku niepowodzenia wykrycia Mówcy.", "cv_button_apply": "ZASTOSUJ KONFIGURACJĘ", "tab_help": "Pomoc", @@ -2686,6 +2878,12 @@ "sk4": "TTS Högtalare 4", "sk5": "TTS Högtalare 5", "sk6": "TTS Högtalare 6", + "sk7": "TTS Högtalare 7", + "sk8": "TTS Högtalare 8", + "sk9": "TTS Högtalare 9", + "sk10": "TTS Högtalare 10", + "sk11": "TTS Högtalare 11", + "sk12": "TTS Högtalare 12", "vc_title": "Röstimitation på olika språk", "vc_subtitle": """ ### Replicera en persons röst över olika språk. @@ -2773,6 +2971,12 @@ "cv_tts4": "Välj röst att tillämpa för Högtalare 4.", "cv_tts5": "Välj röst att tillämpa för Högtalare 5.", "cv_tts6": "Välj röst att tillämpa för Högtalare 6.", + "cv_tts7": "Välj röst att tillämpa för Högtalare 7.", + "cv_tts8": "Välj röst att tillämpa för Högtalare 8.", + "cv_tts9": "Välj röst att tillämpa för Högtalare 9.", + "cv_tts10": "Välj röst att tillämpa för Högtalare 10.", + "cv_tts11": "Välj röst att tillämpa för Högtalare 11.", + "cv_tts12": "Välj röst att tillämpa för Högtalare 12.", "cv_aux": "- Röst att tillämpa om en högtalare inte upptäcks framgångsrikt.", "cv_button_apply": "TILLÄMPA KONFIGURATION", "tab_help": "Hjälp", @@ -2848,6 +3052,12 @@ "sk4": "TTS 스피커 4", "sk5": "TTS 스피커 5", "sk6": "TTS 스피커 6", + "sk7": "TTS 스피커 7", + "sk8": "TTS 스피커 8", + "sk9": "TTS 스피커 9", + "sk10": "TTS 스피커 10", + "sk11": "TTS 스피커 11", + "sk12": "TTS 스피커 12", "vc_title": "다른 언어에서 음성 모방", "vc_subtitle": """ ### 여러 언어로 사람의 음성을 복제합니다. @@ -2935,6 +3145,12 @@ "cv_tts4": "스피커 4에 적용할 음성을 선택하세요.", "cv_tts5": "스피커 5에 적용할 음성을 선택하세요.", "cv_tts6": "스피커 6에 적용할 음성을 선택하세요.", + "cv_tts7": "스피커 7에 적용할 음성을 선택하세요.", + "cv_tts8": "스피커 8에 적용할 음성을 선택하세요.", + "cv_tts9": "스피커 9에 적용할 음성을 선택하세요.", + "cv_tts10": "스피커 10에 적용할 음성을 선택하세요.", + "cv_tts11": "스피커 11에 적용할 음성을 선택하세요.", + "cv_tts12": "스피커 12에 적용할 음성을 선택하세요.", "cv_aux": "- 스피커가 올바르게 감지되지 않은 경우 적용할 음성.", "cv_button_apply": "구성 적용", "tab_help": "도움말", @@ -3010,6 +3226,12 @@ "sk4": "TTS वक्त्य 4", "sk5": "TTS वक्त्य 5", "sk6": "TTS वक्त्य 6", + "sk7": "TTS वक्त्य 7", + "sk8": "TTS वक्त्य 8", + "sk9": "TTS वक्त्य 9", + "sk10": "TTS वक्त्य 10", + "sk11": "TTS वक्त्य 11", + "sk12": "TTS वक्त्य 12", "vc_title": "विविध भाषांमध्ये आवाज नक्कल", "vc_subtitle": """ ### विविध भाषांमध्ये व्यक्तीचा आवाज पुनर्निर्मित करा. @@ -3095,6 +3317,12 @@ "cv_tts4": "स्पीकर 4 साठी लागू करण्यासाठी आवाज निवडा.", "cv_tts5": "स्पीकर 5 साठी लागू करण्यासाठी आवाज निवडा.", "cv_tts6": "स्पीकर 6 साठी लागू करण्यासाठी आवाज निवडा.", + "cv_tts7": "स्पीकर 7 साठी लागू करण्यासाठी आवाज निवडा.", + "cv_tts8": "स्पीकर 8 साठी लागू करण्यासाठी आवाज निवडा.", + "cv_tts9": "स्पीकर 9 साठी लागू करण्यासाठी आवाज निवडा.", + "cv_tts10": "स्पीकर 10 साठी लागू करण्यासाठी आवाज निवडा.", + "cv_tts11": "स्पीकर 11 साठी लागू करण्यासाठी आवाज निवडा.", + "cv_tts12": "स्पीकर 12 साठी लागू करण्यासाठी आवाज निवडा.", "cv_aux": "- जर कारणाने वक्ता सही ओळखले जात नाही तर लागू करण्यासाठी आवाज.", "cv_button_apply": "सेटिंग्ज लागू करा", "tab_help": "मदत", @@ -3170,6 +3398,12 @@ "sk4": "TTS Səsçi 4", "sk5": "TTS Səsçi 5", "sk6": "TTS Səsçi 6", + "sk7": "TTS Səsçi 7", + "sk8": "TTS Səsçi 8", + "sk9": "TTS Səsçi 9", + "sk10": "TTS Səsçi 10", + "sk11": "TTS Səsçi 11", + "sk12": "TTS Səsçi 12", "vc_title": "Fərqli dillərdə Səs İmələsi", "vc_subtitle": """ ### Bir insanın səsini müxtəlif dillərdə çoğaldın. @@ -3257,6 +3491,12 @@ "cv_tts4": "4-cü Səsçi üçün tətbiq olunacaq səsi seçin.", "cv_tts5": "5-ci Səsçi üçün tətbiq olunacaq səsi seçin.", "cv_tts6": "6-cı Səsçi üçün tətbiq olunacaq səsi seçin.", + "cv_tts7": "7-ci Səsçi üçün tətbiq olunacaq səsi seçin.", + "cv_tts8": "8-ci Səsçi üçün tətbiq olunacaq səsi seçin.", + "cv_tts9": "9-cu Səsçi üçün tətbiq olunacaq səsi seçin.", + "cv_tts10": "10-cu Səsçi üçün tətbiq olunacaq səsi seçin.", + "cv_tts11": "11-ci Səsçi üçün tətbiq olunacaq səsi seçin.", + "cv_tts12": "12-ci Səsçi üçün tətbiq olunacaq səsi seçin.", "cv_aux": "- Səsçi doğru şəkildə aşkar edilmirsə tətbiq ediləcək səs.", "cv_button_apply": "KONFiQURASiYANI TƏTBiQ EDiN", "tab_help": "Kömək", diff --git a/soni_translate/text_to_speech.py b/soni_translate/text_to_speech.py index 9b2e967..118b50c 100644 --- a/soni_translate/text_to_speech.py +++ b/soni_translate/text_to_speech.py @@ -968,6 +968,12 @@ def audio_segmentation_to_voice( tts_voice03, tts_voice04, tts_voice05, + tts_voice06, + tts_voice07, + tts_voice08, + tts_voice09, + tts_voice10, + tts_voice11, dereverb_automatic=True, model_id_bark="suno/bark-small", model_id_coqui="tts_models/multilingual/multi-dataset/xtts_v2", @@ -984,6 +990,12 @@ def audio_segmentation_to_voice( "SPEAKER_03": tts_voice03, "SPEAKER_04": tts_voice04, "SPEAKER_05": tts_voice05, + "SPEAKER_06": tts_voice06, + "SPEAKER_07": tts_voice07, + "SPEAKER_08": tts_voice08, + "SPEAKER_09": tts_voice09, + "SPEAKER_10": tts_voice10, + "SPEAKER_11": tts_voice11, } # Assign 'SPEAKER_00' to segments without a 'speaker' key @@ -1174,7 +1186,7 @@ def accelerate_segments( ) audio_files.append(f"{folder_output}/{filename}") - speaker = "TTS Speaker " + str(int(speaker[-1]) + 1) + speaker = "TTS Speaker {:02d}".format(int(speaker[-2:]) + 1) speakers_list.append(speaker) return audio_files, speakers_list diff --git a/soni_translate/translate_segments.py b/soni_translate/translate_segments.py index 52bea8f..0ee87db 100644 --- a/soni_translate/translate_segments.py +++ b/soni_translate/translate_segments.py @@ -336,19 +336,19 @@ def gpt_batch(segments, model, target, token_batch_limit=900, source=None): fixed_target = fix_code_language(target) fixed_source = fix_code_language(source) if source else "auto" - name_speaker = "ABCDEF" + name_speaker = "ABCDEFGHIJKL" translated_lines = [] text_data_dict = [] num_tokens = 0 - count_sk = {char: 0 for char in "ABCDEF"} + count_sk = {char: 0 for char in "ABCDEFGHIJKL"} for i, line in enumerate(segments_copy): text = line["text"] speaker = line["speaker"] last_start = line["start"] # text_data_dict.append({str(int(speaker[-1])+1): text}) - index_sk = int(speaker[-1]) + index_sk = int(speaker[-2:]) character_sk = name_speaker[index_sk] count_sk[character_sk] += 1 code_sk = character_sk+str(count_sk[character_sk]) @@ -361,7 +361,7 @@ def gpt_batch(segments, model, target, token_batch_limit=900, source=None): # Reset vars num_tokens = 0 text_data_dict = [] - count_sk = {char: 0 for char in "ABCDEF"} + count_sk = {char: 0 for char in "ABCDEFGHIJKL"} # Process translation # https://arxiv.org/pdf/2309.03409.pdf system_prompt = f"Machine translation designed to output the translated_conversation key JSON containing a list of {batch_lines} items." From 36450b681e62c899ae08378a50ac70801d340242 Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Sat, 4 May 2024 14:38:00 +0000 Subject: [PATCH 29/36] Addefeat(voice imitation): d OpenVoiceV2 --- app_rvc.py | 5 +++-- soni_translate/text_to_speech.py | 20 ++++++++++++++++---- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/app_rvc.py b/app_rvc.py index 566b497..6958aad 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -1475,10 +1475,11 @@ def submit(value): label=lg_conf["vc_active_label"], info=lg_conf["vc_active_info"], ) + openvoice_models = ["openvoice", "openvoice_v2"] voice_imitation_method_options = ( - ["freevc", "openvoice"] + ["freevc"] + openvoice_models if SoniTr.tts_info.xtts_enabled - else ["openvoice"] + else openvoice_models ) voice_imitation_method_gui = gr.Dropdown( voice_imitation_method_options, diff --git a/soni_translate/text_to_speech.py b/soni_translate/text_to_speech.py index 118b50c..f55f281 100644 --- a/soni_translate/text_to_speech.py +++ b/soni_translate/text_to_speech.py @@ -22,6 +22,7 @@ import soundfile as sf import platform import logging +import traceback from .logging_setup import logger @@ -42,6 +43,7 @@ def verify_saved_file_and_size(filename): def error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename): + traceback.print_exc() logger.error(f"Error: {str(error)}") try: from tempfile import TemporaryFile @@ -1327,6 +1329,7 @@ def toneconverter_openvoice( preprocessor_max_segments, remove_previous_process=True, get_vocals_dereverb=False, + model="openvoice", ): audio_path = "audio.wav" # se_path = "se.pth" @@ -1359,16 +1362,24 @@ def toneconverter_openvoice( ) logger.info("Openvoice loading model...") - model_path_openvoice = "./OPENVOICE_MODELS" url_model_openvoice = "https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter" + if "v2" in model: + model_path = os.path.join(model_path_openvoice, "v2") + url_model_openvoice = url_model_openvoice.replace( + "OpenVoice", "OpenVoiceV2" + ).replace("checkpoints/", "") + else: + model_path = os.path.join(model_path_openvoice, "v1") + create_directories(model_path) + config_url = f"{url_model_openvoice}/config.json" checkpoint_url = f"{url_model_openvoice}/checkpoint.pth" - config_path = download_manager(url=config_url, path=model_path_openvoice) + config_path = download_manager(url=config_url, path=model_path) checkpoint_path = download_manager( - url=checkpoint_url, path=model_path_openvoice + url=checkpoint_url, path=model_path ) device = os.environ.get("SONITR_DEVICE") @@ -1533,12 +1544,13 @@ def toneconverter( remove_previous_process=remove_previous_process, get_vocals_dereverb=get_vocals_dereverb, ) - elif method_vc == "openvoice": + elif "openvoice" in method_vc: return toneconverter_openvoice( result_diarize, preprocessor_max_segments, remove_previous_process=remove_previous_process, get_vocals_dereverb=get_vocals_dereverb, + model=method_vc, ) From 53e3043fa4aafc165a9150e9a4edabfe3dd22142 Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Sat, 4 May 2024 14:43:30 +0000 Subject: [PATCH 30/36] fix(tts): xtts output list to numpy array #42 --- soni_translate/text_to_speech.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/soni_translate/text_to_speech.py b/soni_translate/text_to_speech.py index f55f281..4275f1b 100644 --- a/soni_translate/text_to_speech.py +++ b/soni_translate/text_to_speech.py @@ -82,6 +82,9 @@ def error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename): def pad_array(array, sr): + if isinstance(array, list): + array = np.array(array) + if not array.shape[0]: raise ValueError("The generated audio does not contain any data") From f64a79d4768ce6bb60a1943aa0617ffb085aa2c8 Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Mon, 6 May 2024 22:14:19 +0000 Subject: [PATCH 31/36] fix: reencode speakers #42 skip aling vi lang #18 --- app_rvc.py | 201 +++++++++++--------------- soni_translate/audio_segments.py | 2 +- soni_translate/postprocessor.py | 23 ++- soni_translate/speech_segmentation.py | 26 +++- 4 files changed, 129 insertions(+), 123 deletions(-) diff --git a/app_rvc.py b/app_rvc.py index 6958aad..5d2c7b2 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -377,7 +377,7 @@ def multilingual_media_conversion( preview=False, transcriber_model="large-v3", batch_size=4, - compute_type="float16", + compute_type="auto", origin_language="Automatic detection", target_language="English (en)", min_speakers=1, @@ -719,11 +719,19 @@ def multilingual_media_conversion( ): prog_disp("Aligning...", 0.45, is_gui, progress=progress) try: - self.result = align_speech(audio, self.result) - logger.debug( - "Align complete, " - f"segments count {len(self.result['segments'])}" - ) + if self.align_language in ["vi"]: + logger.info( + "Deficient alignment for the " + f"{self.align_language} language, skipping the" + " process. It is suggested to reduce the " + "duration of the segments as an alternative." + ) + else: + self.result = align_speech(audio, self.result) + logger.debug( + "Align complete, " + f"segments count {len(self.result['segments'])}" + ) except Exception as error: logger.error(str(error)) @@ -834,7 +842,9 @@ def multilingual_media_conversion( ], { "result_diarize": self.result_diarize }): - if output_format_subtitle != "ass": + if output_format_subtitle == "disable": + self.sub_file = "sub_tra.srt" + elif output_format_subtitle != "ass": self.sub_file = process_subtitles( self.result_source_lang, self.align_language, @@ -842,6 +852,8 @@ def multilingual_media_conversion( output_format_subtitle, TRANSLATE_AUDIO_TO, ) + + # Need task if output_format_subtitle != "srt": _ = process_subtitles( self.result_source_lang, @@ -850,6 +862,7 @@ def multilingual_media_conversion( "srt", TRANSLATE_AUDIO_TO, ) + if output_format_subtitle == "ass": convert_ori = "ffmpeg -i sub_ori.srt sub_ori.ass -y" convert_tra = "ffmpeg -i sub_tra.srt sub_tra.ass -y" @@ -857,23 +870,41 @@ def multilingual_media_conversion( run_command(convert_ori) run_command(convert_tra) + format_sub = ( + output_format_subtitle + if output_format_subtitle != "disable" + else "srt" + ) + if output_type == "subtitle": - output = media_out( + + out_subs = [] + tra_subs = media_out( media_file, TRANSLATE_AUDIO_TO, video_output_name, - output_format_subtitle, + format_sub, file_obj=self.sub_file, ) - logger.info(f"Done: {output}") - return output + out_subs.append(tra_subs) + + ori_subs = media_out( + media_file, + self.align_language, + video_output_name, + format_sub, + file_obj=f"sub_ori.{format_sub}", + ) + out_subs.append(ori_subs) + logger.info(f"Done: {out_subs}") + return out_subs if output_type == "subtitle [by speaker]": output = get_subtitle_speaker( media_file, result=self.result_diarize, language=TRANSLATE_AUDIO_TO, - extension=output_format_subtitle, + extension=format_sub, base_name=video_output_name, ) logger.info(f"Done: {str(output)}") @@ -889,8 +920,10 @@ def multilingual_media_conversion( ), file_obj=base_audio_wav if is_audio_file(media_file) else base_video_file, soft_subtitles=False if is_audio_file(media_file) else True, + subtitle_files=output_format_subtitle, ) - logger.info(f"Done: {output}") + msg_out = output[0] if isinstance(output, list) else output + logger.info(f"Done: {msg_out}") return output if not self.task_in_cache("tts", [ @@ -1061,8 +1094,10 @@ def multilingual_media_conversion( "ogg" if "ogg" in output_type else "mp3" ), file_obj=mix_audio_file, + subtitle_files=output_format_subtitle, ) - logger.info(f"Done: {output}") + msg_out = output[0] if isinstance(output, list) else output + logger.info(f"Done: {msg_out}") return output hash_base_video_file = get_hash(base_video_file) @@ -1076,7 +1111,7 @@ def multilingual_media_conversion( try: logger.info("Burn subtitles") remove_files(vid_subs) - command = f"ffmpeg -i {base_video_file} -y -vf subtitles=sub_tra.srt {vid_subs}" + command = f"ffmpeg -i {base_video_file} -y -vf subtitles=sub_tra.srt -max_muxing_queue_size 9999 {vid_subs}" run_command(command) base_video_file = vid_subs self.burn_subs_id = hashvideo_text @@ -1103,8 +1138,10 @@ def multilingual_media_conversion( "mkv" if "mkv" in output_type else "mp4", file_obj=video_output_file, soft_subtitles=soft_subtitles_to_video, + subtitle_files=output_format_subtitle, ) - logger.info(f"Done: {output}") + msg_out = output[0] if isinstance(output, list) else output + logger.info(f"Done: {msg_out}") return output @@ -1116,7 +1153,7 @@ def multilingual_docs_conversion( origin_language="English (en)", target_language="English (en)", tts_voice00="en-AU-WilliamNeural-Male", - name_final_file="sample", + name_final_file="", translate_process="google_translator", output_type="audio", chunk_size=None, @@ -1154,11 +1191,7 @@ def multilingual_docs_conversion( raise Exception("No data found") # audio_wav = "audio.wav" - final_wav_file = ( - "audio_book.wav" - if not name_final_file - else f"{name_final_file}.wav" - ) + final_wav_file = "audio_book.wav" prog_disp("Processing text...", 0.15, is_gui, progress=progress) result_file_path, result_text = document_preprocessor( @@ -1253,9 +1286,19 @@ def multilingual_docs_conversion( result_diarize, audio_files, final_wav_file, True ) - logger.info(f"Done: {final_wav_file}") + output = media_out( + result_file_path if is_string else document, + TRANSLATE_AUDIO_TO, + name_final_file, + "mp3" if "mp3" in output_type else ( + "ogg" if "ogg" in output_type else "wav" + ), + file_obj=final_wav_file, + ) + + logger.info(f"Done: {output}") - return final_wav_file + return output title = "
📽️ SoniTranslate 🈷️
" @@ -1363,84 +1406,84 @@ def submit(value): tts_voice00 = gr.Dropdown( SoniTr.tts_info.tts_list(), - value="en-AU-WilliamNeural-Male", + value="en-US-EmmaMultilingualNeural-Female", label=lg_conf["sk1"], visible=True, interactive=True, ) tts_voice01 = gr.Dropdown( SoniTr.tts_info.tts_list(), - value="en-CA-ClaraNeural-Female", + value="en-US-AndrewMultilingualNeural-Male", label=lg_conf["sk2"], visible=True, interactive=True, ) tts_voice02 = gr.Dropdown( SoniTr.tts_info.tts_list(), - value="en-GB-ThomasNeural-Male", + value="en-US-AvaMultilingualNeural-Female", label=lg_conf["sk3"], visible=False, interactive=True, ) tts_voice03 = gr.Dropdown( SoniTr.tts_info.tts_list(), - value="en-GB-SoniaNeural-Female", + value="en-US-BrianMultilingualNeural-Male", label=lg_conf["sk4"], visible=False, interactive=True, ) tts_voice04 = gr.Dropdown( SoniTr.tts_info.tts_list(), - value="en-NZ-MitchellNeural-Male", + value="de-DE-SeraphinaMultilingualNeural-Female", label=lg_conf["sk4"], visible=False, interactive=True, ) tts_voice05 = gr.Dropdown( SoniTr.tts_info.tts_list(), - value="en-GB-MaisieNeural-Female", + value="de-DE-FlorianMultilingualNeural-Male", label=lg_conf["sk6"], visible=False, interactive=True, ) tts_voice06 = gr.Dropdown( SoniTr.tts_info.tts_list(), - value="en-GB-MaisieNeural-Female", + value="fr-FR-VivienneMultilingualNeural-Female", label=lg_conf["sk7"], visible=False, interactive=True, ) tts_voice07 = gr.Dropdown( SoniTr.tts_info.tts_list(), - value="en-GB-MaisieNeural-Female", + value="fr-FR-RemyMultilingualNeural-Male", label=lg_conf["sk8"], visible=False, interactive=True, ) tts_voice08 = gr.Dropdown( SoniTr.tts_info.tts_list(), - value="en-GB-MaisieNeural-Female", + value="en-US-EmmaMultilingualNeural-Female", label=lg_conf["sk9"], visible=False, interactive=True, ) tts_voice09 = gr.Dropdown( SoniTr.tts_info.tts_list(), - value="en-GB-MaisieNeural-Female", + value="en-US-AndrewMultilingualNeural-Male", label=lg_conf["sk10"], visible=False, interactive=True, ) tts_voice10 = gr.Dropdown( SoniTr.tts_info.tts_list(), - value="en-GB-MaisieNeural-Female", + value="en-US-EmmaMultilingualNeural-Female", label=lg_conf["sk11"], visible=False, interactive=True, ) tts_voice11 = gr.Dropdown( SoniTr.tts_info.tts_list(), - value="en-GB-MaisieNeural-Female", + value="en-US-AndrewMultilingualNeural-Male", label=lg_conf["sk12"], visible=False, interactive=True, @@ -1622,6 +1665,7 @@ def submit(value): gr.HTML("
") sub_type_options = [ + "disable", "srt", "vtt", "ass", @@ -1631,20 +1675,9 @@ def submit(value): "aud", ] - def get_subs_path(type_subs): - if os.path.exists( - f"sub_ori.{type_subs}" - ) and os.path.exists(f"sub_tra.{type_subs}"): - return ( - f"sub_ori.{type_subs}", - f"sub_tra.{type_subs}", - ) - else: - return None, None - sub_type_output = gr.Dropdown( sub_type_options, - value=sub_type_options[0], + value=sub_type_options[1], label=lg_conf["sub_type"], ) soft_subtitles_to_video_gui = gr.Checkbox( @@ -1825,15 +1858,6 @@ def visible_component_subs(input_bool): interactive=False, ) # gr.Video() - with gr.Row(): - sub_ori_output = gr.File( - label=lg_conf["sub_ori"], - interactive=False, - ) - sub_tra_output = gr.File( - label=lg_conf["sub_tra"], - interactive=False, - ) gr.HTML("
") @@ -1870,48 +1894,8 @@ def visible_component_subs(input_bool): "English (en)", 1, 2, - "en-AU-WilliamNeural-Male", "en-CA-ClaraNeural-Female", - "en-GB-ThomasNeural-Male", - "en-GB-SoniaNeural-Female", - "en-NZ-MitchellNeural-Male", - "en-GB-MaisieNeural-Female", "en-AU-WilliamNeural-Male", - "en-CA-ClaraNeural-Female", - "en-GB-ThomasNeural-Male", - "en-GB-SoniaNeural-Female", - "en-NZ-MitchellNeural-Male", - "en-GB-MaisieNeural-Female", - "", - "Adjusting volumes and mixing audio", - ], - [ - None, - "https://www.youtube.com/watch?v=5ZeHtRKHl7Y", - "", - "", - False, - whisper_model_default, - 4, - com_t_default, - "Japanese (ja)", - "English (en)", - 1, - 1, - "en-CA-ClaraNeural-Female", - "en-AU-WilliamNeural-Male", - "en-GB-ThomasNeural-Male", - "en-GB-SoniaNeural-Female", - "en-NZ-MitchellNeural-Male", - "en-GB-MaisieNeural-Female", - "en-AU-WilliamNeural-Male", - "en-CA-ClaraNeural-Female", - "en-GB-ThomasNeural-Male", - "en-GB-SoniaNeural-Female", - "en-NZ-MitchellNeural-Male", - "en-GB-MaisieNeural-Female", - "", - "Adjusting volumes and mixing audio", ], ], # no update fn=SoniTr.batch_multilingual_media_conversion, @@ -1930,23 +1914,6 @@ def visible_component_subs(input_bool): max_speakers, tts_voice00, tts_voice01, - tts_voice02, - tts_voice03, - tts_voice04, - tts_voice05, - tts_voice06, - tts_voice07, - tts_voice08, - tts_voice09, - tts_voice10, - tts_voice11, - VIDEO_OUTPUT_NAME, - AUDIO_MIX, - audio_accelerate, - acceleration_rate_regulation_gui, - volume_original_mix, - volume_translated_mix, - sub_type_output, ], outputs=[video_output], cache_examples=False, @@ -2024,7 +1991,7 @@ def swap_visibility(data_type): SoniTr.tts_info.tts_list(), ) ), - value="en-GB-ThomasNeural-Male", + value="en-US-EmmaMultilingualNeural-Female", label="TTS", visible=True, interactive=True, @@ -2066,7 +2033,7 @@ def swap_visibility(data_type): ) docs_OUTPUT_NAME = gr.Textbox( label="Final file name", - value="final_sample", + value="", info=lg_conf["out_name_info"], ) docs_chunk_size = gr.Number( @@ -2593,8 +2560,6 @@ def update_tts_list(): ], outputs=video_output, trigger_mode="multiple", - ).then( - get_subs_path, [sub_type_output], [sub_ori_output, sub_tra_output] ).then( play_sound_alert, [play_sound_gui], [sound_alert_notification] ) diff --git a/soni_translate/audio_segments.py b/soni_translate/audio_segments.py index 3a51e52..105c6ba 100644 --- a/soni_translate/audio_segments.py +++ b/soni_translate/audio_segments.py @@ -89,7 +89,7 @@ def create_translated_audio( combined_audio = Mixer() combined_audio.overlay(base_audio) - logger.info( + logger.debug( f"Audio duration: {total_duration // 60} " f"minutes and {int(total_duration % 60)} seconds" ) diff --git a/soni_translate/postprocessor.py b/soni_translate/postprocessor.py index 9753857..a77b5bc 100644 --- a/soni_translate/postprocessor.py +++ b/soni_translate/postprocessor.py @@ -26,7 +26,9 @@ ] DOCS_OUTPUT_TYPE_OPTIONS = [ - "audio", + "audio (wav)", + "audio (mp3)", + "audio (ogg)", "text", ] # Add DOCX and etc. @@ -76,7 +78,7 @@ def get_output_file( soft_subtitles, output_directory="", ): - directory_base = "." # default directory + directory_base = "." # default directory if output_directory and os.path.isdir(output_directory): new_file_path = os.path.join(output_directory, new_file_name) @@ -96,6 +98,8 @@ def get_output_file( cm = f'ffmpeg -y -i "{original_file}" -acodec pcm_s16le -ar 44100 -ac 2 "{new_file_path}"' elif new_file_path.endswith(".ogg"): cm = f'ffmpeg -i "{original_file}" -c:a libvorbis "{new_file_path}"' + elif new_file_path.endswith(".mp3") and not original_file.endswith(".mp3"): + cm = f'ffmpeg -y -i "{original_file}" -codec:a libmp3lame -qscale:a 2 "{new_file_path}"' if cm: try: @@ -117,6 +121,7 @@ def media_out( extension="mp4", file_obj="video_dub.mp4", soft_subtitles=False, + subtitle_files="disable", ): if not media_out_name: if os.path.exists(media_file): @@ -128,7 +133,19 @@ def media_out( f_name = f"{sanitize_file_name(media_out_name)}.{extension}" - return get_output_file(file_obj, f_name, soft_subtitles) + if subtitle_files != "disable": + final_media = [get_output_file(file_obj, f_name, soft_subtitles)] + name_tra = f"{sanitize_file_name(media_out_name)}.{subtitle_files}" + name_ori = f"{sanitize_file_name(base_name)}.{subtitle_files}" + tgt_subs = f"sub_tra.{subtitle_files}" + ori_subs = f"sub_ori.{subtitle_files}" + final_subtitles = [ + get_output_file(tgt_subs, name_tra, False), + get_output_file(ori_subs, name_ori, False) + ] + return final_media + final_subtitles + else: + return get_output_file(file_obj, f_name, soft_subtitles) def get_subtitle_speaker(media_file, result, language, extension, base_name): diff --git a/soni_translate/speech_segmentation.py b/soni_translate/speech_segmentation.py index 9ef5f6b..9b0b446 100644 --- a/soni_translate/speech_segmentation.py +++ b/soni_translate/speech_segmentation.py @@ -35,6 +35,8 @@ ] COMPUTE_TYPE_GPU = [ + "default", + "auto", "int8", "int8_float32", "int8_float16", @@ -45,6 +47,8 @@ ] COMPUTE_TYPE_CPU = [ + "default", + "auto", "int8", "int8_float32", "int16", @@ -326,6 +330,26 @@ def align_speech(audio, result): } +def reencode_speakers(result): + + if result["segments"][0]["speaker"] == "SPEAKER_00": + return result + + speaker_mapping = {} + counter = 0 + + logger.debug("Reencode speakers") + + for segment in result["segments"]: + old_speaker = segment["speaker"] + if old_speaker not in speaker_mapping: + speaker_mapping[old_speaker] = f"SPEAKER_{counter:02d}" + counter += 1 + segment["speaker"] = speaker_mapping[old_speaker] + + return result + + def diarize_speech( audio_wav, result, @@ -420,4 +444,4 @@ def diarize_speech( {**item, "speaker": "SPEAKER_00"} for item in result_diarize["segments"] ] - return result_diarize + return reencode_speakers(result_diarize) From ff1a52da7fd310bf05861cbbc29b720385f65eb7 Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Thu, 9 May 2024 03:26:22 +0000 Subject: [PATCH 32/36] feat(document translation): output type videobook with illustrations #42 --- .gitignore | 4 +- app_rvc.py | 234 +++++++++++-- requirements_extra.txt | 2 +- soni_translate/postprocessor.py | 10 +- soni_translate/text_multiformat_processor.py | 343 ++++++++++++++++++- soni_translate/text_to_speech.py | 22 +- 6 files changed, 561 insertions(+), 54 deletions(-) diff --git a/.gitignore b/.gitignore index 69a138c..c2f3033 100644 --- a/.gitignore +++ b/.gitignore @@ -185,6 +185,7 @@ task_subtitle.* *.mov *.ogv *.wmv +test.py list.txt text_preprocessor.txt text_translation.txt @@ -209,4 +210,5 @@ OPENVOICE_MODELS/ PIPER_MODELS/ WHISPER_MODELS/ whisper_api_audio_parts/ -uroman/ \ No newline at end of file +uroman/ +pdf_images/ \ No newline at end of file diff --git a/app_rvc.py b/app_rvc.py index 5d2c7b2..658f64c 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -70,6 +70,7 @@ diarization_models, ) from soni_translate.text_multiformat_processor import ( + BORDER_COLORS, srt_file_to_segments, document_preprocessor, determine_chunk_size, @@ -78,6 +79,12 @@ process_subtitles, linguistic_level_segments, break_aling_segments, + doc_to_txtximg_pages, + page_data_to_segments, + update_page_data, + fix_timestamps_docs, + create_video_from_images, + merge_video_and_audio, ) from soni_translate.languages_gui import language_data, news import copy @@ -382,18 +389,18 @@ def multilingual_media_conversion( target_language="English (en)", min_speakers=1, max_speakers=1, - tts_voice00="en-AU-WilliamNeural-Male", - tts_voice01="en-CA-ClaraNeural-Female", - tts_voice02="en-GB-ThomasNeural-Male", - tts_voice03="en-GB-SoniaNeural-Female", - tts_voice04="en-NZ-MitchellNeural-Male", - tts_voice05="en-GB-MaisieNeural-Female", - tts_voice06="en-AU-WilliamNeural-Male", - tts_voice07="en-CA-ClaraNeural-Female", - tts_voice08="en-GB-ThomasNeural-Male", - tts_voice09="en-GB-SoniaNeural-Female", - tts_voice10="en-NZ-MitchellNeural-Male", - tts_voice11="en-GB-MaisieNeural-Female", + tts_voice00="en-US-EmmaMultilingualNeural-Female", + tts_voice01="en-US-AndrewMultilingualNeural-Male", + tts_voice02="en-US-AvaMultilingualNeural-Female", + tts_voice03="en-US-BrianMultilingualNeural-Male", + tts_voice04="de-DE-SeraphinaMultilingualNeural-Female", + tts_voice05="de-DE-FlorianMultilingualNeural-Male", + tts_voice06="fr-FR-VivienneMultilingualNeural-Female", + tts_voice07="fr-FR-RemyMultilingualNeural-Male", + tts_voice08="en-US-EmmaMultilingualNeural-Female", + tts_voice09="en-US-AndrewMultilingualNeural-Male", + tts_voice10="en-US-EmmaMultilingualNeural-Female", + tts_voice11="en-US-AndrewMultilingualNeural-Male", video_output_name="", mix_method_audio="Adjusting volumes and mixing audio", max_accelerate_audio=2.1, @@ -1145,6 +1152,109 @@ def multilingual_media_conversion( return output + def hook_beta_processor( + self, + document, + tgt_lang, + translate_process, + ori_lang, + tts, + name_final_file, + custom_voices, + custom_voices_workers, + output_type, + chunk_size, + width, + height, + start_page, + end_page, + bcolor, + is_gui, + progress + ): + prog_disp("Processing pages...", 0.10, is_gui, progress=progress) + doc_data = doc_to_txtximg_pages(document, width, height, start_page, end_page, bcolor) + result_diarize = page_data_to_segments(doc_data, 1700) + + prog_disp("Translating...", 0.20, is_gui, progress=progress) + result_diarize["segments"] = translate_text( + result_diarize["segments"], + tgt_lang, + translate_process, + chunk_size=0, + source=ori_lang, + ) + chunk_size = ( + chunk_size if chunk_size else determine_chunk_size(tts) + ) + doc_data = update_page_data(result_diarize, doc_data) + + prog_disp("Text to speech...", 0.30, is_gui, progress=progress) + result_diarize = page_data_to_segments(doc_data, chunk_size) + valid_speakers = audio_segmentation_to_voice( + result_diarize, + tgt_lang, + is_gui, + tts, + ) + + # fix format and set folder output + audio_files, speakers_list = accelerate_segments( + result_diarize, + 1.0, + valid_speakers, + ) + + # custom voice + if custom_voices: + prog_disp( + "Applying customized voices...", + 0.60, + is_gui, + progress=progress, + ) + self.vci( + audio_files, + speakers_list, + overwrite=True, + parallel_workers=custom_voices_workers, + ) + self.vci.unload_models() + + # Update time segments and not concat + result_diarize = fix_timestamps_docs(result_diarize, audio_files) + final_wav_file = "audio_book.wav" + remove_files(final_wav_file) + + prog_disp("Creating audio file...", 0.70, is_gui, progress=progress) + create_translated_audio( + result_diarize, audio_files, final_wav_file, False + ) + + prog_disp("Creating video file...", 0.80, is_gui, progress=progress) + video_doc = create_video_from_images( + document, + width, + height, + doc_data, + result_diarize + ) + + # Merge video and audio + prog_disp("Merging...", 0.90, is_gui, progress=progress) + vid_out = merge_video_and_audio(video_doc, final_wav_file) + + # End + output = media_out( + document, + tgt_lang, + name_final_file, + "mkv" if "mkv" in output_type else "mp4", + file_obj=vid_out, + ) + logger.info(f"Done: {output}") + return output + def multilingual_docs_conversion( self, string_text="", # string @@ -1152,13 +1262,18 @@ def multilingual_docs_conversion( directory_input="", # doc path origin_language="English (en)", target_language="English (en)", - tts_voice00="en-AU-WilliamNeural-Male", + tts_voice00="en-US-EmmaMultilingualNeural-Female", name_final_file="", translate_process="google_translator", output_type="audio", chunk_size=None, custom_voices=False, custom_voices_workers=1, + start_page=1, + end_page=99999, + width=1280, + height=720, + bcolor="dynamic", is_gui=False, progress=gr.Progress(), ): @@ -1190,16 +1305,42 @@ def multilingual_docs_conversion( if not document: raise Exception("No data found") + if "videobook" in output_type: + if not document.lower().endswith(".pdf"): + raise ValueError( + "Videobooks are only compatible with PDF files." + ) + + return self.hook_beta_processor( + document, + TRANSLATE_AUDIO_TO, + translate_process, + SOURCE_LANGUAGE, + tts_voice00, + name_final_file, + custom_voices, + custom_voices_workers, + output_type, + chunk_size, + width, + height, + start_page, + end_page, + bcolor, + is_gui, + progress + ) + # audio_wav = "audio.wav" final_wav_file = "audio_book.wav" prog_disp("Processing text...", 0.15, is_gui, progress=progress) result_file_path, result_text = document_preprocessor( - document, is_string + document, is_string, start_page, end_page ) if ( - output_type == "text" + output_type == "book (txt)" and translate_process == "disable_translation" ): return result_file_path @@ -1226,8 +1367,14 @@ def multilingual_docs_conversion( txt_file_path, result_text = segments_to_plain_text(result_diarize) - if output_type == "text": - return txt_file_path + if output_type == "book (txt)": + return media_out( + result_file_path if is_string else document, + TRANSLATE_AUDIO_TO, + name_final_file, + "txt", + file_obj=txt_file_path, + ) # (TTS limits) plain text to result_diarize chunk_size = ( @@ -1242,17 +1389,6 @@ def multilingual_docs_conversion( TRANSLATE_AUDIO_TO, is_gui, tts_voice00, - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", ) # fix format and set folder output @@ -2028,7 +2164,7 @@ def swap_visibility(data_type): docs_output_type = gr.Dropdown( DOCS_OUTPUT_TYPE_OPTIONS, - value=DOCS_OUTPUT_TYPE_OPTIONS[0], + value=DOCS_OUTPUT_TYPE_OPTIONS[2], label="Output type", ) docs_OUTPUT_NAME = gr.Textbox( @@ -2043,6 +2179,41 @@ def swap_visibility(data_type): interactive=True, info=lg_conf["chunk_size_info"], ) + gr.HTML("
") + start_page_gui = gr.Number( + step=1, + value=1, + minimum=1, + maximum=99999, + label="Start page", + ) + end_page_gui = gr.Number( + step=1, + value=99999, + minimum=1, + maximum=99999, + label="End page", + ) + gr.HTML("
Videobook") + videobook_width_gui = gr.Number( + step=1, + value=1280, + minimum=100, + maximum=4096, + label="Width", + ) + videobook_height_gui = gr.Number( + step=1, + value=720, + minimum=100, + maximum=4096, + label="Height", + ) + videobook_bcolor_gui = gr.Dropdown( + BORDER_COLORS, + value=BORDER_COLORS[0], + label="Border color", + ) docs_dummy_check = gr.Checkbox( True, visible=False ) @@ -2580,6 +2751,11 @@ def update_tts_list(): docs_chunk_size, enable_custom_voice, workers_custom_voice, + start_page_gui, + end_page_gui, + videobook_width_gui, + videobook_height_gui, + videobook_bcolor_gui, docs_dummy_check, ], outputs=docs_output, diff --git a/requirements_extra.txt b/requirements_extra.txt index 5cb027f..058fb3f 100644 --- a/requirements_extra.txt +++ b/requirements_extra.txt @@ -15,5 +15,5 @@ git+https://github.com/R3gm/openvoice_package.git@lite openai==1.14.3 tiktoken==0.6.0 # Documents -PyPDF2 +pypdf==4.2.0 python-docx \ No newline at end of file diff --git a/soni_translate/postprocessor.py b/soni_translate/postprocessor.py index a77b5bc..61a54eb 100644 --- a/soni_translate/postprocessor.py +++ b/soni_translate/postprocessor.py @@ -26,10 +26,12 @@ ] DOCS_OUTPUT_TYPE_OPTIONS = [ - "audio (wav)", - "audio (mp3)", - "audio (ogg)", - "text", + "videobook (mp4)", + "videobook (mkv)", + "audiobook (wav)", + "audiobook (mp3)", + "audiobook (ogg)", + "book (txt)", ] # Add DOCX and etc. diff --git a/soni_translate/text_multiformat_processor.py b/soni_translate/text_multiformat_processor.py index a795c00..226169c 100644 --- a/soni_translate/text_multiformat_processor.py +++ b/soni_translate/text_multiformat_processor.py @@ -1,11 +1,14 @@ from .logging_setup import logger from whisperx.utils import get_writer -from .utils import remove_files, run_command +from .utils import remove_files, run_command, remove_directory_contents +from typing import List import srt import re import os import copy import string +import soundfile as sf +from PIL import Image, ImageOps punctuation_list = list( string.punctuation + "¡¿«»„”“”‚‘’「」『』《》()【】〈〉〔〕〖〗〘〙〚〛⸤⸥⸨⸩" @@ -89,18 +92,62 @@ def srt_file_to_segments(file_path, speaker=False): # documents -def pdf_to_txt(pdf_file): - import PyPDF2 +def dehyphenate(lines: List[str], line_no: int) -> List[str]: + next_line = lines[line_no + 1] + word_suffix = next_line.split(" ")[0] + + lines[line_no] = lines[line_no][:-1] + word_suffix + lines[line_no + 1] = lines[line_no + 1][len(word_suffix):] + return lines + + +def remove_hyphens(text: str) -> str: + """ + + This fails for: + * Natural dashes: well-known, self-replication, use-cases, non-semantic, + Post-processing, Window-wise, viewpoint-dependent + * Trailing math operands: 2 - 4 + * Names: Lopez-Ferreras, VGG-19, CIFAR-100 + """ + lines = [line.rstrip() for line in text.split("\n")] + + # Find dashes + line_numbers = [] + for line_no, line in enumerate(lines[:-1]): + if line.endswith("-"): + line_numbers.append(line_no) + + # Replace + for line_no in line_numbers: + lines = dehyphenate(lines, line_no) + + return "\n".join(lines) + + +def pdf_to_txt(pdf_file, start_page, end_page): + from pypdf import PdfReader with open(pdf_file, "rb") as file: - reader = PyPDF2.PdfReader(file) + reader = PdfReader(file) + logger.debug(f"Total pages: {reader.get_num_pages()}") text = "" - for page in reader.pages: - text += page.extract_text() + + start_page_idx = max((start_page-1), 0) + end_page_inx = min((end_page), (reader.get_num_pages())) + document_pages = reader.pages[start_page_idx:end_page_inx] + logger.info( + f"Selected pages from {start_page_idx} to {end_page_inx}: " + f"{len(document_pages)}" + ) + + for page in document_pages: + text += remove_hyphens(page.extract_text()) return text def docx_to_txt(docx_file): + # https://github.com/AlJohri/docx2pdf update from docx import Document doc = Document(docx_file) @@ -122,14 +169,14 @@ def replace_multiple_elements(text, replacements): return replaced_text -def document_preprocessor(file_path, is_string): +def document_preprocessor(file_path, is_string, start_page, end_page): if not is_string: file_ext = os.path.splitext(file_path)[1].lower() if is_string: text = file_path elif file_ext == ".pdf": - text = pdf_to_txt(file_path) + text = pdf_to_txt(file_path, start_page, end_page) elif file_ext == ".docx": text = docx_to_txt(file_path) elif file_ext == ".txt": @@ -236,6 +283,286 @@ def segments_to_plain_text(result_diarize): return txt_file_path, complete_text +# doc to video + +COLORS = { + "black": (0, 0, 0), + "white": (255, 255, 255), + "red": (255, 0, 0), + "green": (0, 255, 0), + "blue": (0, 0, 255), + "yellow": (255, 255, 0), + "light_gray": (200, 200, 200), + "light_blue": (173, 216, 230), + "light_green": (144, 238, 144), + "light_yellow": (255, 255, 224), + "light_pink": (255, 182, 193), + "lavender": (230, 230, 250), + "peach": (255, 218, 185), + "light_cyan": (224, 255, 255), + "light_salmon": (255, 160, 122), + "light_green_yellow": (173, 255, 47), +} + +BORDER_COLORS = ["dynamic"] + list(COLORS.keys()) + + +def calculate_average_color(img): + # Resize the image to a small size for faster processing + img_small = img.resize((50, 50)) + # Calculate the average color + average_color = img_small.convert("RGB").resize((1, 1)).getpixel((0, 0)) + return average_color + + +def add_border_to_image(image_path, target_width, target_height, border_color=None): + + img = Image.open(image_path) + + # Calculate the width and height for the new image with borders + original_width, original_height = img.size + original_aspect_ratio = original_width / original_height + target_aspect_ratio = target_width / target_height + + # Resize the image to fit the target resolution while retaining aspect ratio + if original_aspect_ratio > target_aspect_ratio: + # Image is wider, calculate new height + new_height = int(target_width / original_aspect_ratio) + resized_img = img.resize((target_width, new_height)) + else: + # Image is taller, calculate new width + new_width = int(target_height * original_aspect_ratio) + resized_img = img.resize((new_width, target_height)) + + # Calculate padding for borders + padding = (0, 0, 0, 0) + if resized_img.size[0] != target_width or resized_img.size[1] != target_height: + if original_aspect_ratio > target_aspect_ratio: + # Add borders vertically + padding = (0, (target_height - resized_img.size[1]) // 2, 0, (target_height - resized_img.size[1]) // 2) + else: + # Add borders horizontally + padding = ((target_width - resized_img.size[0]) // 2, 0, (target_width - resized_img.size[0]) // 2, 0) + + # Add borders with specified color + if not border_color or border_color == "dynamic": + border_color = calculate_average_color(resized_img) + else: + border_color = COLORS.get(border_color, (0, 0, 0)) + + bordered_img = ImageOps.expand(resized_img, padding, fill=border_color) + + bordered_img.save(image_path) + + return image_path + + +def doc_to_txtximg_pages(document, width, height, start_page, end_page, bcolor): + from pypdf import PdfReader + + reader = PdfReader(document) + logger.debug(f"Total pages: {reader.get_num_pages()}") + images_folder = "pdf_images/" + os.makedirs(images_folder, exist_ok=True) + remove_directory_contents(images_folder) + + start_page_idx = max((start_page-1), 0) + end_page_inx = min((end_page), (reader.get_num_pages())) + document_pages = reader.pages[start_page_idx:end_page_inx] + + logger.info( + f"Selected pages from {start_page_idx} to {end_page_inx}: " + f"{len(document_pages)}" + ) + + data_doc = {} + for i, page in enumerate(document_pages): + + count = 0 + images = [] + for image_file_object in page.images: + img_name = f"{images_folder}{i:04d}_{count:02d}_{image_file_object.name}" + images.append(img_name) + with open(img_name, "wb") as fp: + fp.write(image_file_object.data) + count += 1 + img_name = add_border_to_image(img_name, width, height, bcolor) + + data_doc[i] = { + "text": remove_hyphens(page.extract_text()), + "images": images + } + + return data_doc + + +def page_data_to_segments(result_text=None, chunk_size=None): + + if not chunk_size: + chunk_size = 100 + + segments_chunks = [] + time_global = 0 + for page, result_data in result_text.items(): + # result_image = result_data["images"] + result_text = result_data["text"] + text_chunks = split_text_into_chunks(result_text, chunk_size) + if not text_chunks: + text_chunks = [" "] + + for chunk in text_chunks: + chunk_dict = { + "text": chunk, + "start": (1.0 + time_global), + "end": (2.0 + time_global), + "speaker": "SPEAKER_00", + "page": page, + } + segments_chunks.append(chunk_dict) + time_global += 1 + + result_diarize = {"segments": segments_chunks} + + return result_diarize + + +def update_page_data(result_diarize, doc_data): + complete_text = "" + current_page = result_diarize["segments"][0]["page"] + text_page = "" + + for seg in result_diarize["segments"]: + text = seg["text"] + " " # issue + complete_text += text + + page = seg["page"] + + if page == current_page: + text_page += text + else: + doc_data[current_page]["text"] = text_page + + # Next + text_page = text + current_page = page + + if doc_data[current_page]["text"] != text_page: + doc_data[current_page]["text"] = text_page + + return doc_data + + +def fix_timestamps_docs(result_diarize, audio_files): + current_start = 0.0 + + for seg, audio in zip(result_diarize["segments"], audio_files): + duration = round(sf.info(audio).duration, 2) + + seg["start"] = current_start + current_start += duration + seg["end"] = current_start + + return result_diarize + + +def create_video_from_images( + document, + width, + height, + doc_data, + result_diarize +): + + # First image + text = os.path.basename(document)[:-4] + first_image = "pdf_images/0000_00_aaa.png" + cm = f"ffmpeg -f lavfi -i color=c=black:s={width}x{height} -vf \"drawtext=text='{text}':x=(w-text_w)/2:y=(h-text_h)/2:fontsize=24:fontcolor=white\" -frames:v 1 {first_image}" + run_command(cm) + + # Time segments and images + max_pages_idx = len(doc_data) - 1 + current_page = result_diarize["segments"][0]["page"] + duration_page = 0.0 + last_image = None + + for seg in result_diarize["segments"]: + start = seg["start"] + end = seg["end"] + duration_seg = end - start + + page = seg["page"] + + if page == current_page: + duration_page += duration_seg + else: + + images = doc_data[current_page]["images"] + + if first_image: + images = [first_image] + images + first_image = None + if not doc_data[min(max_pages_idx, (current_page+1))]["text"].strip(): + images = images + doc_data[min(max_pages_idx, (current_page+1))]["images"] + if not images and last_image: + images = [last_image] + + # Calculate images duration + time_duration_per_image = round((duration_page / len(images)), 2) + doc_data[current_page]["time_per_image"] = time_duration_per_image + + # Next values + doc_data[current_page]["images"] = images + last_image = images[-1] + duration_page = duration_seg + current_page = page + + if "time_per_image" not in doc_data[current_page].keys(): + images = doc_data[current_page]["images"] + if first_image: + images = [first_image] + images + if not images: + images = [last_image] + time_duration_per_image = round((duration_page / len(images)), 2) + doc_data[current_page]["time_per_image"] = time_duration_per_image + + # Timestamped image video. + with open("list.txt", "w") as file: + + for i, page in enumerate(doc_data.values()): + + duration = page["time_per_image"] + for img in page["images"]: + if i == len(doc_data) - 1 and img == page["images"][-1]: # Check if it's the last item + file.write(f"file {img}\n") + file.write(f"outpoint {duration}") + else: + file.write(f"file {img}\n") + file.write(f"outpoint {duration}\n") + + out_video = "video_from_images.mp4" + remove_files(out_video) + + cm = f"ffmpeg -y -f concat -i list.txt -c:v libx264 -preset veryfast -crf 18 -pix_fmt yuv420p {out_video}" + run_command(cm) + + return out_video + + +def merge_video_and_audio(video_doc, final_wav_file): + + fixed_audio = "fixed_audio.mp3" + remove_files(fixed_audio) + cm = f"ffmpeg -i {final_wav_file} -c:a libmp3lame {fixed_audio}" + run_command(cm) + + vid_out = "video_book.mp4" + remove_files(vid_out) + cm = f"ffmpeg -i {video_doc} -i {fixed_audio} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {vid_out}" + run_command(cm) + + return vid_out + + # subtitles diff --git a/soni_translate/text_to_speech.py b/soni_translate/text_to_speech.py index 4275f1b..07afd35 100644 --- a/soni_translate/text_to_speech.py +++ b/soni_translate/text_to_speech.py @@ -968,17 +968,17 @@ def audio_segmentation_to_voice( TRANSLATE_AUDIO_TO, is_gui, tts_voice00, - tts_voice01, - tts_voice02, - tts_voice03, - tts_voice04, - tts_voice05, - tts_voice06, - tts_voice07, - tts_voice08, - tts_voice09, - tts_voice10, - tts_voice11, + tts_voice01="", + tts_voice02="", + tts_voice03="", + tts_voice04="", + tts_voice05="", + tts_voice06="", + tts_voice07="", + tts_voice08="", + tts_voice09="", + tts_voice10="", + tts_voice11="", dereverb_automatic=True, model_id_bark="suno/bark-small", model_id_coqui="tts_models/multilingual/multi-dataset/xtts_v2", From 1888ce79bbd4b58e0abf8973d7b5e56e76ee2a63 Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Sat, 11 May 2024 22:47:10 +0000 Subject: [PATCH 33/36] fix(videobook): First frame with IPython, not ffmpeg #42 --- app_rvc.py | 3 - assets/logo.jpeg | Bin 0 -> 73059 bytes soni_translate/text_multiformat_processor.py | 148 +++++++++++++++++-- 3 files changed, 133 insertions(+), 18 deletions(-) create mode 100644 assets/logo.jpeg diff --git a/app_rvc.py b/app_rvc.py index 658f64c..bc9b5fc 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -1233,9 +1233,6 @@ def hook_beta_processor( prog_disp("Creating video file...", 0.80, is_gui, progress=progress) video_doc = create_video_from_images( - document, - width, - height, doc_data, result_diarize ) diff --git a/assets/logo.jpeg b/assets/logo.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..7c4e0375add91a53f6e509833b19e5ce3c322fd9 GIT binary patch literal 73059 zcmeFZ2Ut_xwk{kxB1Jk<5|u6jN|7QYq6P#6q)S&3Y0`U*fEtJt0Rg2-GxSK8E*+$I z=@5EPr~yLw>I>j=9DhbI$jDN6vqp&jBu}DXS_2 zNJszx65$YM^|pNp0BC3ct^)u7YT`Kt04Z^gmRLB61)%!E%E{XGnZp$&RlxZq z;68wYjGUaDjDq-!f`WpQik6y+xG`M3c!8FYfr*Kcfsv7!jhmgBg_D($k>e@{Cl3z@ z1Y%~t#&-?K#|;Dlf8T_J_)#iKDtc;adLRoU3-IsW&VK+HsmTH405Xzm08&O0GDec~ z766E77X`^5+wZFX-bhHv$ceS2zCd%4ctQCk04WI>87Z;;L<5Lt1BmAVr#%(zj(~<>c=_fGVpz zR8`Z}(>E}DVq|Rj!s?~9jjf%VyN9Qjw~uego44=ap<#%a*pG4X37$l;IlOce=C23-`qx8Y`OI%OU;%v8h5}gr$BX*i3s6zw!*vFTJ7T}X5;69ZXT--4M}mz1*+O&# zVjWl>ll%`C(g6g8=pRi_8B^@9fo`D3Io_P;&3CeEv9c6&kJ#7l#OH@J;5Oz%O$$?# z^2~zpCTurhJDF*NB`J2!gO4`SgIdN6k(6U|zICjN6&5_TayNn;%Tjd4wXlFlszo=m z*|&JB`39zNTh1&=_3%$ziyvV>w50CU$<;t3E6A9Y%(FO^&semLlb<2(t3Ui}MED&c zO|=mK;&?=SCB$g_=fxPHMhr>BlfQR}S7sA~)*n0n?s5{FtpDVVYOAUV{g;N$zbgu5 zc|OqT{h22p<9z+1$thRWi+7`kGd{e4>h>$otJad;zZk0DYh?|Q3CD}jt(F9M_H9vW ziF7f)}Zrj$6yeHnVo-U@f_X1R^FX=hX2 z-Zv{(6hJK1vwI^pS_mu}YPbht4+f!EBAjc1vL}pY8jmv2Q$~f?qw^_bH(Rs64h|<$ z+IIv>_-iLTOw=H2c12CX*=770NeJtQqU(MEzMkiROSQJyI7!WS=re&-d-ynAt?Uf$ zv^^C7tVh=pCK%q-%kN1GKQj|pXPB9489e(`awHm34*$Mfwb5DLN%hjV%7eN!?_jO! zey4>6gtuwiQJ}ge2EFW+ioRd1$`{bvJ{;R4$-2E_gi|eil*$_Q2*LI8Y>NH;t7{r( zl;dCwINV#B0WZ=3z=;~416qAcXWv;KdJ7)A-eC}xbK4o3sNKn{%!{$FI=Cnst~fsQ z(Q3fNaMe3Nl{V(SggQ$EcaCdhxcosHBMYFRF@tFL!1~>%P{$@K&F@NT*5ICGucD**l?bUmj`68Z77w46jH*j3zYwv~elFm-n|lE=7ofa5j~B}IEavR_v+vNYs4x{OK#-x&`` zLeuWPZJM5J7axADcMe#U-b|wv2TxQL5~$v(?4kY2Pc_IShtY7h|?dLYCry60jM4mor;`Cu_`t0ammZ z6|ivStJPzSF#w+zl>FKkT?!WPu56k>g%p(Wu-SJmmc&T1#5CzS=wEl$lxy5C98O-M zj2#M~zy5JvEybov)N{n$OJ01uX~*+(!4tM~fT8@O%<%-ObAS@c2t{$0f>J&da%MJQ=Pof|NEX_is6o@*VIb){qla11grv0_+-5$5Zqc?;r-e4bh6vMQgSGCm+*b2cIzj~h}20nF(dS~Lh zpyy<=6kaR0>F4!ZksxMU6Kl-zetB;^?9I%jTRhk30-qH+OkL~^Z{MJ92cha-{_+P7 zFW34tFyr2_mWMv=dep3>S;l7;*4me+bq+EVxe^jeiiyeHt;6q`mQIH9XHQ9iidjIA#u4&%xg|e6tRr z$3uhlTCWF`UTt^g*Hq~b!ReMoz3~O6J%zXW&Y9Fi#Y-bR!WipF98J#w$p#MXukl`( zGu~1BoxsXL*48Pu{@5VG{5jxB{T!*H2-{>M|p^CMBO6%ssU$7^IIy~fLI56zAq#(4|Sk6ocB9g1&g(9rN*)3j;4-cm!odD&DIIc zH%D@~^XA~~H=NjUmQ9B8X3Ob$tDNpL4;VxqKBGAYT#r3^$6$ASm41$kqfd7K5XD(X zrJ6N<0yO)q!gE=+HJ^t*VnK_|mJ13G6<>g^MhSjuh<#ybo9XY2g%*trzbZR}Qt7a? zoC7{}oQb4jj+!l4yhEBlY$QH1C{>b_FL1iu3DynYy*2DRCRen^84u3|r{o)#7Y+(p za@|w_!mmWDsufE6)sU~wx#&9EiKz#k1D5?p-(#Dby=2Wan@+VXM}os12a5Eqnbp5v z5Wi@D1!iqup;JX5yP|Rqs5hakzsG{4iU**}BX4KS>IPTR&m-}-7jUSJ<$w}u-Ybj1 zeM4akVo)9O?&O9Wn}lLRq8E2say;4Ywd(4FYsVR%vE{jW9{l7xFZ;Ejtjf>+%`}sh zb*ZO3rDke+3hG2dTNd+OAEaZ$Y3TW&tbBQf7uPi}db7-c&LzLQVQ>3;6lAA@1TSey zS&nUP8RJh-|MqR@!~PB0zT;$u!7AkYY#=iCK7X6R$i1#UEu7#0OZl#5HkPWbItXHe z<=@REa6EYc!S;*Lujq}4smjP_9+^qDFO9wBB`BaGIOc-n+e`mxstF&ldUj?9D}&VF63+p7=YahqVss7;D24jF zSfOD|MvRAAm+U?)d|}sASyW{mShHaBcYAu*!Qjp?sM&+G0>;`{$Qu8(Kg{GQB{Wm_$i4?2`0pUh&cUQ_~rM~`oCw%^{)Yz zFCR$wUQ_A#8GiC!KyRmgRp*a_&ub_zT`)+Ayn25t(gL$EQV4o3>4i&TBU6K>58B9CPAnLl& zmdF~F<$9jl@kTR@4X=Te7yE*>8=Y?Xa^usX2dJC#(23H!Mj30Co}l@uV3jJxL6Ax1 zfT#UC`ddCcMbW(7ll@kDnCJV_&7KpXjBj^>5yaJ?H)`wZD`jM+`PQ-dD@LAW#a9B@yzXx%%}E-`#$)B8zsMJ<*oU(K_jurEyvuZ<4SKxh=L zdtD}we}0WmV2!w5BFURCLiWypLT_YJQ-nz$27Fz0XE(;Dz&|j;$317Xh_bWuM(G1Zx>GdINY##TJ(d$61k7Q)?}`PJbRI53l4v1rp^1Wo%Rm7 zQuklUu+uk=_))gcwihX(zTlOzp4e9_w#DV%6Da>~RwU23seV}=v$cn?`&;L{{x&sfTYO^Nx(C8B zzaTs*A`QBiFIDhKoeci*c#8FmLBk>fC}?(RJRrUkRk6k1!o4uowN^8WEO&1xZA;3T zonAK=A1Nwwe35h;(+j<%G%*XGXz8t=b~U;FyuwKJ(0 zrPfG;8s?0qV?A-?T^h%fW0B?iDCMUNYl*uqQHh1oM@AIV>J+^v6@)Fw{SAh9i7tb9 zvWzF+nzJ(`&jGc~E@vA4V_3__gdtp1;S}Z1?Dh|74@h92qz^p0qo1tpb~_iFG?}cj zzz*`!ok+i$&qY~sK8v;{>n&*;#tVwSl!<&G(xA?G=cJ}#)mQKbgl?^#56hO3*!t1S zGyc%$*Kyn7Q<8=BAJS{`4UOW|)9-HUtaJJ(W+wR(w9;0LW=q-97I9vB!{a_x)r`XD z04sL|G z(C^49_g)#Q&qFR5;}X;egC&*K??{*Q`Z&yR+qKD2z8I&sGqee3y^SPC-h=gEt){G*)QaSUkH?iWCkaKMuy?#JIRlz^uL+Y zvfoh8IS0Jd%*1w}yfNZY3J~rz(eO>?`W*n@xK_j(P@)TZqs=`|Zg^G;4Eb_qP8MS`%kk+hWa~_6Pi7dl{drE#y zS?whaR!)KNUt{0K@-?&UL|Smk=JZ8ONPg%!fU2Je&h#%4hvNHpDfmVC|8uOd>|Dj)Vfi|x8HDj9NM93G8WWNr zWh9O1W7HUV?3@c~@1Md5hKbhd&aZmN0#pN+T8xM*+$-zgl=kAn_2bh+*l1!`>sEH@ zIUsZW<_Ajey-QYFOWzIqr8`5jHDM`PU0d`v#t2!&I$|-2E!F}a;%>1=d);kAF`bWDx%YZ z4s|>BaBe#G*L+T;J0z6B5l8TaM6@G;!F%}vb>z>VODf8UO}BGE0EvbAfaze9ZbJ0N zK;JF*xFnqeuw60k&VnDD)ZShMx;;1*64m${^R2&N>lt%{aHSyt+75njx1D_fud#F{i-nuu;o*H`(*D$Qc3x>zw;Lj>W2eq&H{ zi9%oRa}Q*N`sRx%;YC=y?>Aq}Ys^zgf70qlwi9WCrv5ooYp}$UtUJw4QwndI5b{Pe zX&ByZ7`pcTRC0Q<_L^Bug(FArow1m+gRlE+DR~dh)-ApZ}iF7fV+Xd%{^3$ z=hHn-+?_nn((Opgd2_hS{4D9(iPhrsJLR-{IZvH8TcV@K3f=d$jg_z8(l(=4E9W)w*A`9kB4*^ zd`$0?W~SA5vQ*Z;!`o7;l0R)#vwKy-__~6#D`a;C$-E z*d%UXxk)9hnm<`j;_Yl~%Y=(uhRn|3UR#A$j;=$$`MH%&rcokWi%sMR#NML`e zlzik@e-UI40sivGj6^E`M_1<6hQ2k6JLZ+8?_;OVK%zes^q&m83v*g*D2y~jM@2d* zSm(!6%nug+I*XYiMgs?~v&<8EMf(p~8MSg6J=5EP(aPaMO1fVQJ}^UBn5cB9Mze+~ zC==u^Q%QaPN}fq{q{G;nru6|Ba0$C*6)i5H|ofd16w#pmAnuT;~h8LMSp`J1!=zZtCm9YygUQ5S!H{5OGNJXf!vHh{nLU0*@fjYek36PE2& z-j09%&F>rlPfTpfPNgt;O?66p;Y`w4+cOuWo5YdS+#}YFhP~|Im9(#Xz;Kh=lIqvC z;6ahhLO^#cUU}qV?CG0GzP_iIK+A2FFzX(JJ6bH$YcV%T2>{DJuX~!1Yq0AO8%0uk zr%Nhm>IG4IKh9PWk|woGGKhZrl6iQ*lzM}nKR!0q1ZW;*(T*QjlYi6RWI^+2;*n~7 zyViYbUS1cnWpHgwTP+iW1s5lMHK(X1Cm`qc*4C}-TIm)wQLmoY8P3C_q#Sgc2fUqH zV(-&+y9A`kkW+74wowU2{F)0nVA)7}H_PB4Aaf+&5~3)4pId_|c|H zIb$S6HdLLRL-S2R%*0{wkQr{^2AFHYyJC6Bua8z^QDm#Do{~hdIxXAATg1qT#ao9? z$4|eK{T}od75`Pa-AO7tvaAo6|IL-6{vECj&~yMAze0W4+fkn2sem0>n$R5F#r z^>8yhLqG;jT;I*Qr`_5-^&RlAoho zB0@K|`uT1u7q?}W1O=IMVJ<#m54@G|*yHDuNhTz>6YOJPQvQsFyCu>m=THHUfQzzg zAK(-4%%54_Fce$ZCK#bY7J(TES7f3E~mTHO<3JdeMY`BopeJGL~0Cp?1+*xG+M z(u5OH&bKH93$)OFzTvJVS@-wk@X$;i>{RM&!$BYr=QVf6H1aG2TK@e5t-{zt{^-73 z(V^9wB0*>IX!xe_J`M;=VPc+j98_7?@YC@9rFLsk$KJpn>=#vlr4KjJjzYTtMR8{H zqQ1S*_nPoWwe8#64wtU#ZfCv&w$6?7e)!UxFM()9e|Ko_NqIhBl%~?N`Jn z^siWeOr*T`Kw9_36urKIvNa_>>iI-hGFaxTm8Q$Mk4;;46NB3xL$Nge>KAidQp@dX z=}0!!BvD@4(EflGh|+Pg;Si7&=QXdzFv0iO_OY3%H-{ITXi)d-bnTT2#sk$y;{*7MCpZPGQg3ZL;dU-K69~UnZ;h>dw^J*Kz7xTT3!`~9|B3cOCd31dbXIvz7Z7I7aH@NQOsAjQ zvhQX$*O-lP)NrDAwf5q&$@?qq1}9DFg5EXw!rTSoD!|8j;up~(N|ub6pu0TQO=(OG zZ}GOa-rEMfuA#73zV+F^T z6}ymKX)OR4=e{$mObA##%sGF$MO@{C$Kz@?5fxs1L-hRrl%f6akr#b6Rf|8|)e_i> zPWPP1js)}pT~^hWQ^{!A@4#UysYwoSN3`7ND#U zNg6L~$OoROiAXjZA10th#T3;aR;!1CLLx77RexJ&C^x{O5bmf8Ha)p^G0|!99J-$c zVB0wa>n}S7i!safaQRH_;X_-$Nt4{~HmEB&FKo!Eg?WM&mc`&uds>z7_8KT&0mJL+ z8#`X7DT}TP{F?G`*f6X}B7on<4J77`QXN)c zUJwXo_fDM9Rt!G}%sj#)XF={!s6s#Z8~pur{5{N6MS^q7ki@IH61SgeJL0ZYi;P1X z#$ZKEWEy%)a5LdWvkRN?#FDbH03pv0alF?8f}dO<9-}|C&|iflNaR zeQ&w6vCfhr2LME#zTf<+gRqbA`+@yR-_xYdMWeTSaUJ& zNtgSRKqaVMc35QmqZOAf>m4(c_!^{pmB>Fx7Gr9gpNwp0)D82JXIj3xO~|s&B|>Za zTZs93N!&WzE95+ezw4~uOKsCqBJ{1S>2__-mKSWFuxF-C-YuFuLOGtbTH2$`9l{=~>6?T0;4*+;dO9J&75m;C4OoPJHIax+z_lc9< zFTeh7G1b2oekI+oy`jZ|xXhzglKKXik)nQ=JE0kN0oEiWZPQGp{=TrEdvA$kH)LC) zsd`9Vbtcc%&FiImuJT8gt0iC+TyCI;cxd&~z^lAz4&Ae15=eT{%Od;87o!K4xnjm* zna_zGG}o}Vb4DYrpg;JnP^-Htg-}*T*6Si+=Q8BP0`(|+@s;7{oL8G> zKtzBN(tN`1cJ%bGsa7|qD?e44DRsaZ^};tYi0C*2D~o zq8KDQuZE|Ug;!Y}r#4GrbX#hZ)yG$##u^#Qu(Rv>j2SYUs-GRd5S#rh%>(sUFZ|jubmQmMpjuRq;IQd{ zvQUxRm^=e~Mj)*zP^fYQegB3UtrsI%l!W9kgG(b2U7w4YX~^=1YpHKtI}!Qx>BZ~7 z7vS(ZrIWF?Vcg5a3kn?q@$7_sM?J#xYMbX~qN48J^v6~`GurPCxq&A>%lD^G^$O~<{0*w(~s@x;K7?{{dZ=cjq=d!8B zjlQ!r?kaqb1OkWDHy%}rU);xN^wirDy`9DI)+TY?@drKMit;*3K)qk99{toq$g^2d zTD|NH(??OUZlsmcX8RlFHKzhwO?F{~I`?Nhd6}KPbI72yxHNPC1@Sgzfu2`0LjHv7 z|L-Q@U*MXxho#c!EbKT9l20NQZ?Pe!wPwKeS@hqzlUC8eBqndRnMn9;Lf9wT*fw69NVqEoY8Iyn0mp3u}=tWiV{a}uR zTWRyWr**PhqcvLW-s1OgLD@u9TJy*d@u`jp z`AIoBfeGdCz`G0kdAI!eFm~Vtip`LLq9jnZSD8UxITR@=mwJ&w(2L@TV5B~0iuK$_@arA{VyxvNC##RLSzdN)*qv(LL*yG zp0)ol3(rhf*3r~=2yVCBg>_CZ>P8|&r)ijo0@&wSd?3O% zRBFWPe4r6_3yS@b#acW!`NYZJBGYKQr?z_*`@OFm0s^TWB3F1VrO#Y#iH_AW6>@_;YeCp;t zy3lgZU)dH>sGF98;@gbgoo$7|6U;AGM=sA;*9?k*Ug_REUT~}daR)H?8(y?9z}p31 zAh2(wdXuXFqAzT&|3Gb62pfJzWaajDoN^~zRxk%1$Y_pjvuJC4h~{D(qhjs$qsqer zanC?u4$3tssobxQBIJw}qu(1FpZmV;<93CmeM2_)k@HuMx4)Ab{M7cy$mw(gMI7=k z24A$GwmCffK^^n+()V+Kv%wFPT?F+#k^6fEp4Oo#V-3};)UZ213asBev_1lK`og(6(xSywHVik5zk$RoZKVcz&>@h|q{KYrb zDOJY}#gvY@k`G2|!KEVI5kW+8_P3L z1JYMGgNmpjcG#RQ4|H20w>MPw@ndMM1t+Qv0@aA6eG1{R|AfV4Oy0w_K0PN(+{-$d z`GhvT572Dy#!)%hCkK~q7OpLm^b-+N*@IRs%pThOGrC3`r>x*WSDfmxJ8Hz@BV*nz zFG7Gc-p1v)TQC5SVLqSXA3XL`t7HTJsVC@P(LXGE0nH@ zMGX*={QEK%++(C;WDJ!r8|Z#e$&o7Gooxn%C8FMBsjGI)t}gC5nk>Jck?@7p46Ru_ z#5tJ{)%U+5vb#ro7!SA{4rZp0KWsqU8 z;e_$_XZwQHF<UPkqAoX z)z&(QsHb>glH;qQmlY|2 zn;$x~;b;+q(^i5=eZ}puK;Ge{H{mQYTzRRgdN1s^-5Cqh0RVn*u#~qNv00A&4YT3m+TyafzE5qK^+NsOY!M6OG4KqV-#%(g4|mI0 zI}#Bq^|~BocRR2*qS13-c2$05qTZgEug5`#&{2Kf5~JG5w-~!_Q)D-iZD44PN?5}D z7+QQ^<5y3ld1+d!MREzHegZLBAQV%DJtgkjt4bEoJuC9uzG{C4Q+#5})cTnspYnc(tRfL>_)&MQt3}4Q4}M?|0slmgTQgMB zu^=Yzi@+{4gMQXk&L4%8%~GyuS-;p)7y=(}zW^w}zb$+rs~j7xa{iWr8c3b-F=1ds zXWB)VYWQrxHge}ImAHFCCBctX6aj@A`V11PSmNNKR-l<)=@6i84jhN9?j>6m~yTMrm@i@BJwnSzIf zE{Mo~TJP@XrCiYQ-|uLCspm3GKt&ku!$$is0UD^#2Acf_=E)SlrWm4_vAh1|Fz@D( z(TYOAC^$1@cJ%vfa4HWrpfYkXQqnD}`NiGgH}7}9Zvo|RDJpHs5fzf-Z9}aW_=lsU zKG;wj(K4qWf8ALMIdYk;tgxT--jvIKD>Q7EU0@m3KHAvkt->!oa7bui_(8|dVraD2 z_%HJ<;^?O{*iU9_OBB(!a{NvJ3;!c6;&*<;U(~(7bHa#HCN*QB-#HThB=h}?`2Jsj z%0Kk?Lcj5F;uS)_Z&N$F01sk7NWC4A>IfRW@pw~v6b=3gLAmS@Cr62o^z?$0eT?e4 z7!{=T4Y)GVOw@c5aaTSlaa@V~g1BXVjLn>PvvZOO#Y+QDUAT(2*rOn4W#<5=VHo=u zD)`dR?(l+wtb7sLODd05EX_rYb+Sby_g~S=r%I$LfbG9c2-~#2@SIX3Qb`D5TMAK9 z;=Jy+9e>pL+wUkHoJQ-ae*Hd+17BpLHp`1rAb@N%F_X_S#?P?9pAqLc@#K9=HJ+pX zS(3UnXq|*``yBA<#Pejd%vL_8MAikj?k%tBHk+~SVL|QBS$_%Bul`mycN%R{0c!HQ z0)CI*GfNTW_j5+P(bGQe@f|{G4=0TfyQ!9@)pSZIUY zwC26e`PmC5CMa9+?MD#@6R#QYzVCw$3~XUEI1{6@FZI{$OoHtzPJBW0rKQkz^kmeA zX=)~xxqTusaC|`{@Ud@@?<$1D8Z2M@YeJJ~m2Gf}-4SuL<#KR;;qc=ZC zQ%jeBI0E0JZ_DRYcO&A!)#m$?Ga2S4Uk%XNq3~R=?8=(Or)KWyk2a4tTv>bZ>S4S=AjBFlE^(!$Ge#r*;G%Ai;#k(@!WqQBlyC>c@o z`AQov<5{fw8=5p0V*_J)Sa${Br+VH+vvwOm?m_Y4faPG7ps1DahUwmJhv^u_HN8=( z9*c)=Belf}me~6>MD!i$0^EbT*0bT@b5YE5Hi1GzVvCU7mPbq$r}N)tvpwX#P-KJ; z{AJp}PwaE={<*7VXAsko{@Ts5iBuCQ(R)eC%HAAF6H#!s1{)MT&fKUBMGFOmO51d* zc=#JPO*gt6b?jTTAOJ@|n~q)yNsF}lo46V*bE`zvu)l_n0~t`XfMfCj6|##cjAE2- zPr3hC!Y3lfJYnPeR6R^#kl6B{2VXYasl9^*wWosOEYz@$D5l)L1pdZE$pa$^k%?N8 zQ|+0ifeGBKvic_+<&hSbsbATHIU*JL@I>3W_yEcpG}pj^vu`u@n#9ZGPg=BrPtseW zK`Qt6gijE&EG?`nvrP5uw!j+eU92H}9&O_8nzM~;&>kZDlxrPwV!<DcMf9G5>*-UZ!Ho9$m`i}k`LlTG1&0~nhR}d zb>huC=K$VNoDn8tvU!IMg*rA&{BoQT>c(1iKvX@{oq8Uibh0w))Z7;|j0IcKF8VII zD|y>Z@iBTYHM{d{d?zpI7{ z_FXWvXeWIYMzgn$UO6;?56|t&)6YzJCA=8J%WdDn`}B`vPUyepuM<-nZj5Qq_aOr5 zMwgy2Yv0x_{t<~)s7-r$irhoR-&v6Ij5!+RAtH1HJ)sHiKc;Mg|JKgPQ%(f)C zY5w{VBYx`~U}~vIb*h0A_dRacuB{z*T5cG2T&b;1-j4TK6z5Iz{M}WF>|P6KBE-DG zR@9$8yESWapWW-Vx^wfe;FLwXhqrZ@0uh^v!A#>qp45*B^xik^HWfe3AAkWwBoKFr zqY&BeG4XdH`)k$@rA-gZV}SERKH5n5V2$_%{A29V+n-`l=oF=^n(=!LY3xJxE1-8G z5gD(+(U_<$-8`id5G&4c21`Zfg6yz_6}5Ljj`~qn7^k*bI8aEKwW5J7%^iQy;ch_6 z-s(mFLL9RN`SQoP@JrN+5qA*nYg3Lf*E+fKGimdMJ9sxVdjef?D(&VsG0BJ-AeJrt z`RkQVBblba;tIok)JqI(O9N&63f?MD@im)c9$Df>-e88j4wKH9x7<(@>8j<&Zp=T%^;@$2 zTeb5$t%;ZgMLc9fY#3CIx$)Ye|GAc=`ROsGuj9IU_kpkfe{pZ6oJ+UaCHWE)m1d&s)` zL#m9TC@}Gf*~IL4tB<7{-ALI_PPugF)rDck#@vnNs9!pM@k6zSuZDh(7Hhm3c<|(- z@-qQ+=9gYdmoe7;I&HxoSXns1x<&szWc}4Oi=I#C0OLI+v$6In|5=3Zmo08*qVT!l zDe>q$)Z1fvU48gWTr^6IDRKAS08tM$?VO(oj?LYWdD*A>?R)+Ai{IO69IpmwJq4Fn zVNM%4+r!Hlu6HWD7n)>ek8I!zY(DMTIvyJ`r^D1o&nxn~p9xF}_{6_lR!utXh9Q;A z^llF3<_`eHtdn&cPoqZzYKSVBXd=(d+opYX4!F49yymF2n;rFoAn{?|Up$X+laWY3 zb`K=3pSo_IlhC4S&os^(qZZi^_NUy`2_; z5t07>{W$GLU6b`^x}jKC+^c$v6}K)aND$$2+4Ym`rR}c^ICUS@V{szo#RgBencxO$ zMt?OP7ld8EdEq1#i7n(rY%k9YP8W$G5!8>ZQdMX&vC;!HkKs*GB8-tJ3tXu~Ajq?& z%rMa#wLl3+I)y;P4orgDFrN1qPNz*Upa>TN6D|`qgWWH1-&z5z_Qi7AiaN-pQnMR!i$%eL7k^g8gpG zp>N&y{mP->v_7iiG4(m0@|r&S4PwR`b6L&%6wcABuCaDEtr{=?j9e+EB4!Rj+Rg#& z{?H6uzRuu{SNo&y1}&Ksa+ab1i*rFU>6nBz4aZZ9(r=>$?nk9HaZlZtBuX4*sbZ8a zDjz(1XJcawE&RlXZIc(k#!ywb z1)0hGxZF@CFjgz<>NUQ8oA#99itMo}{ysy8o>{ME^1$faw|z@Qx}XD>My31FZ0^#v zy8J_1EMM^QUeg#8Gna5x8Tr{zvV2sL5`Ke$&lwSS`4} z6>@{!P~JJ1(n$&`!e(_{3E%+CK(ORBWq8uIvxl@iLo(PTRix%yPSy}Xuk|!~;za6q z#B^<7srAahD`Lv3m%lCjN)sC$q23d-YNu}KPJQiaLuwI9kR>MPy!KZ(>22^RmaL#= z{_F?>xv&!j&VWU9Js)v;s?w6iBrS3BvEjvFhX2Icx2oODeQQV?VFmeAgbNJ{KbYmg z#XESJSIvs}f4lqU*kE8KwBOe^HETDZY9jhA(MOy9Sj89{0~}Zi?u8cvd}DRTjm&2& zCb>lNOZ=3`RL=olLhiDz7A5u(C(n^PfZH<{v9z!EA>+G3XLk!Z2#k2m4_5vc3oXxb zMqo*gsTiT2yZ*em7=8TOhAgpF;JV+W zdJWrTfVG=VfY?#SGb>AeHCO94Lpv1Ar@QT@FvZsqt(rcar>sF!iSHS=m> z9pUp8xt_vim9!ixDr~{J5ZNzFC< zPr*=s5kI@xR?&}L62!qxl(;UIKh{J%FFr!T>*f4!js_PHInCm@hNYAf=_>p$hco-S zArD0volp&=9a}EzwXhw=&%9Fu#doMLPcOkNE z=JJ)HVl8i7D-H*io>l1Lg&KOI1RI{Rwp*GkfLiJN(3+(y7HnmMPhDS?ExyWvZB-HE zsCW9^fyj`Im=CQzEFWnA#zMJUn>%9KsS|@}D~sBJmM07?x>K5}B19@njy#z9%3SbG zo2*+<(JR?3v?rN|%;6DSttAWZYAQ|7crDND*W{0n4td@PzLs9Yb|+TMo-T8o1KR3? zcJxB6JI?`^XJM^8?kA7lA1lXoAl3+?OtZ}X_h4(7s7`~DTDf$)nXALZ1cA-0n&h*s z+UpDkjtFr1yq=~$r&-^Sb4i544epc}1U`H!s8g189Yh<*Y{z;0UUw|sgidHaUc5&7 z$01?;^kDx0br0u)iBiLyLUT4=Dr)JPN(47l5!1a&kUJnZ9Eo|7S=mj`Bjge2&Sw{j z9WA<}b3npzoO{%Dc9jPIR} zV{z>cppquS(QQsqfmg9vv9`WsTIn`i#i6qfJsk(p?bD#m+>_YBnnSRDrJ!d(amX4p z(7q83VZ1w;j-@ZXe4u`;Lg+Cu_G+08_4T0~YMWRO>Q|gpg9vsIr+oYJ2pp;zRF=@< zAojpaQ{#^5>wu;2uc23geL%UBT+|mZtG_D-7KwbX;T6Dc6#KTM=2@6lmaDrHl5Xpf z3erNfd0aC-rofPni#jFYstGsKojnT??@aA6rvu@Pu^ll-v8U0|3vu;&Po3{-|G-lO zn2WSD*f7wZ#S_GtjoVL?Vy@Ky0>*FcR-!4vUwp?LX>7IP4isK3hp(t=iaQL1%~`yi zK!u538Qw0M|2FiMD;mDT9~}T!!lgCUB{YGlF+1-70rD-6 z0+TlgY}J8WlH)7IzggjNIG(2NVypRW%nqET@a&|ngWKf(UI&b12;0i>+2G?8xT7`=H8bPsEq9zl+3FJIlgX9%2RnUtUDl4r$|m~gMhXmXZ}Yv= zqYsy7<-Ibds3wVYw5nQtR{XJ8G|%ynk4Ojb=KNkt{w?K?ue15PzQu$TK}U)J99JI3 zRm6ygYHR+yr)3Fj-YEYBS~E>(1`@Noyw3qcc#rK_)}-cy>eh;|g^McXL(A_2;`6dl zakn475RFQRK&CSIuDGYYBhdCsbAc{FB1wX0^i!F3n^+H%&$buV%wMfm8h-WyH(&g8 zkAq(`!?xTiduYEB?NixmHHj6p0PP=Y5#-B1&)XzAG27n!!k>Bqfb5{h3USD02m{_4 zHD#CR6|v>u`y##}N6UeOv3iCi+iZEE76}7Ub$xDdkAg7H9 ztR)#9b$nBmT+Ji3$7%P*)@W~B#AxPFPAh8YLZHxW&^h47+*^h-EHRe~RJzXP;AA}Z z;@H7`OS(JKmy+)igioNnyrZ5Kmrq=_yvf(aYP5z|J}h*3!A(WWLThQOSmUup%mR<7 zb;^jMmwDq)hpdbIup(nfPaU%`ndGH5IMJbh=9~c7nI&>M0yQXL_|8Fcj+aehLFU!+~@3j z&bjy8d-r?e{gJW82n+~|xxV?$U;FmebLRlpgz`I}IoJV_0o1D~nJumwIzkX%nwj%A zYh*oFY>QqShh`CHRv)OBvgxTxI|P0X z3D8lwgVn;+CBD&($ougyCiY-d@C6}#VI#N8n>;WjAfL-pgMPwlX8NyAows;$7l2e(u@co(PMAx%HKvCJ*hZi%}W)H1L%* z>BDRllJ!gaUgqH6g87E*uMMe>sibSJ{d?`x%&N-5g3wUlk1BvI5(y5saTNqbuzRp$2nA_qmSe2tXOpnXOH>w=!%&=Mp zjSEo;tt~3t(k7p>*y4Ki*c4lD*Fn)nY1oLcn-j*ByELlB+gYu&03cN@I$(Ll zsH_|=j-x`*yeZ?T%%SuN|MruF#-zJPmppIxPDG@&A0fNd*Wn_5Ya z=oHDi7VvOFCf0h#Y*Wf2^7bQL+t1Dxhr^Vrr+81~Qw%A(z@Ed;)PYrE+8l|={*5S6 z);g8RF|JiS7Zumb(jR-hR8tdjrC<6N1B0fUp7wjgm>Uj`o{(=7NcK(_^;}D7QAjf3 z6%M^L7%{9>)NW*gC5I+_6F9RL$VrQs<6iK=dn0*qqvx7)jw)V}nB(8oHNW}BGhFZF zkv^83hZ6G!v$=1R#x3B zyw31Y#WWkb_br3j|5&-NGAK&u9}CDp*HHB$BlXR2MKx=$aD|&!Sn;ZlC>{1Jr}NfQlOr&UKWWqK~3Mu!z$;)Tk9@}e4(00D9I5nW_>@0$YDU;T{>6_dv8PQQ3G%j zFD8+^mw0p!A7V9FpHzI<<-ThBHcZJZ&-fhaGD?UW?aZP+dZPFeKUPsXOXtNP|^Op93ouM(;>R@qBc!8=}`WK*zQ!f3hC<1^WXaX@@ z{|+Yp6IA>^T>*N|OE)~i4-(}eSmA6Ps@Gi8SDkV% z*m%(RCj2rf0PUh>Bq$^zbND#YL;APw#E9v^8SLKy#C#+b~J4XfeWL)8vdw!&q-_7S^mlyXVl?(NHTFE zQ1{0B&05D{hcl;uAD84c9#JIdl5uGv^Pc1Q7LNk@WJ@HD6;A{^vqhjQPWcCpg&HfeFW7ovBlCM)94@L|}Op2~S zQi4WEN5i?@xn%#0Qp} z)TIo+Ukc}&jxSy$6MWD}6>;P&et*RID3+u1%8eVIGt3f|EP=$nJw1Np@H1{+EL}#< zeU8okpIwrjS-z-t|Mc|Q7aDY)TnWM%@?!}xEK^8Rv)*VAWr?pkUi@i(z0~t??k=p% z+Wc+otBP_d9&P6e#3So_7yUDf&io-@KvwJe`3X{t8!ShHqH~VeouYLnQv2X(8ypec zX^|o?3UAX5D6W#!2r$O|UT&VOdo6Kf{_592C4H5SQJB)T$!Qde=E3NJvYD?JN9GVz z87^f4W!N4GBw8P05X!Jycbnm{2Dy!veuKWK)(qiTnHtC}^a;#Vueo%A z{`~dAipgy(_`NmH>naad;u=^X+G^@Xk*-iqT_)#9%+@OzLm0>feepeT^k66m~DA{|+HWPF-4oy}x~6m>=YKC{KN(FMq3&9!86mYV(j>$g2^KKZ(DX+M`+ zruA8mhTCc-wX`u~wHsC|+KTAtM}O*V)gE)PXh2jbu$mq4$V{%;(;(KS<}lyGniDn= zgoXG|FiR!nvb^ZJghekelQ^aP(>z465_)ss)pi?bsCM!y$9!zjmO*@Mz3jHYAv4|j zS_R7~=snmD5jI`&Dpa#!{sQiRfl&Mf zByM2US3w%cV!CtX#yyV=V=6Tl-zvn?P3XFXZZ)QVnH!N<>?38S&SfRyewXCB#~i$B zUm~`cIdEd7DpL*j@|l)8lVIWa&?T&j=b!}Y-v6Nf+4DjsDmMK*_ln#^%bn+a6<2Z* zhJNH$@k?+jGY54sq_*XJoNDjhGI%F+>^!TpJ})8zQrJr;1=68uUH6{BPj+D7q1@}X zr0N@*RP%9@m=XW_+;dNvji}PnQs5;~nny-|U&tO1Zvu5C5{qlGef;#SkjVCsxgj@d z?Dtgz05nslb$?Vf;S^hO&m8weA--&!DgTOt1bZWSH>*m{^`3=~#{PN8_TNPwsB% zn9pcE{PN`bS$wY=k$Uek%W+m0a@wvFANPrwTPn`^hN1cet=zDJx+I{qc>PXQdFQ$X zTL&}YZZD5#u4~e}yNtE#zSJu^jo3b}y!ZC9;XmLNRU}7&{D@bA3W`3mAQB-X=e~nv zCylCH9ND$-2~r#gIt1O$e{%6AASv7QMN<@i$iQryghbB^N(oU%ph!3s z!4|+kH6kGB)F=KzcGswC8^w&j^Io6{{YB3svp#keU~CnF$)P@^pK`m!0d$xZMOORU z36qI`$R4lSg0fjKZ;*8;8-U=zT&y;W#?+B*Vq|7ci<=+vApgta7pkgnniF-K z4kDMO=Z0sf8+!Qn0Pr61=*z68dbm>DdKMuN)qo6(t=UfZX7>LZ+J( zM8Mr<VUOO(XTlL6F>C<8QJv0ACuBcSfn|e2^C)^UT)J ze1DuYvh-B_nOOdHb!BF_&pBLS2`N@7&Ubh2@d4^wGvtWtlA>XYKk)&qesG_Cmpghg z$k?i_i0WM|wW#u5QGs@PVN3T-z~3igq=r6jW$4=EsQsNn=hZ7T8B`i%bY^17GW*Fa zw}^efOnc+lg`V&R%a=KM>@T9MD>;rny&GgB48LU+k`rKu3Cd;>x7yZDI%Z?o(>vdE z11muWp5`{ah%V!({kZbuO6s&`EHwNTDohc-NwS;p zNM1A(?{&7~Dl6NlONGtt&UP=ve@8yVc=v#=;1bpprPIx?Owlo3$!7CCG64yxObvK+ zG3fg7`?ALru|+_Vr1xqSqs(*z1uF)n!}mrec4;i9JKjW;_Orjhqs7YtByrxQRj@x~ zmp0t0w)*Xh7t(`amAS!6c?QHx73O(HBh`j#0a75cKp%Xe8=_ns)UR^sW1?+^&)dNv z-{J3I?CRjQ)q#-x7m)OPP~QX<|KPXNV*ktK1_05($`bE>F0c(0x;&pzJ=Z^EPwsHR z6Vvf^QFo~Ns7{Qq`bs`!>O7GJl3?DBkH&HLL{QE^==>%UjWP?(|sBcP<2nf@WGLjqkRy3N2+=Z1lZ z2)9k&ml1F8^TE$bMwk@~%vv;mMWFk0%(ou})?T71_qgsKSIV|U;(q@j<6S!>HLC%i z^HRVbY`Akg+0^|JM(VMQl8h~lzp1vqU_oWLZ~5QT|2q#99YNp^+2x=Tcbk(HyVVDL z^E?a9Yu_Z}Etd(5p*Vu3+>#^s+gFqw^o-+J9FGOz*^wSdIhOY&G;xxCAib35633V= z_t^Q?XTKQ&m`2HoYpnqbd)=V~M@0ZkJ#Y7mTqvnH24d~TUz*MW(xs$Vt*<|zIk(5s zYtpt2+m|*~>@|KKd;(2vFFmQc)B=|210BxeUkxs@(=3bvT+!Vxw<-wqxv#mdrgW;E z)xqj=zbgFP1wR-%TD%kv$ zv@`A9g0;5++K+VS+N5bnSEq}(1+%lR>GyDkZI1f8ayd7}(FJ7gz#XrTb*`m_Ji(~H z>lZlj|I~WFxcB2N?%dg0icGp;AyIk;<%RvX@^gg?e2-yPq+REV-RJ<{Smm^pAV zA?PM#`{Zje9|^Zdb@cRlcsbB=6GcEn9~#7MO;B{L0hh_+!{BeC3>3*Pgm zAu{5OmTEAmT^02AjW6iH9v=QuIjT5(1mDkvEy3S$+()+w`PgZD&T&r1Jh^%NP{k(b zJ?!W~Ium0{*>Yz2>*In^z?M6O_(&J7hCW~iW%z}yZo=$jerCA|P7MxSMTb}n@(ui~ zzx%mQB-Lh57q*5=WnR*dhDa8+*cEl=++ZeVZjlO1 z(Oaa9&&b~HK*1-sB9Kz(q!;&E^PFO*3aHAUS})EH)GPN(Z@tI=o)Us7GAgUH44SlNjs!#EaIA&J7yB_Od=7-1d-83XTe10Avy_aL)UxS}!1tSfW#OIL!O?z?C z_YavP96_L1FN`uiAO4Pddiztan-o+n*=Sr9ABB{OjagxOX5N>z|BzPbP4l@7oF2#X z48Kj^OCn!n&uy*S?#$$B98AlfDN$azM8Elr4MkU@<>p-2T&Miyym#9o`JOu)D6HsE z;YG&MZ!AATh3HbyO>(QU^SXe_*)TPLVLEqYAslQ#L*wn2+v2NZqwQ3!?#X3@_2ilrErW*1K zGLD35X_q^8=ql?4F&RM~yh=!A214}!(We<1ka?o7!)!Q0mz4W4t#b>%AL(KFdTqYH zyJ?mh@$tlwF-GV*sAnJ1iF5$;pFw<+D?22)iT7L=t1oMkRA^t|Ce#zV(dO{r+n{2D z^lwPF#MhVGde=<9e`AmQrpOPVjuQZtG!0s=$R;~TEv(K%)oGjlT)J>^)72P-1H zudkc>GxP+#T#qqXf2U-*483f11xW|Z&+eQOfBT6Kycf)4Zm3-8VNA$tZg@K~t@EiQf!*f1w@vUR zUv#9Bx2M|xWempzU8X<%Ja`CBO$y4JHyHYf>X;AL^oSO3O~?j&9u^sc<7N6CvseSDp3@U_!)EB0Y9sTB_j|W> zjbglCj&_-pVN_8MU8e`XS-NOp)f}c%!>wz=r=9C1uj~9Ysuo%b?AU-!v?y0>rjt^+ zi}L3R25K_K4vK*A2kIl4#RVqPEu45yT`nJHFOP5b{(Etqe6h@hp5zmrgUTzNK>`y5 z@gNruSQo{Y?y4_yoO#2kD0K00B}?RL#-}EgxMy+`f!pG+kFaQ`MOvQ{<~BE z-?KLU$<+wv0z$Ar_>y;+jDJbx{EcQ<UHCtQtlZD^ zcs=}mlD$vZS7s_*8$B~ddgUGn+~QTUGayNV0&wpT6aj51!`$S+?P^sQ;mDYLFLpBg zRrDtyi));ilH*OEQsscTye>9*rgMorQ1^*H{WnpYP72Ei9V~ZbiN&lMna`ec{^c;6 z@7H;hqLw`qm-<~I&|h(|bAA4tVl@}>nYZf|;6#dEXSzS3 zkM{`Dwy-|AQobXOOfadgYG^XOfA?dq_!sVXi>`WAW1h6KE=MBUW%5stqD*2C?GRhc z@(=S0$~|TAo-g)(E%ndK^~Ab_g`azGrl_&XoMnMpvKxHys=1fM2PCK;%| z^rR6+yf2b%1C3QaUF>qW6HL68-Xa3(l?Pa50jpl<&J2RcJVt(2eOZ)IQl3v$dg~>4 z%jR!hk#L}v>OVAre(O7t_Vr*+=`(h_oe6I#It8u}DpCa01=0(udLR;DigG_q)l79+ zx#M}FF3k_Ls@K+KN%W}aZ!90$kS?}EqN9!%qVf^cefvuAUoBB>ph(_%QL$m@Clw@5 z?3aO4mxKGj$R&XGLA*4uqzbM{8YieR8;NcEXnqnrrpPAfiez!YhDfyEnrr2&4y|Pf z@-iE?9z0MsY8@EK?DMlQhz_m1Cj4;d=>A{HEEn0XhLAc7-T<@ww-*EM!oOi{HA(uE zK@~Zh95s5Zc}3>4A~9TjeITpa-&S{U*X^W~d&51M?0e&#-Q;?YuF3=Wr>N))B^jaS z+LEd8-erkjN55UfvRn@|-?vaK?_R8iZq(D${2}Xyxvp!TA$hPD8e?MOXFa)-PmZ7C zLObT!(IE$qRaJp1~wEfUXM4O?#<25ulz_8s^1c!SCw+$e(+EVvCrKe z_(V>>t!8xj&_%#B;>Gx<4TbbW040VQIUa`8^f+H9&id{XSM~{Ie$rf;tG>@t?DQ^E zCbrP~8=e zH%Z*~x1Fo4>JzB9n6A5@Jr9x@HFW%Jk)`&dYuIz^+h=b(hLx;qZS}?M&)uYPHyFeT zN6Nr4QBT$4m>Dwo>$Tfj%KE}^9*xA)#To!$Wq!&L#$00!9EL3Z4)YNc>;d$&4SB#4 z%r*=$BQavWp_T^>JQnC2ARMuM_g*j(;?J8w$(mzBl=n#ikO|b+{f%d56|Lu4+pzqlub+WgD%=kz?wXZSy3_ zA89;1Tk_0WvKi62OC0g;5D0n*cW64!cy5wv7@=D|T8~OwZND5N>;8fLv->Xv|EG9l zw31#ay!Q{;dl~t7RK(y*HJsR`K*jt31&m2j$8Lb)s}VZH-8A#V0CZZ=n8H(im9UbT z&Ze%pBDSUNa zH!b>wOB3kmc~=j2P1B!*E^Mcy8~v!;cXg}#J^AvMXg#lY2Qq+|dJIzRIK{(2=_5y9 zKs!yLgf2*2fLbkpGwF<{$cUUctL;kCTR#jvn{X>&p&o1;RpTa3~vJ; zNSRx?NFRNCC3N;-Bs)=6W+1=`7nqCFQ5{9rJlZ4kPo(YAk#w6fgdizC`1zrSOdwZ0 zZ7=<_6OxAACQ=UoOA+MMhQ@Rm1O!YBa;}grtblW5rVPVmo*d+@OZXNF`xFZ&I$KQ~ z53Pc2`!UY719vh_9_=v)su}^3)TommXJsdDZL%naCzWsNQ`~?Uo~z(^WP!tyt)G3& zNh<$cz1FNv{3XgLHA{gw=A{2sjs0KL*@F26|Eci(vjF~Yt}JU%kPJ~r)O2FuIq)wvr1ZzbeC`EJvSZAE;7>Xy2u-yNq~RV5w2hs zx|$+FYBp;#q(5!0+-)i0;R)_e=H%|;as(20baWHl9n$B<8+Q`F@q#{fRF>#nT^%(c zwIIYYIDcF-e1{AIU_7Y|T*eBD&Ii?Hpyu+zld|aQ)^J)Lr+{Mc8$Is%&PG~|lW_t= z?xxp4c&6f~Pti$agz_F^sbd36l@g49k5?m1^9#y=*rzQ?V7EE0gcxD!-ygLQQEI3# zO{}kaDhY29HJ@JaGW!>&PLI62ad?Hf%?N=7gHlHauoO7TA-r9@?_THp)?QY501<8n zQsxz`TO*$ehZzJ@k|7h0c45y(O+~|zfsT;as#ysN62<4B1X%C(T#Jsw3Go7c=xWV} zFVM=nU^7ho_kjQilM#&J`YAN=?@R_=b;aQ%fACMW`vua;hjw*>m4}POk;0-td)=#5 zBFMH8cfxjP*)(0ASaO;9Lg(FKIT7M-C5AJ|E9iSur^mvri3C|YvAdzj2SBGkAI1XF zX{#G6w{ETepv4?|^OQRq&tOt^^u13@NPSLf$(D`Ht^p7)Jh(PuR3Zr!+Aga5k z7j+@X9AAFgr?i)n-1_!S0xm(G+u?rL66G8?UP=NYA83#rGfQ<}Z3rf*ZoP;uFR=F6 z;7JTlmA(rW&84Z?($GBYGM8)M8p_v_4Y;1k*sooty1uwBO};8^pnfY%jN$^lS_e>A z$nlpjSIh$a7x%+GFGB-5`<^E^r@sFk zG};DM^2f^~%OPe#EW<8Jn#)fVRYy;VW%`XeKGA{WcYJMjD7EnVl+NrjCxsE`vH`MH z)lte17WBu(j5X7w>*GRZ8fo50&F@?65!}(FX3xrJ0e+>nF^ziG z#D75!`MbPZxVrmZ(&vWi_q~azg+ZAryTt|Ssg<2f*5RAIPf{4@i7C;S27lc%E_D7< zrad1Y+j~?ItJrBJGWLj-n++}IKGa+ELvy{J1ux@b=insi;&72?8$dW3C=1u#C4kSxa3j0PwH0=nBP)A6;=m3500iHkUq}Fc zBLgODXK+zeD8czvpc*vojn|++GCdLVk=LIQHtF$%M`u|2X!Iyh0j*bq*z0n+7OS}^ zyvy7TnBVj%ih=SX@(b{I2P#<+q?)qGT9bw?=nHp?j%*A9EO3ax#Q}}!&Oxp4-+MRj zMiwT8aW2MYyT>V>B5z>rx{<6tr4vtt_k4J2KgLc>d|5=vlO?9`D3>L)`tUA|ow~Ik znS{EomFH%hZg{|}xwD-KFW^OtUUO6X6vX@&MD}r)X^0*xNhNAWyx;%jyYpl|z0Igi zl-*M5mR3F_<9?$t}pG2+`g5eD=xvuF4rv`?rw@Y#X9jmOScog zP&eyArA+&(q;IIURW3O#Tz2|Q{w!ZBAbc9VBr&FSP)AqCN=Da#+dlcW>Kl{SG6A$ zJK6ef|AH%=b*^=;YhuTH@>#*oDP8(b=4TqJx64*9G{|4Hj@ol%F!vS^DGD!%tPl}; zHV+2S%sgUNjkB6Q_IG8O3)2-5vRx(IncLIi`~Hbp-+`=Hore;#iO!;}S(7UXGOkpq zUkWweXi(BxR;Dv2-Iq0tr+u^Ie7$*lhIwg1oa4S$UtiLx0PHiakJ zx@R6d$S3&>xU)g8DBTjB^_G~nF#CEhF#fR@C+A{lrb_ZGkd9rxQY}#uo@ex|2mAr3 z!D&Dt{s$*aes+r8)>e@F24n@-!QLbNiFF=QwpgaeXaj15(U*g{ zs*VKKUeep`GsxSgF#s*je6=wjI#A#^;aAY8;+fp-&9HL*y(8iPIbwN3zka+XTG?hsLUv?(z}s-GwfhyCM|A6O8F zLxeV#YrMhY;qF=wY1O|yiR$Vg#;JIAS3$Aj{#Ws{b%Sr zY7nEOFvaH+Knrd^78Z%E!bRo|gaD4uGXF~K83I!=17fiEZeYb1R7;Xr1B(u1whewt zjJMH568?~>4VHu=^_D|r?DI{oHYbHsHOLoKc76OO)T6EK$qxWS{|yU&iDfyfF5m(* z^nu~s=i{QEL67&K#Prs3NPG}__$-UU6F{Cby-k3`lK3RJ85u4p$RN zY+2r60eS#Cs&vc_$e7x40quk>G7q)}n$(4+_F(LA6`-RJaBaL_I^XBNZoM;s)U4xa z5RqMEy%f*35=Fg1}ZliZAJ)m2Iuvt=%6Tw!Oawn~uqvZw3S= zD{HO8;0f_DNXUZh`FSeoXb+BEFs{289ddRX?vcf~I zAFQV9{kSYLpzep>NxYW0l9JR7FRN{W6T&r*vo&X>4GqVswf`IKBlBP5ESR&9_@vkZYm#x?2zCDN|zQcQ>3zPCmsL z1;Wg%Yf{`{SIkdG5=M0l0c9;;&rL|FR~jl_c}uSDuaZutQ7GXfVMaxq8)B_H6tw2iuqtAC1_+faF(^=2p>{4E2Zv{{7V z9FOgL0qMe3eOMdr%1YuQ=zX1E{sGIV7<^cUW%suvKn*R@hT~r!S6)b+9HQF%shgnj zRby*o>-sZY;K8BH^kqkW2IT(GD52ubDgqe&8a}#v4OmzGw*Zj`^&hg7XQTtKV`vUC zx5q#rG2kJl-vJU^JoPQ8OUgDFv}JLE=UzxHaFaRW8@o0;)7sh%<24k5m|VXIN%R@* zIOR}mol9uJ`^q2N2O^433ERunH4hjm@-3G6oXD%y0(7Bg)BZIK~K5y(W1_Jgye^ePS z9`C?Ayn^6y7Pvz#0ibi2x9x0I<~3>sOOBI|y* zH6>*i89K`iD~hKbG={6N@Gj{JRqh69PBhW)8ZR}b*-Q{*N~i3~>vj?aPrs>>Jh$er z;&K+jY)J?;EZ66H(LZEjqjHIIL<~*L#88&yuqOD)KErzHK1~60u11 zl3mVabw$rVf^5dLk|j4PPSKcqC&>Ju>!hJfTC7pei!xvsW$~(D+-ApHpyRc+qFS5; zn|ZS!UgoBkUsn^l$nZh)fzVj{-!Z>`@5z608DFu2oI@)2LljQWMV;UWN}gNEYoOYD zgY5{0^E*_4!mRf`h&o+R`e~9V;Qa(?2X!I{$Yf2*8f~WsU@T!9Y!c9?ye_S0RI*JB$CR8$D(L z%Ed%m(1*Pgjbkq`U@24JL=Wa=2EwlP?))v9Wf5qs3bYugW=W8bfr7cMh!bVYg&U%X zQ;+RL{7q8rN@#(<-i}iv?g#y;M{_7Jg+=pz+DWV^AIpRP>(2eRr~si$7ZQ?2{{Oad z+xS%Xa0rC$Tw1I4`y%4X`TTW)+D56ojCpU6fVwGqwo zYN7{nWS;0caDeGI_080?Qp~H@yR@jd5d%r7r(T?dUr-u z1a{4ehT?_j(OG;g*vX}1){n{@nPUpFYf#g(iZm{)j;zZNH zWMs?}+Sw2qt^)1GZ<3?zm02*8C|%vnOsS5=s>9Dh>OH-c3lRj#lVq0PsEaDM`j!QC zLn|K9m0frpWz##Y?>TsGgDPI1Xw_}r95d$miAd{tQTn1(!Bg%55cFr_{@0lN4|pbE z;M~!NB|m&{nKFy+@hzugC`ADqt0O`{0TZ7JV5ih(Lvg&nn(8+QC}LjF9jH+5ZIYI^wNOFRs0xr3DSZ!+xIZOV|0dGy7!#zmJ>t-ogzC{+F z{I?xMSGg$Y=!cGE_G>Tg7F6s|UTFqgLdOt`s562${cyLOg4GU+zPaf8i9}kDnMg`e z3yNtSFES>Bi0}*eoj)qZJ&Yh!!!y9$5S~&Wscu%@tSVxmB;k$h{wQ)2DjE4h(gz;J zW;ubv==p`)MpV?46OQD&)U~AD9{lEyU3Mji73gh0Mk+efZO@8i;L?h<3UExuO<_^m z*4Msu72lP349w@%5J|z++VY)t%^BP>9AwO2q+4Q_B$8Kt=7{b#GMwH%l*c|PmHItD zc%b6?jf^5Y+i)0K2%-Vhgc3Bs+>%WoUCR&JiHISIVXnrT^}g}6WYN8$*W~s9fqzNV zKX+>z2V!Y}<6fi{kPKCHBbM&ac~+ZtAO`nr5s!2w4Sh!rnl_=Ud<-Wz^bQh6wV;1gxjs!fU^P#hs zB*}AAKKwI(^4}KY6)W3wV4Llu#)=c&0ZJT5TS>FB`=8zd$}7;o^Zi}CpogR`Dv}6@ zNxU-SGN9v#g_=S7KwpM|+PczhgMJ(hu%!cKNnV@S!+6Z^KXzm96;pp zu|X_|bAdC+VBaa#NL$FK_b=Y09AOhAD{C_36@^__hZNZBu53N-Rr}C zwMhhMMt91t=#B<7)}|%Cq5UyabUNv+5{eorbk~Qx8k0DdWMNTo+|BTgY|+W_c|B_I{)LN_!qJg(jg}~F8#;7^!oXAXQxl(-)9iQ``-ACK=0fw{HxlR;?kcs z#BVCOL>`82XzB;XLXoL#s<l@`EeUyw_rI?mo|)A5N`x+_R8(`8^HpvC zu6Pw7HF2kgp35PI`q@&ccbq#=(}b%Tp1i`QS3Q2n8(#WYk3zQ2RqB4-n;Ud1I;-4 zwhnEVmaL(}B*#Dd*?JDI(ybs?;F<4O*HnKGg zO3Xd@TJ+_`XGl*OR6pQClRq+CmS$b$VM3=$$S-h)38hO~cYmayfVPZZkxR>d3D6u# zt-SqDdT)9$VlL^rCYHWa#rExOT<8;8zIVW!AUN}#z#E)?5>yjy?3qH(Gam^=J$e=S zLyI#yT!ZS;qa9UtS&zx4pGaab6L> z8FgSWd>@nzp!z|AK2p6OiieJ$vNgN&g#oi*m`E3bFsBw#yCR1iay|uG*4}ZCX517c zO^mjvS3w1S3&ALRCbPA}(4MZn*E>UbP`E)^7W0R1|EpMtkRP$)=v{+~uuHdkGzQBFQvoHe<=(|H-Ei zs&!57uv}#Sl8@WIz>bOOc%oVkLqNO_8rkQu`pN{N@zVBI za?)Z6=UeIx)&2X3(I$}|@GJ3XC?0 zweW<`fQJ5`P>9)~-gL0tqGseDvh5fsUP0lg=@2^YPkN;H_nmit-yy?E53AG9qHhmD5p7<6xsc4|l(!+O_Nb&%S-3+kU!1@4K zY~^%3(+Yx+*!)j?W9q-)8(W&d592=gLxz?mX|%x+L^I+gf8rp)57-MWvi^{%${Z4) zTj&juEXjE=cxY#dSQ>ybSWfw1h)xe=Uz%tgJP`q5J@?@|4C17ixE2XWQ&11|z*ro$ z!z2K_@iFZLI9)yd+g|?L@&7-r7e9kDyULWOSu->Gy#5^Jg9XP(gWhw1 zBWg1gAI8z@+Hwbeeyr59`bKLpG%l}I+j(2SQ=CU@^mna-mT=NBu6A!fw{a2An&w-+ z?d7MOZoRejf+igF<_{SIcrREB4;f}F4`8BB%5yG$woa_}nlSh9=SD1pShs@Mak))=2LplGjzZTi-)56Y{gF@+ zr=cnA-mz=p{oWb61nG&^Ip-`y_;|nLy7(}J7mi_6qjxgVB#16>v6{$DJdBIjfthBC zW8+$vzgOg;1?oSVOG#T;~%qQJw;VhzjuK3I`&?%-e zYNI1LT$QUwp>Z%AVrUhhGW6Qk`qzYHxIh~vxWL16OMQ+%z}D1XzCk6f(?B)9@cus^ z^8fA%v<&}(%NQ7;Lk^SbsYCiMIh%|~=nU8i3!VzzuTY>@c98M3&J%kkBo@(a5L)FX z;c8WTu3(QORs=3LSN%49b|27nc2;v+tPej0#!&r{)yb6^j*n+E3ctC{ZAI>e2X z_ZE)-l8T?GsfDu!+-b!u<6J!_YDUH%u-{opkoW<9_o1;Di3~u0(nVA!uTEszOSaaH43?G0yzfh)6?+Eov!PCu?3ZQ~%5c*;+7 zh}>rmCV$AbDsGylE>K*rY`Lnu&eOGQ)@KkaRLLSvoezf&QlkjbhcjUB1vNq@0q#Sx|^PyDYM?l2N1b{b#OtD ztKsw4v>^>1_f`#n0Av6A$X_U0f*N)(f(A&JO0TVfjqo?z1*ACvDnL@EDUg(@zFfxk zs6{W|bo&~8t0)SuGzA7)R2Q%LUgRE23TLJf%%%m=!#LtXDxYne zsdv=3RVV3O4Xxrrl*WJ1eDfqIm>`6$%v#yMG=&TAWoQZQZwi%{c#I09Kcs57iC%w$ zG$W>xZpytSz`3*hRpxG(+158+4Kgj*R4DvS^ikdTmOBHhvgA`Mc)K)N<^z&5`N zZ|~>1-}v75{bRoc<6>;P&N`0sIGji-Tjzg)zF&jxI{LZ8c}cCvZ74pJ)T{;^K*`Tt zli?fRe@N#8==X^K$Xk==IN_}!a=UEOw}b`|Pb0QS@&Z6Tpbb!klCfu-M$rARCJSU| zYA5kIl6ZQ|Ux`$V2>b|i%U{=lMkFM+XOHJZ?A$ANAPVhHj6nV@fCI0HSDVPr?WyzE zBl)Y~{j(=Ehsyo?$G{rqznFU`?7iQYoXpn%ng9qiEJ4eavZe?UX$E?T9#@*)sGd4I zD1Oh&F(Ji$4jO>F^TAZZ-7CkwsFy~grdvwjqQ~(JFjKFXzBTh;04-6N^Oe=PTzma& z2a>L02i=-#y@si3<1A)0zanI8cE_y~sIk*r>G!<|s0!yf-iAykoA4eZz^3MSvZxQ|U4A1&E*202b(*+2UDdNjVSE6lutEG_ z0uccy1X)}qPD+FLZKv6OKAyk$C?Ah&ys9>5I|k_NYiFBvgxs>7dlFvS^=K;EJ5YQy z8}|$Js>GLm)^mMpPjwH>Xns#mET(`$peRMmJTYW1Ey?Q9ez!H!;$RkNb=eEFG&jdkN@TU7sb2dFd;X zJ$AdU9<5T!+&w-uWiqYg?kL;o!Who?2ApKZ6|~puCxl67UgeP(kRa-}9wgvaZ)vd_ z4lX{uo`F?;%FrNIJd_8S8a{LoF@EMFl?x-&HO#ePiS^j5M9SBB$g#_T=SEjBk#Jmv zr9nk17&ed~UTBy20%Cgw3K6Trje*}1)|PwW8aT(!_H)8!GelI&zK*rQvGTyBiSend zsVMCy=vyNtoO9_NoDLfDeOcoVoi=`EJ((e$ai*-ino7aVa`1`fmYobbS?zh=bcLT> zgSH*s%lgP8KtlG~86F|FAo{xojD$5G34ahs2Sg5{8-{&|k4L@Azb#z@sePrU#PsEV zoOtVw#wPr)d%+aHzodZ00H*P#=Otgu_d+I5HI4d&?KeIb(wfeCoFb91&c6ojVWXp3C^|j&U=$Q#^8S6xb%(eMA^T~&9`ftC zQbz^*hVCd}T@iGM{_I`uE%3TL)yOK$anqpN?>wOh3DCBLs)*TccEtpJM|4Ns`qu0N zLh~1fE1rF|(JK8T@bkXxxM!(++|rH}wz3jhd~rOhGyTcy1TtcUM+W$`=T0lf>q-jW zI-kUsm7isVE(~$Mb=z7q>(3Q|X{mRXg|!dvm$nbY#>Xc&RwB#?X748Yi+RHe@rknu zbq@Kw%d9uy#_FB4vSB|ikjAWe=F{yrNIZm#gXL6n!|wTUhQsGWl*YpzLEZx%YfIZj z^DpzSnT*Of*m$bD9S)-ruaNjNHlMXYnzqYAR9)qCx4=peSydoQ@(Mg@I58v8h) z_Q zTJNl&^XJ#+{m!o232kkKgU|ZX&o}I>CO@bge&CX49K!@`zVn^|FMl5U27wZj33l(3 zM0}+sAbg+9qQk?U8Th=HH6@)wQY*?qBFgf=!|gnZt@6jszE59kRdi)kp2t3Ijtdfc6Hb)A5HR$C}-irM+G+uS$^*3=xHNsAo$E zcu2^VtX#;9%Rb;=+fz?9Px;YvcSBEa-)14oNc!e6*pOtn`Wx>*{rJL<+M8@g4<>xE zGX?9GuUZUR`V!y*Q^2Gn7cZ5ZQ`{Ft5VDC>XP(KQxYvZ+^>-Bv_yrPXawlPVGHia% z-T?)kl?xQwJy-)=O-8`9ux5k^;6dnAc6XHD+E90U((Jhf^vN@0bt{GDYWV}SfH$G1Jl>m9J&-+XH7d$qNoj~crcl0Xwn#Dxh&4O#>J3i|j43peGQ z=YUP3V_F?+^c^{>Az^lc@*Y9SFISVH4G=ng2*_yboPwp0hLGEL>vaLP#0rr7^nPd0 zl)tm5Hj(2E8P+&6Na;6rLD`S|I72{@F8u<1)I%;0z|CP_n*P%x`uEDh@IZ#+-~88R z*RI1dJPN$6we_1vChD-1Wzj*7*B(B+zVmyrEBXh}QpR@@UH+XY8%PHvxf19aFtT30 zQh(~kp2IG4h*dqp#)h6C37~j54b^K7y^z<=|IUFeU)T^8OUYx+P^urYYL=Hb-*)03#d*^=ml4(iOBnpP2lm+H4rm)5Jnz~Z zoY^DvrF?ET8A7AoCM)WPN|ws2wd<~n=!RCRIN9%q;Hh|if#TY>9S_RcfkAtrsS2JO z(_=@z3Okqxfd=tjh9*LbpKuAAu6t<9e%>a)w#$gx{8b}zKJ(#;iYWbI_nW^!2;pWc z^3l5>Hb6xiP9w)?H`{Rjb5YqUE1N^zYb%%a!sV>0CDWrTkvYo=j{s%xz;71_qF2N> zwe=7c-gWspKYa&1Y>!xL%ZjtC+xj>h(wR*t<621^0%5k+ORe;RZXGreV z&)pI3_CmKmZS&B2-vE%e3#=WXUYGc3CS`f1hX_L71WoO7s?ub`xUx$>o)iq*AKAKQ z*92n@53~9^EUm(`XGs>h{qDWK+SvH?C^_1E{4=Z|@x%4zYh0~jJDjL2GW=tss?dt7 zdyYI4BsqFXh$V59>%!s45mAIOQZ--2+E*eIBfwe<6>~o)TP&+nGqm;o=Jdo@Y#%#` zQcscos{!-IOeHiJb`PJzQs>0d%W+*Z>K~qY?nADW1Z!Y zKVD;+A{{%&AS-jz@+g9M3ENTZRD$==8ZCancB9y;`aTzSl@4k`7-imXS`R2mHJ#mS zc#Wzx=Up|uEBo@>E)XBh=v$H*<)i-Y0#3Rf=+9u!EKkEv-{#xL%1fKK?7i@x2Ca-!c~W@ntZY&AM>)2i*V-hMb1)0_B)og} zZ8Ty*j&i=2uS4HLDW}=%Lb-v>?X!i^ZS!mbgSEAMmnA(TirEKJf|aa+0g7(YH$eNu z3_;fqGfL>tXtNvT4#TKkr>bi;CIDo4 z&xgc51tsj%2B50?P-_;Pvw-;FE8UUx8xp-NM{{f?nbBW)FM(jEQ#(Ps8)lNL!;yw; z=^6>M&ha0g6&n^$G_3nM0b5W%A~qqmKz#^Uiv2DK#Bo9$6F2Qe0rLCK!V|U0qy5YAp_;eh!J^g$I760gsG;XNJrrV^4&}pkm6&xV#geM!U zejm&RBHK{YP|Y%JN#h@r6Ds(SkJ-?g*Q(Wz9k|8ZnT1y7Y2!FnoTY3(hK5bJeurlp zS@4!?4t>u3nJVK|y7bPbU|*m=2u?HF7bK_Thk-J!{NNi(O_*!36d^la8R$P2`-zZf z>0G&(S!h?~JhQStw!d_9s;x}f&(NQ5SJFD^(w`ansT0MmQzs5j*SHP)v+|x;24+9L zum#wUc%`Ed%*!&Uq`U{V-fm^6q*)&~*`{fJXTMrMp>OMMy->~T2{P`*FF;1K%DGh)tUx_IT(Ig3hI?K-VkgiqANFvRh4oMW8LIU%-`^FHG*HSbxq#UT!iAoNEnN;-=5I?LzSa1dM1A>lGpE|d7+jhdbH6>$mmGn^orZ67+RgQ8UB9P9GU7p+trZ=~mC0XI$B z3VHho%q+famK|=hzpU*g{8RTZz9dlzM;BTwvLOVB((gw+G~He^34_%9 zCyLbY&*TEde&iS_$!ZBH$}qH5-SCvt+fKH7caZ@KtCVQ(wHymc1Y>K*lL}`$nMj8L zWGm~Iup|1?w#dd@^(*H`u~(mbk*1{VAMQStX9yzu3^K9_-`fyx^_HtGDR#b8g$kHX ztrLv;c?9dT5XYC!nLo$O50!vbK6X)YQ63b@8y_u=yWj_l!-9nmgvb+M9Rv;~cOS5( zE}T8m!Iy?OqJ5>(n7Gt<}c z$<&G(7UsvQ1fU{#N&Y$3FXUnEVs7n>HuwDeQVLHPJ$QqesY}juzYd>HtSG$gU#{_b zrILX>Uo|v*iJQ0YKxf;}lsIv?8UIS0#oET2zf832kr#j3&p}|7Ei)Q$J5)Sn5Ulsr z6XY)NG3Qi6;=U=~4*FUS&Kg$1`CNjV$MVv3shVmFBhghRObV8JnS8D9)c4*Go&t(P zC9X>oEB#7WkoBehNRtUB4#9Aih$Y`U^C(KZw-cW!4iaa1o3N9-V)zTR^=>f-IDucw z3C*3E%=CPvJ`6ujI8faH#_d7Lpy|vwiSpkC#2dYSz(mmwFw|jIu_)72RH`p?iMf6~ z>-&7r;T&3z$KHlQs=6UbH}3VDM8&0tYUJ{^x!*u(Ix5{3=kItOji&4oDFnIIZ(jJuRC%lO#T-9Du90jO%?gF({~4wO8Nz05G@8J za>7l=0Qy)TA&LZ)XCEKKk0i!McL%P|A%Ie{OSS=+c7FuobNjy*#k97NYkx)Kf)fVW zU&Ha=RpNh$w12_hds_rAGXN211^QM^Pa5Rk@V9tbD2W5$k~y#aY4LUcjgyLS@Pyx8te*xg~!lxo0o&PWNWFI_zV1>Jux~RkS5;lg&*@^OV@8L-=L0nESW(cavs2G3GR+(;{ij;p4X(6m@fo! z>yL{=k`1?Jk!OKz_mSiy;6^z&LafoZI`v7yv=9eEpA24lev6^;{h9DNS^2xGgHEyMR)o{t zG{4JQQE2D?gjE`VZ5d72iynvg=8MJ%eCe#>lY_-MfLZrY^bTTnjTf<=fd)UlJiO?d ze(1qyg!0vNBlK;;W%R!a*64hQTPX4+;~0y4e@g-OF zYvq|4{d=hKD$}&F_S-Uy(znJwmKW4u9^ZcsDSSty@S}oDlD;K8HzO#U#+!B8*$CC_ zP6va(k_>lqeAuoK6LOnzG>=NtwhYi~Kw{G&i7F*QD}(>h{m`=^>9 zBnkW~G7JK(A3G=WUEm0kdVzG73G4E`v2S9r`foQ;1ITHrV0L zgL;t8w1SoF4_RuH<}RM>e_V4>$iEn2ycBho*Xu)kiwXx#S*a8?<#V-D{xZm5i?hRx z7d^Wt6^0H}*og}ZU)WjtgLb43q-!c5HAJUVl2!9V)o&-0gRV zE8>aprL<%0?frle35_ojP%iVXJloB?D#&ZolDQAf_T2UaV#fI0jyBo(lKB%@jrrVZ zqH-?<06eG&LMQV+eBEg-CDIAg-VyF~Ey0m)_70lzNb51NHCKHiNgzKe@h(TCaZRm$?-RB7Q$6`9WnCI_9iG-V35L3ErF- zTbVs8O?SDx*tXKuP61qYjs$F>bB0QbX~H>!N?51MoyMy0OW|P*{7wKMEZ^>l3@X-K zif~&Njai~}$|TGTHni=b%P?V*TG_|tjazuvMc0)d&Zx(LjspJS7Cadt^v>L~@IehF zulZd}^S)RqdH~I|dM?M)#$bdC^K^aTUHXAe`U*T*{cAV(q4h#zZn2-N922mZc9P3H zX3E;Yw)jbm6z;(3b^5{AXETG<7c9`nS4{j0=hLF`Un;9=-W@;UZR%Yn-@G#lkfBBx zP9iz;!b}B>5S@OWcoFCYaASF6ZxO?*ra(r+jbDTUISzzZ{#PrZNNR;{%X-F5sM}MskALRrdx86kt25rQvI z{?W^S?<6q&wu=5;ms64*D_Ho&VStV=3>d(Z4^`5n(kwY#eN3M&3b}Lq92n*}tglKb8Y*-<$3<3_xyr zsyon{Obb^y+-I>R?cVp+V28GNpGSn2qP6NimDSx%e35WK?Hz7K12y?JBzFU|aYNrx zLvv90>E<}KMD&&3Zm)|0&C9Ze`mNm@7Hg$xLbr0l%vQN7SbCv-R$UMp{B5It$kiE@ zf=MxANN;_9OS*{pI_L>h!E;q;D&J1X+h9a$JFB;2Uf@;!hCo9sFeWOOKJ|y|O7S~g z{bO+ScR%xcCOq+vAN-y!{kB>E1I7QzXk@lbdcb}*FgRj~xT6GmjBhos#Lk3lxu+B{ zRlbhDPdtzl1Xf4T(e6T%sVX|v!#5s_nN4*McK;~S-rm?{UW1lbg+m7N$MKJTXirZB zxVVw6T>|r_XLJ#iJU6`9y6cA)O!+pek(BGsq)`oHggNvpphgH|-vZ^*w;hx9O2Be_ z$*>t?^?L3O)m0Dd9z)n%_&Yc0TelE~IZ`sQy2@-t_ne1wjC}Y;8mT;`3Bs5-7eAh9 zZJo*MvwPtRhgJkJwSy~l@>aTKgAyqX`Z`ILN3X^I27%4&pQlPith#PWJuw=J4+ zPr+`-Dvtb}ty^hIwLCzFTtADTc9?p}#G8rg6=hxuD_GTj;oAoo2eGD!<38lhnND{f z#4$1fA*3M8g#$rNIiGIwIWxlAQi&-;=TyWIcwYwZCaGyo@F{4n?bCK;NiLF#MMMfz z-}wwIyQiz;z~p#9$D#d)@KF8j4?+8{Rsx+m?x~IZgmh=51Dj%qP`s()5J3_%NlU37 zretG9-O6zAc~!ec8Zak$WF7Be9P89ajXx{77y`aI zD;oKrv`>5fRHM^ik;tB=`i$1cSG_gW&N1kxN9hVOamzw0a2nhFqccGc*%PTV8-X`%C2ZuV-_-Q9sn${7n+uQkTOOG#e zK4*wr%{_yjT=VX2I=`A2M(HFKb2OHu9l;8$Q;64kojYbc9@sK~g{uIy48z5FBS4j; zz5tmchwp34wmR1B9FzcxmE~U`W9&@VjaI@}A~<^gdJit1yM4y=s**QeCDt~ zEcz9A>{&uN zbmKO>A5pO~5;@x|aLn1dzh5LOPA}$k(5H)cBM3blTah+m#(XRdn(aQRpqs6& z+C?jt=+85!Pm-AM1w-s&cDLDh=EPLCGw8p~kH*GmEzqutYy4(C=_&v?;W52B>l{zr z1-Y*5Kl)dC+xd80tkYH5Vhm_kUv;+S39b|K9e^<4L17S2|m^MQa6Eo zc&-r`dfce%HOHwzg+(XKLb+vzi$*5STuiFsgemEq6nP0k^pRYirM#i zg>Hb0Q=!dNIc@k9gSX1gUfW}f$i4+|q>}OWoi8gbmOpsw?L^QduEQO(;Z4a@>m8$7 zfp7ImA1qhmJ`>JRVhq|ZWx)I=ebRH41+QKH#I3}ncP--m+hC>ZUlk0w)#NGsT(Etu z;TYtZal2rJYN5rck6S{|XA-qr>M3Wf{Uv9>lXCgBV_IN`M83c~E%e80>3CW{Wrv~9 zLj%3(;I_AQ-BO;rBc!0qu~L~{Pn=^i6RfTLK|kZ*X<$}4Q-Mg*rBxEIh-Uqe*9)NM zp{?~TMYZ|P7)kC`ZJ5suVNk)WMlS~6ZdH6ZJ}o76iOt#weDAd116b>imb&rnW#HXZ zq;}3`u9=0>PUD+8WZ)s|Ra7@NPd}YS7NQE9W;4>e{UPzhR|M+oZn9|ZnO0Iv-r|o| z=LN@M!Z+xQ<>A#mbmr=Z$Xw$U4$Tkomej3wHq>{h=^`R-Qm$wZI|-K`pL=yYp1^5p zn{1bD)x!Ha7U6Hm0CaZ9`5nv*W$w$}@oszXi3_6oArZsQOW$h_RVugC@pHBCaFTE> zt}n_R$}qKQSQF=1?g;n=W?1lP_rz^#x*CK4CETT8DW6k*{@KhcP>TAilIOPYs_3lV z;_SDT%3Nw!NjEY9rS=?GD^Z=60e71roL?k%_;OfYerNu;uOQYN@1e^GMIOhc#PIGVR;QBRRAl5-_xt|VSf8S&_Nmrj#Vp2sL3Cad$+D}0xZv&dt)8Yve01qR}f*lTiZ&FJ=#ti=jvZamPQ+Z3**75ek+<(;Yv*U zzgG=23ml9x0r{*_KHj+>swe`LDyiaGfCBg}!c_6CndQ7Jn@<$|c4jT|6^Cv=NdTXs zK3VZ_c=4(zfo^ha*V3&0kKU#S0RLBJO7uk{SgWn6Kbfbz5FjPtC}W1&IO!lAL~UeJ+ekYF6{VA+>Vg70qG+f(-sDQ3T&=BqqkofUF8ws8j!^NLXk6E6UUq^-^z38p* zJW3Km+zV%b?iUcj{lhVTQx3F0m;b$^2z<9sc4hzBB>h?7{#VoVAMXS8dIU)QKNs-- z<$lf|s}qx}WVF8H$IA9IPB`q)NEuJc)6!X_AL#`ka5eZ;;dGq8^p!?&oJF=fl5sUT z3-Eg0RAVJhe~0%|V>;FQbvOOMw7QFJ>RHD^I3!lY8t_a&Up=-2^d}Rr8m@9i{WLw| zMYhlhs{o)bebzYuEqf8ZPfyWP_~mVi{@YKn(%W$h@UNTDNbm3hnKP*@G4H>xID`1g z19*Xj#>Z9?eGIFk6ZD}K+|O|SRUa#3=kBL{$)4Xud%uysn2E4_U4mm?0w`}XU~bZP zo@=CO{0BE%{%O1S3c^@=9#_myXac}s@lVEiR}|W_%2XD575pf5NE!bz_Xt_P{s8c( zE;M*}(t}<_=RqH7Q2jii*CYV!uI5bk?$}+S@#_QGz zT_`Ln70TDQX4df@du(3CKbhiF0iNQ~lyP=QPdf<3yz^gnw=sRzDWTqXmnrS8G-%8i!7UY#E{ zFe`OcQ&!2#zZ9YN_PS_&=m1mt!rfub?t#_A7O8YEOn$ToKJdu_>Lryi!rI#eapl3nj`JfOqb)%rl)89 zs=isdVf|6eglV(g(}d9hew~h+d08Op*oYnaC@7m7JoFZ=4SixBKEig#=}~c;*9w2O z-^l*zbc4!Jc4iTZ3XrQ!F9)}4z(XNiH;wD{JtGd!mR8Aybt;OZHqn*@6_lB;sF$mY zmxtu^)V7&fAD^1UC$93S&dcF@{3Cq~Ysu(v$;o0T(}vh>(#nB`5F#QAZ$Jm)qVCIa z3I81mbQMb00Aced_yF9gK2=(9d}(obpp;scN;2GPU|*PE7CY{>E_*2Vx!J@NVXB-u6yZk8o}U@-E$EwO06Hmvg%Jd-*AWu zf$Tey8b1+tvJe|01sjK#j=g3y#e0(2YZ7lyFdo`dEfi11vGaW1gyYYmBPra|sR#}} zvtZ6~&xjF=2hoSHXaJvvlUqF}DwFZd>hSH}t?3JcFjwL%!W{W6nTJ*U%JIjM%gAwg zT6<3)VC|jAFl0RVEOxbH`_gr>_n0@oK)f;+zKs|gcQ_l)!0^0pd=VoZ^c&>aphU7Q_ z!Y%p&49b3}-4%=5bI9Ny7Y9=phWhO*v!Wldy`gv8!do}kdltA`^~ketd%i1a1qT;=AKyG37QAoS7}q#5;X zoQ0vi)|(pRUA&xui^Q8zBFq}tj(m`MH)0a25+RAA^h_9o_ixIj)VX%YZQl>wQl76( zAIke_#C`PxhaTF{+9rgh5}W@)kmPqVIZ1V9~g?DfUkX+ zrYYOfB2CTk39fou1o@pCDD33?9Ms)%bSDHL0?gX9Joc~mX*g~=EPVfo8;f`_b$4Ph zJDa%N69zE6Es=u8j=rv#f&F~TF}FR7`zh(RN71T;wcJ%d>G(eJW19^@EnuFB-hpIf z)~kGOVtqE7EPI3L(cF00DI60eYWg!Kw0ddHJhUBf7@XjV%(ef)6ML(|j(+0ucH66R zRCKz6*V4*^9(t)>r9Ih8ZwFFYjpT2eYmp+b(~R@rFbmS?j~%U{r6}XrTehuymT&2# z`Q*lG+$5Q=siz!7e*9kh;LS87K#9r7M)fzR0LUXJx#WMm0IEDX|EudvxgIbHRR9t> za9Eu)0#RE)879YCG!qmO(ru)_RSqOm{DShxxn)tfzOQ%9F|=jcfILVv;sVf)$G9HB zdaUc~UF!!SLm_sP^wI~&SU~mkWHQ@=%AsB-owEbfMdPzOT16+t&7lKlR7i40)y)>| z$2T5$C2yAHi{WU?ov$6d;3q{0J(cq+;CHr;_e)^Nt}i}V_F3>u+=!dw%l5ry&U5Wn zU$J>k4LSNvx-1DSh{7}b03X<~{o2d~n4w5QYDU!V*2U_+Q^ryAzd%t12=`6#9Rp7# zCO~WLx7Gqr^D)0CoWwx?>ZZMTXYG*U%O4PLuiS+&#FG>fIoxp{eV+K5aGMy9Z8U%W zV85?;S|TH@@jj9l!A^&qHQZQC=HKp{K(*(#G*LMXPUv%!ijZX+VN!ga5G(r3GXzz1 zurmlmLuRpZY~%TEb%sH>UY;hYa=7V4?8pNzdyz##m5MyFpuj`2Fe4Vd{izMEgoYyetB5o}XfCW9^F(vT#`=g#w@ z*43IXguDw?x?N0SG-^!Gpc6Xw>gpWY&RXr#5~+fUNfMK{j#`>9y2yviNqzg8iNr&b zfy^>bZ*5O>^?+es zj6SPi)qgwxqD>cB!`6acNcVrPnY|l=cHt2PxRzGXBSr*h51lNs z`ON9YmHPVXgvCV@#G3Lc%`qu$42h% zhIW!JmtXS`v{9>w_X<7>yo%4e)?l|6TzhhUmo%w>7Vcjg>Aw~4n0ys;OdOJ#Ns&ml z*wLfujYYWO6X5fdA^gM{h5BI98CD_SVS`u25KM+Eaoul}=WDf7N0Yk4B6L#a^Ys-+ z+&Rrqh)Hot9utzzg)4ViYiJ!d7{>vD|3)D105f-ar+r_1lRI`xPsTcbaoW>41-!d!nlOJn4%?}Xpl>F3Dr zPIVWLTvsW)HAR`xyKA&}#ku$5fl@2*QwQha*q5gAs%CwR+6SCRu&{AC6{q_`0Zqyo zXd84T271u7zp~GL%Qh>GHPCiYHCwMFs>VnUSjTLqcQ~a65W%OfqOFM^AQw<%_Jt+O zqPGcr9@`riq7dx>X1GIa{Wf#oaTm?LrT-`pkZe?-qX?jQkVPK<&`bY2%9}`y7oSVV z-$!uR*6w_C2zPArc(5Vni#iHKauD{=MNrFL^Cc`g%VDQ%&o!*hm z2#EkVaOG`GA%L#gsX>lwWw}Y`I_L(wmHou7lDCnBeN|DqmFai-EtIOR>#eTkzNIvT z@B3+czrp}+3gtbt!oK?RR?H2V{#cW~&z;sTQ)xYiqrqr@H|^6|o}uj%?>tH;E7yD; zqfV)8G)r4;4Aqo?+c-$}&*xoG44BOM2{O5>8%xwWC4ZPHKDTg}L!|MXpre z#3Sa#eH5VcnW*3I4t-r~Sm?z8iIi+pYpMGf?6RK0i0V%IK(H%I(B&z7-+NlKRZc7% z#Ivf!y z{Bjgzd^@}Ia2i5nWnekBV zudw%h7$_CtUb)O-(2EK8_0ZdA3(mqe`B9|@r)|)Z<;85b*)lfSMRzv{6*n8+f{d?L8fV6*l;S(P0f4*F^{Yzjp zvEl!uPx+e($OH%%2Qvk}>8DIW3_fSZyM;lcNavBXoky_VirvtSt2vE4_geApXTu3^ zktT;N^{<_9+Vz;31852ncm*YjKgi&B5Iy_Ha>51ER!L5QHsiX;wz0mJMqpce$f>o9 z6O16~A6+HIj|IUS7MNkOgc=Bz!w9 z?Aan=-YT}O3Ize2w50dZBB#)`EfF*Ro63VHX({foc_$Z_e)bz$aYi?kiAPVu_t~#X>s{`G|>qGl74ebmaPg1KkOYs0|Y&2`7(hJ7{=j67MeUp>OB3-;|U@ zGkCr-edL~sG#qvvHk+c-ma1zPiu3# zlgJQ#-(pRHxK5F|;BxsxG+}m+9_o~dKm@H+oOsdDfw=SEo9WyH21Jj3qq_eb3KbXH z?R%Fbx__|0wc={y&ZsD%CC1J6Fv!R@gT62Qbrby-;G_1S}%@26%fPHV69Z(pNnJ4tNf$^fPTa`q-V^w%@uQsx9+4qW^=_OrfBi=rf> z$yPz03@S#VweV3o1$l=(*f7Hw$|=qgJL_xx#z!`N zV4+Ob=vT`hSRhsSUR)L?G@_Bkq4siGuH^K_bmgQiN4Fub&K>2RtK_H2-|eGgWd{^8 zC{A9ByC3T@jcRK<x^%7+Gk~}Km zCFwHmM-7dz#8Lcux{F$ybI|atbA&uH)th@XCzEh}Fhy-}R9K}g?&@vvn}=4=4R5k> zB%_g>IgV3dpyV3j`2s%pAa{$oA$ zqiN5&mMWL8gR2vdvk8qaW>M>kzQq|PV)!5 zTj%f5kC!lJr5(F|KTyQOC<>PmLfBg$Y_h86j##f#@+NE2zCt5CtbAA-AkC@YzF zW$9M{#TiEtQq!V93!D?5WsJ*7NJJK-)WOtf!UnE5pO1bHl6SF~H$w&1Ys^l&>c+~S zV#{i{km#}xy;k{?{G0U8sJ$^*dRVOMpj>WUVsl4Ah4kJOFh&+4uQvfCWB*$r>VGB_ zifo{v`@IV|`oHd!|NVOu8H2hCT*wYTK}|TOvH0{=3tKBz(Y)3Bk43@JlH1W2{Z%fG zO%}a>QwTD64G^vlk#h(B&*}vVdwgupslA_)wtcJ{^Gusi=5%M-PioBqArto^8G;TK zjfuo=dsn$1Wd&uv(Y3JKYyQkG0D_IcDaw=Fkf9up#ow#t8r8V-#nf+-Z?}6M0EJR0 zD}t{n2sYw4!Og-#NB){KQ$`TAd6Wil^sa!YnH50JAhZUB8rm=YQrm-Nt?!<)<3M*F z1Du%^q$0Piz^clENw{8Z;IL2R><9AcyEJEo7a2N zx@z*~lg)_Eb*nw24_AC5ecO-S*7y|xx*eb2LGnJpn07m@O{7{|8G>IReK&bO43RK1 zIn!-7debyuSyc>B+SO@8aRVoEc?7kY9<|=xz|IVI3DCR8{>m!4D@uMjvrU1db7PIP zc_<(<(Dr~ z4$OogGbSMAXAie9Oa8vP^Sv7D2(BTM$3|^qdcFx*WId^|RbZmA0WXc!Xhnb?>~Tic zHa0~$J-L5fpgNo$`1pJPH2twIR&VZk>>ikSgq{b2HmL-E|6++=tb`gre>L zH}GKY^o8NwfeJaSmU91D>4k{J>lLZ%3SO1pe1fL6-Q~cebsSg8cRJ^gtuqY1a-)sS zXwzqb%DkgT7kyvFtRsUPbhhO?CA(lNHeqh&0&Oz)j@~N>hZE%$@ z{8E}&ROqxe?|a}q&-#;sHGoYyYF#z03=nJrUb++tSH|D3&Bm>#nAAFyrZwu2@ex`Q zfNCGsJwr`k^_!pJH1_oFHM_UY-;kto-ZW^ZR6cxJUXA&PG1qv~Zo;o2_>Bf_MY#r( zC|XI0jeMJ_KS}=YR?%Q@61*YLaID4PWxYC$nL*XnsP?VFuD+MGzw#zN>8AxZ{${Cp zcI{>^ciY)J+rq;ofL?WHG$j6q=W2^9fI1iql|S&!WeL6%aS8Rt-LNz73&G)iPH~Qf zYwU9$u5zq|-Cdo)3ewIz=C#XUl#(CcoOUP=%b1A8rm32+$YW_QN4WL;D(pQ3U(^f5 z&<8u_GEB+uEmK0wKKbXam7hY&>>A0Wz-+$N$DPl4;@}~Mw22>1S07Gr-pEckMS*}J zWm}f2?(g^7d!}H(p%rRBmb7Bc1#P6Cgq;aXKhW0x9I>?_T)e(vRZ$Tlbus)!wr#Z` zkH101QqP4OwoBY1KD$_^k+Xw&HN`|*XE-g18`*NXKs!$(|5X=%SX)HtQ);Tyab10P z%r}^f+y&LMYjSD^?gq=8|M{UPFyd<7t^ZQN{Ii<*=WRg4TXIxQPRe4tpbqd}a(16Z>n96lc-l=r{0##8g zrQ*QKpJ-zobDa44>x+r!VF9pW#Mo`?rTK(hp69~Fj_-M9t^m8DY4r)w?M!;qCW@>X zU1(rQ%O(|iEIykX?AX0w^U?Ub&zJk5zd(16w7&V^>gK? zkn{4lGQfk$j1qNYOQ?=s#bemqRs`DPnbGh|-Z&!u9MUSS1$@Zd<%P$bQbJJ)6^*VyeAzDDDwduQca@*LDXVE3FI3v=QQrxaqG#< z-O=)56VU@b^U*=%de2eBPO&Qn+*Zt1+JJqQ_ocyNoT3GgG!)ZbfoHgt1AFF49;2{I zI=!Qg&4K?{-kXO**}n0^N|GokOWCFpvL&)*8|6VrvV^RaM1*YFXY6Dd*+a%2vP@;{ zTUoPZC;LuzV;#(R-fOfxdY+#5`5y1_9>0Hne>m=YZufoN%XMAnbuORtbNZ@S5zE3c zBb@2B5=_c%-mo}jiYqzwqli0eb^wxDQ?YF9#k;yOOl}f&Xq{iHR(XX`!N@1-aK$xa ztm1ORBUaq8M1b#m)~3022aZ)UAioliy@rPGtifwR(HpmSO5C1HqQM!cRxKEn3!k3{ zlB9J7T4u_y)m`K3;lhwM@&&6VUBHrv0OgS)XvyQ_B5CRRJiR*REtXEN(Jl*sj$}Gq zB0)T|SzY!7^#un>|04*5rY?Ob05zTf)$Z_HX^JB!j~G6O7yDDg!dKV9^G(E)?=~`D zDCT1K!%3&jy!(@gTEnmBq~Ywdbi7#)j$sFi6JDM{JYK68EJ>Me?u>8J-S(?JwFZ)A zRqpE=m0s@u@It4Gk?$-3wtIu=dOUpl;_Gkr$qzf_hp_xj4jt*$@!*3F%C-6N#+Xab z&0W+VT=nY}fG>V3yua1s<}kgX6-N;zNsc-AsWc+Dv-Zfw$@E<_Y)>aJtftfacj<6a z%T-sqa9rtGir_^T#&`aZ8@6@)usv4TE@kaC>U&2*Zqy(x&liP-R>ojiw3mk>(of;k z`jUmG1KZy&!YtbiY-cNSH!EXT6tGrVFE`O8k(Z&zzTmmyO7T(R2e?j|XJ9?PIFY{` zfRBGNd8>;Z^WR>F8(AJc+~^Qc%`G8(9lD{={uUoC$EAzy+|H^ z`XsfHA8}{Q((V}?8zw&Z5{tRmEoUAP9p9zKiM2{HnXoIhjX$?>>r-I@>X-%F#I4PF*urkUe=e=<<{mle&=@KwjwvV@zuWaDLw0`tCV2p#d`YqcQ&(g8YN1rC4tMy1 z!Hh15@tv7mEx&}#zd0>h_5l|QX)h8DaZ625k!mbHR?)XUDAa+9cccITd?y)5cZK3I zoqH?AQQQR9j4~cDNL(HlTX$9g%BMz!Y>T$Z&9rR!iLX5!#>{%?pe+u;t!T2Qi*?1G z2B|Qf7To!mT2Z`@3w$^q)vO4|ZU*B&;?nQ=wt;BmMbu4v)Iv>59rXCYE0buDQZ{_K z0^1Yqzi!zp;*RN%ZNV)U*WtHKA4pCl`D^c))r4#yU<-h(`$$5r58qGhhUy-J8BzG( zB1AmET|}5-2iyPl4{9{HV(%a!6_u6e@HeZ$<9f!Uvqb#z;zzaRWNb|)0NX7^%EJ@R zw@E|vaAxm3Yl9!=XI}CMniTAN_(@0n6RGV2BE?#Qs|0h)GZ8yorAShBVOgQvJiGG` z+91Et)<22(0Q3EL#TnR{IP#{RwIk2l>ziweSIjH=AoK9A&|A?Jq_A^}qt0vX>sJFz zq9{GIrss3uf$;av!FAn*G>>I1Xr38&D)Xjlcv*>%F3Bx*tmDpno_!`d?Cx@ZB3eA| zdM#k@XCDl4zPzr%NVQJ04)GF9caa&*WO{S}ty_I23~E)(zk7&&1=455Q{IuFnymL+{#2o zDUjFK&3%o@&^#KnpyoyXI? zqEyrAu9gkU9az15M(N2D-LZE6*_JF5)T?r4oPM`$Nntil`sp?@X5w&!+m-9k(jz-n zUsk`GeT{WYDl9;Zkepu>~1iC|VP3SpnIL1T~MWtoEzTwqsAcMD<*)8eO$m$9f= zv+2H^aZwvgbI~PF1%|ON0mL*so#45KrtnDW?V zlY4>w7n5Gp(0r}P)%%o>9Ub)%sSc~B!>^z6UoBpcB)NdXgjI4K2T-%ewGL}UVcWz~ z!L8uQ(y#B84RxTFl~ zp1T@aSXg8)Aa6X_T_w42bZdIFrzXhoTDe_!YP?2A4rBeDnhzuR9TYP9SVHnQdA8a~>qK{Jb{IbDcjJ;8-ZpGKG2hT># z{Q)8(rP4;_bsD+ROi!0Tjp`D2@(T;D7S!?{n_*_gNa>wi88YL_<0p5RpfE=`>O;Mz zSujj*klD-AwJ|O=m&C8yi7+t_kcH!gR=3sG419J4I*q8RM(1LU*rP8=J(rtqncxb| zmhaNdp+|0zlKXsA>hiX0*w%yD<0%va29-5JKM##L2lSo`GB;;ow<3>^pu-5R3s><{ z7WjzM8Kx?ET2nV#A&`cQAq2Ger)-1D3UW3cKIIeGY+0eS6MYYDTMW1HT*NxqqH(iL zL*`W4I4W^gv;qjt9efgV@v{JfO;;EqQHS5(o1{_6%sv%QyMg;0yYZ@XNYCa*TUUks zp<~un_}5}`U(;4|0yZo=%+1AF?XwgovwEVqW52ll@%RHM)+IP9`Z_Irb`$=ji#M&T zkdv-Cx_~J;VcSNLHu}u~(Tka4uY@%+%X5&1L{0o$hFRQ|M)(&)Z=1>dr3!;fX@;1F zmZ}Y;5Kn^QlS&7d#=?Yqs#mRDMGhfu;GBDMamQjD%TLd+S>O72{v34R`1Re0F+lNr zXUyDs5t@HjF=b|}DeP8>E-Hy#7yp!NMu|l^uN_@tE>wGtCJ+5Ur1ROq;-{@)vFTl5 z;!noPPJl6DNZtWeiyntu~plw4OWGThp4vk?wC^t448I%k;kHLU%?Y%g8A3AS%l%mN$Z5JHTb~+R)A-^YiA0R8=CWr^tzsZhYOGErtfCCWB-z7T_%`I~^MehxzjJvhN zgO58Qon3xxKDNV$+8WB!Kl;wd)Q)v$pCr(50d*|lP!42V(|>Zg!0Sv_yg7ajj~uKh zs=W(7^LB_{J&FRKA>Lt^aNH#>3@_{w-~O&zzBblm&HBv=RDc(I!4?O<^yXp))MV*Yol!#FXzbl<#TDlkiNsURPLtxth_;D!(f5zq zmL(5tO*H=n9Jx`mKAI>pBtUKo3})L_pUd75Y5n=|Q<)WP%* z%v~$&RYBJMRr;6TcU#OicV4EYNWj$>8ai!-D8u~~$;;ScZbN!>GvE?q8tyKl0zoqQ{#Z){jN^LYunlAKE(yh> z(&fFPk)XU?V=?2fagXg>;CMr+kMhox479yM@p=6xRFmuje_cD35Ul2mrSGEmhG&|- zXNZ*V^G!|ij|@ZQXXO_kmuluJlHZEyo5$ib zj15j~zMp^oy0j#^K+(~T#i_JOdDgT$K4;F!#`j$}@4~0u$tuGYb{eFWjIDKAu?Tfa zT!#d-5`NlF=juty6CaAb%byxQzkV~h2Vz&Du;EG$V0q0#64Hvrim{HO{OhEJfH}Lh z=hIZNvhNUg>2hR#feKw{gDUOfu(&8ljG|mRUbmdXi502Wr@zwsVQQVDUN*PXdEH>j zRZd3Q(2iu`m2l+ythcv3ZVWo{hrBQ~U7^gpwVeQ@^Bz1z6)4+;LTx03KLXblK`q}q zu=j(YaqsGD#JHBl9i-2iKmplASgFKE$x1qp3zsK#BqaaOUI26ioIE1 z@FT#RlV@CR8RJJuSFg@r-KfDBc$trD zMOD)a-0(b_X?L2)iRhTM_MN=g_w%k4Z&VwHbE~S5W{!f+P~N#EaqZy|VZVog61Aw5 znsXEBa`2vm3B}ku7~-m;lD5V3MI0^LB#mJQ zQdT|%NS0~i$CYP0ouJi?I|h7~LqRHlxgggO%dAT*h56Cab>|sy34sqkFyl3uH*DqYbQ zOp{H@IM#-lurb>`)itQ9)69%NAUBarHS@J(aDAoZzOqC-OH-X%xCPam>Ip{k4t1jSj8%S?b$0_gSt(R zLIs|r=n@f0v@pgjT+DfSf0WGTf}>!rxRR4(ZE`X0Iv~9b7T+HwAVi-2iiQ(3<8R9C zU*60gLXWT|;0k{$5%-_;FZ%~DbPI5^4S3Mz`|*QQm3;=N1~mZDN1dZt<6WqGjg}N@ z6$I2y*%Bi>vu*Y9ldLZ5H|{Hlg4`Twx-%o2s8$1a`)O6I*r;OuRm0aZ3=mreO5kmb zzg|9x51V!@KWvh?x|4B<#ULRXgPTdb(j`Mpka#9ZZo~#$IH_)kE(VdB9@+HBkXo`W zv9ID&d~JDRCwaDamE30aP0cw^e+=ZHmdmBwf`Z~6!G0b5weTx-l9T>I;vwijM&@b3 zLC;WMftZTK@keWuI-xP4ikg}4CXo+Lxy{9A+m%yc;bR7;v;!Ks9J1&87nQ9irT-)< zCb|#|i&!v!gAUj)Zn`VsO~cBU|)qMDsTw9LY9OV7x&>r+I68dKdyj@u*QX** z>p#|01^k`4%tXW&VwUmt<36cEY23_ot>cAx!K_BnOhYn|nmox_yqRc(q4JpMav|rJ z*Ilgn(?xCiuFJ_|dIdog`NBetMj4gJ`#}(Eg*b8!aZJbySF={rL_tAYBLhk`O5s~c zxz~Z3sNxrnR*^bDsP1aQkQrn-X7t(3n6hlQHzFGt*{dM?=2 zq15#GhmX66OD^;9`>;!Yl)j)1o?oUazt?X@dgPq$bXRNfu%%r83Y%?d09#h^iuI$rKhpl&ISqtyfIO1)zBkFO?8U{Y1L>3iBEhAcE z#SKfIrE11aV`me!t#m*Tg%6~%?&t6Mm3^BaZ}vh<3V%m}{pNSLs_K8_TJ8B(|1t=F zN@_q2l0bO|k1FNh^^>H{&v;A~Iwu{!@KPo8KFgDe7w0NQ1TjjmI7LRAKQIab7c5fm zYCXIxenQ4YZAZ$5x)XOQLw61aLC36CuDpBUsBm|l-%*Y@&NKHJBSiIO+qi8~$Wy6% z*GLP~P7*bqAT3I{JpGp2ndmBed&PAaxr4RAQ5T+80mX=Zn`7zD?lWAWi_nABFq+k# z`Nri(e-f#7^N1URQXTp23tCF|%|tiJx)8^sG0PUz2rq~;+VNp|9sL7^cTt@ zo$J!VUC%#DzRc`{)CG-{qSba+apz7*j?WpQoYc4GFcbDo=_*p{hMq`m`MhW= z$&W!6jTPOV4NLk+HON%Tl`kk+p0X1&EuU0u;9E{rim`Z~kMTj&vK?L=XFLyJ1#q-8 z%uavd;>L}7=bJK$mTXSO^9xB5`9aR8TQALesIJ9tO-0w^wBj4eR1@E#3*Nr>X=rEl zA38);8_=e1OC#uT$JANA2x8-v@OI5m$rg;iT+xG({w|xvm^@zj~aeA!F8|0g`<(z=ZAwCSmXG*g@ z?`r5}M-S<8whc}v*y$vZrKxK=KJzFAr!7>^?t%4P6O;tj7$dESG0cg37LYcJ{p6Le z`o!)cd_%3JyNpzDkiZ-!XRbs2$5>wTrGd$*BM zcbr?@jjI}V*r7mbIrVk6F^cYOv7@JB5INsD*^VvqDdJQY`fE79&+LT(sJ(aCdhwRXKF2i#! z;hKRMtTkVl9$GCX0UNwQr&7l$Y$4irAw zq58PpEjThw&eI~zT_bnQPc^bd?rXyk`&-X3-YON^csoYLbUEtfPjf|gJP&__+?H`P zb2*A3+9}Qtl`9t6jZhx-MOc=SWa>KDR6koybqjxP$in-W{TXpFGKDf7m1tXby-xi- zV)Iss&tl)T@KAP&G~BiZpE~B~%D_^lCS8Q^QRM-PnZ9ZFcy}1fLWaro?harCA^k9X zGI-0+h&$LFR0D$r*Uo>7ltraeDx8sZ>(Fj?qKprWC7i8@I-aEZvyNii$f)1J@(0-E!}e!PatYiVe43IfW{x8i~|l1$K>Hk9OU` zV9Wj6baXd@h?SMtufOeJsqNRX^+oKcA1#rg8Hzwl>Z@9tzXhaq?t2#Pp8ZO&d(pr* zvKN~|e@CDKQ&;}`fqfarzcColDsoQ0Nij|Z)7qav5aIdVq6TWwKfPhp;lP>(ys%Pd zm#-~>8tcsPFnqq&Fsf9IDu*h)s`U%QOH3x%H%BtXFgQ-)|qdL5K zvesd`7IB^19up&YuHtV?HEkep{CqRc4+Cz;2uzE+tpQD4`OLAhl|VWBBL_aL-ERZ5 zON^kZCyM5A=oRh$*BL}#Hx_*AikacuJ#9YzGpYU=@^_y&Vzb`+u&xRk4?h`_-nB$3T&My@xo8NNHHpP(T1TT=mIi|Bb=(;V2e0{N+ zBU6du&!K)fq3u-l1?~kmhH8p=M`Jc5=oCBF%q7K+@pp@vIc%Rlp4N$+9_VTS{(8qQ zZ1n8tQ-Pnt{QG?+#yxi4&?o zNq@CGg{5uL#kl(VO;#W|xN#K&he#lXk&Nq1W=Z880{y+EJ7uEnQ@6coCcSmNCEm*^ zY;{+S!Gp4;fUlx>Abmmn$Z$GQ_d--{@Cg}ryVs8kBvb8c;nAt$Jg$OezRCh02TKH9 z*8p5lNSt-qilk->BNxJbD?x``{w1~8bM}MZ? z0Y5Gem4J-dhu`6W-08>*IOm)?B<*OSl^)9W>C>!a-m@b5{OLC-DLEdVGOWZR6#+w+ z0qv_?)nT1SY8iTAG)c&3udFAWF5;~Wc&kWuGc!()BBpk?M;lN0KMa+XkpGj2ei0^U zYOl|Njk^?ULQ&b(UfXvWwi+r7dtwEuLe?Uguo5F^q=`&-n$=3~r9)Aw;};IXW+_{p zkuJ0G<*_Abm36Gpdz=8ZVY6zQ@?4U5?k()K@yt)v!S~G`QJMw3U|7-Z4u=z$>JK|@ z$|>)dP6Mm@4Si2d3;isA?lm}VEE$yhSsQPIM6WyhT26peTRHz+@e^%-2ZXZ-+ieVa zKpdjMiRn!m(N4Ej&>1lqA=|FZ;525ZymOSL{w*Uyh4Yd6)}Ak0)eMzN~fz_dFrTh{FSM|x-HsfVgghii|tbkI!m zmo#%k8aqCe(3j;Nj-ConyA@l&R&J!498g^sJx0g7sfue+!VLSnkkUrl1|P!0E>7L0 z=}@QIGH2i*y{P=T4v|KTT%UuImYRwijE-IEmCZ2nDpsk*;d_bJM5Y0mIk%@C>ZAE1 zC5AD!n2(=)FuV1aTMj`7ak1u{BGJw=t}k;b-n@DnB)h26y)d|ipYBpe_v!ZJxLl^6 zfStujwTaVB$fUE@)O1xwA%;ju@;!s9Ly6Ah6qm#B=}+{bCuc+v$Lkm98F!M6w*(FK zt-TnNoX+UeBiT>U+X>n8nnIW-uF}JO!kq+bde-jj<|QX#TU0Q0RqEF?(4$GJsCn<$VlV%asK&B%&+uB zdW&S(QMOw6SAt4QSy-zaecEC#W#Z)f5pI3e&Ti_vG}HP5MHs>I66ch;;_^friLBQ` zCkOa6m|jwhA36c2AN0pMz^^Yu{n2YG;!5IbZB{VT*sJK>#z*{8)9aU8Msf_oBCjHK@rxvgrYm?j%dv zp%XUU1-ofCD#B85zX)(t?4G-oMsS8~+z z`wnXHk9yWOnSirYKV;=RmHaJ%@B^PfdgS4N@zcZu8F`&7x-yO@7=YBpDbB+wG!eQE zlw-;sOZwx|&e6#&+Y6#MmmmwW@DDxh0va@zF|p-MYj2BEL8hrdxU+7JM#V0w!(}?Hv%l$%J`joX9^dIk5i??eQ~ph(M+P7@G zyM=c1i{F?I9}4p{2;O2|g^xfE4~0HjUJre@dojEgjv3G(`IG2l3)wBQd{^1#V5KOA z54h9xBlED!4(};klO#mv;dx}&;WW_pf-FfPBskgEx5#6B`|>Wj48g_#t%mD1zQL&| zPVQcFU6!(bBL0V~*G4a1J$O=!-uL4$ta-8m+#*hepELYMA>^9qSo^s5NG~oGfU;0U~*_HL(^-2hM)w z&T}UQw?gGP4M~6y!5d1;N+@zZ%_{Ejpv-b(*LK*ZH+mNxUV z8s6f1^eb4#C-IiE)1R}hFi@)aZ?lhCpkNgwwhIO~OOOrn8g0=D)Gla6RyOwnJbP-hTOyO zzsxD^jqo|@w`&+*eLxrERTOHn&iR0&ST!{CG25nOo&-q$;~IkU)0Uz(qB_buMgsJ% zY4R93q{MIN(m->#y0~6qGtxjT11HwKaDCD8J=$NQV+zuco6T%$&J^uxVC0_$6>te2 zDhs9Bfc8FnjRbm;Zv}zxd?{#MOgSP?m9DxBF zxZ(Cp6Z&67D^YUUUIgwh6NLc$As~Q$HDAa=es!Lp#b}l^o?fT!0-$R(r1_hb5*t)+(=1598q!@sY%4-fWKT zcEcUFvmm{w#}ju4D`RCT-q+%la7CwxSI`6hwZ#UK4!r;A3p2VRG5o3e}L< zZ^q0mqWU{<(Glr53ye`sk$-o~&G!%ZrZ}&>-99X`Yd5tWv@n(ID!7hsD$rCDnqhr0 z`4UuL?K+itA&4Q9C9#T&f=1Gd`P%ETLUvUfY+Orv)u;&=j^5NcMBP0;E@~*Oh3fXD zG{HCT8f&U#rw4Dzx^8!0;*h&77Um`vcK3BhvG6Mvk(GKqN2D)rEtFBBD99q~`~|}% zMU_I>!{GLkrQW&9)&4$RY>AqNu(pp(5xi_NEYfDNZjE`p$%PZE?3mGV{M}f9eO^bn zFQTpXLgCr~)Z4W<-gKLUiY3S}=nj?Coja$Aj=O25`lMc!bP8_WqUb;4n@Z@KO~=*iotn2_|PD!|(IIV6Qa-Fi8;F|3%U?!x^mA+O?V%btZ6C3*)kbr|P9|NLg` zT)hZy9qu#ti~&_n|K=`R+u8)Q>Tz215uDx|EgVxuWBdXy!`$Z8*|+FtfFd1n39dc& zHM=ZE9I&@4r$!|iOxgsXu{t9T4LWkf3Zde3yCG*ESHcr7p))gMy-9lQGd_g+LDOs#L~(!Rokv|vJg*P#@M zzG+;)aj^fGegEoc-`C8K=6-I!qpv~%L9xQOgQc!^}HSJoMn|{jF~D4K8k}b$NQw*MK5sBPde$|a_J>) zMaKJZr6>8dr5#gcdQwNDXZ_h?*nR?Vq-vz?(fj>gUHn&}2qdP8R044xKsS9)a$)_3 zbpSo{O{#%3#=j+<{Tdb_2oMgLf03^Hb>ZLb9{}1gkgY#E{JXd(h0HG6qHcGfL;Qj-Sx9_!-_H_N9M?njF&N{-y)b!vR?faVl zeC>aJroDA<|M&YoIY1Zu8~wpvckEm3etf`RwgCPQ%%AT)`Tx58$889uGJX^$CG-Ws z&`S96{RuRP^59dDfHA*^TKF+Ze=(c(4(^?{r#kE(|Ag;LxE}li2;pzv^|va+_Y=Og zzlZ7T`{&T#8~*?KEkL4~-TyF1!Jzy$*7r5IC&zw>r=R^qH~?4zeo8dZ(*Q#HcNX6L zpBPS!*_1lYnFUY+|CJf$eZ}fj-vrhrJ``vf>tpYM*LU(`$|GFo*p5#}N1D2}2TY`rH-I73(@-NT6zf1P+?A#7u`Tt1$_+|G0 z82bOQ1rmVuSFy400zxhc@PzLZV&Cwre_a9Q)82yj&uo)@&HV1UK+|X-EB|WgcPIVl zG4%gel=IKN+&>2Y&+q@wJ^uF*@Z&dFM1SY$*)O{OZ`t1y(Le#Ym|6oMJbPFk!q_C> zc~skgUcg~e0~X4{WWw(;!fz{C!pVe*{UgKb+s@t@dr%-kWI2lZ0XRcf<*zG#Hh+m= z_eJfk2L5~?HgyRUi~n(lf7~&UuxRW%ZEtawA)HTmf?wL-3uy11zxRaS0^EOHzweFy g{r>yj@Z0OWs(Zw{jm{Mq~e0BIL2wEzGB literal 0 HcmV?d00001 diff --git a/soni_translate/text_multiformat_processor.py b/soni_translate/text_multiformat_processor.py index 226169c..4d7f522 100644 --- a/soni_translate/text_multiformat_processor.py +++ b/soni_translate/text_multiformat_processor.py @@ -8,7 +8,7 @@ import copy import string import soundfile as sf -from PIL import Image, ImageOps +from PIL import Image, ImageOps, ImageDraw, ImageFont punctuation_list = list( string.punctuation + "¡¿«»„”“”‚‘’「」『』《》()【】〈〉〔〕〖〗〘〙〚〛⸤⸥⸨⸩" @@ -315,7 +315,12 @@ def calculate_average_color(img): return average_color -def add_border_to_image(image_path, target_width, target_height, border_color=None): +def add_border_to_image( + image_path, + target_width, + target_height, + border_color=None +): img = Image.open(image_path) @@ -324,7 +329,7 @@ def add_border_to_image(image_path, target_width, target_height, border_color=No original_aspect_ratio = original_width / original_height target_aspect_ratio = target_width / target_height - # Resize the image to fit the target resolution while retaining aspect ratio + # Resize the image to fit the target resolution retaining aspect ratio if original_aspect_ratio > target_aspect_ratio: # Image is wider, calculate new height new_height = int(target_width / original_aspect_ratio) @@ -357,15 +362,134 @@ def add_border_to_image(image_path, target_width, target_height, border_color=No return image_path -def doc_to_txtximg_pages(document, width, height, start_page, end_page, bcolor): +def resize_and_position_subimage( + subimage, + max_width, + max_height, + subimage_position, + main_width, + main_height +): + subimage_width, subimage_height = subimage.size + + # Resize subimage if it exceeds maximum dimensions + if subimage_width > max_width or subimage_height > max_height: + # Calculate scaling factor + width_scale = max_width / subimage_width + height_scale = max_height / subimage_height + scale = min(width_scale, height_scale) + + # Resize subimage + subimage = subimage.resize( + (int(subimage_width * scale), int(subimage_height * scale)) + ) + + # Calculate position to place the subimage + if subimage_position == "top-left": + subimage_x = 0 + subimage_y = 0 + elif subimage_position == "top-right": + subimage_x = main_width - subimage.width + subimage_y = 0 + elif subimage_position == "bottom-left": + subimage_x = 0 + subimage_y = main_height - subimage.height + elif subimage_position == "bottom-right": + subimage_x = main_width - subimage.width + subimage_y = main_height - subimage.height + else: + raise ValueError( + "Invalid subimage_position. Choose from 'top-left', 'top-right'," + " 'bottom-left', or 'bottom-right'." + ) + + return subimage, subimage_x, subimage_y + + +def create_image_with_text_and_subimages( + text, + subimages, + width, + height, + text_color, + background_color, + output_file +): + # Create an image with the specified resolution and background color + image = Image.new('RGB', (width, height), color=background_color) + + # Initialize ImageDraw object + draw = ImageDraw.Draw(image) + + # Load a font + font = ImageFont.load_default() # You can specify your font file here + + # Calculate text size and position + text_bbox = draw.textbbox((0, 0), text, font=font) + text_width = text_bbox[2] - text_bbox[0] + text_height = text_bbox[3] - text_bbox[1] + text_x = (width - text_width) / 2 + text_y = (height - text_height) / 2 + + # Draw text on the image + draw.text((text_x, text_y), text, fill=text_color, font=font) + + # Paste subimages onto the main image + for subimage_path, subimage_position in subimages: + # Open the subimage + subimage = Image.open(subimage_path) + + # Convert subimage to RGBA mode if it doesn't have an alpha channel + if subimage.mode != 'RGBA': + subimage = subimage.convert('RGBA') + + # Resize and position the subimage + subimage, subimage_x, subimage_y = resize_and_position_subimage( + subimage, width / 4, height / 4, subimage_position, width, height + ) + + # Paste the subimage onto the main image + image.paste(subimage, (int(subimage_x), int(subimage_y)), subimage) + + image.save(output_file) + + return output_file + + +def doc_to_txtximg_pages( + document, + width, + height, + start_page, + end_page, + bcolor +): from pypdf import PdfReader - reader = PdfReader(document) - logger.debug(f"Total pages: {reader.get_num_pages()}") images_folder = "pdf_images/" os.makedirs(images_folder, exist_ok=True) remove_directory_contents(images_folder) + # First image + text_image = os.path.basename(document)[:-4] + subimages = [("./assets/logo.jpeg", "top-left")] + text_color = (255, 255, 255) if bcolor == "black" else (0, 0, 0) # w|b + background_color = COLORS.get(bcolor, (255, 255, 255)) # dynamic white + first_image = "pdf_images/0000_00_aaa.png" + + create_image_with_text_and_subimages( + text_image, + subimages, + width, + height, + text_color, + background_color, + first_image + ) + + reader = PdfReader(document) + logger.debug(f"Total pages: {reader.get_num_pages()}") + start_page_idx = max((start_page-1), 0) end_page_inx = min((end_page), (reader.get_num_pages())) document_pages = reader.pages[start_page_idx:end_page_inx] @@ -466,18 +590,12 @@ def fix_timestamps_docs(result_diarize, audio_files): def create_video_from_images( - document, - width, - height, - doc_data, - result_diarize + doc_data, + result_diarize ): - # First image - text = os.path.basename(document)[:-4] + # First image path first_image = "pdf_images/0000_00_aaa.png" - cm = f"ffmpeg -f lavfi -i color=c=black:s={width}x{height} -vf \"drawtext=text='{text}':x=(w-text_w)/2:y=(h-text_h)/2:fontsize=24:fontcolor=white\" -frames:v 1 {first_image}" - run_command(cm) # Time segments and images max_pages_idx = len(doc_data) - 1 From 0e0f68e95fdf9963fb19f528a2a6d2a63c4e306b Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Thu, 16 May 2024 22:10:04 +0000 Subject: [PATCH 34/36] chore: gui language persian and afrikaans --- app_rvc.py | 32 +- soni_translate/languages_gui.py | 706 +++++++++++++++++++++++++++++++- 2 files changed, 720 insertions(+), 18 deletions(-) diff --git a/app_rvc.py b/app_rvc.py index bc9b5fc..47718cd 100644 --- a/app_rvc.py +++ b/app_rvc.py @@ -1755,8 +1755,8 @@ def submit(value): ) avoid_overlap_gui = gr.Checkbox( False, - label="Overlap Reduction", - info="Overlap Reduction: Ensures segments don't overlap by adjusting start times based on previous end times; could disrupt synchronization.", + label=lg_conf["or_label"], + info=lg_conf["or_info"], ) gr.HTML("
") @@ -1826,17 +1826,17 @@ def submit(value): gr.Markdown(lg_conf["whisper_title"]) literalize_numbers_gui = gr.Checkbox( True, - label="Literalize Numbers", - info="Literalize Numbers: Replace numerical representations with their written equivalents in the transcript.", + label=lg_conf["lnum_label"], + info=lg_conf["lnum_info"], ) vocal_refinement_gui = gr.Checkbox( False, - label="Sound Cleanup", - info="Sound Cleanup: Enhance vocals, remove background noise before transcription for utmost timestamp precision. This operation may take time, especially with lengthy audio files.", + label=lg_conf["scle_label"], + info=lg_conf["scle_info"], ) segment_duration_limit_gui = gr.Slider( - label="Segment Duration Limit", - info="Specify the maximum duration (in seconds) for each segment. The audio will be processed using VAD, limiting the duration for each segment chunk.", + label=lg_conf["sd_limit_label"], + info=lg_conf["sd_limit_info"], value=15, step=1, minimum=1, @@ -1852,7 +1852,7 @@ def submit(value): ASR_MODEL_OPTIONS + find_whisper_models(), value=whisper_model_default, label="Whisper ASR model", - info="It converts spoken language to text using the Whisper model by default. Use a custom model, for example, by inputting the repository name 'BELLE-2/Belle-whisper-large-v3-zh' in the dropdown to utilize a Chinese language finetuned model. Find finetuned models on Hugging Face.", + info=lg_conf["asr_model_info"], allow_custom_value=True, ) com_t_opt, com_t_default = ( @@ -1863,15 +1863,15 @@ def submit(value): compute_type = gr.Dropdown( com_t_opt, value=com_t_default, - label="Compute type", - info="Choosing smaller types like int8 or float16 can improve performance by reducing memory usage and increasing computational throughput, but may sacrifice precision compared to larger data types like float32.", + label=lg_conf["ctype_label"], + info=lg_conf["ctype_info"], ) batch_size = gr.Slider( minimum=1, maximum=32, value=8, - label="Batch size", - info="Reducing the batch size saves memory if your GPU has less VRAM and helps manage Out of Memory issues.", + label=lg_conf["batchz_label"], + info=lg_conf["batchz_info"], step=1, ) input_srt = gr.File( @@ -1889,8 +1889,8 @@ def submit(value): text_segmentation_scale_gui = gr.Dropdown( text_segmentation_options, value=text_segmentation_options[0], - label="Text Segmentation Scale", - info="Divide text into segments by sentences, words, or characters. Word and character segmentation offer finer granularity, useful for subtitles; disabling translation preserves original structure.", + label=lg_conf["tsscale_label"], + info=lg_conf["tsscale_info"], ) divide_text_segments_by_gui = gr.Textbox( label=lg_conf["divide_text_label"], @@ -2191,7 +2191,7 @@ def swap_visibility(data_type): maximum=99999, label="End page", ) - gr.HTML("
Videobook") + gr.HTML("
Videobook config") videobook_width_gui = gr.Number( step=1, value=1280, diff --git a/soni_translate/languages_gui.py b/soni_translate/languages_gui.py index e9be02a..970cda6 100644 --- a/soni_translate/languages_gui.py +++ b/soni_translate/languages_gui.py @@ -26,14 +26,14 @@ "description": """ ### 🎥 **Translate videos easily with SoniTranslate!** 📽️ - Upload a video, audio file or provide a YouTube link. 📽️ **Gets the updated notebook from the official repository.: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!** + Upload a video, subtitle, audio file or provide a URL video link. 📽️ **Gets the updated notebook from the official repository.: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!** See the tab `Help` for instructions on how to use it. Let's start having fun with video translation! 🚀🎉 """, "tutorial": """ # 🔰 **Instructions for use:** - 1. 📤 Upload a **video**, **audio file** or provide a 🌐 **YouTube link.** + 1. 📤 Upload a **video**, **subtitle file**, **audio file**, or provide a 🌐 **URL link** to a video like YouTube. 2. 🌍 Choose the language in which you want to **translate the video**. @@ -48,6 +48,7 @@ - FACEBOOK MMS → format `en-facebook-mms VITS` → The voice is more natural; at the moment, it only uses CPU. - PIPER TTS → format `en_US-lessac-high VITS-onnx` → Same as the previous one, but it is optimized for both CPU and GPU. - BARK → format `en_speaker_0-Male BARK` → Good quality but slow, and it is prone to hallucinations. + - OpenAI TTS → format `>alloy OpenAI-TTS` → Multilingual but it needs an OpenAI API key. - Coqui XTTS → format `_XTTS_/AUTOMATIC.wav` → Only available for Chinese (Simplified), English, French, German, Italian, Portuguese, Polish, Turkish, Russian, Dutch, Czech, Arabic, Spanish, Hungarian, Korean and Japanese. --- @@ -130,6 +131,8 @@ "acc_max_info": "Maximum acceleration for translated audio segments to avoid overlapping. A value of 1.0 represents no acceleration", "acc_rate_label": "Acceleration Rate Regulation", "acc_rate_info": "Acceleration Rate Regulation: Adjusts acceleration to accommodate segments requiring less speed, maintaining continuity and considering next-start timing.", + "or_label": "Overlap Reduction", + "or_info": "Overlap Reduction: Ensures segments don't overlap by adjusting start times based on previous end times; could disrupt synchronization.", "aud_mix_label": "Audio Mixing Method", "aud_mix_info": "Mix original and translated audio files to create a customized, balanced output with two available mixing modes.", "vol_ori": "Volume original audio", @@ -142,6 +145,19 @@ "burn_subs_label": "Burn Subtitles", "burn_subs_info": "Burn Subtitles: Embed subtitles into the video, making them a permanent part of the visual content.", "whisper_title": "Config transcription.", + "lnum_label": "Literalize Numbers", + "lnum_info": "Literalize Numbers: Replace numerical representations with their written equivalents in the transcript.", + "scle_label": "Sound Cleanup", + "scle_info": "Sound Cleanup: Enhance vocals, remove background noise before transcription for utmost timestamp precision. This operation may take time, especially with lengthy audio files.", + "sd_limit_label": "Segment Duration Limit", + "sd_limit_info": "Specify the maximum duration (in seconds) for each segment. The audio will be processed using VAD, limiting the duration for each segment chunk.", + "asr_model_info": "It converts spoken language to text using the 'Whisper model' by default. Use a custom model, for example, by inputting the repository name 'BELLE-2/Belle-whisper-large-v3-zh' in the dropdown to utilize a Chinese language finetuned model. Find finetuned models on Hugging Face.", + "ctype_label": "Compute type", + "ctype_info": "Choosing smaller types like int8 or float16 can improve performance by reducing memory usage and increasing computational throughput, but may sacrifice precision compared to larger data types like float32.", + "batchz_label": "Batch size", + "batchz_info": "Reducing the batch size saves memory if your GPU has less VRAM and helps manage Out of Memory issues.", + "tsscale_label": "Text Segmentation Scale", + "tsscale_info": "Divide text into segments by sentences, words, or characters. Word and character segmentation offer finer granularity, useful for subtitles; disabling translation preserves original structure.", "srt_file_label": "Upload an SRT subtitle file (will be used instead of the transcription of Whisper)", "divide_text_label": "Redivide text segments by:", "divide_text_info": "(Experimental) Enter a separator to split existing text segments in the source language. The tool will identify occurrences and create new segments accordingly. Specify multiple separators using |, e.g.: !|?|...|。", @@ -222,6 +238,7 @@ - FACEBOOK MMS → formato `en-facebook-mms VITS` → Voz más natural, por el momento solo usa CPU. - PIPER TTS → formato `en_US-lessac-high VITS-onnx` → Igual que el anterior, pero está optimizado tanto para CPU como para GPU. - BARK → formato `en_speaker_0-Male BARK` → De buena calidad pero lento y propenso a alucinaciones. + - OpenAI TTS → formato `>alloy OpenAI-TTS` → Multilingüe pero necesita una OpenAI API key. - Coqui XTTS → formato `_XTTS_/AUTOMATIC.wav` → Solo disponible para Chinese (Simplified), English, French, German, Italian, Portuguese, Polish, Turkish, Russian, Dutch, Czech, Arabic, Spanish, Hungarian, Korean y Japanese. --- @@ -304,6 +321,8 @@ "acc_max_info": "Aceleración máxima para segmentos de audio traducidos para evitar superposiciones. Un valor de 1.0 representa ninguna aceleración.", "acc_rate_label": "Regulación de la Tasa de Aceleración", "acc_rate_info": "Regulación de la Tasa de Aceleración: Ajusta la aceleración para adaptarse a segmentos que requieren menos velocidad, manteniendo la continuidad y considerando el momento de inicio siguiente.", + "or_label": "Reducción de superposición", + "or_info": "Reducción de superposición: Asegura que los segmentos no se superpongan ajustando los tiempos de inicio en función de los tiempos de finalización anteriores; podría interrumpir la sincronización.", "aud_mix_label": "Método de Mezcla de Audio", "aud_mix_info": "Mezclar archivos de audio original y traducido para crear una salida personalizada y equilibrada con dos modos de mezcla disponibles.", "vol_ori": "Volumen audio original", @@ -316,6 +335,19 @@ "burn_subs_label": "Grabar subtítulos", "burn_subs_info": "Grabar subtítulos: Incrusta los subtítulos en el video, convirtiéndolos en una parte permanente del contenido visual.", "whisper_title": "Configuracion Transcripción.", + "lnum_label": "Literalizar Números", + "lnum_info": "Literalizar Números: Reemplazar representaciones numéricas con sus equivalentes escritos en la transcripción.", + "scle_label": "Limpieza de Sonido", + "scle_info": "Limpieza de Sonido: Mejora de vocales, elimina ruido de fondo antes de la transcripción para una precisión máxima en la marca de tiempo. Esta operación puede tomar tiempo, especialmente con archivos de audio extensos.", + "sd_limit_label": "Límite de Duración del Segmento", + "sd_limit_info": "Especifique la duración máxima (en segundos) para cada segmento. El audio se procesará utilizando VAD, limitando la duración para cada fragmento de segmento.", + "asr_model_info": "Convierte el lenguaje hablado a texto utilizando el modelo 'Whisper' de forma predeterminada. Utilice un modelo personalizado, por ejemplo, ingresando el nombre del repositorio 'BELLE-2/Belle-whisper-large-v3-zh' en el menú desplegable para utilizar un modelo en chino preajustado. Encuentre modelos preajustados en Hugging Face.", + "ctype_label": "Tipo de Cálculo", + "ctype_info": "Elegir tipos más pequeños como int8 o float16 puede mejorar el rendimiento al reducir el uso de memoria y aumentar el rendimiento computacional, pero puede sacrificar precisión en comparación con tipos de datos más grandes como float32.", + "batchz_label": "Tamaño del Lote", + "batchz_info": "Reducir el tamaño del lote ahorra memoria si su GPU tiene menos VRAM y ayuda a gestionar problemas de falta de memoria.", + "tsscale_label": "Escala de Segmentación de Texto", + "tsscale_info": "Divide el texto en segmentos por oraciones, palabras o caracteres. La segmentación por palabras y caracteres ofrece una granularidad más fina, útil para subtítulos; desactivar la traducción conserva la estructura original.", "srt_file_label": "Subir un archivo de subtítulos SRT (Se utilizará en lugar de la transcripción de Whisper)", "divide_text_label": "Redividir segmentos de texto por:", "divide_text_info": "(Experimental) Ingresa un separador para dividir los segmentos de texto existentes en el idioma origen. La herramienta identificará las ocurrencias y creará nuevos segmentos en consecuencia. Especifica múltiples separadores usando |, por ejemplo: !|?|...|。", @@ -396,6 +428,7 @@ - FACEBOOK MMS → format `en-facebook-mms VITS` → La voix est plus naturelle ; pour le moment, il utilise uniquement le CPU. - PIPER TTS → format `en_US-lessac-high VITS-onnx` → Identique au précédent, mais optimisé pour le CPU et le GPU. - BARK → format `en_speaker_0-Male BARK` → Bonne qualité mais lent, et sujet aux hallucinations. + - OpenAI TTS → format `>alloy OpenAI-TTS` → Multilingue mais nécessite une OpenAI API key. - Coqui XTTS → format `_XTTS_/AUTOMATIC.wav` → Disponible uniquement pour le chinois (simplifié), l'anglais, le français, l'allemand, l'italien, le portugais, le polonais, le turc, le russe, le néerlandais, le tchèque, l'arabe, l'espagnol, le hongrois, le coréen et le japonais. --- @@ -478,6 +511,8 @@ "acc_max_info": "Accélération maximale pour les segments audio traduits afin d'éviter les chevauchements. Une valeur de 1,0 représente aucune accélération", "acc_rate_label": "Régulation du taux d'accélération", "acc_rate_info": "Régulation du taux d'accélération : Ajuste l'accélération pour prendre en compte les segments nécessitant moins de vitesse, en maintenant la continuité et en tenant compte du timing du prochain démarrage.", + "or_label": "Réduction des chevauchements", + "or_info": "Réduction des chevauchements : Garantit que les segments ne se chevauchent pas en ajustant les heures de début en fonction des heures de fin précédentes ; pourrait perturber la synchronisation.", "aud_mix_label": "Méthode de mixage audio", "aud_mix_info": "Mixer les fichiers audio original et traduit pour créer une sortie équilibrée et personnalisée avec deux modes de mixage disponibles.", "vol_ori": "Volume audio original", @@ -490,6 +525,19 @@ "burn_subs_label": "Incorporer les sous-titres", "burn_subs_info": "Incorporer les sous-titres : Intégrer les sous-titres dans la vidéo, les rendant ainsi une partie permanente du contenu visuel.", "whisper_title": "Config transcription.", + "lnum_label": "Literaliser les Nombres", + "lnum_info": "Literaliser les Nombres: Remplacer les représentations numériques par leurs équivalents écrits dans la transcription.", + "scle_label": "Nettoyage du Son", + "scle_info": "Nettoyage du Son: Amélioration des voix, suppression du bruit de fond avant la transcription pour une précision maximale des horodatages. Cette opération peut prendre du temps, notamment avec des fichiers audio volumineux.", + "sd_limit_label": "Limite de Durée du Segment", + "sd_limit_info": "Spécifiez la durée maximale (en secondes) pour chaque segment. L'audio sera traité en utilisant VAD, limitant la durée pour chaque fragment de segment.", + "asr_model_info": "Il convertit la langue parlée en texte en utilisant le modèle 'Whisper' par défaut. Utilisez un modèle personnalisé, par exemple, en saisissant le nom du référentiel 'BELLE-2/Belle-whisper-large-v3-zh' dans la liste déroulante pour utiliser un modèle chinois préajusté. Trouvez des modèles préajustés sur Hugging Face.", + "ctype_label": "Type de Calcul", + "ctype_info": "Choisir des types plus petits comme int8 ou float16 peut améliorer les performances en réduisant l'utilisation de la mémoire et en augmentant le débit computationnel, mais peut sacrifier la précision par rapport à des types de données plus grands comme float32.", + "batchz_label": "Taille du Lot", + "batchz_info": "Réduire la taille du lot permet d'économiser de la mémoire si votre GPU dispose de moins de VRAM et aide à gérer les problèmes de mémoire insuffisante.", + "tsscale_label": "Échelle de Segmentation de Texte", + "tsscale_info": "Divisez le texte en segments par phrases, mots ou caractères. La segmentation par mots et caractères offre une granularité plus fine, utile pour les sous-titres; désactiver la traduction conserve la structure d'origine.", "srt_file_label": "Télécharger un fichier de sous-titres SRT (sera utilisé à la place de la transcription de Whisper)", "divide_text_label": "Rediviser les segments de texte par :", "divide_text_info": "(Expérimental) Entrez un séparateur pour diviser les segments de texte existants dans la langue source. L'outil identifiera les occurrences et créera de nouveaux segments en conséquence. Spécifiez plusieurs séparateurs en utilisant |, par ex. : !|?|...|。", @@ -570,6 +618,7 @@ - FACEBOOK MMS → Format `en-facebook-mms VITS` → Die Stimme ist natürlicher; derzeit nur CPU. - PIPER TTS → Format `en_US-lessac-high VITS-onnx` → Wie das vorherige, aber optimiert für CPU und GPU. - BARK → Format `en_speaker_0-Male BARK` → Gute Qualität, aber langsam und anfällig für Halluzinationen. + - OpenAI TTS → Format `>alloy OpenAI-TTS` → Multisprachig, erfordert jedoch einen OpenAI API key - Coqui XTTS → Format `_XTTS_/AUTOMATIC.wav` → Nur verfügbar für Chinesisch (vereinfacht), Englisch, Französisch, Deutsch, Italienisch, Portugiesisch, Polnisch, Türkisch, Russisch, Niederländisch, Tschechisch, Arabisch, Spanisch, Ungarisch, Koreanisch und Japanisch. --- @@ -652,6 +701,8 @@ "acc_max_info": "Maximale Beschleunigung für übersetzte Audiosegmente, um Überlappungen zu vermeiden. Ein Wert von 1,0 repräsentiert keine Beschleunigung", "acc_rate_label": "Beschleunigungsrate-Regelung", "acc_rate_info": "Beschleunigungsrate-Regelung: Passt die Beschleunigung an, um Segmente mit weniger Geschwindigkeit anzupassen, um die Kontinuität zu erhalten und den Zeitpunkt des nächsten Starts zu berücksichtigen.", + "or_label": "Überlappungsreduzierung", + "or_info": "Überlappungsreduzierung: Stellt sicher, dass Segmente sich nicht überschneiden, indem Startzeiten auf Grundlage vorheriger Endzeiten angepasst werden; könnte die Synchronisierung stören.", "aud_mix_label": "Audio-Mixing-Methode", "aud_mix_info": "Mischen Sie Original- und übersetzte Audiodateien, um eine individuelle, ausgewogene Ausgabe mit zwei verfügbaren Mischmodi zu erstellen.", "vol_ori": "Lautstärke des Originaltons", @@ -664,6 +715,19 @@ "burn_subs_label": "Untertitel einbetten", "burn_subs_info": "Untertitel einbetten: Untertitel in das Video einbetten und somit zu einem festen Bestandteil des visuellen Inhalts machen.", "whisper_title": "Konfiguration Transkription.", + "lnum_label": "Zahlen Literalisieren", + "lnum_info": "Zahlen Literalisieren: Ersetzen numerischer Darstellungen durch ihre geschriebenen Äquivalente in der Transkription.", + "scle_label": "Tonbereinigung", + "scle_info": "Tonbereinigung: Verbesserung der Stimme, Entfernen von Hintergrundgeräuschen vor der Transkription für maximale Zeitstempelgenauigkeit. Diese Operation kann Zeit in Anspruch nehmen, insbesondere bei längeren Audiodateien.", + "sd_limit_label": "Segmentdauerbegrenzung", + "sd_limit_info": "Geben Sie die maximale Dauer (in Sekunden) für jeden Abschnitt an. Der Ton wird unter Verwendung von VAD verarbeitet, wobei die Dauer für jeden Segmentabschnitt begrenzt wird.", + "asr_model_info": "Es wandelt gesprochene Sprache standardmäßig mit dem 'Whisper'-Modell in Text um. Verwenden Sie ein benutzerdefiniertes Modell, indem Sie beispielsweise den Repository-Namen 'BELLE-2/Belle-whisper-large-v3-zh' im Dropdown-Menü eingeben, um ein chinesisches Sprachmodell zu verwenden. Finden Sie feinabgestimmte Modelle auf Hugging Face.", + "ctype_label": "Berechnungstyp", + "ctype_info": "Die Auswahl kleinerer Typen wie int8 oder float16 kann die Leistung verbessern, indem der Speicherverbrauch reduziert und die Rechenleistung erhöht wird, kann jedoch im Vergleich zu größeren Datentypen wie float32 an Präzision verlieren.", + "batchz_label": "Batch-Größe", + "batchz_info": "Die Reduzierung der Batch-Größe spart Speicherplatz, wenn Ihre GPU weniger VRAM hat, und hilft bei der Verwaltung von Out-of-Memory-Problemen.", + "tsscale_label": "Textsegmentierungsskala", + "tsscale_info": "Teilen Sie den Text in Segmente nach Sätzen, Wörtern oder Zeichen auf. Die Segmentierung nach Wörtern und Zeichen bietet eine feinere Granularität, die für Untertitel nützlich ist. Das Deaktivieren der Übersetzung erhält die Originalstruktur.", "srt_file_label": "Laden Sie eine SRT-Untertiteldatei hoch (wird anstelle der Transkription von Whisper verwendet)", "divide_text_label": "Textsegmente neu aufteilen nach:", "divide_text_info": "(Experimentell) Geben Sie einen Separator ein, um vorhandene Textsegmente in der Ausgangssprache aufzuteilen. Das Tool erkennt Vorkommen und erstellt entsprechend neue Segmente. Geben Sie mehrere Trennzeichen mit | an, z. B.: !|?|...|。", @@ -744,6 +808,7 @@ - FACEBOOK MMS → formato `en-facebook-mms VITS` → La voce è più naturale; al momento utilizza solo la CPU. - PIPER TTS → formato `en_US-lessac-high VITS-onnx` → Come il precedente, ma ottimizzato sia per CPU che GPU. - BARK → formato `en_speaker_0-Male BARK` → Buona qualità ma lenta e soggetta ad allucinazioni. + - OpenAI TTS → formato `>alloy OpenAI-TTS` → Multilingue ma richiede una OpenAI API key. - Coqui XTTS → formato `_XTTS_/AUTOMATIC.wav` → Disponibile solo per cinese (semplificato), inglese, francese, tedesco, italiano, portoghese, polacco, turco, russo, olandese, ceco, arabo, spagnolo, ungherese, coreano e giapponese. --- @@ -826,6 +891,8 @@ "acc_max_info": "Massima accelerazione per i segmenti audio tradotti per evitare sovrapposizioni. Un valore di 1,0 rappresenta nessuna accelerazione", "acc_rate_label": "Regolazione del tasso di accelerazione", "acc_rate_info": "Regolazione del tasso di accelerazione: Regola l'accelerazione per adattarsi ai segmenti che richiedono una velocità inferiore, mantenendo la continuità e considerando il timing di avvio successivo.", + "or_label": "Riduzione Sovrapposizione", + "or_info": "Riduzione Sovrapposizione: Assicura che i segmenti non si sovrappongano regolando gli orari di inizio in base agli orari di fine precedenti; potrebbe interrompere la sincronizzazione.", "aud_mix_label": "Metodo di mixing audio", "aud_mix_info": "Mixa file audio originali e tradotti per creare un output personalizzato e bilanciato con due modalità di mixing disponibili.", "vol_ori": "Volume audio originale", @@ -838,6 +905,19 @@ "burn_subs_label": "Incorpora sottotitoli", "burn_subs_info": "Incorpora sottotitoli: Incorpora i sottotitoli nel video, rendendoli una parte permanente del contenuto visivo.", "whisper_title": "Configura la trascrizione.", + "lnum_label": "Literalizzare Numeri", + "lnum_info": "Literalizzare Numeri: Sostituisci le rappresentazioni numeriche con i loro equivalenti scritti nella trascrizione.", + "scle_label": "Pulizia del Suono", + "scle_info": "Pulizia del Suono: Migliora le voci, rimuovi il rumore di fondo prima della trascrizione per una massima precisione dei timestamp. Questa operazione può richiedere del tempo, specialmente con file audio lunghi.", + "sd_limit_label": "Limite Durata Segmento", + "sd_limit_info": "Specifica la durata massima (in secondi) per ogni segmento. L'audio verrà elaborato utilizzando VAD, limitando la durata per ciascun frammento di segmento.", + "asr_model_info": "Converte il linguaggio parlato in testo utilizzando il modello 'Whisper' per impostazione predefinita. Utilizza un modello personalizzato, ad esempio, inserendo il nome del repository 'BELLE-2/Belle-whisper-large-v3-zh' nel menu a discesa per utilizzare un modello pre-ottimizzato in cinese. Trova modelli pre-ottimizzati su Hugging Face.", + "ctype_label": "Tipo di Calcolo", + "ctype_info": "Scegliere tipi più piccoli come int8 o float16 può migliorare le prestazioni riducendo l'utilizzo della memoria e aumentando il throughput computazionale, ma può sacrificare la precisione rispetto a tipi di dati più grandi come float32.", + "batchz_label": "Dimensione Batch", + "batchz_info": "Ridurre la dimensione del batch consente di risparmiare memoria se la tua GPU ha meno VRAM e aiuta a gestire i problemi di memoria esaurita.", + "tsscale_label": "Scala di Segmentazione del Testo", + "tsscale_info": "Dividi il testo in segmenti per frasi, parole o caratteri. La segmentazione per parole e caratteri offre una granularità più fine, utile per i sottotitoli; disabilitare la traduzione conserva la struttura originale.", "srt_file_label": "Carica un file sottotitoli SRT (verrà utilizzato al posto della trascrizione di Whisper)", "divide_text_label": "Ridividi i segmenti di testo per:", "divide_text_info": "(Sperimentale) Inserisci un separatore per dividere i segmenti di testo esistenti nella lingua di origine. Lo strumento identificherà le occorrenze e creerà nuovi segmenti di conseguenza. Specifica più separatori usando |, ad esempio: !|?|...|。", @@ -918,6 +998,7 @@ - FACEBOOK MMS → 形式 `en-facebook-mms VITS` → 音声がより自然です。現時点ではCPUのみを使用します。 - PIPER TTS → 形式 `en_US-lessac-high VITS-onnx` → 前述のものと同じですが、CPUとGPUの両方に最適化されています。 - BARK → 形式 `en_speaker_0-Male BARK` → 品質は良好ですが、遅く、幻覚に陥りやすいです。 + - OpenAI TTS → フォーマット `>alloy OpenAI-TTS` → 多言語対応ですが、OpenAIのAPIキーが必要です - Coqui XTTS → 形式 `_XTTS_/AUTOMATIC.wav` → 中国語(簡体字)、英語、フランス語、ドイツ語、イタリア語、ポルトガル語、ポーランド語、トルコ語、ロシア語、オランダ語、チェコ語、アラビア語、スペイン語、ハンガリー語、韓国語、日本語のみ利用可能です。 --- @@ -1000,6 +1081,8 @@ "acc_max_info": "オーバーラップを回避するための翻訳されたオーディオセグメントの最大加速度。値が1.0の場合、加速度はありません", "acc_rate_label": "加速度調整", "acc_rate_info": "加速度調整:速度が低いセグメントに適応するために加速度を調整し、連続性を保ち、次の開始時刻を考慮します。", + "or_label": "重複削減", + "or_info": "重複削減:前の終了時間に基づいて開始時間を調整してセグメントが重複しないようにします。同期を妨げる可能性があります。", "aud_mix_label": "オーディオミキシング方法", "aud_mix_info": "オリジナルと翻訳されたオーディオファイルを混合してカスタマイズされたバランスの取れた出力を作成するための2つの利用可能なミキシングモード。", "vol_ori": "元のオーディオの音量", @@ -1012,6 +1095,19 @@ "burn_subs_label": "字幕を焼く", "burn_subs_info": "字幕を焼く:字幕をビデオに埋め込み、それを視覚コンテンツの恒久的な一部にします。", "whisper_title": "トランスクリプションの構成。", + "lnum_label": "数値の表現化", + "lnum_info": "数値の表現化:トランスクリプト内の数値表現を書き換えて、数値を文字列に変換します。", + "scle_label": "音声のクリーンアップ", + "scle_info": "音声のクリーンアップ:トランスクリプトの時間スタンプの精度を最大限に高めるために、ボーカルを強調し、背景ノイズを除去します。この操作には時間がかかる場合があります。特に長時間のオーディオファイルの場合。", + "sd_limit_label": "セグメントの長さ制限", + "sd_limit_info": "各セグメントの最大長(秒単位)を指定します。オーディオはVADを使用して処理され、各セグメントチャンクの長さが制限されます。", + "asr_model_info": "デフォルトでは、「Whisperモデル」を使用して、音声をテキストに変換します。カスタムモデルを使用するには、ドロップダウンでリポジトリ名「BELLE-2/Belle-whisper-large-v3-zh」を入力して、中国語の言語を微調整したモデルを利用します。 Hugging Faceで微調整されたモデルを見つけます。", + "ctype_label": "計算タイプ", + "ctype_info": "int8やfloat16などの小さなタイプを選択すると、メモリ使用量が減少し、計算スループットが増加してパフォーマンスが向上しますが、float32などの大きなデータタイプと比較して精度が低下する場合があります。", + "batchz_label": "バッチサイズ", + "batchz_info": "バッチサイズを減らすと、GPUのVRAMが少ない場合にメモリを節約し、メモリ不足の問題を管理するのに役立ちます。", + "tsscale_label": "テキストのセグメンテーションスケール", + "tsscale_info": "テキストを文、単語、または文字でセグメントに分割します。単語と文字のセグメンテーションは、字幕などの細かい粒度の処理に役立ちます。翻訳を無効にすると、元の構造が保持されます。", "srt_file_label": "SRT字幕ファイルをアップロードしてください(Whisperのトランスクリプションの代わりに使用されます)", "divide_text_label": "次のようにテキストセグメントを再分割します:", "divide_text_info": "(実験的) ソース言語の既存のテキストセグメントを分割するセパレーターを入力します。ツールは出現を識別し、適切な箇所で新しいセグメントを作成します。複数のセパレーターを | を使用して指定します。例: !|?|...|。", @@ -1092,6 +1188,7 @@ - FACEBOOK MMS → 格式 `en-facebook-mms VITS` → 声音更自然;目前仅使用CPU。 - PIPER TTS → 格式 `en_US-lessac-high VITS-onnx` → 与前一款相同,但针对CPU和GPU进行了优化。 - BARK → 格式 `en_speaker_0-Male BARK` → 质量良好但速度较慢,易产生幻觉。 + - OpenAI TTS → 格式 `>alloy OpenAI-TTS` → 多语言但需要 OpenAI API key - Coqui XTTS → 格式 `_XTTS_/AUTOMATIC.wav` → 仅支持简体中文、英文、法文、德文、意大利文、葡萄牙文、波兰文、土耳其文、俄文、荷兰文、捷克文、阿拉伯文、西班牙文、匈牙利文、韩文和日文。 --- @@ -1174,6 +1271,8 @@ "acc_max_info": "翻译音频段的最大加速度,以避免重叠。值为1.0表示无加速度", "acc_rate_label": "加速度调节", "acc_rate_info": "加速度调节:调整加速度以适应需要较低速度的片段,保持连续性并考虑下一个开始的时机。", + "or_label": "重叠减少", + "or_info": "重叠减少:通过根据先前的结束时间调整开始时间来确保片段不重叠;可能会干扰同步。", "aud_mix_label": "音频混合方法", "aud_mix_info": "混合原始和翻译音频文件,以创建平衡的定制输出,提供两种可用的混合模式。", "vol_ori": "原始音频音量", @@ -1186,6 +1285,19 @@ "burn_subs_label": "烧录字幕", "burn_subs_info": "烧录字幕:将字幕嵌入视频中,使其成为视觉内容的永久部分。", "whisper_title": "配置转录。", + "lnum_label": "数字文字化", + "lnum_info": "数字文字化:将数字表示替换为其在转录中的书面等价物。", + "scle_label": "声音清理", + "scle_info": "声音清理:增强语音,消除转录之前的背景噪音,以实现最大的时间戳精度。此操作可能需要一些时间,特别是对于较长的音频文件。", + "sd_limit_label": "段落时长限制", + "sd_limit_info": "指定每个段落的最大持续时间(以秒为单位)。将使用VAD处理音频,以限制每个段落块的持续时间。", + "asr_model_info": "默认情况下,它使用“Whisper模型”将口语转换为文本。使用自定义模型,例如,在下拉菜单中输入存储库名称“BELLE-2/Belle-whisper-large-v3-zh”以使用经过中文语言微调的模型。在Hugging Face上找到微调模型。", + "ctype_label": "计算类型", + "ctype_info": "选择较小的类型,如int8或float16,可以通过减少内存使用量和增加计算吞吐量来提高性能,但可能会牺牲与float32等较大数据类型相比的精度。", + "batchz_label": "批处理大小", + "batchz_info": "如果您的GPU的VRAM较少,则减小批处理大小可以节省内存,并有助于管理内存不足问题。", + "tsscale_label": "文本分段比例", + "tsscale_info": "按句子、单词或字符将文本分成段。按单词和字符进行分段可提供更精细的粒度,适用于字幕等用途;禁用翻译将保留原始结构。", "srt_file_label": "上传SRT字幕文件(将用于替代Whisper的转录)", "divide_text_label": "通过以下方式重新划分文本段:", "divide_text_info": "(实验性)输入用于拆分源语言中现有文本段的分隔符。该工具将识别出现并相应地创建新段。使用|指定多个分隔符,例如:!|?|...|。", @@ -1266,6 +1378,7 @@ - FACEBOOK MMS → формат `en-facebook-mms VITS` → Голос більш натуральний; наразі використовується лише ЦП. - PIPER TTS → формат `en_US-lessac-high VITS-onnx` → Те ж саме, що й попередній, але оптимізований як для ЦП, так і для ГПУ. - BARK → формат `en_speaker_0-Male BARK` → Хороша якість, але повільна, і вона схильна до галюцинацій. + - OpenAI TTS → формат `>alloy OpenAI-TTS` → Мультиязычный, але потребує OpenAI API key - Coqui XTTS → формат `_XTTS_/AUTOMATIC.wav` → Доступний лише для китайської (спрощеної), англійської, французької, німецької, італійської, португальської, польської, турецької, російської, голландської, чеської, арабської, іспанської, угорської, корейської та японської. --- @@ -1348,6 +1461,8 @@ "acc_max_info": "Максимальне прискорення для перекладених аудіосегментів для уникнення перекриття. Значення 1,0 означає відсутність прискорення", "acc_rate_label": "Регулювання швидкості прискорення", "acc_rate_info": "Регулювання швидкості прискорення: Налаштовує прискорення, щоб пристосуватися до сегментів, які потребують меншої швидкості, зберігаючи послідовність та враховуючи час наступного запуску.", + "or_label": "Зменшення перекриття", + "or_info": "Зменшення перекриття: Забезпечує відсутність перекриття сегментів за допомогою налаштування часу початку на основі попередніх часів завершення; може порушити синхронізацію.", "aud_mix_label": "Метод мікшування аудіо", "aud_mix_info": "Змішуйте оригінальні та перекладені аудіофайли, щоб створити налаштований, збалансований вихід з двома доступними режимами мікшування.", "vol_ori": "Гучність оригінального аудіо", @@ -1360,6 +1475,19 @@ "burn_subs_label": "Підпалити субтитри", "burn_subs_info": "Підпалити субтитри: Вбудувати субтитри у відео, зробивши їх постійною частиною візуального змісту.", "whisper_title": "Налаштування транскрипції.", + "lnum_label": "Літералізація Чисел", + "lnum_info": "Літералізація Чисел: Заміна числових представлень на їх письмові еквіваленти в транскрипції.", + "scle_label": "Очищення Звуку", + "scle_info": "Очищення Звуку: Покращення голосів, видалення фонового шуму перед транскрипцією для максимальної точності відміток часу. Ця операція може зайняти час, особливо з довгими аудіофайлами.", + "sd_limit_label": "Обмеження тривалості сегменту", + "sd_limit_info": "Вкажіть максимальну тривалість (у секундах) для кожного сегменту. Аудіо буде оброблено за допомогою VAD, обмежуючи тривалість для кожного фрагменту сегменту.", + "asr_model_info": "Він перетворює усну мову на текст за допомогою моделі 'Whisper' за замовчуванням. Використовуйте власну модель, наприклад, введіть ім'я репозиторію 'BELLE-2/Belle-whisper-large-v3-zh' у розкривному списку, щоб використовувати китайську мову з налаштованою моделлю. Знайдіть налаштовані моделі на Hugging Face.", + "ctype_label": "Тип обчислення", + "ctype_info": "Вибір менших типів, таких як int8 або float16, може покращити продуктивність, зменшивши використання пам'яті та збільшивши обчислювальну пропускну здатність, але може пожертвувати точністю порівняно з більшими типами даних, такими як float32.", + "batchz_label": "Розмір пакету", + "batchz_info": "Зменшення розміру пакета заощаджує пам'ять, якщо у вашої GPU менше VRAM, і допомагає керувати проблемами нестачі пам'яті.", + "tsscale_label": "Масштаб сегментації тексту", + "tsscale_info": "Розділіть текст на сегменти за допомогою речень, слів або символів. Сегментація за словами та символами надає більшу деталізацію, корисну для субтитрів; вимкнення перекладу зберігає вихідну структуру.", "srt_file_label": "Завантажте файл субтитрів SRT (використовуватиметься замість транскрипції Whisper)", "divide_text_label": "Розділити текстові сегменти за допомогою:", "divide_text_info": "(Експериментально) Введіть роздільник для розділення існуючих текстових сегментів на мові джерела. Інструмент ідентифікує випадки та створює нові сегменти відповідно. Вказуйте кілька роздільників, використовуючи |, наприклад: !|?|...|。", @@ -1440,6 +1568,7 @@ - FACEBOOK MMS → الصيغة `en-facebook-mms VITS` → الصوت أكثر طبيعية؛ في الوقت الحالي، يستخدم فقط وحدة المعالجة المركزية. - PIPER TTS → الصيغة `en_US-lessac-high VITS-onnx` → نفس الشيء كما السابق، ولكنه محسّن لكل من وحدة المعالجة المركزية ووحدة معالجة الرسومات. - BARK → الصيغة `en_speaker_0-Male BARK` → جودة جيدة ولكن بطيء، ويميل إلى التهليل. + - OpenAI TTS → الصيغة `>alloy OpenAI-TTS` → متعدد اللغات ولكن يتطلب OpenAI API key - Coqui XTTS → الصيغة `_XTTS_/AUTOMATIC.wav` → متاحة فقط للصينية (المبسطة)، الإنجليزية، الفرنسية، الألمانية، الإيطالية، البرتغالية، البولندية، التركية، الروسية، الهولندية، التشيكية، العربية، الإسبانية، الهنغارية، الكورية واليابانية. --- @@ -1522,6 +1651,8 @@ "acc_max_info": "التسارع الأقصى لقطع الصوت المترجم لتجنب التداخل. قيمة 1.0 تمثل عدم وجود تسارع", "acc_rate_label": "تنظيم معدل التسارع", "acc_rate_info": "تنظيم معدل التسارع: يعدل التسارع لتوفير مقاطع تتطلب سرعة أقل، مع الحفاظ على الاستمرارية واعتبار توقيت البدء التالي.", + "or_label": "تقليل التداخل", + "or_info": "تقليل التداخل: يضمن عدم تداخل الشرائح عن طريق ضبط أوقات البدء استنادًا إلى الأوقات السابقة للنهاية ؛ قد يؤدي إلى إختلال التزامن.", "aud_mix_label": "طريقة مزج الصوت", "aud_mix_info": "مزج ملفات الصوت الأصلية والمترجمة لإنشاء إخراج مخصص ومتوازن بوجود طريقتي مزج متاحتين.", "vol_ori": "مستوى صوت الصوت الأصلي", @@ -1534,6 +1665,19 @@ "burn_subs_label": "حرق الترجمة الفرعية", "burn_subs_info": "حرق الترجمة الفرعية: تضمين الترجمة الفرعية في الفيديو، مما يجعلها جزءًا دائمًا من المحتوى البصري.", "whisper_title": "تكوين النص السريع.", + "lnum_label": "تحويل الأرقام إلى كلمات", + "lnum_info": "تحويل الأرقام إلى كلمات: استبدال التمثيلات الرقمية بمكافآتها المكتوبة في النص المكتوب.", + "scle_label": "تنظيف الصوت", + "scle_info": "تنظيف الصوت: تعزيز الأصوات، إزالة الضجيج الخلفي قبل التفريغ للحصول على أقصى دقة في الطابع الزمني. قد تستغرق هذه العملية وقتًا، خاصة مع ملفات الصوت الطويلة.", + "sd_limit_label": "حد مدة القطعة", + "sd_limit_info": "حدد المدة القصوى (بالثواني) لكل قطعة. سيتم معالجة الصوت باستخدام VAD، محددة مدة كل قطعة.", + "asr_model_info": "يحول اللغة الحية إلى نص باستخدام نموذج 'الهمس' افتراضيًا. استخدم نموذجًا مخصصًا، على سبيل المثال، عن طريق إدخال اسم المستودع 'BELLE-2/Belle-whisper-large-v3-zh' في القائمة المنسدلة لاستخدام نموذج معدل باللغة الصينية. العثور على النماذج المعدلة على Hugging Face.", + "ctype_label": "نوع الحساب", + "ctype_info": "اختيار أنواع أصغر مثل int8 أو float16 يمكن أن يحسن الأداء من خلال تقليل استخدام الذاكرة وزيادة الإخراج الحسابي، ولكن قد يضحي بالدقة مقارنة بأنواع البيانات الأكبر مثل float32.", + "batchz_label": "حجم الدفعة", + "batchz_info": "توفير الذاكرة عن طريق تقليل حجم الدفعة إذا كان لديك بطاقة رسومات GPU تحتوي على VRAM أقل وتساعد في إدارة مشكلات الذاكرة النفاد.", + "tsscale_label": "مقياس تقسيم النص", + "tsscale_info": "تقسيم النص إلى قطع حسب الجمل أو الكلمات أو الأحرف. يوفر تقسيم الكلمات والأحرف دقة أكبر، وهو مفيد للترجمات الفورية؛ يحافظ تعطيل الترجمة على الهيكل الأصلي.", "srt_file_label": "قم بتحميل ملف عنوان فرعي SRT (سيُستخدم بدلاً من النص السريع)", "divide_text_label": "إعادة تقسيم شرائح النص بواسطة:", "divide_text_info": "(تجريبي) أدخل فاصل لتقسيم شرائح النص الحالية في اللغة المصدر. ستحدد الأداة حدوث الحالات وإنشاء شرائح جديدة وفقًا لذلك. حدد علامات فاصلة متعددة باستخدام |، على سبيل المثال: !|؟|...|。", @@ -1614,6 +1758,7 @@ - FACEBOOK MMS → формат `en-facebook-mms VITS` → Голос более естественный; на данный момент используется только процессор. - PIPER TTS → формат `en_US-lessac-high VITS-onnx` → То же самое, что и предыдущее, но оптимизировано как для CPU, так и для GPU. - BARK → формат `en_speaker_0-Male BARK` → Хорошее качество, но медленное, и оно подвержено галлюцинациям. + - OpenAI TTS → формат `>alloy OpenAI-TTS` → Многоязычный, но требуется OpenAI API key - Coqui XTTS → формат `_XTTS_/AUTOMATIC.wav` → Доступен только для китайского (упрощенного), английского, французского, немецкого, итальянского, португальского, польского, турецкого, русского, голландского, чешского, арабского, испанского, венгерского, корейского и японского языков. --- @@ -1696,6 +1841,8 @@ "acc_max_info": "Максимальное ускорение для переведенных аудиосегментов для избежания их перекрытия. Значение 1.0 означает отсутствие ускорения", "acc_rate_label": "Регулирование уровня ускорения", "acc_rate_info": "Регулирование уровня ускорения: Регулирует ускорение для адаптации к сегментам, требующим меньшей скорости, сохраняя непрерывность и учитывая временные параметры следующего запуска.", + "or_label": "Сокращение перекрытий", + "or_info": "Сокращение перекрытий: Обеспечивает отсутствие перекрытия сегментов путем корректировки времени начала на основе предыдущих времен завершения; может нарушить синхронизацию.", "aud_mix_label": "Метод смешивания аудио", "aud_mix_info": "Смешивание оригинальных и переведенных аудиофайлов для создания настраиваемого, сбалансированного вывода с двумя доступными режимами смешивания.", "vol_ori": "Громкость оригинального аудио", @@ -1708,6 +1855,19 @@ "burn_subs_label": "Вжечь субтитры", "burn_subs_info": "Вжечь субтитры: Внедрить субтитры в видео, сделав их постоянной частью визуального контента.", "whisper_title": "Конфигурация транскрипции.", + "lnum_label": "Литерализация Чисел", + "lnum_info": "Литерализация Чисел: Замена числовых представлений их письменными эквивалентами в транскрипции.", + "scle_label": "Очистка Звука", + "scle_info": "Очистка Звука: Улучшение голосов, удаление фонового шума перед транскрипцией для максимальной точности временных меток. Эта операция может занять время, особенно с длинными аудиофайлами.", + "sd_limit_label": "Ограничение Длительности Сегмента", + "sd_limit_info": "Укажите максимальную длительность (в секундах) для каждого сегмента. Аудио будет обработано с использованием VAD, ограничивая длительность для каждого фрагмента сегмента.", + "asr_model_info": "Он преобразует устную речь в текст с использованием модели 'Whisper' по умолчанию. Используйте пользовательскую модель, например, введите имя репозитория 'BELLE-2/Belle-whisper-large-v3-zh' в выпадающем списке, чтобы использовать китайскую модель. Найдите настроенные модели на Hugging Face.", + "ctype_label": "Тип вычисления", + "ctype_info": "Выбор меньших типов, таких как int8 или float16, может улучшить производительность за счет уменьшения использования памяти и увеличения вычислительного потока, но может пожертвовать точностью по сравнению с более крупными типами данных, такими как float32.", + "batchz_label": "Размер Пакета", + "batchz_info": "Уменьшение размера пакета экономит память, если у вашей GPU меньше VRAM, и помогает управлять проблемами с памятью.", + "tsscale_label": "Масштабирование сегментации текста", + "tsscale_info": "Разделите текст на сегменты по предложениям, словам или символам. Сегментация по словам и символам обеспечивает более точную гранулярность, полезную для субтитров; отключение перевода сохраняет исходную структуру.", "srt_file_label": "Загрузить файл субтитров в формате SRT (будет использоваться вместо транскрипции Whisper)", "divide_text_label": "Разделить текстовые сегменты по:", "divide_text_info": "(Экспериментально) Введите разделитель для разделения существующих текстовых сегментов на исходном языке. Инструмент определит вхождения и создаст новые сегменты в соответствии с ними. Укажите несколько разделителей, используя |, например: !|?|...|。", @@ -1788,6 +1948,7 @@ - FACEBOOK MMS → biçim `tr-facebook-mms VITS` → Ses daha doğal; şu anda yalnızca CPU kullanıyor. - PIPER TTS → biçim `tr_TR-lessac-high VITS-onnx` → Öncekiyle aynı, ancak hem CPU hem de GPU için optimize edilmiştir. - BARK → biçim `tr_speaker_0-Kadın BARK` → İyi kalite ancak yavaş ve halüsinasyonlara eğilimli. + - OpenAI TTS → biçim `>alloy OpenAI-TTS` → Çok dilli ancak bir OpenAI API key gerektirir - Coqui XTTS → biçim `_XTTS_/AUTOMATIC.wav` → Sadece Çince (Basitleştirilmiş), İngilizce, Fransızca, Almanca, İtalyanca, Portekizce, Lehçe, Türkçe, Rusça, Hollandaca, Çekçe, Arapça, İspanyolca, Macarca, Korece ve Japonca için mevcut. --- @@ -1871,6 +2032,8 @@ "acc_max_info": "Çakışmayı önlemek için çevrilen ses segmentlerinin maksimum hızlandırması. 1.0 değeri hiçbir hızlandırmayı temsil eder", "acc_rate_label": "Hızlanma Oranı Düzenlemesi", "acc_rate_info": "Hızlanma Oranı Düzenlemesi: Daha az hız gerektiren segmentlere uyum sağlamak için hızlanmayı ayarlar, sürekliliği korur ve sonraki başlangıç zamanını dikkate alır.", + "or_label": "Örtüşme Azaltma", + "or_info": "Örtüşme Azaltma: Önceki bitiş zamanlarına dayanarak başlangıç zamanlarını ayarlayarak segmentlerin örtüşmesini engeller; senkronizasyonu bozabilir.", "aud_mix_label": "Ses Karıştırma Yöntemi", "aud_mix_info": "Özgün ve çevrilmiş ses dosyalarını karıştırarak iki kullanılabilir karıştırma moduyla özelleştirilmiş, dengeli bir çıkış oluşturun.", "vol_ori": "Özgün ses seviyesi", @@ -1883,6 +2046,19 @@ "burn_subs_label": "Altyazıyı Yak", "burn_subs_info": "Altyazıyı Yak: Altyazıları videoya gömerek, bunları görsel içeriğin kalıcı bir parçası haline getirir.", "whisper_title": "Transkripsiyonu yapılandır.", + "lnum_label": "Sayıları Metinleştir", + "lnum_info": "Sayıları Metinleştir: Transkript içindeki sayısal temsilleri yazılı eşdeğerleriyle değiştirin.", + "scle_label": "Ses Temizliği", + "scle_info": "Ses Temizliği: Zaman damgası hassasiyeti için transkripsiyondan önce sesleri iyileştirin, arka plan gürültüsünü kaldırın. Bu işlem özellikle uzun ses dosyalarıyla zaman alabilir.", + "sd_limit_label": "Bölüm Süresi Sınırı", + "sd_limit_info": "Her bölüm için maksimum süreyi (saniye cinsinden) belirtin. Ses, her bölüm parçası için süreyi sınırlayarak VAD kullanılarak işlenecektir.", + "asr_model_info": "Varsayılan olarak 'Fısıldama modeli'ni kullanarak konuşma dilini metne dönüştürür. Özel bir model kullanın, örneğin, özel bir model kullanmak için açılan menüye 'BELLE-2/Belle-whisper-large-v3-zh' depo adını girin. Hugging Face'de ince ayarlı modeller bulun.", + "ctype_label": "Hesaplama Türü", + "ctype_info": "int8 veya float16 gibi daha küçük tipleri seçmek, bellek kullanımını azaltarak ve hesaplama verimliliğini artırarak performansı artırabilir, ancak float32 gibi daha büyük veri tiplerine göre hassasiyetten ödün verebilir.", + "batchz_label": "Toplu İş Boyutu", + "batchz_info": "GPU'nuzun daha az VRAM'a sahip olması durumunda toplu iş boyutunu azaltmak bellek tasarrufu sağlar ve Bellek Dışı Sorunları yönetmeye yardımcı olur.", + "tsscale_label": "Metin Bölme Ölçeği", + "tsscale_info": "Metni cümleler, kelimeler veya karakterler olarak bölümlere ayırın. Kelime ve karakter bölme, altyazılar için faydalı olan daha ince granülerlik sağlar; çeviriyi devre dışı bırakma, orijinal yapının korunmasını sağlar.", "srt_file_label": "Bir SRT altyazı dosyası yükleyin (Whisper'ın transkripsiyonu yerine kullanılacaktır)", "divide_text_label": "Metin bölümlerini yeniden böl:", "divide_text_info": "(Deneysel) Mevcut metin segmentlerini kaynak dildeki ayraçla bölmek için bir ayraç girin. Aracı, bu ayraçları tanımlayacak ve buna göre yeni segmentler oluşturacaktır. Birden çok ayıraç belirtmek için | kullanın, örn .: !|?|...|。", @@ -1963,6 +2139,7 @@ - FACEBOOK MMS → format `en-facebook-mms VITS` → Suara lebih alami; saat ini, hanya menggunakan CPU. - PIPER TTS → format `en_US-lessac-high VITS-onnx` → Sama seperti sebelumnya, tetapi dioptimalkan untuk CPU dan GPU. - BARK → format `en_speaker_0-Male BARK` → Kualitas bagus tetapi lambat, dan rentan terhadap halusinasi. + - OpenAI TTS → format `>alloy OpenAI-TTS` → Multibahasa tetapi membutuhkan OpenAI API key - Coqui XTTS → format `_XTTS_/AUTOMATIC.wav` → Hanya tersedia untuk Cina (Sederhana), Inggris, Prancis, Jerman, Italia, Portugis, Polandia, Turki, Rusia, Belanda, Ceko, Arab, Spanyol, Hungaria, Korea, dan Jepang. --- @@ -2045,6 +2222,8 @@ "acc_max_info": "Akselerasi maksimum untuk segmen audio yang diterjemahkan untuk menghindari tumpang tindih. Nilai 1.0 mewakili tidak ada akselerasi", "acc_rate_label": "Regulasi Tingkat Akselerasi", "acc_rate_info": "Regulasi Tingkat Akselerasi: Menyesuaikan akselerasi untuk mengakomodasi segmen yang membutuhkan kecepatan lebih rendah, menjaga kontinuitas, dan mempertimbangkan waktu mulai berikutnya.", + "or_label": "Pengurangan Tumpang Tindih", + "or_info": "Pengurangan Tumpang Tindih: Memastikan segmen tidak tumpang tindih dengan menyesuaikan waktu mulai berdasarkan waktu selesai sebelumnya; bisa mengganggu sinkronisasi.", "aud_mix_label": "Metode Penggabungan Audio", "aud_mix_info": "Gabungkan file audio asli dan diterjemahkan untuk membuat output yang seimbang dengan dua mode pencampuran yang tersedia.", "vol_ori": "Volume audio asli", @@ -2057,6 +2236,19 @@ "burn_subs_label": "Bakar Subtitle", "burn_subs_info": "Bakar Subtitle: Menyematkan subtitle ke dalam video, menjadikannya bagian permanen dari konten visual.", "whisper_title": "Konfigurasi transkripsi.", + "lnum_label": "Literalisasi Angka", + "lnum_info": "Literalisasi Angka: Gantikan representasi numerik dengan ekivalen tertulisnya dalam transkrip.", + "scle_label": "Pembersihan Suara", + "scle_info": "Pembersihan Suara: Tingkatkan vokal, hapus kebisingan latar belakang sebelum transkripsi untuk presisi timestamp maksimum. Operasi ini bisa memakan waktu, terutama dengan file audio yang panjang.", + "sd_limit_label": "Batas Durasi Segment", + "sd_limit_info": "Tentukan durasi maksimum (dalam detik) untuk setiap segmen. Audio akan diproses menggunakan VAD, membatasi durasi untuk setiap potongan segmen.", + "asr_model_info": "Ini mengubah bahasa yang diucapkan menjadi teks menggunakan model 'Whisper' secara default. Gunakan model kustom, misalnya, dengan memasukkan nama repositori 'BELLE-2/Belle-whisper-large-v3-zh' dalam dropdown untuk menggunakan model yang disesuaikan bahasa Cina. Temukan model yang disesuaikan di Hugging Face.", + "ctype_label": "Jenis Perhitungan", + "ctype_info": "Memilih tipe yang lebih kecil seperti int8 atau float16 dapat meningkatkan kinerja dengan mengurangi penggunaan memori dan meningkatkan throughput komputasi, tetapi dapat mengorbankan presisi dibandingkan dengan tipe data yang lebih besar seperti float32.", + "batchz_label": "Ukuran Batch", + "batchz_info": "Mengurangi ukuran batch menghemat memori jika GPU Anda memiliki VRAM yang lebih sedikit dan membantu mengelola masalah Out of Memory.", + "tsscale_label": "Skala Segmentasi Teks", + "tsscale_info": "Bagi teks menjadi segmen berdasarkan kalimat, kata, atau karakter. Segmentasi kata dan karakter menawarkan granularitas yang lebih halus, berguna untuk subjudul; menonaktifkan terjemahan mempertahankan struktur asli.", "srt_file_label": "Unggah file subtitle SRT (akan digunakan sebagai gantinya dari transkripsi Whisper)", "divide_text_label": "Bagi ulang segmen teks dengan:", "divide_text_info": "(Eksperimental) Masukkan pemisah untuk membagi segmen teks yang ada dalam bahasa sumber. Alat ini akan mengidentifikasi kejadian dan membuat segmen baru sesuai. Tentukan beberapa pemisah menggunakan |, misalnya: !|?|...|。", @@ -2137,6 +2329,7 @@ - FACEBOOK MMS → formato `en-facebook-mms VITS` → A voz é mais natural; no momento, usa apenas CPU. - PIPER TTS → formato `en_US-lessac-high VITS-onnx` → O mesmo que o anterior, mas é otimizado para CPU e GPU. - BARK → formato `en_speaker_0-Male BARK` → Boa qualidade, mas lento e propenso a alucinações. + - OpenAI TTS → formato `>alloy OpenAI-TTS` → Multilíngue mas requer uma OpenAI API key - Coqui XTTS → formato `_XTTS_/AUTOMATIC.wav` → Disponível apenas para Chinês (Simplificado), Inglês, Francês, Alemão, Italiano, Português, Polonês, Turco, Russo, Holandês, Tcheco, Árabe, Espanhol, Húngaro, Coreano e Japonês. --- @@ -2219,6 +2412,8 @@ "acc_max_info": "Aceleração máxima para segmentos de áudio traduzidos para evitar sobreposições. Um valor de 1.0 representa nenhuma aceleração", "acc_rate_label": "Regulação da Taxa de Aceleração", "acc_rate_info": "Regulação da Taxa de Aceleração: Ajusta a aceleração para acomodar segmentos que exigem menos velocidade, mantendo a continuidade e considerando o tempo de próximo início.", + "or_label": "Redução de sobreposição", + "or_info": "Redução de sobreposição: Garante que os segmentos não se sobreponham ajustando os horários de início com base nos horários de término anteriores; pode perturbar a sincronização.", "aud_mix_label": "Método de Mistura de Áudio", "aud_mix_info": "Misture arquivos de áudio original e traduzido para criar uma saída personalizada e equilibrada com dois modos de mistura disponíveis.", "vol_ori": "Volume do áudio original", @@ -2231,6 +2426,19 @@ "burn_subs_label": "Queimar Legendas", "burn_subs_info": "Queimar Legendas: Incorporar legendas no vídeo, tornando-as uma parte permanente do conteúdo visual.", "whisper_title": "Configurar transcrição.", + "lnum_label": "Literalizar Números", + "lnum_info": "Literalizar Números: Substituir representações numéricas por seus equivalentes escritos na transcrição.", + "scle_label": "Limpeza de Som", + "scle_info": "Limpeza de Som: Aprimorar vocais, remover ruído de fundo antes da transcrição para máxima precisão de marcação de tempo. Esta operação pode levar tempo, especialmente com arquivos de áudio longos.", + "sd_limit_label": "Limite de Duração do Segmento", + "sd_limit_info": "Especifique a duração máxima (em segundos) para cada segmento. O áudio será processado usando VAD, limitando a duração para cada fragmento de segmento.", + "asr_model_info": "Ele converte linguagem falada em texto usando o modelo 'Whisper' por padrão. Use um modelo personalizado, por exemplo, inserindo o nome do repositório 'BELLE-2/Belle-whisper-large-v3-zh' no menu suspenso para utilizar um modelo em chinês finetuned. Encontre modelos finetuned na Hugging Face.", + "ctype_label": "Tipo de Cálculo", + "ctype_info": "Escolher tipos menores como int8 ou float16 pode melhorar o desempenho, reduzindo o uso de memória e aumentando o throughput computacional, mas pode sacrificar a precisão em comparação com tipos de dados maiores como float32.", + "batchz_label": "Tamanho do Lote", + "batchz_info": "Reduzir o tamanho do lote economiza memória se sua GPU tiver menos VRAM e ajuda a gerenciar problemas de Memória Insuficiente.", + "tsscale_label": "Escala de Segmentação de Texto", + "tsscale_info": "Divida o texto em segmentos por frases, palavras ou caracteres. A segmentação por palavras e caracteres oferece granularidade mais fina, útil para legendas; desativar a tradução preserva a estrutura original.", "srt_file_label": "Carregar um arquivo de legenda SRT (será usado em vez da transcrição de Whisper)", "divide_text_label": "Redividir segmentos de texto por:", "divide_text_info": "(Experimental) Insira um separador para dividir os segmentos de texto existentes no idioma de origem. A ferramenta identificará as ocorrências e criará novos segmentos conforme necessário. Especifique vários separadores usando |, por exemplo: !|?|...|。", @@ -2311,6 +2519,7 @@ - FACEBOOK MMS → प्रारूप `en-facebook-mms VITS` → आवाज अधिक प्राकृतिक है; वर्तमान में, यह केवल CPU का उपयोग करता है। - PIPER TTS → प्रारूप `en_US-lessac-high VITS-onnx` → पिछले वाले के समान, लेकिन यह CPU और GPU दोनों के लिए अनुकूलित है। - BARK → प्रारूप `en_speaker_0-Male BARK` → अच्छी गुणवत्ता है लेकिन धीमी, और यह हैलुसिनेशन के लिए प्रवृत्त है। + - OpenAI TTS → प्रारूप `>alloy OpenAI-TTS` → बहुभाषी लेकिन इसमें एक OpenAI API key की आवश्यकता है - Coqui XTTS → प्रारूप `_XTTS_/AUTOMATIC.wav` → केवल चीनी (सरलीकृत), अंग्रेजी, फ्रेंच, जर्मन, इतालवी, पुर्तगाली, पोलिश, तुर्की, रूसी, डच, चेक, अरबी, स्पैनिश, हंगेरियन, कोरियाई और जापानी के लिए ही उपलब्ध है। --- @@ -2393,6 +2602,8 @@ "acc_max_info": "ओवरलैपिंग से बचने के लिए अनुवादित ऑडियो सेगमेंटों के लिए अधिकतम त्वरण। 1.0 का मान कोई त्वरण नहीं दर्शाता है।", "acc_rate_label": "त्वरण दर नियामक", "acc_rate_info": "त्वरण दर नियामक: त्वरण को समायोजित करता है ताकि उपभागों को उससे कम गति की आवश्यकता हो, सततता को बनाए रखते हुए और अगले आरंभ के समय को ध्यान में रखते हुए।", + "or_label": "ओवरलैप कमी करना", + "or_info": "ओवरलैप कमी करना: पिछले समाप्ति समयों के आधार पर शुरुआत समयों को समायोजित करके सेगमेंट को ओवरलैप नहीं होने देता है; समवारण को बिगाड़ सकता है।", "aud_mix_label": "ऑडियो मिश्रण विधि", "aud_mix_info": "मूल और अनुवादित ऑडियो फ़ाइलों को मिश्रित करें और दो उपलब्ध मिश्रण मोड के साथ एक अनुकूलित, संतुलित उत्पादन बनाएं।", "vol_ori": "मूल ऑडियो ध्वनि", @@ -2405,6 +2616,19 @@ "burn_subs_label": "उपशीर्षक जलाएं", "burn_subs_info": "उपशीर्षक जलाएं: वीडियो में उपशीर्षक एम्बेड करें, जिससे वे दृश्यीय सामग्री का स्थायी हिस्सा बन जाएं।", "whisper_title": "कॉन्फ़िगर ट्रांस्क्रिप्शन।", + "lnum_label": "संख्याओं का वाचक रूपांतरण", + "lnum_info": "संख्याओं का वाचक रूपांतरण: संख्यात्मक प्रतिनिधित्वों को उनके लेखित समकक्षों से प्रतिस्थापित करें ट्रांसक्रिप्ट में।", + "scle_label": "ध्वनि की सफाई", + "scle_info": "ध्वनि की सफाई: अधिकतम समयचिह्न सटीकता के लिए ध्वनि को बेहतर बनाएं, समय चिह्नों की अधिकता के लिए अधिकतम समयचिह्न सटीकता के लिए पीछे की ध्वनि हटाएं। इस ऑपरेशन में समय लग सकता है, खासकर लंबे ऑडियो फ़ाइलों के साथ।", + "sd_limit_label": "सेगमेंट अवधि सीमा", + "sd_limit_info": "प्रत्येक सेगमेंट की अधिकतम अवधि (सेकंड में) को निर्दिष्ट करें। ऑडियो को वैड का उपयोग करके प्रोसेस किया जाएगा, प्रत्येक सेगमेंट चंक की अवधि को सीमित करके।", + "asr_model_info": "यह डिफ़ॉल्ट रूप से बोली भाषा को पाठ में परिवर्तित करता है 'व्हिस्पर मॉडल' का उपयोग करके। अपना कस्टम मॉडल उपयोग करें, उदाहरण के लिए, ड्रॉपडाउन में रिपॉज़िटरी नाम 'BELLE-2/Belle-whisper-large-v3-zh' दर्ज करके एक चीनी भाषा फ़ाइन ट्यून मॉडल का उपयोग करें। Hugging Face पर फ़ाइन ट्यून मॉडल्स पाएँ।", + "ctype_label": "हिसाब प्रकार", + "ctype_info": "छोटे प्रकारों जैसे int8 या फ़्लोट16 का चयन करना प्रदर्शन को बढ़ावा दे सकता है, मेमोरी उपयोग को कम करके और गणनात्मक परिचालन बढ़ाकर प्रदर्शन को सुधार सकता है, लेकिन float32 जैसे बड़े डेटा प्रकारों की तुलना में निश्चितता को कट्टरता में बदल सकता है।", + "batchz_label": "बैच का आकार", + "batchz_info": "यदि आपके पास कम VRAM वाली जीपीयू है, तो बैच का आकार कम करने से मेमोरी बचाई जा सकती है और मेमोरी की कमी की समस्याओं का प्रबंधन किया जा सकता है।", + "tsscale_label": "पाठ के विभाजन का पैमाना", + "tsscale_info": "पाठ को वाक्य, शब्द या अक्षरों के आधार पर खंडों में विभाजित करें। शब्द और अक्षर विभाजन और लघु ग्रेन्युलरिटी प्रदान करता है, जो उपशीर्षकों के लिए उपयोगी है; अनुवाद को अक्षम करने से मूल संरचना को संरक्षित रखा जाता है।", "srt_file_label": "एक SRT उपशीर्षक फ़ाइल अपलोड करें (विस्पर की प्रतिलिपि के बजाय इस्तेमाल की जाएगी)", "divide_text_label": "पुनः विभाजित करें टेक्स्ट सेगमेंट द्वारा:", "divide_text_info": "(प्रयोगात्मक) मौजूदा पाठ सेगमेंट को विभाजित करने के लिए एक विभाजक दर्ज करें। उपकरण को घटनाओं को पहचानने और उन्हें अनुसार नए सेगमेंट बनाने के लिए। | का उपयोग करके एक से अधिक विभाजक निर्दिष्ट करें, उदा।: !|?|...|。", @@ -2485,6 +2709,7 @@ - FACEBOOK MMS → định dạng `en-facebook-mms VITS` → Giọng nói tự nhiên hơn; hiện tại chỉ sử dụng CPU. - PIPER TTS → định dạng `en_US-lessac-high VITS-onnx` → Giống như cái trước, nhưng được tối ưu hóa cho cả CPU và GPU. - BARK → định dạng `en_speaker_0-Male BARK` → Chất lượng tốt nhưng chậm, và dễ bị ảo giác. + - OpenAI TTS → định dạng `>alloy OpenAI-TTS` → Đa ngôn ngữ nhưng cần một OpenAI API key - Coqui XTTS → định dạng `_XTTS_/AUTOMATIC.wav` → Chỉ có sẵn cho tiếng Trung (Giản thể), tiếng Anh, tiếng Pháp, tiếng Đức, tiếng Ý, tiếng Bồ Đào Nha, tiếng Ba Lan, tiếng Thổ Nhĩ Kỳ, tiếng Nga, tiếng Hà Lan, tiếng Séc, tiếng Ả Rập, tiếng Tây Ban Nha, tiếng Hungary, tiếng Hàn và tiếng Nhật. --- @@ -2567,6 +2792,8 @@ "acc_max_info": "Tăng tốc tối đa cho các đoạn âm thanh dịch để tránh chồng chéo. Giá trị 1.0 đại diện cho không tăng tốc", "acc_rate_label": "Điều Chỉnh Tốc Độ Tăng Tốc", "acc_rate_info": "Điều Chỉnh Tốc Độ Tăng Tốc: Điều chỉnh tốc độ tăng tốc để phù hợp với các đoạn yêu cầu tốc độ thấp hơn, duy trì liên tục và xem xét thời gian bắt đầu tiếp theo.", + "or_label": "Giảm chồng chéo", + "or_info": "Giảm chồng chéo: Đảm bảo các đoạn không chồng chéo bằng cách điều chỉnh thời gian bắt đầu dựa trên thời gian kết thúc trước đó; có thể làm gián đoạn đồng bộ hóa.", "aud_mix_label": "Phương pháp Trộn Âm thanh", "aud_mix_info": "Trộn các tập tin âm thanh gốc và dịch để tạo ra một đầu ra cân bằng tùy chỉnh với hai chế độ trộn có sẵn.", "vol_ori": "Âm lượng âm thanh gốc", @@ -2579,6 +2806,19 @@ "burn_subs_label": "Đốt Phụ đề", "burn_subs_info": "Đốt Phụ đề: Nhúng phụ đề vào video, biến chúng thành một phần cố định của nội dung hình ảnh.", "whisper_title": "Cấu hình chuyển đổi.", + "lnum_label": "Biểu Diễn Số Bằng Chữ", + "lnum_info": "Biểu Diễn Số Bằng Chữ: Thay thế các biểu diễn số thành các tương đương viết của chúng trong bản ghi âm.", + "scle_label": "Dọn Dẹp Âm Thanh", + "scle_info": "Dọn Dẹp Âm Thanh: Nâng cao giọng nói, loại bỏ tiếng ồn nền trước khi chuyển đổi để đạt được độ chính xác cao nhất về dấu thời gian. Thao tác này có thể mất thời gian, đặc biệt là với các tệp âm thanh dài.", + "sd_limit_label": "Giới Hạn Thời Lượng Đoạn", + "sd_limit_info": "Chỉ định thời lượng tối đa (theo giây) cho mỗi đoạn. Âm thanh sẽ được xử lý bằng cách sử dụng VAD, giới hạn thời lượng cho mỗi đoạn.", + "asr_model_info": "Nó chuyển đổi ngôn ngữ nói thành văn bản bằng cách sử dụng mô hình 'Whisper' theo mặc định. Sử dụng một mô hình tùy chỉnh, ví dụ, bằng cách nhập tên kho 'BELLE-2/Belle-whisper-large-v3-zh' trong danh sách thả xuống để sử dụng một mô hình đã được điều chỉnh cho ngôn ngữ Trung Quốc. Tìm mô hình đã điều chỉnh trên Hugging Face.", + "ctype_label": "Loại Tính Toán", + "ctype_info": "Lựa chọn các loại nhỏ hơn như int8 hoặc float16 có thể cải thiện hiệu suất bằng cách giảm việc sử dụng bộ nhớ và tăng thông lượng tính toán, nhưng có thể hy sinh độ chính xác so với các loại dữ liệu lớn hơn như float32.", + "batchz_label": "Kích Thước Lô", + "batchz_info": "Giảm kích thước lô giúp tiết kiệm bộ nhớ nếu GPU của bạn có ít VRAM và giúp quản lý các vấn đề Cạn Kiệt Bộ Nhớ.", + "tsscale_label": "Thước Đo Phân Đoạn Văn Bản", + "tsscale_info": "Chia văn bản thành các đoạn theo câu, từ hoặc ký tự. Phân đoạn theo từng từ và ký tự cung cấp độ mịn hơn, hữu ích cho phụ đề; vô hiệu hóa dịch thuật bảo tồn cấu trúc gốc.", "srt_file_label": "Tải lên một tập tin phụ đề SRT (sẽ được sử dụng thay vì việc chuyển đổi của Whisper)", "divide_text_label": "Chia lại đoạn văn bản bằng:", "divide_text_info": "(Thử nghiệm) Nhập một bộ phân cách để chia các đoạn văn bản hiện có trong ngôn ngữ nguồn. Công cụ sẽ nhận dạng các xuất hiện và tạo ra các đoạn mới tương ứng. Chỉ định nhiều bộ phân cách bằng |, ví dụ: !|?|...|。", @@ -2659,6 +2899,7 @@ - FACEBOOK MMS → format `en-facebook-mms VITS` → Głos jest bardziej naturalny; obecnie wykorzystuje tylko CPU. - PIPER TTS → format `en_US-lessac-high VITS-onnx` → To samo co poprzednie, ale zoptymalizowane zarówno pod CPU, jak i GPU. - BARK → format `en_speaker_0-Male BARK` → Dobra jakość, ale wolne działanie, podatne na halucynacje. + - OpenAI TTS → format `>alloy OpenAI-TTS` → Wielojęzyczne, ale wymaga klucza OpenAI API - Coqui XTTS → format `_XTTS_/AUTOMATIC.wav` → Dostępne tylko dla języka chińskiego (uproszczonego), angielskiego, francuskiego, niemieckiego, włoskiego, portugalskiego, polskiego, tureckiego, rosyjskiego, niderlandzkiego, czeskiego, arabskiego, hiszpańskiego, węgierskiego, koreańskiego i japońskiego. --- @@ -2741,6 +2982,8 @@ "acc_max_info": "Maksymalne przyspieszenie dla przetłumaczonych segmentów dźwiękowych, aby uniknąć nakładania się. Wartość 1.0 oznacza brak przyspieszenia", "acc_rate_label": "Regulacja prędkości przyśpieszania", "acc_rate_info": "Regulacja prędkości przyśpieszania: Dostosowuje przyśpieszenie, aby dostosować się do segmentów wymagających mniejszej prędkości, zachowując ciągłość i uwzględniając czas następnego startu.", + "or_label": "Redukcja Nakładania", + "or_info": "Redukcja Nakładania: Zapewnia, że segmenty się nie nakładają, poprzez dostosowanie czasów rozpoczęcia na podstawie wcześniejszych czasów zakończenia; może zakłócić synchronizację.", "aud_mix_label": "Metoda Mieszania Audio", "aud_mix_info": "Mieszaj pliki audio oryginalne i przetłumaczone, aby utworzyć spersonalizowane, zrównoważone wyjście z dwoma dostępnymi trybami mieszania.", "vol_ori": "Głośność oryginalnego dźwięku", @@ -2753,6 +2996,19 @@ "burn_subs_label": "Wypal napisy", "burn_subs_info": "Wypal napisy: Osadź napisy w wideo, stając się trwałą częścią treści wizualnej.", "whisper_title": "Konfiguracja transkrypcji.", + "lnum_label": "Zliteralizuj Liczby", + "lnum_info": "Zliteralizuj Liczby: Zastąp numeryczne reprezentacje ich pisemnymi odpowiednikami w transkrypcji.", + "scle_label": "Oczyszczanie Dźwięku", + "scle_info": "Oczyszczanie Dźwięku: Poprawa głosu, usuwanie szumów tła przed transkrypcją dla najwyższej precyzji znaczników czasowych. Ta operacja może zająć trochę czasu, szczególnie przy długich plikach dźwiękowych.", + "sd_limit_label": "Ograniczenie Czasu Trwania Segmentu", + "sd_limit_info": "Określ maksymalny czas trwania (w sekundach) dla każdego segmentu. Dźwięk będzie przetwarzany za pomocą VAD, ograniczając czas trwania dla każdego fragmentu segmentu.", + "asr_model_info": "Konwertuje mowę na tekst za pomocą modelu „Szept” domyślnie. Użyj niestandardowego modelu, na przykład, wpisując nazwę repozytorium „BELLE-2/Belle-whisper-large-v3-zh” w rozwijanej liście, aby użyć dostosowanego modelu w języku chińskim. Znajdź dostosowane modele na Hugging Face.", + "ctype_label": "Typ Obliczeń", + "ctype_info": "Wybór mniejszych typów, takich jak int8 lub float16, może poprawić wydajność poprzez zmniejszenie użycia pamięci i zwiększenie przepustowości obliczeniowej, ale może poświęcić precyzję w porównaniu do większych typów danych, takich jak float32.", + "batchz_label": "Rozmiar Partii", + "batchz_info": "Zmniejszenie rozmiaru partii oszczędza pamięć, jeśli Twój GPU ma mniej VRAM, i pomaga zarządzać problemami z brakiem pamięci.", + "tsscale_label": "Skala Segmentacji Tekstu", + "tsscale_info": "Podziel tekst na segmenty według zdań, słów lub znaków. Segmentacja według słów i znaków zapewnia drobniejszą granulację, przydatną dla napisów; wyłączenie tłumaczenia zachowuje pierwotną strukturę.", "srt_file_label": "Prześlij plik napisów SRT (będzie używany zamiast transkrypcji Whisper)", "divide_text_label": "Podziel segmenty tekstu przez:", "divide_text_info": "(Eksperymentalne) Wprowadź separator do podziału istniejących segmentów tekstu w języku źródłowym. Narzędzie zidentyfikuje wystąpienia i utworzy nowe segmenty zgodnie z nimi. Wprowadź kilka separatorów, używając |, np.: !|?|...|。", @@ -2833,6 +3089,7 @@ - FACEBOOK MMS → format `en-facebook-mms VITS` → Rösten är mer naturlig; för tillfället använder den endast CPU. - PIPER TTS → format `en_US-lessac-high VITS-onnx` → Samma som den föregående, men den är optimerad för både CPU och GPU. - BARK → format `en_speaker_0-Male BARK` → Bra kvalitet men långsam och benägen för hallucinationer. + - OpenAI TTS → format `>alloy OpenAI-TTS` → Multispråkigt men kräver en OpenAI API-nyckel - Coqui XTTS → format `_XTTS_/AUTOMATIC.wav` → Endast tillgängligt för kinesiska (förenklad), engelska, franska, tyska, italienska, portugisiska, polska, turkiska, ryska, nederländska, tjeckiska, arabiska, spanska, ungerska, koreanska och japanska. --- @@ -2915,6 +3172,8 @@ "acc_max_info": "Maximal acceleration för översatta ljudsegment för att undvika överlappning. En värde på 1,0 representerar ingen acceleration", "acc_rate_label": "Accelerationshastighetsreglering", "acc_rate_info": "Accelerationshastighetsreglering: Justerar accelerationen för att passa segment som kräver lägre hastighet, vilket bibehåller kontinuitet och överväger nästa starttid.", + "or_label": "Överlappningsreducering", + "or_info": "Överlappningsreducering: Säkerställer att segment inte överlappar genom att justera starttider baserat på tidigare sluttider; kan störa synkroniseringen.", "aud_mix_label": "Ljudmixningsmetod", "aud_mix_info": "Blanda original- och översatta ljudfiler för att skapa en anpassad, balanserad utdata med två tillgängliga blandningslägen.", "vol_ori": "Volym ursprungligt ljud", @@ -2927,6 +3186,19 @@ "burn_subs_label": "Bränn undertexter", "burn_subs_info": "Bränn undertexter: Bädda in undertexter i videon, vilket gör dem till en permanent del av det visuella innehållet.", "whisper_title": "Konfigurera transkription.", + "lnum_label": "Literalisera Siffror", + "lnum_info": "Literalisera Siffror: Ersätt numeriska representationer med deras skrivna motsvarigheter i transkriptet.", + "scle_label": "Ljudstädning", + "scle_info": "Ljudstädning: Förbättra röster, ta bort bakgrundsljud innan transkribering för högsta tidsstämpelprecision. Denna operation kan ta tid, särskilt med långa ljudfiler.", + "sd_limit_label": "Segmentvaraktighetsbegränsning", + "sd_limit_info": "Ange den maximala varaktigheten (i sekunder) för varje segment. Ljudet kommer att bearbetas med VAD och begränsa varaktigheten för varje segmentbit.", + "asr_model_info": "Det konverterar talat språk till text med hjälp av standardmodellen 'Whisper'. Använd en anpassad modell, till exempel genom att ange lagringsnamnet 'BELLE-2/Belle-whisper-large-v3-zh' i rullgardinsmenyn för att använda en anpassad modell för kinesiska. Hitta finjusterade modeller på Hugging Face.", + "ctype_label": "Beräkningstyp", + "ctype_info": "Att välja mindre typer som int8 eller float16 kan förbättra prestanda genom att minska minnesanvändningen och öka den beräkningsmässiga genomströmningen, men kan offra precisionen jämfört med större datatyper som float32.", + "batchz_label": "Batchstorlek", + "batchz_info": "Att minska batchstorleken sparar minne om din GPU har mindre VRAM och hjälper till att hantera minnesproblem.", + "tsscale_label": "Text segmenteringsskala", + "tsscale_info": "Dela upp texten i segment efter meningar, ord eller tecken. Ordet och teckensegmentering ger finare granularitet, användbart för undertexter; inaktivering av översättning bevarar den ursprungliga strukturen.", "srt_file_label": "Ladda upp en SRT-undertextsfil (kommer att användas istället för Whisper-transkriptionen)", "divide_text_label": "Dela upp textsegment med:", "divide_text_info": "(Experimentell) Ange en avgränsare för att dela upp befintliga textsegment på källspråket. Verktyget kommer att identifiera förekomster och skapa nya segment därefter. Ange flera avgränsare med |, t.ex.: !|?|...|。", @@ -3007,6 +3279,7 @@ - FACEBOOK MMS → 형식 `en-facebook-mms VITS` → 음성이 더 자연스럽지만 현재 CPU만 사용됩니다. - PIPER TTS → 형식 `en_US-lessac-high VITS-onnx` → 이전 것과 동일하지만 CPU와 GPU 모두 최적화되었습니다. - BARK → 형식 `en_speaker_0-Male BARK` → 품질은 좋지만 느리고 환각에 취약합니다. + - OpenAI TTS → 형식 `>alloy OpenAI-TTS` → 다국어지만 OpenAI API 키가 필요합니다 - Coqui XTTS → 형식 `_XTTS_/AUTOMATIC.wav` → 중국어 (간체), 영어, 프랑스어, 독일어, 이탈리아어, 포르투갈어, 폴란드어, 터키어, 러시아어, 네덜란드어, 체코어, 아랍어, 스페인어, 헝가리어, 한국어 및 일본어만 사용할 수 있습니다. --- @@ -3089,6 +3362,8 @@ "acc_max_info": "중첩을 피하기 위해 번역된 오디오 세그먼트에 대한 최대 가속도. 값이 1.0이면 가속도가 없음을 의미합니다", "acc_rate_label": "가속도 조절", "acc_rate_info": "가속도 조절: 속도가 느린 세그먼트에 대응하기 위해 가속도를 조절하여 연속성을 유지하고 다음 시작 시간을 고려합니다.", + "or_label": "중첩 감소", + "or_info": "중첩 감소: 이전 종료 시간을 기반으로 시작 시간을 조정하여 세그먼트가 겹치지 않도록 합니다. 동기화를 방해할 수 있습니다.", "aud_mix_label": "오디오 혼합 방법", "aud_mix_info": "원본 및 번역된 오디오 파일을 혼합하여 두 가지 사용 가능한 혼합 모드로 사용자 정의된 균형 잡힌 출력을 만듭니다.", "vol_ori": "원본 오디오 볼륨", @@ -3101,6 +3376,19 @@ "burn_subs_label": "자막 불러오기", "burn_subs_info": "자막 불러오기: 자막을 비디오에 임베드하여 시각 콘텐츠의 영구적인 부분으로 만듭니다.", "whisper_title": "전사 구성.", + "lnum_label": "숫자를 문자로 변환", + "lnum_info": "숫자를 문자로 변환: 텍스트에서 숫자 표현을 해당되는 글자로 대체하십시오.", + "scle_label": "소리 정리", + "scle_info": "소리 정리: 음성을 향상시키고 타임 스탬프 정확도를 위해 전사하기 전에 배경 소음을 제거하십시오. 이 작업은 특히 긴 오디오 파일의 경우 시간이 걸릴 수 있습니다.", + "sd_limit_label": "세그먼트 기간 제한", + "sd_limit_info": "각 세그먼트의 최대 기간(초)을 지정하십시오. 오디오는 VAD를 사용하여 각 세그먼트 조각의 기간을 제한하여 처리됩니다.", + "asr_model_info": "기본적으로 '속삭임 모델'을 사용하여 구어를 텍스트로 변환합니다. 예를 들어, 중국어 언어 파인튜닝 모델을 사용하려면 드롭다운에 'BELLE-2/Belle-whisper-large-v3-zh' 저장소 이름을 입력하십시오. Hugging Face에서 파인튜닝된 모델을 찾을 수 있습니다.", + "ctype_label": "계산 유형", + "ctype_info": "int8 또는 float16과 같은 더 작은 유형을 선택하면 메모리 사용을 줄이고 계산 처리량을 증가시켜 성능을 향상시킬 수 있지만 float32와 같은 큰 데이터 유형에 비해 정밀성을 희생할 수 있습니다.", + "batchz_label": "일괄 크기", + "batchz_info": "일괄 크기를 줄이면 GPU의 VRAM이 적은 경우 메모리를 절약할 수 있으며 메모리 부족 문제를 관리하는 데 도움이됩니다.", + "tsscale_label": "텍스트 분할 규모", + "tsscale_info": "문장, 단어 또는 문자별로 텍스트를 세그먼트로 나눕니다. 단어 및 문자 분할은 자막에 유용한 더 세밀한 세분성을 제공합니다. 번역 비활성화는 원래 구조를 보존합니다.", "srt_file_label": "SRT 자막 파일 업로드(전사 대신 사용됨)", "divide_text_label": "다음에 따라 텍스트 세그먼트를 분할:", "divide_text_info": "(실험적) 기존 텍스트 세그먼트를 분할하기 위해 구분 기호를 입력하세요. 도구는 발생한 사례를 식별하고 그에 따라 새 세그먼트를 생성합니다. |를 사용하여 여러 구분 기호를 지정하세요. 예: !|?|...|。", @@ -3181,6 +3469,7 @@ - FACEBOOK MMS → स्वरूप `en-facebook-mms VITS` → ध्वनी अधिक प्राकृतिक आहे; ह्या क्षणी, हे केवळ CPU वापरते. - PIPER TTS → स्वरूप `en_US-lessac-high VITS-onnx` → म्हणजे अखेरचा, परंतु ह्यात CPU आणि GPU दोन्हीत अनुकूलित केले आहे. - BARK → स्वरूप `en_speaker_0-Male BARK` → चांगली गुणवत्ता परंतु मंद, आणि हे हल्ल्यांसाठी आदर्श आहे. + - OpenAI TTS → स्वरूप `>alloy OpenAI-TTS` → बहुभाषिक आहे पण OpenAI API की आवश्यकता आहे - Coqui XTTS → स्वरूप `_XTTS_/AUTOMATIC.wav` → केवळ उपलब्ध आहे: चिनी (सरलीकृत), इंग्रजी, फ्रेंच, जर्मन, इटालियन, पोर्तुगीज, पोलिश, तुर्की, रशियन, डच, चेक, अरबी, स्पॅनिश, हंगेरियन, कोरियन आणि जपानी. --- @@ -3261,6 +3550,8 @@ "acc_max_info": "ओव्हरलॅपिंग टाळण्यासाठी अनुवादित ऑडियो सेगमेंटसाठी अधिकतम एक्सेलरेशन. 1.0 ची एक मूल्य अधिकतम एक्सेलरेशन प्रतिनिधित्व करते", "acc_rate_label": "वेगवर्धी दर व्यवस्थापन", "acc_rate_info": "वेगवर्धी दर व्यवस्थापन: अल्प गतीचे आवश्यक असलेले क्षेत्र समायोजित करण्यासाठी वेगवर्धी व्यवस्थापन करते, सततता ठेवते आणि पुढील सुरुवातीचा वेळ मलान घेतला जातो.", + "or_label": "ओव्हरलॅप कमी करा", + "or_info": "ओव्हरलॅप कमी करा: मागील समाप्तीच्या वेळेस आधारित सुरुवातीच्या वेळा समायोजित करून सेगमेंट ओव्हरलॅप होण्यास रोखते; समकालिकरण अडचणी उत्पन्न करू शकतो.", "aud_mix_label": "ऑडियो मिक्सिंग पद्धत", "aud_mix_info": "स्वच्छ आणि संतुलित आउटपुट सादर करण्यासाठी मूळ आणि अनुवादित ऑडियो फाईल्स एकत्रित करण्यासाठी आवश्यक दोन मिक्सिंग मोड्युल्या सोडल्या आहेत.", "vol_ori": "मूळ ऑडियोची व्हॉल्यूम", @@ -3273,6 +3564,19 @@ "burn_subs_label": "सबटायटल्स जळवा", "burn_subs_info": "सबटायटल्स जळवा: व्हिडिओमध्ये सबटायटल्स आजार करा, त्यांना दृश्यांतराचा कोणताही स्थायी भाग बनवून करा.", "whisper_title": "वाचन विक्रमण संरचना.", + "lnum_label": "संख्या शब्दांतर", + "lnum_info": "संख्या शब्दांतर: अंकांचे प्रतिनिधित्व लेखित सर्वकाशांमध्ये बदला करा.", + "scle_label": "आवाज स्वच्छता", + "scle_info": "आवाज स्वच्छता: वादला तयार करण्यापूर्वी आवाज आणि बॅकग्राऊंड ध्वनी काढा. हे काम वेगवेगळ्या आवाज फाईल्ससह करता येऊ शकते.", + "sd_limit_label": "सेगमेंट अवधी सीमा", + "sd_limit_info": "प्रत्येक सेगमेंटसाठी कोणत्याही अवधीचा महासूचीत (सेकंदांमध्ये) सुनिश्चित करा. ऑडिओ वाडचा वापर करून प्रत्येक सेगमेंटच्या तुकड्याची अवधी सीमित करण्यात येईल.", + "asr_model_info": "जीवनाचा मूळ 'फिस्फिंग' मॉडेल वापरून बोललेली भाषा ते टेक्स्टमध्ये बदलते. उदाहरणार्थ, चीनी भाषेतील फायनट्यून्ड मॉडेल वापरण्यासाठी ड्रॉपडाऊनमध्ये 'BELLE-2/Belle-whisper-large-v3-zh' संग्रह नाव नोंदवा. Hugging Face वर फायनट्यून्ड मॉडेल्स शोधा.", + "ctype_label": "गणना प्रकार", + "ctype_info": "int8 किंवा float16 आढळवून कमी डेटा प्रकारांमध्ये निर्देशन करणे कामाची वेगवेगळी प्रदर्शन करू शकते आणि गणना द्वारे अपेक्षित क्षमतेची वाढवू शकते, परंतु float32 आणि इतर मोठ्या डेटा प्रकारांपेक्षा निश्चितता कुठल्या प्रकारे कमी करू शकते.", + "batchz_label": "बॅच आकार", + "batchz_info": "आपल्याला कमी VRAM असलेले GPU असल्यास बॅच आकार कमी करणे मेमरी झटका आणू शकते आणि मेमरी नसलेली समस्या व्यवस्थापित करण्यास मदत करू शकते.", + "tsscale_label": "टेक्स्ट सेगमेंटेशन पैमाना", + "tsscale_info": "पाठाचे सेगमेंट वाक्य, शब्द किंवा अक्षरांमध्ये वागवा. शब्द आणि अक्षर सेगमेंटेशन उपशीर्षकसाठी उपयुक्त तंत्रज्ञान उपलब्ध करून देतात; अनुवाद बंद करणे मूल संरचना संरक्षित करते.", "srt_file_label": "एसआरटी उपशीर्षक फाईल अपलोड करा (व्हिस्परच्या विवेचनाच्या विरोधात वापरली जाईल)", "divide_text_label": "टेक्स्ट सेगमेंट्स पुनर्विभाजित करा:", "divide_text_info": "(प्रयोगशील) स्रोत भाषेतील विद्यमान टेक्स्ट सेगमेंट्सचा विभाग करण्यासाठी एक विभाजक प्रविष्ट करा. टूलला उपलब्धींना ओळखण्यासाठी आणि नुकसानकर्ता करण्यासाठी त्यामुळे नवीन सेगमेंट्स निर्मित करते. | चा वापर करून अनेक विभाजक स्पष्ट करा, उदा.: !|?|...|।", @@ -3353,6 +3657,7 @@ - FACEBOOK MMS → format `en-facebook-mms VITS` → Səsi daha doğaldır; ancaq ancaq CPU istifadə edir. - PIPER TTS → format `en_US-lessac-high VITS-onnx` → Əvvəlki ilə eynidir, ancaq hem CPU, hem də GPU üçün optimalaşdırılmışdır. - BARK → format `en_speaker_0-Male BARK` → Yaxşı keyfiyyətli, ancaq yavaş və halüsinasiyalara meyllidir. + - OpenAI TTS → format `>alloy OpenAI-TTS` → Çoxdilli, lakin OpenAI API açarı tələb olunur - Coqui XTTS → format `_XTTS_/AUTOMATIC.wav` → Yalnız Çin (Sadələşdirilmiş), İngilis, Fransız, Alman, İtalyan, Portuqal, Poliş, Türk, Rus, Holland, Çex, Ərəb, İspan, Macar, Korey və Yapon dilində mövcuddur. --- @@ -3435,6 +3740,8 @@ "acc_max_info": "Üstünlük təşkil etməmək üçün tərcümə olunmuş audio segmentlərinin maksimum sürəti. 1.0 dəyəri heç bir sürəti təşkil etmir", "acc_rate_label": "Sürətin Artımının Tənzimlənməsi", "acc_rate_info": "Sürətin Artımının Tənzimlənməsi: Sürəti az olan segmentlərə uyğun olaraq sürəti tənzimləyir, davam etməni qoruyur və növbəti başlanğıcın vaxtını nəzərə alır.", + "or_label": "Üstünlüklərin Azaldılması", + "or_info": "Üstünlüklərin Azaldılması: Segmentlərin bir-birinin üstündə olmamasını təmin edir, əvvəlki bitiş vaxtlarına əsasən başlanğıc vaxtlarını tənzimləyərək; sinxronlaşmaya mane ola bilər.", "aud_mix_label": "Audio qarışdırma metodları", "aud_mix_info": "Orijinal və tərcümə olunmuş audio fayllarını qarışdıraraq iki mövcud qarışdırma rejimi ilə xüsusi, dengəli bir çıxış yaradın.", "vol_ori": "Orijinal səsin səsi", @@ -3447,6 +3754,19 @@ "burn_subs_label": "Altyazıları Yanma", "burn_subs_info": "Altyazıları Yanma: Altyazıları videoya ilave edərək, onları görünən məzmunun daimi bir hissəsi halına gətirin.", "whisper_title": "Tərcümə edilən mətnin konfiqurasiyası.", + "lnum_label": "Rəqəmləri Litarallarlaşdırmaq", + "lnum_info": "Rəqəmləri Litarallarlaşdırmaq: Sayısal təsvirləri onların yazılı müqabilələri ilə əvəzləyin.", + "scle_label": "Səs Təmizliyi", + "scle_info": "Səs Təmizliyi: Maksimum vaxt damğası dəqiqliyi üçün səsi yaxşılaşdırın, transkripsiyadan əvvəl fon gürültüsünü çıxarın. Bu əməliyyat uzun səs faylları ilə xüsusilə vaxt ala bilər.", + "sd_limit_label": "Segment Müddəti Məhdudiyyəti", + "sd_limit_info": "Hər bir segment üçün maksimum müddəti (saniyə) təyin edin. Səs VAD-dan istifadə edilərək hər bir segment parçasının müddəti məhdudlaşdırılacaq.", + "asr_model_info": "Bu, default olaraq danışılan dilə mətni 'Əfsus' modeli istifadə edərək mətnə çevirir. Xüsusi model istifadə edin, məsələn, çin dilində fayin-tuninq edilmiş model istifadə etmək üçün 'BELLE-2/Belle-whisper-large-v3-zh' depozit adını keçid menyusuna daxil edin. Hugging Face-də fayin-tuninq edilmiş modelləri tapın.", + "ctype_label": "Hesablama Növü", + "ctype_info": "int8 və ya float16 kimi kiçik növ seçmək yaddaş istifadəsini azaldaraq və hesablama nəzarətini artıraraq performansı yaxşılaşdıra bilər, lakin float32 kimi daha böyük veri növlərinə nisbətən dəqiqliyi fəda etmək olar.", + "batchz_label": "Toplu Ölçüsü", + "batchz_info": "Toplu ölçüsünü azaldaraq, əğer GPU-nuzun az VRAM varsa, yaddaş qənaət etmək mümkündür və Yaddaşsız Yaddaş problemə idarə edə bilər.", + "tsscale_label": "Mətn Segmentlərinin Masshtabı", + "tsscale_info": "Mətni cümlə, söz və ya simvollarla segmentlərə bölmək. Söz və simvol bölməsi, subtitrlər üçün faydalı olan daha dəqiqliyi təmin edir; tərcüməni söndürmək asal strukturu qoruyur.", "srt_file_label": "Bir SRT subtitri faylı yükləyin (Fısıldağın transkripsiyası əvəzinə istifadə olunacaq)", "divide_text_label": "Mətn segmentlərini bölmək üçün ayırıcı daxil edin:", "divide_text_info": "(Təcrübəli) Mövcud mətn segmentlərini böləcək bir ayırıcı daxil edin. Alətlər tez-tez yaradır və uyğun gələn yerlərdə yeni segmentlər yaradır. Birdən çox ayırıcı daxil edin, |, misal: !|?|...|。", @@ -3501,4 +3821,386 @@ "cv_button_apply": "KONFiQURASiYANI TƏTBiQ EDiN", "tab_help": "Kömək", }, + + "persian": { + "description": """ + ### 🎥 **با SoniTranslate به راحتی ویدئوها را ترجمه کنید!** 📽️ + + یک ویدئو، فایل زیرنویس، فایل صوتی را آپلود کنید یا یک لینک ویدئوی URL ارائه دهید. 📽️ **دفترچه یادداشت به‌روز شده را از مخزن رسمی دریافت کنید: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!** + + دستورالعمل‌های استفاده را در تب `Help` ببینید. بیایید با ترجمه ویدئوها سرگرم شویم! 🚀🎉 + """, + "tutorial": """ + # 🔰 **دستورالعمل استفاده:** + + 1. 📤 یک **ویدئو**، **فایل زیرنویس**، **فایل صوتی** را آپلود کنید یا 🌐 **لینک URL** به یک ویدئو مانند یوتیوب ارائه دهید. + + 2. 🌍 زبانی را که می‌خواهید **ویدئو را به آن ترجمه کنید** انتخاب کنید. + + 3. 🗣️ تعداد **افراد گوینده** در ویدئو را مشخص کنید و **برای هرکدام یک صدای متن به گفتار مناسب** برای زبان ترجمه انتخاب کنید. + + 4. 🚀 دکمه '**ترجمه**' را فشار دهید تا نتایج را دریافت کنید. + + --- + + # 🧩 **SoniTranslate از موتورهای مختلف TTS (متن به گفتار) پشتیبانی می‌کند، که شامل:** + - EDGE-TTS → فرمت `en-AU-WilliamNeural-Male` → سریع و دقیق. + - FACEBOOK MMS → فرمت `en-facebook-mms VITS` → صدای طبیعی‌تر؛ در حال حاضر فقط از CPU استفاده می‌کند. + - PIPER TTS → فرمت `en_US-lessac-high VITS-onnx` → مانند قبلی، اما برای CPU و GPU بهینه‌سازی شده است. + - BARK → فرمت `en_speaker_0-Male BARK` → کیفیت خوب ولی کند و مستعد هذیان. + - OpenAI TTS → فرمت `>alloy OpenAI-TTS` → چندزبانه اما نیاز به کلید API OpenAI دارد. + - Coqui XTTS → فرمت `_XTTS_/AUTOMATIC.wav` → فقط برای چینی (ساده‌شده)، انگلیسی، فرانسوی، آلمانی، ایتالیایی، پرتغالی، لهستانی، ترکی، روسی، هلندی، چک، عربی، اسپانیایی، مجارستانی، کره‌ای و ژاپنی در دسترس است. + + --- + + # 🎤 چگونه از صداهای R.V.C. و R.V.C.2 استفاده کنیم (اختیاری) 🎶 + + هدف اعمال R.V.C. به TTS تولید شده است 🎙️ + + 1. در تب `Custom Voice R.V.C.` مدل‌های مورد نیاز را دانلود کنید 📥 می‌توانید از لینک‌های Hugging Face و Google Drive در قالب‌های zip، pth، یا index استفاده کنید. همچنین می‌توانید مخازن کامل HF را دانلود کنید، اما این گزینه خیلی پایدار نیست 😕 + + 2. حالا به `Replace voice: TTS to R.V.C.` بروید و جعبه `enable` را تیک بزنید ✅ پس از این، می‌توانید مدل‌هایی را که می‌خواهید به هر سخنگوی TTS اعمال کنید انتخاب کنید 👩‍🦰👨‍🦱👩‍🦳👨‍🦲 + + 3. روش F0 که برای همه R.V.C. اعمال خواهد شد تنظیم کنید 🎛️ + + 4. دکمه `APPLY CONFIGURATION` را فشار دهید تا تغییرات اعمال شود 🔄 + + 5. به تب ترجمه ویدئو بازگردید و بر روی 'Translate' کلیک کنید ▶️ حالا ترجمه با اعمال R.V.C. انجام خواهد شد 🗣️ + + نکته: می‌توانید از `Test R.V.C.` استفاده کنید تا بهترین TTS یا تنظیمات را برای اعمال به R.V.C. آزمایش و پیدا کنید 🧪🔍 + + --- + + """, + "tab_translate": "ترجمه ویدئو", + "video_source": "منبع ویدئو را انتخاب کنید", + "link_label": "لینک رسانه.", + "link_info": "مثال: www.youtube.com/watch?v=g_9rPvbENUw", + "link_ph": "لینک URL را اینجا وارد کنید...", + "dir_label": "مسیر ویدئو.", + "dir_info": "مثال: /usr/home/my_video.mp4", + "dir_ph": "مسیر را اینجا وارد کنید...", + "sl_label": "زبان مبدا", + "sl_info": "این زبان اصلی ویدئو است", + "tat_label": "ترجمه صوتی به", + "tat_info": "زبان مقصد را انتخاب کنید و همچنین مطمئن شوید که TTS مربوط به آن زبان را انتخاب کنید.", + "num_speakers": "تعداد افراد گوینده در ویدئو را انتخاب کنید.", + "min_sk": "حداقل گوینده‌ها", + "max_sk": "حداکثر گوینده‌ها", + "tts_select": "صدای مورد نظر برای هر گوینده را انتخاب کنید.", + "sk1": "گوینده TTS 1", + "sk2": "گوینده TTS 2", + "sk3": "گوینده TTS 3", + "sk4": "گوینده TTS 4", + "sk5": "گوینده TTS 5", + "sk6": "گوینده TTS 6", + "sk7": "گوینده TTS 7", + "sk8": "گوینده TTS 8", + "sk9": "گوینده TTS 9", + "sk10": "گوینده TTS 10", + "sk11": "گوینده TTS 11", + "sk12": "گوینده TTS 12", + "vc_title": "تقلید صدا در زبان‌های مختلف", + "vc_subtitle": """ + ### صدای یک فرد را در زبان‌های مختلف بازتولید کنید. + در حالی که با اکثر صداها به درستی کار می‌کند، ممکن است در هر مورد به صورت کامل عمل نکند. + تقلید صدا تنها لحن گوینده مرجع را بازتولید می‌کند، بدون لهجه و احساسات که توسط مدل پایه TTS تعیین می‌شوند و توسط مبدل بازتولید نمی‌شوند. + این کار نمونه‌های صوتی را از صدای اصلی هر گوینده گرفته و پردازش می‌کند. + """, + "vc_active_label": "تقلید صدا فعال است", + "vc_active_info": "تقلید صدا فعال: لحن گوینده اصلی را بازتولید می‌کند", + "vc_method_label": "روش", + "vc_method_info": "یک روش برای فرآیند تقلید صدا انتخاب کنید", + "vc_segments_label": "حداکثر نمونه‌ها", + "vc_segments_info": "حداکثر نمونه‌ها: تعداد نمونه‌های صوتی که برای فرآیند تولید خواهند شد، بیشتر بهتر است اما ممکن است نویز اضافه کند", + "vc_dereverb_label": "حذف اکو", + "vc_dereverb_info": "حذف اکو: حذف اکو صوتی از نمونه‌های صوتی.", + "vc_remove_label": "حذف نمونه‌های قبلی", + "vc_remove_info": "حذف نمونه‌های قبلی: حذف نمونه‌های قبلی تولید شده، بنابراین نمونه‌های جدید نیاز به تولید دارند.", + "xtts_title": "ایجاد TTS بر اساس یک فایل صوتی", + "xtts_subtitle": "یک فایل صوتی کوتاه با صدای حداکثر 10 ثانیه آپلود کنید. با استفاده از XTTS، یک TTS جدید با صدای مشابه به فایل صوتی ارائه شده ایجاد خواهد شد.", + "xtts_file_label": "یک فایل صوتی کوتاه با صدا آپلود کنید", + "xtts_name_label": "نام برای TTS", + "xtts_name_info": "یک نام ساده استفاده کنید", + "xtts_dereverb_label": "حذف اکو صوتی", + "xtts_dereverb_info": "حذف اکو صوتی: حذف اکو از صوت", + "xtts_button": "پردازش صوت و افزودن آن به انتخابگر TTS", + "xtts_footer": "تولید صدای XTTS به طور خودکار: می‌توانید از `_XTTS_/AUTOMATIC.wav` در انتخابگر TTS برای تولید خودکار بخش‌ها برای هر گوینده هنگام تولید ترجمه استفاده کنید.", + "extra_setting": "تنظیمات پیشرفته", + "acc_max_label": "حداکثر شتاب صوتی", + "acc_max_info": "حداکثر شتاب برای بخش‌های صوتی ترجمه شده برای جلوگیری از تداخل. مقدار 1.0 نمایانگر بدون شتاب است", + "acc_rate_label": "تنظیم نرخ شتاب", + "acc_rate_info": "تنظیم نرخ شتاب: تنظیم شتاب برای سازگاری با بخش‌هایی که نیاز به سرعت کمتری دارند، حفظ پیوستگی و در نظر گرفتن زمان شروع بعدی.", + "or_label": "کاهش تداخل", + "or_info": "کاهش تداخل: اطمینان از عدم تداخل بخش‌ها با تنظیم زمان شروع بر اساس زمان پایان قبلی؛ ممکن است همگام‌سازی را مختل کند.", + "aud_mix_label": "روش ترکیب صوتی", + "aud_mix_info": "میکس فایل‌های صوتی اصلی و ترجمه شده برای ایجاد خروجی سفارشی و متعادل با دو حالت میکس موجود.", + "vol_ori": "حجم صدای اصلی", + "vol_tra": "حجم صدای ترجمه شده", + "voiceless_tk_label": "مسیر بدون صدا", + "voiceless_tk_info": "مسیر بدون صدا: حذف صدای اصلی قبل از ترکیب آن با صدای ترجمه شده.", + "sub_type": "نوع زیرنویس", + "soft_subs_label": "زیرنویس نرم", + "soft_subs_info": "زیرنویس نرم: زیرنویس‌های اختیاری که بینندگان می‌توانند آنها را هنگام تماشا روشن یا خاموش کنند.", + "burn_subs_label": "زیرنویس سوخته", + "burn_subs_info": "زیرنویس سوخته: تعبیه زیرنویس‌ها در ویدئو، که آنها را به بخشی دائمی از محتوای بصری تبدیل می‌کند.", + "whisper_title": "پیکربندی رونوشت.", + "lnum_label": "نوشتاری اعداد", + "lnum_info": "نوشتاری اعداد: جایگزین نمایش عددی با معادل‌های نوشتاری آنها در رونوشت.", + "scle_label": "پاکسازی صدا", + "scle_info": "پاکسازی صدا: تقویت صداها، حذف نویز پس‌زمینه قبل از رونوشت برای دقت زمان‌بندی بالا. این عملیات ممکن است زمان ببرد، به ویژه با فایل‌های صوتی طولانی.", + "sd_limit_label": "حداکثر مدت زمان بخش", + "sd_limit_info": "حداکثر مدت زمان برای هر بخش را مشخص کنید. صوت با استفاده از VAD پردازش خواهد شد، و مدت زمان برای هر بخش محدود خواهد شد.", + "asr_model_info": "این مدل زبان گفتاری را به متن تبدیل می‌کند و از مدل 'Whisper' به‌صورت پیش‌فرض استفاده می‌کند. از یک مدل سفارشی استفاده کنید، برای مثال، با وارد کردن نام مخزن 'BELLE-2/Belle-whisper-large-v3-zh' در لیست کشویی برای استفاده از مدل چینی فاین‌تیون شده. مدل‌های فاین‌تیون شده را در Hugging Face پیدا کنید.", + "ctype_label": "نوع محاسبه", + "ctype_info": "انتخاب انواع کوچکتر مانند int8 یا float16 می‌تواند عملکرد را با کاهش استفاده از حافظه و افزایش توان محاسباتی بهبود بخشد، اما ممکن است دقت را نسبت به انواع داده‌های بزرگ‌تر مانند float32 فدا کند.", + "batchz_label": "اندازه دسته", + "batchz_info": "کاهش اندازه دسته حافظه را ذخیره می‌کند اگر GPU شما VRAM کمتری دارد و کمک می‌کند به مدیریت مشکلات کمبود حافظه.", + "tsscale_label": "مقیاس بخش‌بندی متن", + "tsscale_info": "تقسیم متن به بخش‌ها با جملات، کلمات، یا کاراکترها. بخش‌بندی کلمه و کاراکتر دانه‌بندی بیشتری ارائه می‌دهد که برای زیرنویس‌ها مفید است؛ غیرفعال کردن ترجمه ساختار اصلی را حفظ می‌کند.", + "srt_file_label": "یک فایل زیرنویس SRT آپلود کنید (به جای رونوشت Whisper استفاده خواهد شد)", + "divide_text_label": "تقسیم مجدد بخش‌های متن توسط:", + "divide_text_info": "(آزمایشی) یک جداکننده برای تقسیم بخش‌های موجود متن در زبان منبع وارد کنید. ابزار وقوع‌ها را شناسایی کرده و بخش‌های جدید را بر اساس آن ایجاد می‌کند. چندین جداکننده را با | مشخص کنید، به عنوان مثال: !|?|...|。", + "diarization_label": "مدل دیاریزیشن", + "tr_process_label": "فرآیند ترجمه", + "out_type_label": "نوع خروجی", + "out_name_label": "نام فایل", + "out_name_info": "نام فایل خروجی", + "task_sound_label": "صدای وضعیت کار", + "task_sound_info": "صدای وضعیت کار: پخش صدای هشدار نشان‌دهنده تکمیل کار یا خطاها در حین اجرا.", + "cache_label": "بازیابی پیشرفت", + "cache_info": "بازیابی پیشرفت: ادامه فرآیند از آخرین نقطه توقف.", + "preview_info": "پیش‌نمایش ویدئو را به 10 ثانیه برای آزمایش برش می‌دهد. لطفاً آن را غیرفعال کنید تا ویدئوی کامل را دریافت کنید.", + "edit_sub_label": "ویرایش زیرنویس‌های تولید شده", + "edit_sub_info": "ویرایش زیرنویس‌های تولید شده: به شما امکان می‌دهد ترجمه را در دو مرحله انجام دهید. ابتدا با دکمه 'GET SUBTITLES AND EDIT' زیرنویس‌ها را بگیرید و ویرایش کنید، و سپس با دکمه 'TRANSLATE' ویدئو را تولید کنید", + "button_subs": "GET SUBTITLES AND EDIT", + "editor_sub_label": "زیرنویس‌های تولید شده", + "editor_sub_info": "می‌توانید متن زیرنویس‌های تولید شده را اینجا ویرایش کنید. قبل از کلیک بر روی دکمه 'TRANSLATE' می‌توانید تغییرات را در گزینه‌های رابط ایجاد کنید، به جز 'زبان منبع'، 'ترجمه صوتی به' و 'حداکثر گوینده‌ها'، تا از بروز خطاها جلوگیری شود. پس از اتمام، دکمه 'TRANSLATE' را فشار دهید.", + "editor_sub_ph": "ابتدا دکمه 'GET SUBTITLES AND EDIT' را فشار دهید تا زیرنویس‌ها را دریافت کنید", + "button_translate": "TRANSLATE", + "output_result_label": "دانلود ویدئوی ترجمه شده", + "sub_ori": "زیرنویس‌ها", + "sub_tra": "زیرنویس‌های ترجمه شده", + "ht_token_info": "یکی از مراحل مهم قبول موافقتنامه مجوز برای استفاده از Pyannote است. شما نیاز به داشتن یک حساب کاربری در Hugging Face و قبول مجوز برای استفاده از مدل‌ها دارید: https://huggingface.co/pyannote/speaker-diarization و https://huggingface.co/pyannote/segmentation. کلید TOKEN خود را اینجا بگیرید: https://hf.co/settings/tokens", + "ht_token_ph": "کلید TOKEN را اینجا وارد کنید...", + "tab_docs": "ترجمه اسناد", + "docs_input_label": "منبع سند را انتخاب کنید", + "docs_input_info": "می‌تواند PDF، DOCX، TXT، یا متن باشد", + "docs_source_info": "این زبان اصلی متن است", + "chunk_size_label": "حداکثر تعداد کاراکترهایی که TTS در هر بخش پردازش خواهد کرد", + "chunk_size_info": "مقدار 0 یک مقدار پویا و سازگارتر برای TTS اختصاص می‌دهد.", + "docs_button": "شروع پل تبدیل زبان", + "cv_url_info": "مدل‌های R.V.C. را به صورت خودکار از URL دانلود کنید. می‌توانید از لینک‌های HuggingFace یا Drive استفاده کنید و می‌توانید چندین لینک را شامل کنید، هرکدام با کاما جدا شده باشند. مثال: https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.pth, https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.index", + "replace_title": "تعویض صدا: TTS به R.V.C.", + "sec1_title": "### 1. برای فعال‌سازی استفاده، آن را به عنوان فعال علامت بزنید.", + "enable_replace": "این را بررسی کنید تا استفاده از مدل‌ها فعال شود.", + "sec2_title": "### 2. صدایی را که به هر TTS هر گوینده اعمال خواهد شد انتخاب کنید و تنظیمات را اعمال کنید.", + "sec2_subtitle": "بسته به تعداد <گوینده TTS> که استفاده می‌کنید، هرکدام به مدل مربوطه خود نیاز دارند. علاوه بر این، یک مدل کمکی نیز وجود دارد که در صورت عدم تشخیص صحیح گوینده استفاده می‌شود.", + "cv_tts1": "صدایی را برای گوینده 1 انتخاب کنید.", + "cv_tts2": "صدایی را برای گوینده 2 انتخاب کنید.", + "cv_tts3": "صدایی را برای گوینده 3 انتخاب کنید.", + "cv_tts4": "صدایی را برای گوینده 4 انتخاب کنید.", + "cv_tts5": "صدایی را برای گوینده 5 انتخاب کنید.", + "cv_tts6": "صدایی را برای گوینده 6 انتخاب کنید.", + "cv_tts7": "صدایی را برای گوینده 7 انتخاب کنید.", + "cv_tts8": "صدایی را برای گوینده 8 انتخاب کنید.", + "cv_tts9": "صدایی را برای گوینده 9 انتخاب کنید.", + "cv_tts10": "صدایی را برای گوینده 10 انتخاب کنید.", + "cv_tts11": "صدایی را برای گوینده 11 انتخاب کنید.", + "cv_tts12": "صدایی را برای گوینده 12 انتخاب کنید.", + "cv_aux": "- صدایی که در صورت عدم تشخیص موفقیت‌آمیز گوینده اعمال خواهد شد.", + "cv_button_apply": "اعمال تنظیمات", + "tab_help": "کمک", + }, + + "afrikaans": { + "description": """ + ### 🎥 **Vertaal video's maklik met SoniTranslate!** 📽️ + + Laai 'n video, onderskrif, klanklêer op of verskaf 'n URL-videolink. 📽️ **Kry die opgedateerde notaboek van die amptelike repository: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!** + + Sien die tab 'Hulp' vir instruksies oor hoe om dit te gebruik. Kom ons begin pret hê met videovertaal! 🚀🎉 + """, + "tutorial": """ + # 🔰 **Instruksies vir gebruik:** + + 1. 📤 Laai 'n **video**, **onderskriflêer**, **klanklêer** op of verskaf 'n 🌐 **URL link** na 'n video soos YouTube. + + 2. 🌍 Kies die taal waarin jy die **video wil vertaal**. + + 3. 🗣️ Spesifiseer die **aantal mense wat praat** in die video en **ken elkeen 'n teks-na-spraak-stem toe** wat geskik is vir die vertalingstaal. + + 4. 🚀 Druk die '**Vertaal**' knoppie om die resultate te verkry. + + --- + + # 🧩 **SoniTranslate ondersteun verskillende TTS (Teks-na-Spraak) enjins, wat is:** + - EDGE-TTS → formaat `en-AU-WilliamNeural-Male` → Vinnig en akkuraat. + - FACEBOOK MMS → formaat `en-facebook-mms VITS` → Die stem is meer natuurlik; op die oomblik gebruik dit net CPU. + - PIPER TTS → formaat `en_US-lessac-high VITS-onnx` → Dieselfde as die vorige een, maar dit is geoptimaliseer vir beide CPU en GPU. + - BARK → formaat `en_speaker_0-Male BARK` → Goeie kwaliteit maar stadig, en dit is geneig tot hallusinasies. + - OpenAI TTS → formaat `>alloy OpenAI-TTS` → Veeltalig maar dit benodig 'n OpenAI API sleutel. + - Coqui XTTS → formaat `_XTTS_/AUTOMATIC.wav` → Slegs beskikbaar vir Vereenvoudigde Chinees, Engels, Frans, Duits, Italiaans, Portugees, Pools, Turks, Russies, Nederlands, Tsjeggies, Arabies, Spaans, Hongaars, Koreaans en Japanees. + + --- + + # 🎤 Hoe om R.V.C. en R.V.C.2 Stemmen te Gebruik (Opsioneel) 🎶 + + Die doel is om 'n R.V.C. toe te pas op die gegenereerde TTS (Teks-na-Spraak) 🎙️ + + 1. In die `Aangepaste Stem R.V.C.` tab, laai die modelle af wat jy benodig 📥 Jy kan skakels van Hugging Face en Google Drive in formate soos zip, pth, of index gebruik. Jy kan ook volledige HF-ruimte-repositories aflaai, maar hierdie opsie is nie baie stabiel nie 😕 + + 2. Gaan nou na `Vervang stem: TTS na R.V.C.` en merk die `aktiveer` boks ✅ Na dit, kan jy die modelle kies wat jy wil toepas op elke TTS spreker 👩‍🦰👨‍🦱👩‍🦳👨‍🦲 + + 3. Pas die F0 metode aan wat toegepas sal word op alle R.V.C. 🎛️ + + 4. Druk `PAS KONFIGURASIE TOE` om die veranderinge wat jy gemaak het toe te pas 🔄 + + 5. Gaan terug na die videovertaal tab en klik op 'Vertaal' ▶️ Nou sal die vertaling gedoen word met die toepassing van die R.V.C. 🗣️ + + Wenke: Jy kan `Toets R.V.C.` gebruik om te eksperimenteer en die beste TTS of konfigurasies te vind om op die R.V.C. toe te pas 🧪🔍 + + --- + + """, + "tab_translate": "Videovertaal", + "video_source": "Kies Video Bron", + "link_label": "Media link.", + "link_info": "Voorbeeld: www.youtube.com/watch?v=g_9rPvbENUw", + "link_ph": "URL gaan hier...", + "dir_label": "Video Pad.", + "dir_info": "Voorbeeld: /usr/home/my_video.mp4", + "dir_ph": "Pad gaan hier...", + "sl_label": "Bron taal", + "sl_info": "Dit is die oorspronklike taal van die video", + "tat_label": "Vertaal klank na", + "tat_info": "Kies die teikentaal en maak ook seker om die ooreenstemmende TTS vir daardie taal te kies.", + "num_speakers": "Kies hoeveel mense praat in die video.", + "min_sk": "Min sprekers", + "max_sk": "Max sprekers", + "tts_select": "Kies die stem wat jy vir elke spreker wil hê.", + "sk1": "TTS Spreker 1", + "sk2": "TTS Spreker 2", + "sk3": "TTS Spreker 3", + "sk4": "TTS Spreker 4", + "sk5": "TTS Spreker 5", + "sk6": "TTS Spreker 6", + "sk7": "TTS Spreker 7", + "sk8": "TTS Spreker 8", + "sk9": "TTS Spreker 9", + "sk10": "TTS Spreker 10", + "sk11": "TTS Spreker 11", + "sk12": "TTS Spreker 12", + "vc_title": "Stem Nabootsing in Verskillende Tale", + "vc_subtitle": """ + ### Herhaal 'n persoon se stem oor verskeie tale. + Terwyl effektief met die meeste stemme wanneer gepas gebruik, mag dit nie perfek wees in elke geval nie. + Stem Nabootsing herhaal slegs die verwysingspreker se toon, sonder aksent en emosie, wat deur die basispreker TTS model beheer word en nie deur die omskakelaar nageboots word nie. + Dit sal oudio monsters van die hoof oudio neem vir elke spreker en hulle verwerk. + """, + "vc_active_label": "Aktiewe Stem Nabootsing", + "vc_active_info": "Aktiewe Stem Nabootsing: Herhaal die oorspronklike spreker se toon", + "vc_method_label": "Metode", + "vc_method_info": "Kies 'n metode vir die Stem Nabootsing proses", + "vc_segments_label": "Max monsters", + "vc_segments_info": "Max monsters: Is die aantal oudio monsters wat gegenereer sal word vir die proses, meer is beter maar dit kan geraas byvoeg", + "vc_dereverb_label": "Dereverb", + "vc_dereverb_info": "Dereverb: Pas vokale dereverb toe op die oudio monsters.", + "vc_remove_label": "Verwyder vorige monsters", + "vc_remove_info": "Verwyder vorige monsters: Verwyder die vorige monsters wat gegenereer is, sodat nuwe monsters geskep moet word.", + "xtts_title": "Skep 'n TTS gebaseer op 'n oudio", + "xtts_subtitle": "Laai 'n oudio lêer van maksimum 10 sekondes op met 'n stem. Deur XTTS te gebruik, sal 'n nuwe TTS geskep word met 'n stem soortgelyk aan die verskafde oudio lêer.", + "xtts_file_label": "Laai 'n kort oudio op met die stem", + "xtts_name_label": "Naam vir die TTS", + "xtts_name_info": "Gebruik 'n eenvoudige naam", + "xtts_dereverb_label": "Dereverb oudio", + "xtts_dereverb_info": "Dereverb oudio: Pas vokale dereverb toe op die oudio", + "xtts_button": "Verwerk die oudio en sluit dit in die TTS keurder in", + "xtts_footer": "Genereer stem xtts outomaties: Jy kan `_XTTS_/AUTOMATIC.wav` gebruik in die TTS keurder om outomaties segmente te genereer vir elke spreker wanneer die vertaling gegenereer word.", + "extra_setting": "Gevorderde Instellings", + "acc_max_label": "Max Oudio versnelling", + "acc_max_info": "Maksimum versnelling vir vertaalde oudio segmente om oorvleueling te vermy. 'n Waarde van 1.0 verteenwoordig geen versnelling nie", + "acc_rate_label": "Versnelling Reguleringskoers", + "acc_rate_info": "Versnelling Reguleringskoers: Pas versnelling aan om segmente wat minder spoed benodig te akkommodeer, handhaaf kontinuïteit en oorweeg volgende-begin tydsberekening.", + "or_label": "Oorvleueling Reduksie", + "or_info": "Oorvleueling Reduksie: Verseker segmente oorvleuel nie deur begin tye aan te pas gebaseer op vorige eind tye; kan sinkronisasie versteur.", + "aud_mix_label": "Oudio Meng Metode", + "aud_mix_info": "Meng oorspronklike en vertaalde oudio lêers om 'n aangepaste, gebalanseerde uitset te skep met twee beskikbare mengmodusse.", + "vol_ori": "Volume oorspronklike oudio", + "vol_tra": "Volume vertaalde oudio", + "voiceless_tk_label": "Stemlose Snit", + "voiceless_tk_info": "Stemlose Snit: Verwyder die oorspronklike oudio stemme voordat dit met die vertaalde oudio gekombineer word.", + "sub_type": "Onderskrif tipe", + "soft_subs_label": "Sagte Onderskrifte", + "soft_subs_info": "Sagte Onderskrifte: Opsionele onderskrifte wat kykers kan aanskakel of afskakel terwyl hulle die video kyk.", + "burn_subs_label": "Brand Onderskrifte", + "burn_subs_info": "Brand Onderskrifte: Inbed onderskrifte in die video, maak hulle 'n permanente deel van die visuele inhoud.", + "whisper_title": "Konfigureer transkripsie.", + "lnum_label": "Literaliseer Nommer", + "lnum_info": "Literaliseer Nommer: Vervang numeriese verteenwoordigings met hul geskrewe ekwivalente in die transkripsie.", + "scle_label": "Klank Opruiming", + "scle_info": "Klank Opruiming: Versterk vokale, verwyder agtergrondgeraas voor transkripsie vir uiterste tydstempel presisie. Hierdie operasie kan tyd neem, veral met lang oudio lêers.", + "sd_limit_label": "Segmentduur Beperking", + "sd_limit_info": "Spesifiseer die maksimum duur (in sekondes) vir elke segment. Die oudio sal verwerk word met VAD, wat die duur vir elke segment stuk beperk.", + "asr_model_info": "Dit omskakel gesproke taal na teks met die 'Whisper model' by verstek. Gebruik 'n aangepaste model, byvoorbeeld, deur die repository naam 'BELLE-2/Belle-whisper-large-v3-zh' in die dropdown in te voer om 'n Chinees taal fyn-afgestelde model te gebruik. Vind fyn-afgestelde modelle op Hugging Face.", + "ctype_label": "Reken tipe", + "ctype_info": "Kies kleiner tipes soos int8 of float16 kan prestasie verbeter deur geheuegebruik te verminder en berekeningstempo te verhoog, maar kan presisie opoffer in vergelyking met groter datatipes soos float32.", + "batchz_label": "Batch grootte", + "batchz_info": "Verkleining van die batch grootte bespaar geheue as jou GPU minder VRAM het en help om Uit-van-Geheue probleme te bestuur.", + "tsscale_label": "Teks Segmentasie Skale", + "tsscale_info": "Verdeel teks in segmente deur sinne, woorde, of karakters. Woord en karakter segmentasie bied fyner granulariteit, nuttig vir onderskrifte; deaktiveer vertaling behou oorspronklike struktuur.", + "srt_file_label": "Laai 'n SRT onderskriflêer op (sal gebruik word in plaas van die transkripsie van Whisper)", + "divide_text_label": "Her-verdeel teks segmente deur:", + "divide_text_info": "(Eksperimenteel) Voer 'n skeier in om bestaande teks segmente in die brontaal te verdeel. Die hulpmiddel sal voorkomste identifiseer en nuwe segmente dienooreenkomstig skep. Spesifiseer verskeie skeiers met behulp van |, bv.: !|?|...|。", + "diarization_label": "Diarisering model", + "tr_process_label": "Vertaal proses", + "out_type_label": "Uitvoer tipe", + "out_name_label": "Lêer naam", + "out_name_info": "Die naam van die uitvoer lêer", + "task_sound_label": "Taak Status Klank", + "task_sound_info": "Taak Status Klank: Speel 'n klank waarskuwing wat taak voltooiing of foute tydens uitvoering aandui.", + "cache_label": "Herstel Vordering", + "cache_info": "Herstel Vordering: Gaan voort met die proses vanaf die laaste kontrolepunt.", + "preview_info": "Voorskou sny die video tot slegs 10 sekondes vir toetsdoeleindes. Skakel dit asseblief af om die volle video duur te kry.", + "edit_sub_label": "Wysig gegenereerde onderskrifte", + "edit_sub_info": "Wysig gegenereerde onderskrifte: Laat jou toe om die vertaling in 2 stappe uit te voer. Eerstens met die 'KRY ONDERSKRIFTE EN WYSIG' knoppie, kry jy die onderskrifte om dit te wysig, en dan met die 'VERTAAL' knoppie, kan jy die video genereer.", + "button_subs": "KRY ONDERSKRIFTE EN WYSIG", + "editor_sub_label": "Gegenereerde onderskrifte", + "editor_sub_info": "Voel vry om die teks in die gegenereerde onderskrifte hier te wysig. Jy kan veranderinge aan die koppelvlak opsies maak voordat jy die 'VERTAAL' knoppie druk, behalwe vir 'Bron taal', 'Vertaal klank na', en 'Max sprekers', om foute te vermy. Sodra jy klaar is, klik die 'VERTAAL' knoppie.", + "editor_sub_ph": "Druk eers 'KRY ONDERSKRIFTE EN WYSIG' om die onderskrifte te kry", + "button_translate": "VERTAAL", + "output_result_label": "LAAI VERTAALDE VIDEO AF", + "sub_ori": "Onderskrifte", + "sub_tra": "Vertaalde onderskrifte", + "ht_token_info": "Een belangrike stap is om die lisensie-ooreenkoms te aanvaar vir die gebruik van Pyannote. Jy moet 'n rekening hê op Hugging Face en die lisensie aanvaar om die modelle te gebruik: https://huggingface.co/pyannote/speaker-diarization en https://huggingface.co/pyannote/segmentation. Kry jou SLEUTEL TOKEN hier: https://hf.co/settings/tokens", + "ht_token_ph": "Token gaan hier...", + "tab_docs": "Dokument vertaling", + "docs_input_label": "Kies Dokument Bron", + "docs_input_info": "Dit kan 'n PDF, DOCX, TXT, of teks wees", + "docs_source_info": "Dit is die oorspronklike taal van die teks", + "chunk_size_label": "Max aantal karakters wat die TTS per segment sal verwerk", + "chunk_size_info": "'n Waarde van 0 ken 'n dinamiese en meer versoenbare waarde toe vir die TTS.", + "docs_button": "Begin Taal Omskakelingsbrug", + "cv_url_info": "Laai outomaties die R.V.C. modelle af van die URL. Jy kan skakels van HuggingFace of Drive gebruik, en jy kan verskeie skakels insluit, elkeen geskei deur 'n komma. Voorbeeld: https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.pth, https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.index", + "replace_title": "Vervang stem: TTS na R.V.C.", + "sec1_title": "### 1. Om die gebruik te aktiveer, merk dit as aktief.", + "enable_replace": "Merk dit om die gebruik van die modelle te aktiveer.", + "sec2_title": "### 2. Kies 'n stem wat toegepas sal word op elke TTS van elke ooreenstemmende spreker en pas die konfigurasies toe.", + "sec2_subtitle": "Afhangende van hoeveel jy sal gebruik, benodig elkeen sy onderskeie model. Daar is ook 'n hulp een indien 'n spreker nie korrek opgespoor word nie.", + "cv_tts1": "Kies die stem om toe te pas vir Spreker 1.", + "cv_tts2": "Kies die stem om toe te pas vir Spreker 2.", + "cv_tts3": "Kies die stem om toe te pas vir Spreker 3.", + "cv_tts4": "Kies die stem om toe te pas vir Spreker 4.", + "cv_tts5": "Kies die stem om toe te pas vir Spreker 5.", + "cv_tts6": "Kies die stem om toe te pas vir Spreker 6.", + "cv_tts7": "Kies die stem om toe te pas vir Spreker 7.", + "cv_tts8": "Kies die stem om toe te pas vir Spreker 8.", + "cv_tts9": "Kies die stem om toe te pas vir Spreker 9.", + "cv_tts10": "Kies die stem om toe te pas vir Spreker 10.", + "cv_tts11": "Kies die stem om toe te pas vir Spreker 11.", + "cv_tts12": "Kies die stem om toe te pas vir Spreker 12.", + "cv_aux": "- Stem om toe te pas in geval 'n Spreker nie suksesvol opgespoor word nie.", + "cv_button_apply": "PAS KONFIGURASIE TOE", + "tab_help": "Hulp", + }, } From 054c09e515227bf2ba8e40040a9fffc8ab22987e Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Thu, 16 May 2024 22:18:27 +0000 Subject: [PATCH 35/36] fix(videobook): temp fix in images to video --- soni_translate/text_multiformat_processor.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/soni_translate/text_multiformat_processor.py b/soni_translate/text_multiformat_processor.py index 4d7f522..fd58c60 100644 --- a/soni_translate/text_multiformat_processor.py +++ b/soni_translate/text_multiformat_processor.py @@ -661,7 +661,13 @@ def create_video_from_images( remove_files(out_video) cm = f"ffmpeg -y -f concat -i list.txt -c:v libx264 -preset veryfast -crf 18 -pix_fmt yuv420p {out_video}" - run_command(cm) + cm_alt = f"ffmpeg -f concat -i list.txt -c:v libx264 -r 30 -pix_fmt yuv420p -y {out_video}" + try: + run_command(cm) + except Exception as error: + logger.error(str(error)) + remove_files(out_video) + run_command(cm_alt) return out_video From 7f53d73edf15510397a88d3e1a20f92423cd7bc5 Mon Sep 17 00:00:00 2001 From: Roger Condori <114810545+R3gm@users.noreply.github.com> Date: Sat, 18 May 2024 03:30:11 +0000 Subject: [PATCH 36/36] update 24/05/18 details --- soni_translate/languages_gui.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/soni_translate/languages_gui.py b/soni_translate/languages_gui.py index 970cda6..ec7338a 100644 --- a/soni_translate/languages_gui.py +++ b/soni_translate/languages_gui.py @@ -2,6 +2,8 @@ news = """ ## 📖 News + 🔥 2024/05/18: Overlap reduction. OpenAI API key integration for transcription, translation, and TTS. Output type: subtitles by speaker, separate audio sound, and video only with subtitles. Now you have access to a better-performing version of Whisper for transcribing speech. For example, you can use `kotoba-tech/kotoba-whisper-v1.1` for Japanese transcription, available [here](https://huggingface.co/kotoba-tech/kotoba-whisper-v1.1). You can find these improved models on the [Hugging Face Whisper page](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=trending&search=whisper). Simply copy the repository ID and paste it into the 'Whisper ASR model' in 'Advanced Settings'. Support for ass subtitles and batch processing with subtitles. Vocal enhancement before transcription. Added CPU mode with `app_rvc.py --cpu_mode`. TTS now supports up to 12 speakers. OpenVoiceV2 has been integrated for voice imitation. PDF to videobook (displays images from the PDF). + 🔥 2024/03/02: Preserve file names in output. Multiple archives can now be submitted simultaneously by specifying their paths, directories or URLs separated by commas. Added option for disabling diarization. Implemented soft subtitles. Format output (MP3, MP4, MKV, WAV, and OGG), and resolved issues related to file reading and diarization. 🔥 2024/02/22: Added freevc for voice imitation, fixed voiceless track, divide segments. New languages support. New translations of the GUI. With subtitle file, no align and the media file is not needed to process the SRT file. Burn subtitles to video. Queue can accept multiple tasks simultaneously. Sound alert notification. Continue process from last checkpoint. Acceleration rate regulation