From 40bb833ac469152073f90348b891e3aa889e2a85 Mon Sep 17 00:00:00 2001 From: Abhiroop Talasila Date: Thu, 3 Feb 2022 00:01:57 +0530 Subject: [PATCH] Add support for Coqui STT and .tflite models - By default, Coqui will be used for inference, with an option to switch to DeepSpeech - Coqui supports .tflite models out-of-the-box, whereas DeepSpeech needs a different package. Refer #41 - English models will be automatically downloaded if run without the model argument - Updated README and requirements.txt to reflect changes --- README.md | 15 ++++---- autosub/audioProcessing.py | 23 ++++++++----- autosub/featureExtraction.py | 8 ----- autosub/main.py | 13 ++++--- autosub/segmentAudio.py | 4 --- autosub/trainAudio.py | 2 -- autosub/utils.py | 67 ++++++++++++++++++++++++++++-------- requirements-gpu.txt | 2 ++ requirements.txt | 1 + 9 files changed, 86 insertions(+), 49 deletions(-) diff --git a/README.md b/README.md index 43e5be6..75135df 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ ## About -AutoSub is a CLI application to generate subtitle files (.srt, .vtt, and .txt transcript) for any video file using [Mozilla DeepSpeech](https://github.com/mozilla/DeepSpeech). I use the DeepSpeech Python API to run inference on audio segments and [pyAudioAnalysis](https://github.com/tyiannak/pyAudioAnalysis) to split the initial audio on silent segments, producing multiple small files. +AutoSub is a CLI application to generate subtitle files (.srt, .vtt, and .txt transcript) for any video file using either [Mozilla DeepSpeech](https://github.com/mozilla/DeepSpeech) or [Coqui STT](https://github.com/coqui-ai/STT). I use their open-source models to run inference on audio segments and [pyAudioAnalysis](https://github.com/tyiannak/pyAudioAnalysis) to split the initial audio on silent segments, producing multiple smaller files (makes inference easy). ⭐ Featured in [DeepSpeech Examples](https://github.com/mozilla/DeepSpeech-examples) by Mozilla @@ -35,15 +35,16 @@ AutoSub is a CLI application to generate subtitle files (.srt, .vtt, and .txt tr OR $ pip3 install -r requirements-gpu.txt ``` -* Use `getmodels.sh` to download the model and scorer files with the version number as argument - ```bash - $ ./getmodels.sh 0.9.3 - ``` * Install FFMPEG. If you're on Ubuntu, this should work fine ```bash $ sudo apt-get install ffmpeg $ ffmpeg -version # I'm running 4.1.4 ``` +* By default, if no model files are found in the root directory, the script will download v0.9.3 models for DeepSpeech or TFLITE model and Huge Vocab for Coqui. Use `getmodels.sh` to download DeepSpeech model and scorer files with the version number as argument. For Coqui, download from [here](https://coqui.ai/models) + ```bash + $ ./getmodels.sh 0.9.3 + ``` +* For .tflite models with DeepSpeech, follow [this](https://github.com/abhirooptalasila/AutoSub/issues/41#issuecomment-968847604) ## Docker @@ -72,7 +73,9 @@ AutoSub is a CLI application to generate subtitle files (.srt, .vtt, and .txt tr ## How-to example -* The model files should be in the repo root directory and will be loaded automatically. But incase you have multiple versions, use the `--model` and `--scorer` args while executing +* The model files should be in the repo root directory and will be loaded/downloaded automatically. Incase you have multiple versions, use the `--model` and `--scorer` args while executing +* By default, Coqui is used for inference. You can change this by using the `--engine` argument with value `"ds"` for DeepSpeech +* For languages other than English, you'll need to manually download the model and scorer files. Check [here](https://discourse.mozilla.org/t/links-to-pretrained-models/62688) for DeepSpeech and [here](https://coqui.ai/models) for Coqui. * After following the installation instructions, you can run `autosub/main.py` as given below. The `--file` argument is the video file for which subtitles are to be generated ```bash $ python3 autosub/main.py --file ~/movie.mp4 diff --git a/autosub/audioProcessing.py b/autosub/audioProcessing.py index 6e6ae54..92c726b 100644 --- a/autosub/audioProcessing.py +++ b/autosub/audioProcessing.py @@ -2,19 +2,26 @@ # -*- coding: utf-8 -*- import sys +import shlex import logger import subprocess import numpy as np from os.path import basename +try: + from shlex import quote +except ImportError: + from pipes import quote + _logger = logger.setup_applevel_logger(__name__) + def extract_audio(input_file, audio_file_name): """Extract audio from input video file and save to audio/ in root dir Args: - input_file: input video file - audio_file_name: save audio WAV file with same filename as video file + input_file : input video file + audio_file_name : save audio WAV file with same filename as video file """ try: @@ -32,11 +39,11 @@ def convert_samplerate(audio_path, desired_sample_rate): ***WONT be called as extract_audio() converts the audio to 16kHz while saving*** Args: - audio_path: audio file path - desired_sample_rate: DeepSpeech expects 16kHz + audio_path : audio file path + desired_sample_rate : DeepSpeech expects 16kHz Returns: - numpy buffer: audio signal stored in numpy array + numpy buffer : audio signal stored in numpy array """ sox_cmd = "sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer \ @@ -46,9 +53,7 @@ def convert_samplerate(audio_path, desired_sample_rate): output = subprocess.check_output( shlex.split(sox_cmd), stderr=subprocess.PIPE) except subprocess.CalledProcessError as e: - raise RuntimeError("SoX returned non-zero status: {}".format(e.stderr)) + raise RuntimeError(f"SoX returned non-zero status: {e.stderr}") except OSError as e: - raise OSError(e.errno, "SoX not found, use {}hz files or install it: {}".format( - desired_sample_rate, e.strerror)) - + raise OSError(e.errno, f"SoX not found, use {desired_sample_rate}hz files or install it: {e.strerror}") return np.frombuffer(output, np.int16) diff --git a/autosub/featureExtraction.py b/autosub/featureExtraction.py index da15855..10b1d92 100644 --- a/autosub/featureExtraction.py +++ b/autosub/featureExtraction.py @@ -43,7 +43,6 @@ def energy_entropy(frame, n_short_blocks=10): # Compute entropy of the normalized sub-frame energies: entropy = -np.sum(s * np.log2(s + eps)) - return entropy @@ -71,7 +70,6 @@ def spectral_centroid_spread(fft_magnitude, sampling_rate): # Normalize: centroid = centroid / (sampling_rate / 2.0) spread = spread / (sampling_rate / 2.0) - return centroid, spread @@ -98,7 +96,6 @@ def spectral_entropy(signal, n_short_blocks=10): # compute spectral entropy entropy = -np.sum(s * np.log2(s + eps)) - return entropy @@ -116,7 +113,6 @@ def spectral_flux(fft_magnitude, previous_fft_magnitude): sp_flux = np.sum( (fft_magnitude / fft_sum - previous_fft_magnitude / previous_fft_sum) ** 2) - return sp_flux @@ -135,7 +131,6 @@ def spectral_rolloff(signal, c): sp_rolloff = np.float64(a[0]) / (float(fft_length)) else: sp_rolloff = 0.0 - return sp_rolloff @@ -220,7 +215,6 @@ def chroma_features_init(num_fft, sampling_rate): for u in unique_chroma: idx = np.nonzero(num_chroma == u) num_freqs_per_chroma[idx] = idx[0].shape - return num_chroma, num_freqs_per_chroma @@ -262,7 +256,6 @@ def chroma_features(signal, sampling_rate, num_fft): # ax.set_yticklabels(xaxis) # plt.show(block=False) # plt.draw() - return chroma_names, final_matrix @@ -411,5 +404,4 @@ def feature_extraction(signal, sampling_rate, window, step, deltas=True): fft_magnitude_previous = fft_magnitude.copy() features = np.concatenate(features, 1) - return features, feature_names diff --git a/autosub/main.py b/autosub/main.py index dbd2a90..5f908e3 100644 --- a/autosub/main.py +++ b/autosub/main.py @@ -21,6 +21,7 @@ # Line count for SRT file line_count = 1 + def ds_process_audio(ds, audio_file, output_file_handle_dict, split_duration): """sttWithMetadata() will run DeepSpeech inference on each audio file generated after remove_silent_segments. These files contain start and end @@ -80,17 +81,19 @@ def ds_process_audio(ds, audio_file, output_file_handle_dict, split_duration): def main(): global line_count supported_output_formats = ["srt", "vtt", "txt"] + supported_engines = ["ds", "stt"] parser = argparse.ArgumentParser(description="AutoSub") parser.add_argument("--format", choices=supported_output_formats, nargs="+", help="Create only certain output formats rather than all formats", default=supported_output_formats) - parser.add_argument("--split-duration", dest="split_duration", type=float, - help="Split run-on sentences exceededing this duration (in seconds) into multiple subtitles", - default=5) + parser.add_argument("--split-duration", dest="split_duration", type=float, default=5, + help="Split run-on sentences exceededing this duration (in seconds) into multiple subtitles") parser.add_argument("--dry-run", dest="dry_run", action="store_true", help="Perform dry-run to verify options prior to running. Also useful to instantiate \ cuda/tensorflow cache prior to running multiple times") + parser.add_argument("--engine", choices=supported_engines, nargs="?", default="stt", + help="Select either DeepSpeech or Coqui STT for inference. Latter is default") parser.add_argument("--file", required=False, help="Input video file") parser.add_argument("--model", required=False, help="Input *.pbmm model file") parser.add_argument("--scorer", required=False, help="Input *.scorer file") @@ -104,7 +107,7 @@ def main(): ds_scorer = get_model(args, "scorer") if args.dry_run: - create_model(ds_model, ds_scorer) + create_model(args.engine, ds_model, ds_scorer) if args.file is not None: if not os.path.isfile(args.file): _logger.warn(f"Invalid file: {args.file}") @@ -151,7 +154,7 @@ def main(): audiofiles.remove(os.path.basename(audio_file_name)) _logger.info("Running inference...") - ds = create_model(ds_model, ds_scorer) + ds = create_model(args.engine, ds_model, ds_scorer) for filename in tqdm(audiofiles): audio_segment_path = os.path.join(audio_directory, filename) diff --git a/autosub/segmentAudio.py b/autosub/segmentAudio.py index 556fe12..1fcf6b2 100644 --- a/autosub/segmentAudio.py +++ b/autosub/segmentAudio.py @@ -42,7 +42,6 @@ def read_audio_file(input_file): if signal.ndim == 2 and signal.shape[1] == 1: signal = signal.flatten() - return sampling_rate, signal @@ -58,7 +57,6 @@ def smooth_moving_avg(signal, window=11): signal, 2 * signal[-1] - signal[-1:-window:-1]] w = np.ones(window, 'd') y = np.convolve(w / w.sum(), s, mode='same') - return y[window:-window + 1] @@ -75,7 +73,6 @@ def stereo_to_mono(signal): else: if signal.shape[1] == 2: signal = (signal[:, 1] / 2) + (signal[:, 0] / 2) - return signal @@ -185,7 +182,6 @@ def silence_removal(signal, sampling_rate, st_win, st_step, smooth_window=0.5, if s_lim[1] - s_lim[0] > min_duration: seg_limits_2.append(s_lim) seg_limits = seg_limits_2 - return seg_limits diff --git a/autosub/trainAudio.py b/autosub/trainAudio.py index 108c497..3e8a9c7 100644 --- a/autosub/trainAudio.py +++ b/autosub/trainAudio.py @@ -35,7 +35,6 @@ def train_svm(features, c_param, kernel='linear'): svm = sklearn.svm.SVC(C=c_param, kernel=kernel, probability=True, gamma='auto') svm.fit(feature_matrix, labels) - return svm @@ -95,5 +94,4 @@ def features_to_matrix(features): else: feature_matrix = np.vstack((feature_matrix, f)) labels = np.append(labels, i * np.ones((len(f), 1))) - return feature_matrix, labels diff --git a/autosub/utils.py b/autosub/utils.py index 2b86801..da1cf7f 100644 --- a/autosub/utils.py +++ b/autosub/utils.py @@ -6,9 +6,22 @@ import sys import shutil import logger -from deepspeech import Model +import subprocess +from stt import Model as SModel +from deepspeech import Model as DModel _logger = logger.setup_applevel_logger(__name__) +_models = { + "ds": { + "model": "https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm", + "scorer": "https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer" + }, + "stt": { + "model": "https://github.com/coqui-ai/STT-models/releases/download/english/coqui/v0.9.3/model.tflite", + "scorer": "https://github.com/coqui-ai/STT-models/releases/download/english%2Fcoqui%2Fv1.0.0-huge-vocab/huge-vocabulary.scorer" + } +} + def sort_alphanumeric(data): """Sort function to sort os.listdir() alphanumerically @@ -20,9 +33,9 @@ def sort_alphanumeric(data): convert = lambda text: int(text) if text.isdigit() else text.lower() alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)] - return sorted(data, key=alphanum_key) + def clean_folder(folder): """Delete everything inside a folder @@ -40,6 +53,26 @@ def clean_folder(folder): except Exception as e: _logger.warn(f"Failed to delete {file_path}. Reason: {e}") + +def download_model(engine, fname): + """Download model files, if not available locally + + Args: + engine : "ds" for DeepSpeech and "stt" for Coqui STT + fname : either of "model" or "scorer" + """ + + _logger.info(f"{fname.capitalize()} not found locally. Downloading") + try: + _file = _models[engine][fname] + command = ["wget", _file, "-q", "--show-progress"] + ret = subprocess.run(command).returncode + except Exception as e: + _logger.error(str(e)) + sys.exit(1) + return _file.split("/")[-1] + + def get_model(args, arg_name): """Will prioritze supplied arguments but if not, try to find files @@ -48,10 +81,13 @@ def get_model(args, arg_name): arg_name : either model or scorer file """ - if arg_name == 'model': - arg_extension = '.pbmm' - elif arg_name == 'scorer': - arg_extension = '.scorer' + if arg_name == "model": + if args.engine == "ds": + arg_extension = ".pbmm" + else: + arg_extension = ".tflite" + elif arg_name == "scorer": + arg_extension = ".scorer" arg = args.__getattribute__(arg_name) @@ -65,12 +101,8 @@ def get_model(args, arg_name): num_models = len(models) if num_models == 0: - _logger.warn(f"No {arg_name}s specified via --{arg_name} and none found in local directory. Please run getmodel.sh to get some") - if arg_name == 'model': - _logger.error("Must specify pbmm model") - sys.exit(1) - else: - model = '' + model = download_model(args.engine, arg_name) + elif num_models != 1: _logger.warn(f"Detected {num_models} {arg_name} files in local dir") if arg_name == 'model': @@ -85,16 +117,21 @@ def get_model(args, arg_name): _logger.info(f"{arg_name.capitalize()}: {model}") return(model) -def create_model(model, scorer): + +def create_model(engine, model, scorer): """Instantiate model and scorer Args: + engine : "ds" for DeepSpeech and "stt" for Coqui STT model : .pbmm model file scorer : .scorer file """ try: - ds = Model(model) + if engine == "ds": + ds = DModel(model) + else: + ds = SModel(model) except: _logger.error("Invalid model file") sys.exit(1) @@ -103,4 +140,4 @@ def create_model(model, scorer): ds.enableExternalScorer(scorer) except: _logger.warn("Invalid scorer file. Running inference using only model file") - return(ds) \ No newline at end of file + return(ds) diff --git a/requirements-gpu.txt b/requirements-gpu.txt index f3de28b..25fb60c 100644 --- a/requirements-gpu.txt +++ b/requirements-gpu.txt @@ -1,5 +1,7 @@ cycler==0.10.0 numpy +stt==1.0.0 +tensorflow-gpu==1.15 deepspeech-gpu==0.9.3 joblib==0.16.0 kiwisolver==1.2.0 diff --git a/requirements.txt b/requirements.txt index 5fd05e1..7144684 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ cycler==0.10.0 numpy +stt==1.0.0 deepspeech==0.9.3 joblib==0.16.0 kiwisolver==1.2.0