From 40bb833ac469152073f90348b891e3aa889e2a85 Mon Sep 17 00:00:00 2001
From: Abhiroop Talasila <abhiroop.talasila@gmail.com>
Date: Thu, 3 Feb 2022 00:01:57 +0530
Subject: [PATCH] Add support for Coqui STT and .tflite models

- By default, Coqui will be used for inference, with an option to switch to DeepSpeech
- Coqui supports .tflite models out-of-the-box, whereas DeepSpeech needs a different package. Refer #41
- English models will be automatically downloaded if run without the model argument
- Updated README and requirements.txt to reflect changes
---
 README.md                    | 15 ++++----
 autosub/audioProcessing.py   | 23 ++++++++-----
 autosub/featureExtraction.py |  8 -----
 autosub/main.py              | 13 ++++---
 autosub/segmentAudio.py      |  4 ---
 autosub/trainAudio.py        |  2 --
 autosub/utils.py             | 67 ++++++++++++++++++++++++++++--------
 requirements-gpu.txt         |  2 ++
 requirements.txt             |  1 +
 9 files changed, 86 insertions(+), 49 deletions(-)

diff --git a/README.md b/README.md
index 43e5be6..75135df 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@
 
 ## About
 
-AutoSub is a CLI application to generate subtitle files (.srt, .vtt, and .txt transcript) for any video file using [Mozilla DeepSpeech](https://github.com/mozilla/DeepSpeech). I use the DeepSpeech Python API to run inference on audio segments and [pyAudioAnalysis](https://github.com/tyiannak/pyAudioAnalysis) to split the initial audio on silent segments, producing multiple small files.
+AutoSub is a CLI application to generate subtitle files (.srt, .vtt, and .txt transcript) for any video file using either [Mozilla DeepSpeech](https://github.com/mozilla/DeepSpeech) or [Coqui STT](https://github.com/coqui-ai/STT). I use their open-source models to run inference on audio segments and [pyAudioAnalysis](https://github.com/tyiannak/pyAudioAnalysis) to split the initial audio on silent segments, producing multiple smaller files (makes inference easy).
 
 ⭐ Featured in [DeepSpeech Examples](https://github.com/mozilla/DeepSpeech-examples) by Mozilla
 
@@ -35,15 +35,16 @@ AutoSub is a CLI application to generate subtitle files (.srt, .vtt, and .txt tr
     OR
     $ pip3 install -r requirements-gpu.txt
     ```
-* Use `getmodels.sh` to download the model and scorer files with the version number as argument
-    ```bash
-    $ ./getmodels.sh 0.9.3
-    ```
 * Install FFMPEG. If you're on Ubuntu, this should work fine
     ```bash
     $ sudo apt-get install ffmpeg
     $ ffmpeg -version               # I'm running 4.1.4
     ```
+* By default, if no model files are found in the root directory, the script will download v0.9.3 models for DeepSpeech or TFLITE model and Huge Vocab for Coqui. Use `getmodels.sh` to download DeepSpeech model and scorer files with the version number as argument. For Coqui, download from [here](https://coqui.ai/models)
+    ```bash
+    $ ./getmodels.sh 0.9.3
+    ```
+* For .tflite models with DeepSpeech, follow [this](https://github.com/abhirooptalasila/AutoSub/issues/41#issuecomment-968847604)
 
 
 ## Docker
@@ -72,7 +73,9 @@ AutoSub is a CLI application to generate subtitle files (.srt, .vtt, and .txt tr
 
 ## How-to example
 
-* The model files should be in the repo root directory and will be loaded automatically. But incase you have multiple versions, use the `--model` and `--scorer` args while executing
+* The model files should be in the repo root directory and will be loaded/downloaded automatically. Incase you have multiple versions, use the `--model` and `--scorer` args while executing
+* By default, Coqui is used for inference. You can change this by using the `--engine` argument with value `"ds"` for DeepSpeech
+* For languages other than English, you'll need to manually download the model and scorer files. Check [here](https://discourse.mozilla.org/t/links-to-pretrained-models/62688) for DeepSpeech and [here](https://coqui.ai/models) for Coqui.
 * After following the installation instructions, you can run `autosub/main.py` as given below. The `--file` argument is the video file for which subtitles are to be generated
     ```bash
     $ python3 autosub/main.py --file ~/movie.mp4
diff --git a/autosub/audioProcessing.py b/autosub/audioProcessing.py
index 6e6ae54..92c726b 100644
--- a/autosub/audioProcessing.py
+++ b/autosub/audioProcessing.py
@@ -2,19 +2,26 @@
 # -*- coding: utf-8 -*-
 
 import sys
+import shlex
 import logger
 import subprocess
 import numpy as np
 from os.path import basename
 
+try:
+    from shlex import quote
+except ImportError:
+    from pipes import quote
+
 _logger = logger.setup_applevel_logger(__name__)
 
+
 def extract_audio(input_file, audio_file_name):
     """Extract audio from input video file and save to audio/ in root dir
 
     Args:
-        input_file: input video file
-        audio_file_name: save audio WAV file with same filename as video file
+        input_file : input video file
+        audio_file_name : save audio WAV file with same filename as video file
     """
 
     try:
@@ -32,11 +39,11 @@ def convert_samplerate(audio_path, desired_sample_rate):
     ***WONT be called as extract_audio() converts the audio to 16kHz while saving***
 
     Args:
-        audio_path: audio file path
-        desired_sample_rate: DeepSpeech expects 16kHz
+        audio_path : audio file path
+        desired_sample_rate : DeepSpeech expects 16kHz
 
     Returns:
-        numpy buffer: audio signal stored in numpy array
+        numpy buffer : audio signal stored in numpy array
     """
 
     sox_cmd = "sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer \
@@ -46,9 +53,7 @@ def convert_samplerate(audio_path, desired_sample_rate):
         output = subprocess.check_output(
             shlex.split(sox_cmd), stderr=subprocess.PIPE)
     except subprocess.CalledProcessError as e:
-        raise RuntimeError("SoX returned non-zero status: {}".format(e.stderr))
+        raise RuntimeError(f"SoX returned non-zero status: {e.stderr}")
     except OSError as e:
-        raise OSError(e.errno, "SoX not found, use {}hz files or install it: {}".format(
-            desired_sample_rate, e.strerror))
-
+        raise OSError(e.errno, f"SoX not found, use {desired_sample_rate}hz files or install it: {e.strerror}")
     return np.frombuffer(output, np.int16)
diff --git a/autosub/featureExtraction.py b/autosub/featureExtraction.py
index da15855..10b1d92 100644
--- a/autosub/featureExtraction.py
+++ b/autosub/featureExtraction.py
@@ -43,7 +43,6 @@ def energy_entropy(frame, n_short_blocks=10):
 
     # Compute entropy of the normalized sub-frame energies:
     entropy = -np.sum(s * np.log2(s + eps))
-
     return entropy
 
 
@@ -71,7 +70,6 @@ def spectral_centroid_spread(fft_magnitude, sampling_rate):
     # Normalize:
     centroid = centroid / (sampling_rate / 2.0)
     spread = spread / (sampling_rate / 2.0)
-
     return centroid, spread
 
 
@@ -98,7 +96,6 @@ def spectral_entropy(signal, n_short_blocks=10):
 
     # compute spectral entropy
     entropy = -np.sum(s * np.log2(s + eps))
-
     return entropy
 
 
@@ -116,7 +113,6 @@ def spectral_flux(fft_magnitude, previous_fft_magnitude):
     sp_flux = np.sum(
         (fft_magnitude / fft_sum - previous_fft_magnitude /
          previous_fft_sum) ** 2)
-
     return sp_flux
 
 
@@ -135,7 +131,6 @@ def spectral_rolloff(signal, c):
         sp_rolloff = np.float64(a[0]) / (float(fft_length))
     else:
         sp_rolloff = 0.0
-
     return sp_rolloff
 
 
@@ -220,7 +215,6 @@ def chroma_features_init(num_fft, sampling_rate):
     for u in unique_chroma:
         idx = np.nonzero(num_chroma == u)
         num_freqs_per_chroma[idx] = idx[0].shape
-
     return num_chroma, num_freqs_per_chroma
 
 
@@ -262,7 +256,6 @@ def chroma_features(signal, sampling_rate, num_fft):
     #    ax.set_yticklabels(xaxis)
     #    plt.show(block=False)
     #    plt.draw()
-
     return chroma_names, final_matrix
 
 
@@ -411,5 +404,4 @@ def feature_extraction(signal, sampling_rate, window, step, deltas=True):
         fft_magnitude_previous = fft_magnitude.copy()
 
     features = np.concatenate(features, 1)
-
     return features, feature_names
diff --git a/autosub/main.py b/autosub/main.py
index dbd2a90..5f908e3 100644
--- a/autosub/main.py
+++ b/autosub/main.py
@@ -21,6 +21,7 @@
 # Line count for SRT file
 line_count = 1
 
+
 def ds_process_audio(ds, audio_file, output_file_handle_dict, split_duration):
     """sttWithMetadata() will run DeepSpeech inference on each audio file 
     generated after remove_silent_segments. These files contain start and end 
@@ -80,17 +81,19 @@ def ds_process_audio(ds, audio_file, output_file_handle_dict, split_duration):
 def main():
     global line_count
     supported_output_formats = ["srt", "vtt", "txt"]
+    supported_engines = ["ds", "stt"]
 
     parser = argparse.ArgumentParser(description="AutoSub")
     parser.add_argument("--format", choices=supported_output_formats, nargs="+",
                         help="Create only certain output formats rather than all formats",
                         default=supported_output_formats)
-    parser.add_argument("--split-duration", dest="split_duration", type=float,
-                        help="Split run-on sentences exceededing this duration (in seconds) into multiple subtitles",
-                        default=5)
+    parser.add_argument("--split-duration", dest="split_duration", type=float, default=5,
+                        help="Split run-on sentences exceededing this duration (in seconds) into multiple subtitles")
     parser.add_argument("--dry-run", dest="dry_run", action="store_true",
                         help="Perform dry-run to verify options prior to running. Also useful to instantiate \
                             cuda/tensorflow cache prior to running multiple times")
+    parser.add_argument("--engine", choices=supported_engines, nargs="?", default="stt",
+                        help="Select either DeepSpeech or Coqui STT for inference. Latter is default")
     parser.add_argument("--file", required=False, help="Input video file")
     parser.add_argument("--model", required=False, help="Input *.pbmm model file")
     parser.add_argument("--scorer", required=False, help="Input *.scorer file")
@@ -104,7 +107,7 @@ def main():
     ds_scorer = get_model(args, "scorer")
 
     if args.dry_run:
-        create_model(ds_model, ds_scorer) 
+        create_model(args.engine, ds_model, ds_scorer) 
         if args.file is not None:
             if not os.path.isfile(args.file):
                 _logger.warn(f"Invalid file: {args.file}")
@@ -151,7 +154,7 @@ def main():
     audiofiles.remove(os.path.basename(audio_file_name))
 
     _logger.info("Running inference...")
-    ds = create_model(ds_model, ds_scorer) 
+    ds = create_model(args.engine, ds_model, ds_scorer) 
 
     for filename in tqdm(audiofiles):
         audio_segment_path = os.path.join(audio_directory, filename)
diff --git a/autosub/segmentAudio.py b/autosub/segmentAudio.py
index 556fe12..1fcf6b2 100644
--- a/autosub/segmentAudio.py
+++ b/autosub/segmentAudio.py
@@ -42,7 +42,6 @@ def read_audio_file(input_file):
 
     if signal.ndim == 2 and signal.shape[1] == 1:
         signal = signal.flatten()
-
     return sampling_rate, signal
 
 
@@ -58,7 +57,6 @@ def smooth_moving_avg(signal, window=11):
               signal, 2 * signal[-1] - signal[-1:-window:-1]]
     w = np.ones(window, 'd')
     y = np.convolve(w / w.sum(), s, mode='same')
-
     return y[window:-window + 1]
 
 
@@ -75,7 +73,6 @@ def stereo_to_mono(signal):
         else:
             if signal.shape[1] == 2:
                 signal = (signal[:, 1] / 2) + (signal[:, 0] / 2)
-
     return signal
 
 
@@ -185,7 +182,6 @@ def silence_removal(signal, sampling_rate, st_win, st_step, smooth_window=0.5,
         if s_lim[1] - s_lim[0] > min_duration:
             seg_limits_2.append(s_lim)
     seg_limits = seg_limits_2
-
     return seg_limits
 
 
diff --git a/autosub/trainAudio.py b/autosub/trainAudio.py
index 108c497..3e8a9c7 100644
--- a/autosub/trainAudio.py
+++ b/autosub/trainAudio.py
@@ -35,7 +35,6 @@ def train_svm(features, c_param, kernel='linear'):
     svm = sklearn.svm.SVC(C=c_param, kernel=kernel, probability=True,
                           gamma='auto')
     svm.fit(feature_matrix, labels)
-
     return svm
 
 
@@ -95,5 +94,4 @@ def features_to_matrix(features):
         else:
             feature_matrix = np.vstack((feature_matrix, f))
             labels = np.append(labels, i * np.ones((len(f), 1)))
-
     return feature_matrix, labels
diff --git a/autosub/utils.py b/autosub/utils.py
index 2b86801..da1cf7f 100644
--- a/autosub/utils.py
+++ b/autosub/utils.py
@@ -6,9 +6,22 @@
 import sys
 import shutil
 import logger
-from deepspeech import Model
+import subprocess
+from stt import Model as SModel
+from deepspeech import Model as DModel
 
 _logger = logger.setup_applevel_logger(__name__)
+_models = {
+    "ds": {
+        "model": "https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm", 
+        "scorer": "https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer"
+        },
+    "stt": {
+        "model": "https://github.com/coqui-ai/STT-models/releases/download/english/coqui/v0.9.3/model.tflite",
+        "scorer": "https://github.com/coqui-ai/STT-models/releases/download/english%2Fcoqui%2Fv1.0.0-huge-vocab/huge-vocabulary.scorer"
+    }
+}
+
 
 def sort_alphanumeric(data):
     """Sort function to sort os.listdir() alphanumerically
@@ -20,9 +33,9 @@ def sort_alphanumeric(data):
 
     convert = lambda text: int(text) if text.isdigit() else text.lower()
     alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
-
     return sorted(data, key=alphanum_key)
 
+
 def clean_folder(folder):
     """Delete everything inside a folder
 
@@ -40,6 +53,26 @@ def clean_folder(folder):
         except Exception as e:
             _logger.warn(f"Failed to delete {file_path}. Reason: {e}")
 
+
+def download_model(engine, fname):
+    """Download model files, if not available locally
+
+    Args:
+        engine : "ds" for DeepSpeech and "stt" for Coqui STT
+        fname : either of "model" or "scorer"
+    """
+
+    _logger.info(f"{fname.capitalize()} not found locally. Downloading")
+    try:
+        _file = _models[engine][fname]
+        command = ["wget", _file, "-q", "--show-progress"]
+        ret = subprocess.run(command).returncode
+    except Exception as e:
+        _logger.error(str(e))
+        sys.exit(1)
+    return _file.split("/")[-1]
+
+
 def get_model(args, arg_name):
     """Will prioritze supplied arguments but if not, try to find files
 
@@ -48,10 +81,13 @@ def get_model(args, arg_name):
         arg_name : either model or scorer file
     """
     
-    if arg_name == 'model':
-        arg_extension = '.pbmm'
-    elif arg_name == 'scorer':
-        arg_extension = '.scorer'
+    if arg_name == "model":
+        if args.engine == "ds":
+            arg_extension = ".pbmm"
+        else:
+            arg_extension = ".tflite"
+    elif arg_name == "scorer":
+        arg_extension = ".scorer"
 
     arg = args.__getattribute__(arg_name)
     
@@ -65,12 +101,8 @@ def get_model(args, arg_name):
         num_models = len(models)
     
         if num_models == 0:
-            _logger.warn(f"No {arg_name}s specified via --{arg_name} and none found in local directory. Please run getmodel.sh to get some")
-            if arg_name == 'model':
-                _logger.error("Must specify pbmm model")
-                sys.exit(1)
-            else:
-                model = ''
+            model = download_model(args.engine, arg_name)
+
         elif num_models != 1: 
             _logger.warn(f"Detected {num_models} {arg_name} files in local dir")
             if arg_name == 'model':
@@ -85,16 +117,21 @@ def get_model(args, arg_name):
     _logger.info(f"{arg_name.capitalize()}: {model}")
     return(model)
 
-def create_model(model, scorer):
+
+def create_model(engine, model, scorer):
     """Instantiate model and scorer
 
     Args:
+        engine : "ds" for DeepSpeech and "stt" for Coqui STT
         model : .pbmm model file
         scorer : .scorer file
     """
 
     try:
-        ds = Model(model)
+        if engine == "ds":
+            ds = DModel(model)
+        else:
+            ds = SModel(model)
     except:
         _logger.error("Invalid model file")
         sys.exit(1)
@@ -103,4 +140,4 @@ def create_model(model, scorer):
         ds.enableExternalScorer(scorer)
     except:
         _logger.warn("Invalid scorer file. Running inference using only model file")
-    return(ds)
\ No newline at end of file
+    return(ds)
diff --git a/requirements-gpu.txt b/requirements-gpu.txt
index f3de28b..25fb60c 100644
--- a/requirements-gpu.txt
+++ b/requirements-gpu.txt
@@ -1,5 +1,7 @@
 cycler==0.10.0
 numpy
+stt==1.0.0
+tensorflow-gpu==1.15
 deepspeech-gpu==0.9.3
 joblib==0.16.0
 kiwisolver==1.2.0
diff --git a/requirements.txt b/requirements.txt
index 5fd05e1..7144684 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 cycler==0.10.0
 numpy
+stt==1.0.0
 deepspeech==0.9.3
 joblib==0.16.0
 kiwisolver==1.2.0