m1guelpf · zaltinsoy · Dec 4, 2023 · Mar 19, 2024 · May 20, 2024 · Jun 7, 2024
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,3 @@ dist
 .DS_Store
 *.egg-info
 auto_subtitle/__pycache__
-build
diff --git a/README.md b/README.md
@@ -1,12 +1,16 @@
 # Automatic subtitles in your videos
 
+This is a fork of [auto_subtitle](https://github.com/m1guelpf/auto-subtitle) developed by [m1guelpf](https://github.com/m1guelpf). The main difference in `AutoSubZ` is the addition of several features, such as .vtt and .txt output, along with fixes for various bugs.
+
+
+
 This repository uses `ffmpeg` and [OpenAI's Whisper](https://openai.com/blog/whisper) to automatically generate and overlay subtitles on any video.
 
 ## Installation
 
-To get started, you'll need Python 3.7 or newer. Install the binary by running the following command:
+To get started, you'll need Python 3.8 or newer. Install the binary by running the following command:
 
-    pip install git+https://github.com/m1guelpf/auto-subtitle.git
+    pip install git+https://github.com/zaltinsoy/AutoSubZ.git
 
 You'll also need to install [`ffmpeg`](https://ffmpeg.org/), which is available from most package managers:
 
@@ -17,8 +21,9 @@ sudo apt update && sudo apt install ffmpeg
 # on MacOS using Homebrew (https://brew.sh/)
 brew install ffmpeg
 
-# on Windows using Chocolatey (https://chocolatey.org/)
-choco install ffmpeg
+# on Windows using Winget
+winget install -e --id Gyan.FFmpeg
+
 ```
 
 ## Usage

diff --git a/auto_subtitle/cli.py b/auto_subtitle/cli.py
@@ -4,7 +4,7 @@
 import argparse
 import warnings
 import tempfile
-from .utils import filename, str2bool, write_srt
+from .utils import filename, write_srt
 
 
 def main():
@@ -16,24 +16,40 @@ def main():
                         choices=whisper.available_models(), help="name of the Whisper model to use")
     parser.add_argument("--output_dir", "-o", type=str,
                         default=".", help="directory to save the outputs")
-    parser.add_argument("--output_srt", type=str2bool, default=False,
-                        help="whether to output the .srt file along with the video files")
-    parser.add_argument("--srt_only", type=str2bool, default=False,
+    parser.add_argument("--subtitle_format", type=str, default="srt", choices=["srt","vtt"],
+                        help="subtitle file format type")
+    parser.add_argument("--output_mkv", action="store_true",
+                        help="whether to output the new subtitled video as an .mkv container rather than .mp4 container")
+    parser.add_argument("--output_srt", action="store_true",
+                        help="output the .srt file along with the video files")
+    parser.add_argument("--output_txt", action="store_true",
+                        help="whether to also save the subtitles as a .txt file")
+    parser.add_argument("--srt_only", action="store_true",
                         help="only generate the .srt file and not create overlayed video")
-    parser.add_argument("--verbose", type=str2bool, default=False,
-                        help="whether to print out the progress and debug messages")
-
+    parser.add_argument("--verbose", action="store_true",
+                        help="print out the progress and debug messages")
+    
     parser.add_argument("--task", type=str, default="transcribe", choices=[
                         "transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
-    parser.add_argument("--language", type=str, default="auto", choices=["auto","af","am","ar","as","az","ba","be","bg","bn","bo","br","bs","ca","cs","cy","da","de","el","en","es","et","eu","fa","fi","fo","fr","gl","gu","ha","haw","he","hi","hr","ht","hu","hy","id","is","it","ja","jw","ka","kk","km","kn","ko","la","lb","ln","lo","lt","lv","mg","mi","mk","ml","mn","mr","ms","mt","my","ne","nl","nn","no","oc","pa","pl","ps","pt","ro","ru","sa","sd","si","sk","sl","sn","so","sq","sr","su","sv","sw","ta","te","tg","th","tk","tl","tr","tt","uk","ur","uz","vi","yi","yo","zh"], 
-    help="What is the origin language of the video? If unset, it is detected automatically.")
+    parser.add_argument("--language", type=str, default="auto", choices=["auto","af","am","ar","as","az","ba","be","bg","bn","bo","br","bs","ca","cs","cy","da","de","el","en",
+                        "es","et","eu","fa","fi","fo","fr","gl","gu","ha","haw","he","hi","hr","ht","hu","hy","id","is","it","ja","jw","ka","kk","km","kn","ko","la","lb","ln",
+                        "lo","lt","lv","mg","mi","mk","ml","mn","mr","ms","mt","my","ne","nl","nn","no","oc","pa","pl","ps","pt","ro","ru","sa","sd","si","sk","sl","sn","so",
+                        "sq","sr","su","sv","sw","ta","te","tg","th","tk","tl","tr","tt","uk","ur","uz","vi","yi","yo","zh"], 
+                        help="What is the origin language of the video? If unset, it is detected automatically.")
+    parser.add_argument("--word_timestamps", action="store_true", default=False, 
+                        help="(experimental) extract word-level timestamps and refine the results based on them")
 
     args = parser.parse_args().__dict__
     model_name: str = args.pop("model")
     output_dir: str = args.pop("output_dir")
     output_srt: bool = args.pop("output_srt")
+    subtitle_format: str = args.pop("subtitle_format")
+    output_txt: bool = args.pop("output_txt")
     srt_only: bool = args.pop("srt_only")
+    verbose: bool = args["verbose"]
     language: str = args.pop("language")
+    output_mkv: bool = args.pop("output_mkv")
+
 
     os.makedirs(output_dir, exist_ok=True)
 
@@ -48,24 +64,29 @@ def main():
     model = whisper.load_model(model_name)
     audios = get_audio(args.pop("video"))
     subtitles = get_subtitles(
-        audios, output_srt or srt_only, output_dir, lambda audio_path: model.transcribe(audio_path, **args)
+        audios, output_srt or srt_only, subtitle_format,output_txt, output_dir, lambda audio_path: model.transcribe(audio_path, **args)        
     )
 
     if srt_only:
         return
 
-    for path, srt_path in subtitles.items():
-        out_path = os.path.join(output_dir, f"{filename(path)}.mp4")
+    ext = "mkv" if output_mkv else "mp4"
 
-        print(f"Adding subtitles to {filename(path)}...")
+    for path, srt_path in subtitles.items():
 
-        video = ffmpeg.input(path)
-        audio = video.audio
+        out_path = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(path))[0]}.{ext}")
+        print(f"Adding subtitles to {os.path.basename(path)}...")
+
+        stream = ffmpeg.input(path)
+        audio = stream.audio
+        videoWithSub = stream.video.filter('subtitles', filename=srt_path,force_style='OutlineColour=&H40000000,BorderStyle=3') 
 
-        ffmpeg.concat(
-            video.filter('subtitles', srt_path, force_style="OutlineColour=&H40000000,BorderStyle=3"), audio, v=1, a=1
-        ).output(out_path).run(quiet=True, overwrite_output=True)
+        if output_mkv:
+            stream = ffmpeg.output(ffmpeg.input(srt_path), stream, out_path, vcodec='copy', scodec='copy')
+        else:
+            stream = ffmpeg.output(audio, videoWithSub, out_path, vcodec='libx264', acodec='copy')
 
+        ffmpeg.run(stream, quiet= not verbose, overwrite_output=True)
         print(f"Saved subtitled video to {os.path.abspath(out_path)}.")
 
 
@@ -88,12 +109,17 @@ def get_audio(paths):
     return audio_paths
 
 
-def get_subtitles(audio_paths: list, output_srt: bool, output_dir: str, transcribe: callable):
+
+def get_subtitles(audio_paths: list, output_srt: bool,subtitle_format: str,output_txt: bool, output_dir: str, transcribe: callable):
     subtitles_path = {}
 
     for path, audio_path in audio_paths.items():
         srt_path = output_dir if output_srt else tempfile.gettempdir()
-        srt_path = os.path.join(srt_path, f"{filename(path)}.srt")
+
+        if(subtitle_format=="srt"):
+            srt_path = os.path.join(srt_path, f"{filename(path)}.srt")
+        else: # vtt 
+            srt_path = os.path.join(srt_path, f"{filename(path)}.vtt")
 
         print(
             f"Generating subtitles for {filename(path)}... This might take a while."
@@ -104,12 +130,19 @@ def get_subtitles(audio_paths: list, output_srt: bool, output_dir: str, transcri
         warnings.filterwarnings("default")
 
         with open(srt_path, "w", encoding="utf-8") as srt:
-            write_srt(result["segments"], file=srt)
+            write_srt(result["segments"], file=srt,subtitle_format=subtitle_format)
 
         subtitles_path[path] = srt_path
 
+        if output_txt:
+            text_path = os.path.join(output_dir, f"{filename(path)}.txt")
+            with open(text_path, "w", encoding="utf-8") as text_file:
+                for segment in result["segments"]:
+                    print(segment["text"], file=text_file)
+            print(f"Saving subtitles to text file: {text_path}")
+
     return subtitles_path
 
 
 if __name__ == '__main__':
-    main()
+    main()
diff --git a/auto_subtitle/utils.py b/auto_subtitle/utils.py
@@ -2,18 +2,7 @@
 from typing import Iterator, TextIO
 
 
-def str2bool(string):
-    string = string.lower()
-    str2val = {"true": True, "false": False}
-
-    if string in str2val:
-        return str2val[string]
-    else:
-        raise ValueError(
-            f"Expected one of {set(str2val.keys())}, got {string}")
-
-
-def format_timestamp(seconds: float, always_include_hours: bool = False):
+def format_timestamp(seconds: float, always_include_hours: bool = False,subtitle_format: str = "srt"):
     assert seconds >= 0, "non-negative timestamp expected"
     milliseconds = round(seconds * 1000.0)
 
@@ -27,20 +16,40 @@ def format_timestamp(seconds: float, always_include_hours: bool = False):
     milliseconds -= seconds * 1_000
 
     hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
-    return f"{hours_marker}{minutes:02d}:{seconds:02d},{milliseconds:03d}"
-
 
-def write_srt(transcript: Iterator[dict], file: TextIO):
-    for i, segment in enumerate(transcript, start=1):
-        print(
-            f"{i}\n"
-            f"{format_timestamp(segment['start'], always_include_hours=True)} --> "
-            f"{format_timestamp(segment['end'], always_include_hours=True)}\n"
-            f"{segment['text'].strip().replace('-->', '->')}\n",
-            file=file,
-            flush=True,
-        )
+    #return f"{hours_marker}{minutes:02d}:{seconds:02d},{milliseconds:03d}"
+    if subtitle_format == "srt":
+        output=f"{hours_marker}{minutes:02d}:{seconds:02d},{milliseconds:03d}"
+    else:
+        output= f"{hours:02d}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
+
+    return output
+
+
+def write_srt(transcript: Iterator[dict], file: TextIO,subtitle_format: str = "srt"):
+
+    if subtitle_format == "vtt":         
+        print("WEBVTT\n", file=file)
+        for i, segment in enumerate(transcript, start=1):
+            print(
+                f"{format_timestamp(segment['start'], always_include_hours=True,subtitle_format=subtitle_format)} --> "
+                f"{format_timestamp(segment['end'], always_include_hours=True,subtitle_format=subtitle_format)}\n"
+                f"{segment['text'].strip().replace('-->', '->')}\n",
+                file=file,
+                flush=True,
+            )
+
+    else: #srt
+        for i, segment in enumerate(transcript, start=1):        
+            print(
+                f"{i}\n"
+                f"{format_timestamp(segment['start'], always_include_hours=True,subtitle_format=subtitle_format)} --> "
+                f"{format_timestamp(segment['end'], always_include_hours=True,subtitle_format=subtitle_format)}\n"
+                f"{segment['text'].strip().replace('-->', '->')}\n",
+                file=file,
+                flush=True,
+            )
 
 
 def filename(path):
-    return os.path.splitext(os.path.basename(path))[0]
+    return os.path.splitext(os.path.basename(path))[0]
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,3 @@
 openai-whisper
+ffmpeg-python
+numpy<2.0.0
diff --git a/setup.py b/setup.py
@@ -8,6 +8,8 @@
     author="Miguel Piedrafita",
     install_requires=[
         'openai-whisper',
+        'ffmpeg-python',
+        'numpy<2.0.0'
     ],
     description="Automatically generate and embed subtitles into your videos",
     entry_points={