Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VTT Output support added #89

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,3 @@ dist
.DS_Store
*.egg-info
auto_subtitle/__pycache__
build
13 changes: 9 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
# Automatic subtitles in your videos

This is a fork of [auto_subtitle](https://github.com/m1guelpf/auto-subtitle) developed by [m1guelpf](https://github.com/m1guelpf). The main difference in `AutoSubZ` is the addition of several features, such as .vtt and .txt output, along with fixes for various bugs.



This repository uses `ffmpeg` and [OpenAI's Whisper](https://openai.com/blog/whisper) to automatically generate and overlay subtitles on any video.

## Installation

To get started, you'll need Python 3.7 or newer. Install the binary by running the following command:
To get started, you'll need Python 3.8 or newer. Install the binary by running the following command:

pip install git+https://github.com/m1guelpf/auto-subtitle.git
pip install git+https://github.com/zaltinsoy/AutoSubZ.git

You'll also need to install [`ffmpeg`](https://ffmpeg.org/), which is available from most package managers:

Expand All @@ -17,8 +21,9 @@ sudo apt update && sudo apt install ffmpeg
# on MacOS using Homebrew (https://brew.sh/)
brew install ffmpeg

# on Windows using Chocolatey (https://chocolatey.org/)
choco install ffmpeg
# on Windows using Winget
winget install -e --id Gyan.FFmpeg

```

## Usage
Expand Down
77 changes: 55 additions & 22 deletions auto_subtitle/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import argparse
import warnings
import tempfile
from .utils import filename, str2bool, write_srt
from .utils import filename, write_srt


def main():
Expand All @@ -16,24 +16,40 @@ def main():
choices=whisper.available_models(), help="name of the Whisper model to use")
parser.add_argument("--output_dir", "-o", type=str,
default=".", help="directory to save the outputs")
parser.add_argument("--output_srt", type=str2bool, default=False,
help="whether to output the .srt file along with the video files")
parser.add_argument("--srt_only", type=str2bool, default=False,
parser.add_argument("--subtitle_format", type=str, default="srt", choices=["srt","vtt"],
help="subtitle file format type")
parser.add_argument("--output_mkv", action="store_true",
help="whether to output the new subtitled video as an .mkv container rather than .mp4 container")
parser.add_argument("--output_srt", action="store_true",
help="output the .srt file along with the video files")
parser.add_argument("--output_txt", action="store_true",
help="whether to also save the subtitles as a .txt file")
parser.add_argument("--srt_only", action="store_true",
help="only generate the .srt file and not create overlayed video")
parser.add_argument("--verbose", type=str2bool, default=False,
help="whether to print out the progress and debug messages")

parser.add_argument("--verbose", action="store_true",
help="print out the progress and debug messages")
parser.add_argument("--task", type=str, default="transcribe", choices=[
"transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
parser.add_argument("--language", type=str, default="auto", choices=["auto","af","am","ar","as","az","ba","be","bg","bn","bo","br","bs","ca","cs","cy","da","de","el","en","es","et","eu","fa","fi","fo","fr","gl","gu","ha","haw","he","hi","hr","ht","hu","hy","id","is","it","ja","jw","ka","kk","km","kn","ko","la","lb","ln","lo","lt","lv","mg","mi","mk","ml","mn","mr","ms","mt","my","ne","nl","nn","no","oc","pa","pl","ps","pt","ro","ru","sa","sd","si","sk","sl","sn","so","sq","sr","su","sv","sw","ta","te","tg","th","tk","tl","tr","tt","uk","ur","uz","vi","yi","yo","zh"],
help="What is the origin language of the video? If unset, it is detected automatically.")
parser.add_argument("--language", type=str, default="auto", choices=["auto","af","am","ar","as","az","ba","be","bg","bn","bo","br","bs","ca","cs","cy","da","de","el","en",
"es","et","eu","fa","fi","fo","fr","gl","gu","ha","haw","he","hi","hr","ht","hu","hy","id","is","it","ja","jw","ka","kk","km","kn","ko","la","lb","ln",
"lo","lt","lv","mg","mi","mk","ml","mn","mr","ms","mt","my","ne","nl","nn","no","oc","pa","pl","ps","pt","ro","ru","sa","sd","si","sk","sl","sn","so",
"sq","sr","su","sv","sw","ta","te","tg","th","tk","tl","tr","tt","uk","ur","uz","vi","yi","yo","zh"],
help="What is the origin language of the video? If unset, it is detected automatically.")
parser.add_argument("--word_timestamps", action="store_true", default=False,
help="(experimental) extract word-level timestamps and refine the results based on them")

args = parser.parse_args().__dict__
model_name: str = args.pop("model")
output_dir: str = args.pop("output_dir")
output_srt: bool = args.pop("output_srt")
subtitle_format: str = args.pop("subtitle_format")
output_txt: bool = args.pop("output_txt")
srt_only: bool = args.pop("srt_only")
verbose: bool = args["verbose"]
language: str = args.pop("language")
output_mkv: bool = args.pop("output_mkv")


os.makedirs(output_dir, exist_ok=True)

Expand All @@ -48,24 +64,29 @@ def main():
model = whisper.load_model(model_name)
audios = get_audio(args.pop("video"))
subtitles = get_subtitles(
audios, output_srt or srt_only, output_dir, lambda audio_path: model.transcribe(audio_path, **args)
audios, output_srt or srt_only, subtitle_format,output_txt, output_dir, lambda audio_path: model.transcribe(audio_path, **args)
)

if srt_only:
return

for path, srt_path in subtitles.items():
out_path = os.path.join(output_dir, f"{filename(path)}.mp4")
ext = "mkv" if output_mkv else "mp4"

print(f"Adding subtitles to {filename(path)}...")
for path, srt_path in subtitles.items():

video = ffmpeg.input(path)
audio = video.audio
out_path = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(path))[0]}.{ext}")
print(f"Adding subtitles to {os.path.basename(path)}...")

stream = ffmpeg.input(path)
audio = stream.audio
videoWithSub = stream.video.filter('subtitles', filename=srt_path,force_style='OutlineColour=&H40000000,BorderStyle=3')

ffmpeg.concat(
video.filter('subtitles', srt_path, force_style="OutlineColour=&H40000000,BorderStyle=3"), audio, v=1, a=1
).output(out_path).run(quiet=True, overwrite_output=True)
if output_mkv:
stream = ffmpeg.output(ffmpeg.input(srt_path), stream, out_path, vcodec='copy', scodec='copy')
else:
stream = ffmpeg.output(audio, videoWithSub, out_path, vcodec='libx264', acodec='copy')

ffmpeg.run(stream, quiet= not verbose, overwrite_output=True)
print(f"Saved subtitled video to {os.path.abspath(out_path)}.")


Expand All @@ -88,12 +109,17 @@ def get_audio(paths):
return audio_paths


def get_subtitles(audio_paths: list, output_srt: bool, output_dir: str, transcribe: callable):

def get_subtitles(audio_paths: list, output_srt: bool,subtitle_format: str,output_txt: bool, output_dir: str, transcribe: callable):
subtitles_path = {}

for path, audio_path in audio_paths.items():
srt_path = output_dir if output_srt else tempfile.gettempdir()
srt_path = os.path.join(srt_path, f"{filename(path)}.srt")

if(subtitle_format=="srt"):
srt_path = os.path.join(srt_path, f"{filename(path)}.srt")
else: # vtt
srt_path = os.path.join(srt_path, f"{filename(path)}.vtt")

print(
f"Generating subtitles for {filename(path)}... This might take a while."
Expand All @@ -104,12 +130,19 @@ def get_subtitles(audio_paths: list, output_srt: bool, output_dir: str, transcri
warnings.filterwarnings("default")

with open(srt_path, "w", encoding="utf-8") as srt:
write_srt(result["segments"], file=srt)
write_srt(result["segments"], file=srt,subtitle_format=subtitle_format)

subtitles_path[path] = srt_path

if output_txt:
text_path = os.path.join(output_dir, f"{filename(path)}.txt")
with open(text_path, "w", encoding="utf-8") as text_file:
for segment in result["segments"]:
print(segment["text"], file=text_file)
print(f"Saving subtitles to text file: {text_path}")

return subtitles_path


if __name__ == '__main__':
main()
main()
59 changes: 34 additions & 25 deletions auto_subtitle/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,7 @@
from typing import Iterator, TextIO


def str2bool(string):
string = string.lower()
str2val = {"true": True, "false": False}

if string in str2val:
return str2val[string]
else:
raise ValueError(
f"Expected one of {set(str2val.keys())}, got {string}")


def format_timestamp(seconds: float, always_include_hours: bool = False):
def format_timestamp(seconds: float, always_include_hours: bool = False,subtitle_format: str = "srt"):
assert seconds >= 0, "non-negative timestamp expected"
milliseconds = round(seconds * 1000.0)

Expand All @@ -27,20 +16,40 @@ def format_timestamp(seconds: float, always_include_hours: bool = False):
milliseconds -= seconds * 1_000

hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
return f"{hours_marker}{minutes:02d}:{seconds:02d},{milliseconds:03d}"


def write_srt(transcript: Iterator[dict], file: TextIO):
for i, segment in enumerate(transcript, start=1):
print(
f"{i}\n"
f"{format_timestamp(segment['start'], always_include_hours=True)} --> "
f"{format_timestamp(segment['end'], always_include_hours=True)}\n"
f"{segment['text'].strip().replace('-->', '->')}\n",
file=file,
flush=True,
)
#return f"{hours_marker}{minutes:02d}:{seconds:02d},{milliseconds:03d}"
if subtitle_format == "srt":
output=f"{hours_marker}{minutes:02d}:{seconds:02d},{milliseconds:03d}"
else:
output= f"{hours:02d}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}"

return output


def write_srt(transcript: Iterator[dict], file: TextIO,subtitle_format: str = "srt"):

if subtitle_format == "vtt":
print("WEBVTT\n", file=file)
for i, segment in enumerate(transcript, start=1):
print(
f"{format_timestamp(segment['start'], always_include_hours=True,subtitle_format=subtitle_format)} --> "
f"{format_timestamp(segment['end'], always_include_hours=True,subtitle_format=subtitle_format)}\n"
f"{segment['text'].strip().replace('-->', '->')}\n",
file=file,
flush=True,
)

else: #srt
for i, segment in enumerate(transcript, start=1):
print(
f"{i}\n"
f"{format_timestamp(segment['start'], always_include_hours=True,subtitle_format=subtitle_format)} --> "
f"{format_timestamp(segment['end'], always_include_hours=True,subtitle_format=subtitle_format)}\n"
f"{segment['text'].strip().replace('-->', '->')}\n",
file=file,
flush=True,
)


def filename(path):
return os.path.splitext(os.path.basename(path))[0]
return os.path.splitext(os.path.basename(path))[0]
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
openai-whisper
ffmpeg-python
numpy<2.0.0
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
author="Miguel Piedrafita",
install_requires=[
'openai-whisper',
'ffmpeg-python',
'numpy<2.0.0'
],
description="Automatically generate and embed subtitles into your videos",
entry_points={
Expand Down