Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

When multiple audio streams exist, pick the correct one for whisper to process #2688

Merged
merged 2 commits into from
Sep 30, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion custom_libs/subliminal_patch/providers/whisperai.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import ffmpeg
import functools
from pycountry import languages

# These are all the languages Whisper supports.
# from whisper.tokenizer import LANGUAGES
Expand Down Expand Up @@ -132,6 +133,18 @@ def set_log_level(newLevel="INFO"):
# initialize to default above
set_log_level()

# ffmpeg uses the older ISO 639-2 code when extracting audio streams based on language
# if we give it the newer ISO 639-3 code it can't find that audio stream by name because it's different
# for example it wants 'ger' instead of 'deu' for the German language
# or 'fre' instead of 'fra' for the French language
def get_ISO_639_2_code(iso639_3_code):
# find the language using ISO 639-3 code
language = languages.get(alpha_3=iso639_3_code)
# get the ISO 639-2 code or use the original input if there isn't a match
iso639_2_code = language.bibliographic if language and hasattr(language, 'bibliographic') else iso639_3_code
logger.debug(f"ffmpeg using language code '{iso639_2_code}' (instead of '{iso639_3_code}')")
return iso639_2_code

@functools.lru_cache(2)
def encode_audio_stream(path, ffmpeg_path, audio_stream_language=None):
logger.debug("Encoding audio stream to WAV with ffmpeg")
Expand All @@ -140,7 +153,10 @@ def encode_audio_stream(path, ffmpeg_path, audio_stream_language=None):
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
inp = ffmpeg.input(path, threads=0)
if audio_stream_language:
logger.debug(f"Whisper will only use the {audio_stream_language} audio stream for {path}")
# There is more than one audio stream, so pick the requested one by name
# Use the ISO 639-2 code if available
audio_stream_language = get_ISO_639_2_code(audio_stream_language)
logger.debug(f"Whisper will use the '{audio_stream_language}' audio stream for {path}")
inp = inp[f'a:m:language:{audio_stream_language}']

out, _ = inp.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=16000, af="aresample=async=1") \
Expand Down