tests/test_transcribe.py

import inspect
import os

import numpy as np

from faster_whisper import BatchedInferencePipeline, WhisperModel, decode_audio


def test_supported_languages():
    model = WhisperModel("tiny.en")
    assert model.supported_languages == ["en"]


def test_transcribe(jfk_path):
    model = WhisperModel("tiny")
    segments, info = model.transcribe(jfk_path, word_timestamps=True)
    assert info.all_language_probs is not None

    assert info.language == "en"
    assert info.language_probability > 0.9
    assert info.duration == 11

    # Get top language info from all results, which should match the
    # already existing metadata
    top_lang, top_lang_score = info.all_language_probs[0]
    assert info.language == top_lang
    assert abs(info.language_probability - top_lang_score) < 1e-16

    segments = list(segments)

    assert len(segments) == 1

    segment = segments[0]

    assert segment.text == (
        " And so my fellow Americans, ask not what your country can do for you, "
        "ask what you can do for your country."
    )

    assert segment.text == "".join(word.word for word in segment.words)
    assert segment.start == segment.words[0].start
    assert segment.end == segment.words[-1].end
    batched_model = BatchedInferencePipeline(model=model)
    result, info = batched_model.transcribe(
        jfk_path, word_timestamps=True, vad_filter=False
    )
    assert info.language == "en"
    assert info.language_probability > 0.7
    segments = []
    for segment in result:
        segments.append(
            {"start": segment.start, "end": segment.end, "text": segment.text}
        )

    assert len(segments) == 1
    assert segment.text == (
        " And so my fellow Americans ask not what your country can do for you, "
        "ask what you can do for your country."
    )


def test_batched_transcribe(physcisworks_path):
    model = WhisperModel("tiny")
    batched_model = BatchedInferencePipeline(model=model)
    result, info = batched_model.transcribe(physcisworks_path, batch_size=16)
    assert info.language == "en"
    assert info.language_probability > 0.7
    segments = []
    for segment in result:
        segments.append(
            {"start": segment.start, "end": segment.end, "text": segment.text}
        )
    # number of near 30 sec segments
    assert len(segments) == 6

    result, info = batched_model.transcribe(
        physcisworks_path,
        batch_size=16,
        without_timestamps=False,
        word_timestamps=True,
    )
    segments = []
    for segment in result:
        assert segment.words is not None
        segments.append(
            {"start": segment.start, "end": segment.end, "text": segment.text}
        )
    assert len(segments) > 7


def test_empty_audio():
    audio = np.asarray([], dtype="float32")
    model = WhisperModel("tiny")
    pipeline = BatchedInferencePipeline(model=model)
    assert list(model.transcribe(audio)[0]) == []
    assert list(pipeline.transcribe(audio)[0]) == []
    model.detect_language(audio)


def test_prefix_with_timestamps(jfk_path):
    model = WhisperModel("tiny")
    segments, _ = model.transcribe(jfk_path, prefix="And so my fellow Americans")
    segments = list(segments)

    assert len(segments) == 1

    segment = segments[0]

    assert segment.text == (
        " And so my fellow Americans, ask not what your country can do for you, "
        "ask what you can do for your country."
    )

    assert segment.start == 0
    assert 10 < segment.end <= 11


def test_vad(jfk_path):
    model = WhisperModel("tiny")
    segments, info = model.transcribe(
        jfk_path,
        vad_filter=True,
        vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200),
    )
    segments = list(segments)

    assert len(segments) == 1
    segment = segments[0]

    assert segment.text == (
        " And so my fellow Americans ask not what your country can do for you, "
        "ask what you can do for your country."
    )

    assert 0 < segment.start < 1
    assert 10 < segment.end < 11

    assert info.vad_options.min_silence_duration_ms == 500
    assert info.vad_options.speech_pad_ms == 200


def test_stereo_diarization(data_dir):
    model = WhisperModel("tiny")

    audio_path = os.path.join(data_dir, "stereo_diarization.wav")
    left, right = decode_audio(audio_path, split_stereo=True)

    segments, _ = model.transcribe(left)
    transcription = "".join(segment.text for segment in segments).strip()
    assert transcription == (
        "He began a confused complaint against the wizard, "
        "who had vanished behind the curtain on the left."
    )

    segments, _ = model.transcribe(right)
    transcription = "".join(segment.text for segment in segments).strip()
    assert transcription == "The horizon seems extremely distant."


def test_multilingual_transcription(data_dir):
    model = WhisperModel("tiny")
    pipeline = BatchedInferencePipeline(model)

    audio_path = os.path.join(data_dir, "multilingual.mp3")
    audio = decode_audio(audio_path)

    segments, info = model.transcribe(
        audio,
        multilingual=True,
        without_timestamps=True,
        condition_on_previous_text=False,
    )
    segments = list(segments)

    assert (
        segments[0].text
        == " Permission is hereby granted, free of charge, to any person obtaining a copy of the"
        " software and associated documentation files to deal in the software without restriction,"
        " including without limitation the rights to use, copy, modify, merge, publish, distribute"
        ", sublicence, and or cell copies of the software, and to permit persons to whom the "
        "software is furnished to do so, subject to the following conditions. The above copyright"
        " notice and this permission notice, shall be included in all copies or substantial "
        "portions of the software."
    )

    assert (
        segments[1].text
        == " Jedem, der dieses Software und die dazu gehöregen Dokumentationsdatein erhält, wird "
        "hiermit unengeltlich die Genehmigung erteilt, wird der Software und eingeschränkt zu "
        "verfahren. Dies umfasst insbesondere das Recht, die Software zu verwenden, zu "
        "vervielfältigen, zu modifizieren, zu Samenzofügen, zu veröffentlichen, zu verteilen, "
        "unterzulizenzieren und oder kopieren der Software zu verkaufen und diese Rechte "
        "unterfolgen den Bedingungen anderen zu übertragen."
    )

    segments, info = pipeline.transcribe(audio, multilingual=True)
    segments = list(segments)

    assert (
        segments[0].text
        == " Permission is hereby granted, free of charge, to any person obtaining a copy of the"
        " software and associated documentation files to deal in the software without restriction,"
        " including without limitation the rights to use, copy, modify, merge, publish, distribute"
        ", sublicence, and or cell copies of the software, and to permit persons to whom the "
        "software is furnished to do so, subject to the following conditions. The above copyright"
        " notice and this permission notice, shall be included in all copies or substantial "
        "portions of the software."
    )
    assert (
        "Dokumentationsdatein erhält, wird hiermit unengeltlich die Genehmigung erteilt,"
        " wird der Software und eingeschränkt zu verfahren. Dies umfasst insbesondere das Recht,"
        " die Software zu verwenden, zu vervielfältigen, zu modifizieren"
        in segments[1].text
    )


def test_hotwords(data_dir):
    model = WhisperModel("tiny")
    pipeline = BatchedInferencePipeline(model)

    audio_path = os.path.join(data_dir, "hotwords.mp3")
    audio = decode_audio(audio_path)

    segments, info = model.transcribe(audio, hotwords="ComfyUI")
    segments = list(segments)

    assert "ComfyUI" in segments[0].text
    assert info.transcription_options.hotwords == "ComfyUI"

    segments, info = pipeline.transcribe(audio, hotwords="ComfyUI")
    segments = list(segments)

    assert "ComfyUI" in segments[0].text
    assert info.transcription_options.hotwords == "ComfyUI"


def test_transcribe_signature():
    model_transcribe_args = set(inspect.getargs(WhisperModel.transcribe.__code__).args)
    pipeline_transcribe_args = set(
        inspect.getargs(BatchedInferencePipeline.transcribe.__code__).args
    )
    pipeline_transcribe_args.remove("batch_size")

    assert model_transcribe_args == pipeline_transcribe_args


def test_monotonic_timestamps(physcisworks_path):
    model = WhisperModel("base")
    pipeline = BatchedInferencePipeline(model=model)

    segments, info = model.transcribe(physcisworks_path, word_timestamps=True)
    segments = list(segments)

    for i in range(len(segments) - 1):
        assert segments[i].start <= segments[i].end
        assert segments[i].end <= segments[i + 1].start
        for word in segments[i].words:
            assert word.start <= word.end
            assert word.end <= segments[i].end
    assert segments[-1].end <= info.duration

    segments, info = pipeline.transcribe(physcisworks_path, word_timestamps=True)
    segments = list(segments)

    for i in range(len(segments) - 1):
        assert segments[i].start <= segments[i].end
        assert segments[i].end <= segments[i + 1].start
        for word in segments[i].words:
            assert word.start <= word.end
            assert word.end <= segments[i].end
    assert segments[-1].end <= info.duration


def test_cliptimestamps_segments(jfk_path):
    model = WhisperModel("tiny")
    pipeline = BatchedInferencePipeline(model=model)

    audio = decode_audio(jfk_path)
    audio = np.concatenate([audio, audio])
    clip_timestamps = [{"start": 0.0, "end": 11.0}, {"start": 11.0, "end": 22.0}]

    segments, info = pipeline.transcribe(audio, clip_timestamps=clip_timestamps)
    segments = list(segments)

    assert len(segments) == 2
    for segment, clip in zip(segments, clip_timestamps):
        assert segment.start == clip["start"]
        assert segment.end == clip["end"]
        assert segment.text == (
            " And so my fellow Americans ask not what your country can do for you, "
            "ask what you can do for your country."
        )


def test_cliptimestamps_timings(physcisworks_path):
    model = WhisperModel("tiny")
    pipeline = BatchedInferencePipeline(model=model)

    audio = decode_audio(physcisworks_path)
    clip_timestamps = [{"start": 0.0, "end": 5.0}, {"start": 6.0, "end": 15.0}]
    transcripts = [
        " Now I want to return to the conservation of mechanical energy.",
        (
            " I have here a pendulum. I have an object that weighs 15 kilograms"
            " and I can lift it up one meter, which I have done now."
        ),
    ]
    segments, info = pipeline.transcribe(audio, clip_timestamps=clip_timestamps)
    segments = list(segments)

    assert len(segments) == 2
    for segment, clip, transcript in zip(segments, clip_timestamps, transcripts):
        assert clip["start"] == segment.start
        assert clip["end"] == segment.end
        assert segment.text == transcript