diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 1efdd2b7..8ce542b4 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -16,9 +16,9 @@ import numpy as np import tokenizers import torch -from tqdm import tqdm from pyannote.audio import Model +from tqdm import tqdm from transformers import Pipeline from transformers.pipelines.pt_utils import PipelineIterator @@ -112,9 +112,11 @@ class TranscriptionInfo(NamedTuple): transcription_options: TranscriptionOptions vad_options: VadOptions + # The code below is copied from whisper-x (https://github.com/m-bain/whisperX) # and adapted for faster_whisper + class BatchedInferencePipeline(Pipeline): """ @@ -149,10 +151,10 @@ def __init__( self.use_vad_model = use_vad_model self.vad_onset = 0.500 self.vad_offset = 0.363 - self.vad_model_url = ( - "https://whisperx.s3.eu-west-2.amazonaws.com/model_weights/segmentation" - "/0b5b3216d60a2d32fc086b47ea8c67589aaeb26b7e07fcbe620d6d0b83e209ea/pytorch_model.bin" - ) + self.vad_model_url = ( + "https://whisperx.s3.eu-west-2.amazonaws.com/model_weights/segmentation" + "/0b5b3216d60a2d32fc086b47ea8c67589aaeb26b7e07fcbe620d6d0b83e209ea/pytorch_model.bin" + ) ( self._preprocess_params, self._forward_params, @@ -177,7 +179,6 @@ def __init__( super(Pipeline, self).__init__() - def _sanitize_parameters(self, **kwargs): preprocess_kwargs = {} if "tokenizer" in kwargs: @@ -2062,6 +2063,7 @@ def key_func(language): "log_prob_low_threshold": -2.0, "multilingual": False, "output_language": "en", + "hotwords": None, } diff --git a/faster_whisper/vad.py b/faster_whisper/vad.py index e3df1ea1..9ad4220b 100644 --- a/faster_whisper/vad.py +++ b/faster_whisper/vad.py @@ -4,7 +4,7 @@ import warnings from collections.abc import Callable -from typing import List, NamedTuple, Optional +from typing import List, NamedTuple, Optional, Union import numpy as np import pandas as pd @@ -314,9 +314,9 @@ class VoiceActivitySegmentation(VoiceActivityDetection): def __init__( self, segmentation: PipelineModel = "pyannote/segmentation", - device: torch.device | None = None, + device: Optional[Union[str, torch.device]] = None, fscore: bool = False, - use_auth_token: str | None = None, + use_auth_token: Optional[str] = None, **inference_kwargs, ): """Initialize the pipeline with the model name and the optional device. @@ -324,9 +324,9 @@ def __init__( Args: dict parameters of VoiceActivityDetection class from pyannote: segmentation (PipelineModel): Loaded model name. - device (torch.device | None): Device to perform the segmentation. + device (torch.device or None): Device to perform the segmentation. fscore (bool): Flag indicating whether to compute F-score during inference. - use_auth_token (str | None): Optional authentication token for model access. + use_auth_token (str or None): Optional authentication token for model access. inference_kwargs (dict): Additional arguments from VoiceActivityDetection pipeline. """ super().__init__( @@ -337,7 +337,7 @@ def __init__( **inference_kwargs, ) - def apply(self, file: AudioFile, hook: Callable | None = None) -> Annotation: + def apply(self, file: AudioFile, hook: Optional[Callable] = None) -> Annotation: """Apply voice activity detection on the audio file. Args: @@ -379,7 +379,7 @@ class BinarizeVadScores: def __init__( self, onset: float = 0.5, - offset: float | None = None, + offset: Optional[float] = None, min_duration_on: float = 0.0, min_duration_off: float = 0.0, pad_onset: float = 0.0, @@ -442,7 +442,8 @@ def __get_active_regions(self, scores: SlidingWindowFeature) -> Annotation: curr_scores = [k_scores[0]] curr_timestamps = [start] t = start - for t, y in zip(timestamps[1:], k_scores[1:], strict=False): + # optionally add `strict=False` for python 3.10 or later + for t, y in zip(timestamps[1:], k_scores[1:]): # currently active if is_active: curr_duration = t - start diff --git a/requirements.txt b/requirements.txt index d02dae83..ec741c89 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ tokenizers>=0.13,<1 onnxruntime>=1.14,<2 transformers pyannote-audio>=3.1.1 -pandas>=2.1.4 +pandas torch>=2.1.1 torchaudio>=2.1.2 jsons>=1.6.3 \ No newline at end of file