diff --git a/README.md b/README.md index 36320faf..a18bddf3 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,7 @@ pip install --force-reinstall "faster-whisper @ https://github.com/guillaumekln/ ```python from faster_whisper import WhisperModel -model_size = "large-v2" +model_size = "large-v3" # Run on GPU with FP16 model = WhisperModel(model_size, device="cuda", compute_type="float16") @@ -185,17 +185,17 @@ Here is a non exhaustive list of open-source projects using faster-whisper. Feel ## Model conversion -When loading a model from its size such as `WhisperModel("large-v2")`, the correspondig CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/guillaumekln). +When loading a model from its size such as `WhisperModel("large-v3")`, the correspondig CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/Systran). We also provide a script to convert any Whisper models compatible with the Transformers library. They could be the original OpenAI models or user fine-tuned models. -For example the command below converts the [original "large-v2" Whisper model](https://huggingface.co/openai/whisper-large-v2) and saves the weights in FP16: +For example the command below converts the [original "large-v3" Whisper model](https://huggingface.co/openai/whisper-large-v3) and saves the weights in FP16: ```bash pip install transformers[torch]>=4.23 -ct2-transformers-converter --model openai/whisper-large-v2 --output_dir whisper-large-v2-ct2 \ - --copy_files tokenizer.json --quantization float16 +ct2-transformers-converter --model openai/whisper-large-v3 --output_dir whisper-large-v3-ct2 +--copy_files tokenizer.json preprocessor_config.json --quantization float16 ``` * The option `--model` accepts a model name on the Hub or a path to a model directory. @@ -207,12 +207,12 @@ Models can also be converted from the code. See the [conversion API](https://ope 1. Directly load the model from a local directory: ```python -model = faster_whisper.WhisperModel("whisper-large-v2-ct2") +model = faster_whisper.WhisperModel("whisper-large-v3-ct2") ``` 2. [Upload your model to the Hugging Face Hub](https://huggingface.co/docs/transformers/model_sharing#upload-with-the-web-interface) and load it from its name: ```python -model = faster_whisper.WhisperModel("username/whisper-large-v2-ct2") +model = faster_whisper.WhisperModel("username/whisper-large-v3-ct2") ``` ## Comparing performance against other implementations diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py index 1af70b93..c3b13b43 100644 --- a/faster_whisper/tokenizer.py +++ b/faster_whisper/tokenizer.py @@ -108,7 +108,7 @@ def decode_with_timestamps(self, tokens: List[int]) -> str: def split_to_word_tokens( self, tokens: List[int] ) -> Tuple[List[str], List[List[int]]]: - if self.language_code in {"zh", "ja", "th", "lo", "my"}: + if self.language_code in {"zh", "ja", "th", "lo", "my", "yue"}: # These languages don't typically use spaces, so it is difficult to split words # without morpheme analysis. Here, we instead split words at any # position where the tokens are decoded as valid unicode points @@ -274,4 +274,5 @@ def split_tokens_on_spaces( "yi", "yo", "zh", + "yue", ) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 86187fca..e0525b9e 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -1,8 +1,10 @@ import itertools +import json import logging import os import zlib +from inspect import signature from typing import BinaryIO, Iterable, List, NamedTuple, Optional, Tuple, Union import ctranslate2 @@ -92,8 +94,8 @@ def __init__( Args: model_size_or_path: Size of the model to use (tiny, tiny.en, base, base.en, - small, small.en, medium, medium.en, large-v1, large-v2, or large), a path to a converted - model directory, or a CTranslate2-converted Whisper model ID from the Hugging Face Hub. + small, small.en, medium, medium.en, large-v1, large-v2, large-v3, or large), a path to a + converted model directory, or a CTranslate2-converted Whisper model ID from the HF Hub. When a size or a model ID is configured, the converted model is downloaded from the Hugging Face Hub. device: Device to use for computation ("cpu", "cuda", "auto"). @@ -142,7 +144,8 @@ def __init__( "openai/whisper-tiny" + ("" if self.model.is_multilingual else ".en") ) - self.feature_extractor = FeatureExtractor() + self.feat_kwargs = self._get_feature_kwargs(model_path) + self.feature_extractor = FeatureExtractor(**self.feat_kwargs) self.num_samples_per_token = self.feature_extractor.hop_length * 2 self.frames_per_second = ( self.feature_extractor.sampling_rate // self.feature_extractor.hop_length @@ -159,6 +162,22 @@ def supported_languages(self) -> List[str]: """The languages supported by the model.""" return list(_LANGUAGE_CODES) if self.model.is_multilingual else ["en"] + def _get_feature_kwargs(self, model_path) -> dict: + preprocessor_config_file = os.path.join(model_path, "preprocessor_config.json") + config = {} + if os.path.isfile(preprocessor_config_file): + try: + with open(preprocessor_config_file, "r", encoding="utf-8") as json_file: + config = json.load(json_file) + valid_keys = signature(FeatureExtractor.__init__).parameters.keys() + config = {k: v for k, v in config.items() if k in valid_keys} + except json.JSONDecodeError as e: + self.logger.warning( + "Could not load preprocessor_config.json: %s", str(e) + ) + + return config + def transcribe( self, audio: Union[str, BinaryIO, np.ndarray], diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py index f020bc27..343a6357 100644 --- a/faster_whisper/utils.py +++ b/faster_whisper/utils.py @@ -10,17 +10,18 @@ from tqdm.auto import tqdm _MODELS = { - "tiny.en": "guillaumekln/faster-whisper-tiny.en", - "tiny": "guillaumekln/faster-whisper-tiny", - "base.en": "guillaumekln/faster-whisper-base.en", - "base": "guillaumekln/faster-whisper-base", - "small.en": "guillaumekln/faster-whisper-small.en", - "small": "guillaumekln/faster-whisper-small", - "medium.en": "guillaumekln/faster-whisper-medium.en", - "medium": "guillaumekln/faster-whisper-medium", - "large-v1": "guillaumekln/faster-whisper-large-v1", - "large-v2": "guillaumekln/faster-whisper-large-v2", - "large": "guillaumekln/faster-whisper-large-v2", + "tiny.en": "Systran/faster-whisper-tiny.en", + "tiny": "Systran/faster-whisper-tiny", + "base.en": "Systran/faster-whisper-base.en", + "base": "Systran/faster-whisper-base", + "small.en": "Systran/faster-whisper-small.en", + "small": "Systran/faster-whisper-small", + "medium.en": "Systran/faster-whisper-medium.en", + "medium": "Systran/faster-whisper-medium", + "large-v1": "Systran/faster-whisper-large-v1", + "large-v2": "Systran/faster-whisper-large-v2", + "large-v3": "Systran/faster-whisper-large-v3", + "large": "Systran/faster-whisper-large-v3", } @@ -50,8 +51,8 @@ def download_model( Args: size_or_id: Size of the model to download from https://huggingface.co/guillaumekln (tiny, tiny.en, base, base.en, small, small.en medium, medium.en, large-v1, large-v2, - large), or a CTranslate2-converted model ID from the Hugging Face Hub - (e.g. guillaumekln/faster-whisper-large-v2). + large-v3, large), or a CTranslate2-converted model ID from the Hugging Face Hub + (e.g. Systran/faster-whisper-large-v3). output_dir: Directory where the model should be saved. If not set, the model is saved in the cache directory. local_files_only: If True, avoid downloading the file and return the path to the local @@ -76,6 +77,7 @@ def download_model( allow_patterns = [ "config.json", + "preprocessor_config.json", "model.bin", "tokenizer.json", "vocabulary.*", diff --git a/requirements.txt b/requirements.txt index fa037f71..ba0da206 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ av==10.* -ctranslate2>=3.17,<4 +ctranslate2>=3.22,<4 huggingface_hub>=0.13 -tokenizers>=0.13,<0.15 +tokenizers>=0.13,<0.16 onnxruntime>=1.14,<2