Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

removing clip_timestamps and redundant info, minor typos #15

Merged
merged 1 commit into from
Jun 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[![CI](https://github.com/SYSTRAN/faster-whisper/workflows/CI/badge.svg)](https://github.com/SYSTRAN/faster-whisper/actions?query=workflow%3ACI) [![PyPI version](https://badge.fury.io/py/faster-whisper.svg)](https://badge.fury.io/py/faster-whisper)

# Mobius Faster Whisper transcription with CTranslate2
# Faster Whisper transcription with CTranslate2

**faster-whisper** is a reimplementation of OpenAI's Whisper model using [CTranslate2](https://github.com/OpenNMT/CTranslate2/), which is a fast inference engine for Transformer models.

Expand Down
21 changes: 0 additions & 21 deletions faster_whisper/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,27 +15,6 @@
import av
import numpy as np

# Audio Hyperparameters

SAMPLE_RATE = 16000
N_FFT = 400
HOP_LENGTH = 160
CHUNK_LENGTH = 30


def exact_div(x, y):
assert x % y == 0
return x // y


N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE # 480000 samples in a 30-second chunk
N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH) # 3000 frames in a mel spectrogram input

N_SAMPLES_PER_TOKEN = HOP_LENGTH * 2 # the initial convolutions has stride 2
FRAMES_PER_SECOND = exact_div(SAMPLE_RATE, HOP_LENGTH) # 10ms per audio frame
TOKENS_PER_SECOND = exact_div(SAMPLE_RATE, N_SAMPLES_PER_TOKEN) # 20ms per audio token
TIME_PRECISION = 1 / TOKENS_PER_SECOND


def decode_audio(
input_file: Union[str, BinaryIO],
Expand Down
51 changes: 25 additions & 26 deletions faster_whisper/transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from transformers import Pipeline
from transformers.pipelines.pt_utils import PipelineIterator

from faster_whisper.audio import TIME_PRECISION, decode_audio, pad_or_trim
from faster_whisper.audio import decode_audio, pad_or_trim
from faster_whisper.feature_extractor import FeatureExtractor
from faster_whisper.tokenizer import _LANGUAGE_CODES, Tokenizer
from faster_whisper.utils import (
Expand Down Expand Up @@ -134,10 +134,6 @@ class BatchedInferencePipeline(Pipeline):
Modified by Mobius Labs GmbH
"""

# TODO:
# - add support for timestamp mode
# - add support for custom inference kwargs

def __init__(
self,
model,
Expand Down Expand Up @@ -450,7 +446,6 @@ def align_words(self, features, text_tokens, sot_seqs, seg_metadata):
current_seg_idx += 1

word_timings.append(_word_timings)

return word_timings

def combine_words(self, metadata, response):
Expand Down Expand Up @@ -551,17 +546,19 @@ def transcribe(
prepend_punctuations: str = "\"'“¿([{-",
append_punctuations: str = "\"'.。,,!!??::”)]}、",
max_new_tokens: Optional[int] = None,
clip_timestamps: Union[str, List[float]] = "0",
hotwords: Optional[str] = None,
word_timestamps: bool = False,
) -> Tuple[Iterable[BatchedSegment], TranscriptionInfo]:
"""transcribe audio in chunks in batched fashion and return with language info.

Arguments:
audio: audio file as numpy array/path for batched transcription.
vad_segments: Optionally provide list of dictionaries each containing "start" and
"end" keys, specifying the start and end voiced regions of audio chunks.
If no vad_segments specified, it uses vad model automatically segment them.
vad_segments: Optionally provide list of dictionaries each containing "start", "end",
and "segments" keys.
"start" and "end" keys specify the start and end of the voiced region within
30 sec boundary. An additional key "segments" contains all the start
and end of voiced regions within that 30sec boundary as a list of tuples.
If no vad_segments specified, it uses internal vad model automatically segment them.
batch_size: the maximum number of parallel requests to model for decoding.
num_workers: to enable true parallelism when running the model,
same as the transcribe function argument in WhisperModel class.
Expand Down Expand Up @@ -601,9 +598,6 @@ def transcribe(
with the previous word
max_new_tokens: Maximum number of new tokens to generate per-chunk. If not set,
the maximum will be set by the default max_length.
clip_timestamps:
Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to
process. The last end timestamp defaults to the end of the file.
hotwords:
Hotwords/hint phrases to the model. Has no effect if prefix is not None.
word_timestamps: Extract word-level timestamps using the cross-attention pattern
Expand All @@ -623,9 +617,13 @@ def transcribe(
such as repetition looping or timestamps going out of sync. Set as False
prompt_reset_on_temperature: Resets prompt if temperature is above this value.
Arg has effect only if condition_on_previous_text is True. Set at 0.5
#TODO: support "hallucination_silence_threshold" when "word_timestamps=True"
hallucination_silence_threshold: Optional[float]
When word_timestamps is True, skip silent periods longer than this threshold
(in seconds) when a possible hallucination is detected. set as None.
clip_timestamps:
Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to
process. The last end timestamp defaults to the end of the file. Set as "0".

unused:
language_detection_threshold: If the maximum probability of the language tokens is
Expand All @@ -645,7 +643,6 @@ def transcribe(

- a generator over transcribed batched segments.
- an instance of TranscriptionInfo.
- a dictionary with detected language and its probability.
"""

sampling_rate = self.model.feature_extractor.sampling_rate
Expand All @@ -659,7 +656,7 @@ def transcribe(
if self.use_vad_model:
vad_segments = self.vad_model(
{
"waveform": torch.from_numpy(audio).unsqueeze(0),
"waveform": torch.from_numpy(audio).unsqueeze(0).float(),
"sample_rate": 16000,
}
)
Expand Down Expand Up @@ -702,16 +699,16 @@ def transcribe(
prepend_punctuations=prepend_punctuations,
append_punctuations=append_punctuations,
max_new_tokens=max_new_tokens,
clip_timestamps=clip_timestamps,
hotwords=hotwords,
word_timestamps=word_timestamps,
hallucination_silence_threshold=None,
condition_on_previous_text=False,
clip_timestamps="0",
prompt_reset_on_temperature=0.5,
multilingual=False,
output_language=None,
without_timestamps=True,
max_initial_timestamp=0.0,
max_initial_timestamp=0.0,
)

for idx, out in enumerate(
Expand Down Expand Up @@ -873,14 +870,16 @@ def __init__(
)
self.feat_kwargs = self._get_feature_kwargs(model_path, preprocessor_bytes)
self.feature_extractor = FeatureExtractor(**self.feat_kwargs)
self.num_samples_per_token = self.feature_extractor.hop_length * 2
self.input_stride = 2
self.num_samples_per_token = (
self.feature_extractor.hop_length * self.input_stride
)
self.frames_per_second = (
self.feature_extractor.sampling_rate // self.feature_extractor.hop_length
)
self.tokens_per_second = (
self.feature_extractor.sampling_rate // self.num_samples_per_token
)
self.input_stride = 2
self.time_precision = 0.02
self.max_length = 448

Expand Down Expand Up @@ -1374,9 +1373,10 @@ def generate_segments(
options.log_prob_low_threshold,
)

# fast-forward to the next segment boundary
seek += segment_size
continue
if should_skip:
# fast-forward to the next segment boundary
seek += segment_size
continue

tokens = result.sequences_ids[0]

Expand Down Expand Up @@ -1962,7 +1962,8 @@ def assign_word_timings(self, alignments, text_token_probs, words, word_tokens):
return []

jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool)
jump_times = time_indices[jumps] * TIME_PRECISION

jump_times = time_indices[jumps] / self.tokens_per_second
start_times = jump_times[word_boundaries[:-1]]
end_times = jump_times[word_boundaries[1:]]
word_probs = [
Expand Down Expand Up @@ -2026,7 +2027,7 @@ def decode_batch(tokens: List[List[int]]) -> str:
output = []
for idx, res in enumerate(result):
output.append({"text": text[idx].strip()})

# return scores
seq_len = len(res.sequences_ids[0])
cum_logprob = res.scores[0] * (seq_len ** options["length_penalty"])
Expand Down Expand Up @@ -2117,8 +2118,6 @@ def detect_language_multi_segment(
# number of feature frames in 30 seconds of audio is 3000
nb_max_frames = self.feature_extractor.nb_max_frames

# TODO: need to check if it fails for long audios and if we need to split the audio

# extract features from audio with padding (default)
features = self.feature_extractor(audio, enable_ta=enable_ta_fe)

Expand Down