Add mel_spectrogram_device parameter

openai · Sep 22, 2024 · c1031a5 · c1031a5
1 parent 834662c
commit c1031a5
Showing 1 changed file with 5 additions and 1 deletion.
diff --git a/whisper/transcribe.py b/whisper/transcribe.py
@@ -51,6 +51,7 @@ def transcribe(
  append_punctuations: str = "\"'.。,，!！?？:：”)]}、",
  clip_timestamps: Union[str, List[float]] = "0",
  hallucination_silence_threshold: Optional[float] = None,
+ mel_spectrogram_device: Optional[Union[str, torch.device]] = None,
  **decode_options,
 ):
  """
@@ -113,6 +114,9 @@ def transcribe(
  When word_timestamps is True, skip silent periods longer than this threshold (in seconds)
  when a possible hallucination is detected
 
+ mel_spectrogram_device: Optional[Union[str, torch.device]]
+ If given, the audio tensor is moved to this device before STFT
+
  Returns
  -------
  A dictionary containing the resulting text ("text") and segment-level details ("segments"), and
@@ -131,7 +135,7 @@ def transcribe(
 
  # Pad 30-seconds of silence to the input audio, for slicing
  mel = log_mel_spectrogram(
- audio, model.dims.n_mels, padding=N_SAMPLES, device=model.device
+ audio, model.dims.n_mels, padding=N_SAMPLES, device=mel_spectrogram_device
  )
  content_frames = mel.shape[-1] - N_FRAMES
  content_duration = float(content_frames * HOP_LENGTH / SAMPLE_RATE)