Skip to content

Commit

Permalink
Allow a specific microphone to be used by the ffmpeg audio pipeline u…
Browse files Browse the repository at this point in the history
…tility functions. Default to using the currently active microphone on Mac (#31846)

* use currently active microphone on mac for ffmpeg_microphone

* Allow ffmpeg_microphone device to be specified

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
  • Loading branch information
jrhe and amyeroberts authored Jul 25, 2024
1 parent 6ed0bf1 commit 3a83ec4
Showing 1 changed file with 41 additions and 14 deletions.
55 changes: 41 additions & 14 deletions src/transformers/pipelines/audio_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,29 @@ def ffmpeg_microphone(
sampling_rate: int,
chunk_length_s: float,
format_for_conversion: str = "f32le",
ffmpeg_input_device: Optional[str] = None,
):
"""
Helper function to read raw microphone data.
Helper function to read audio from a microphone using ffmpeg. The default input device will be used unless another
input device is specified using the `ffmpeg_input_device` argument. Uses 'alsa' on Linux, 'avfoundation' on MacOS and
'dshow' on Windows.
Arguments:
sampling_rate (`int`):
The sampling_rate to use when reading the data from the microphone. Try using the model's sampling_rate to
avoid resampling later.
chunk_length_s (`float` or `int`):
The length of the maximum chunk of audio to be sent returned.
format_for_conversion (`str`, defaults to `f32le`):
The name of the format of the audio samples to be returned by ffmpeg. The standard is `f32le`, `s16le`
could also be used.
ffmpeg_input_device (`str`, *optional*):
The indentifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset,
the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices`
for how to specify and list input devices.
Returns:
A generator yielding audio chunks of `chunk_length_s` seconds as `bytes` objects of length
`int(round(sampling_rate * chunk_length_s)) * size_of_sample`.
"""
ar = f"{sampling_rate}"
ac = "1"
Expand All @@ -64,15 +84,16 @@ def ffmpeg_microphone(
raise ValueError(f"Unhandled format `{format_for_conversion}`. Please use `s16le` or `f32le`")

system = platform.system()

if system == "Linux":
format_ = "alsa"
input_ = "default"
input_ = ffmpeg_input_device or "default"
elif system == "Darwin":
format_ = "avfoundation"
input_ = ":0"
input_ = ffmpeg_input_device or ":default"
elif system == "Windows":
format_ = "dshow"
input_ = _get_microphone_name()
input_ = ffmpeg_input_device or _get_microphone_name()

ffmpeg_command = [
"ffmpeg",
Expand Down Expand Up @@ -105,44 +126,50 @@ def ffmpeg_microphone_live(
stream_chunk_s: Optional[int] = None,
stride_length_s: Optional[Union[Tuple[float, float], float]] = None,
format_for_conversion: str = "f32le",
ffmpeg_input_device: Optional[str] = None,
):
"""
Helper function to read audio from the microphone file through ffmpeg. This will output `partial` overlapping
chunks starting from `stream_chunk_s` (if it is defined) until `chunk_length_s` is reached. It will make use of
striding to avoid errors on the "sides" of the various chunks.
Helper function to read audio from a microphone using ffmpeg. This will output `partial` overlapping chunks starting
from `stream_chunk_s` (if it is defined) until `chunk_length_s` is reached. It will make use of striding to avoid
errors on the "sides" of the various chunks. The default input device will be used unless another input device is
specified using the `ffmpeg_input_device` argument. Uses 'alsa' on Linux, 'avfoundation' on MacOS and 'dshow' on Windows.
Arguments:
sampling_rate (`int`):
The sampling_rate to use when reading the data from the microphone. Try using the model's sampling_rate to
avoid resampling later.
chunk_length_s (`float` or `int`):
The length of the maximum chunk of audio to be sent returned. This includes the eventual striding.
stream_chunk_s (`float` or `int`)
stream_chunk_s (`float` or `int`):
The length of the minimal temporary audio to be returned.
stride_length_s (`float` or `int` or `(float, float)`, *optional*, defaults to `None`)
stride_length_s (`float` or `int` or `(float, float)`, *optional*):
The length of the striding to be used. Stride is used to provide context to a model on the (left, right) of
an audio sample but without using that part to actually make the prediction. Setting this does not change
the length of the chunk.
format_for_conversion (`str`, defalts to `f32le`)
format_for_conversion (`str`, *optional*, defaults to `f32le`):
The name of the format of the audio samples to be returned by ffmpeg. The standard is `f32le`, `s16le`
could also be used.
ffmpeg_input_device (`str`, *optional*):
The identifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset,
the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices`
for how to specify and list input devices.
Return:
A generator yielding dictionaries of the following form
`{"sampling_rate": int, "raw": np.array(), "partial" bool}` With optionnally a `"stride" (int, int)` key if
`{"sampling_rate": int, "raw": np.array(), "partial" bool}` With optionally a `"stride" (int, int)` key if
`stride_length_s` is defined.
`stride` and `raw` are all expressed in `samples`, and `partial` is a boolean saying if the current yield item
is a whole chunk, or a partial temporary result to be later replaced by another larger chunk.
"""
if stream_chunk_s is not None:
chunk_s = stream_chunk_s
else:
chunk_s = chunk_length_s

microphone = ffmpeg_microphone(sampling_rate, chunk_s, format_for_conversion=format_for_conversion)
microphone = ffmpeg_microphone(
sampling_rate, chunk_s, format_for_conversion=format_for_conversion, ffmpeg_input_device=ffmpeg_input_device
)
if format_for_conversion == "s16le":
dtype = np.int16
size_of_sample = 2
Expand Down

0 comments on commit 3a83ec4

Please sign in to comment.