From 12f2ee93730464af6aed20a05d11472713c818cd Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Tue, 26 Aug 2025 13:29:41 -0700 Subject: [PATCH 1/4] Add audio to multimodal runner ghstack-source-id: 7eb653c3276672fc80bb1cc400a9311fd9057d72 Pull Request resolved: https://github.com/pytorch/executorch/pull/13662 --- extension/llm/runner/audio.h | 52 ++++++ extension/llm/runner/constants.h | 5 +- extension/llm/runner/multimodal_input.h | 161 +++++++++++++++++- extension/llm/runner/multimodal_prefiller.cpp | 68 ++++++-- 4 files changed, 266 insertions(+), 20 deletions(-) create mode 100644 extension/llm/runner/audio.h diff --git a/extension/llm/runner/audio.h b/extension/llm/runner/audio.h new file mode 100644 index 00000000000..868765950af --- /dev/null +++ b/extension/llm/runner/audio.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// A simple audio struct. + +#pragma once +#include +#include +#include + +namespace executorch { +namespace extension { +namespace llm { + +/** + * Audio inputs as a raw audio tensor, for use when the audio processing + * into a mel spectrogram is baked into the audio encoder with torch.export. + */ +struct ET_EXPERIMENTAL RawAudio { + std::vector data; + int32_t batch_size; + int32_t n_channels; // For mono, use n_channels = 1. + int32_t n_samples; +}; + +/** + * Pre-processed audio inputs, ready to feed directly into an audio + * encoder. + */ +struct ET_EXPERIMENTAL Audio { + std::vector data; + int32_t batch_size; + int32_t n_bins; + int32_t n_frames; +}; + +} // namespace llm +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::llm::Audio; +} // namespace executor +} // namespace torch diff --git a/extension/llm/runner/constants.h b/extension/llm/runner/constants.h index fc6ddcb451c..b26f319b5ec 100644 --- a/extension/llm/runner/constants.h +++ b/extension/llm/runner/constants.h @@ -21,7 +21,8 @@ inline constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache"; // Multimodal method name conventions inline constexpr auto kImageEncoderMethod = "image_encoder"; -inline constexpr auto kTokenEmbeddingMethod = "token_embedding"; -inline constexpr auto kTextModelMethod = "text_model"; +inline constexpr auto kAudioEncoderMethod = "audio_encoder"; +inline constexpr auto kTokenEmbeddingMethod = "token_embeddings"; +inline constexpr auto kTextModelMethod = "decoder"; } // namespace executorch::extension::llm diff --git a/extension/llm/runner/multimodal_input.h b/extension/llm/runner/multimodal_input.h index ae243992fec..728d8aef08f 100644 --- a/extension/llm/runner/multimodal_input.h +++ b/extension/llm/runner/multimodal_input.h @@ -11,6 +11,7 @@ #pragma once +#include #include #include #include @@ -19,19 +20,31 @@ namespace executorch::extension::llm { /** - * A generic class to hold either image or text data for multimodal inputs. - * This allows the generate() API to take a std::vector of these objects - * instead of separate image and text parameters. + * A generic class to hold either image, text, or audio data for multimodal + * inputs. This allows the generate() API to take a std::vector of these objects + * instead of separate image, text, and audio parameters. */ class ET_EXPERIMENTAL MultimodalInput { public: - enum class Type { TEXT, IMAGE }; + /// Type of multimodal input data + enum class Type { + TEXT, ///< Text string input + IMAGE, ///< Processed image input + AUDIO, ///< Processed audio input + RAW_AUDIO, ///< Raw unprocessed audio input (straight from audio file) + UNSUPPORTED ///< Unsupported input type + }; // Constructors explicit MultimodalInput(const std::string& text) : data_(text) {} explicit MultimodalInput(std::string&& text) : data_(std::move(text)) {} explicit MultimodalInput(const Image& image) : data_(image) {} explicit MultimodalInput(Image&& image) : data_(std::move(image)) {} + explicit MultimodalInput(const Audio& audio) : data_(audio) {} + explicit MultimodalInput(Audio&& audio) : data_(std::move(audio)) {} + explicit MultimodalInput(const RawAudio& raw_audio) : data_(raw_audio) {} + explicit MultimodalInput(RawAudio&& raw_audio) + : data_(std::move(raw_audio)) {} // Copy constructor and assignment MultimodalInput(const MultimodalInput& other) = default; @@ -60,12 +73,37 @@ class ET_EXPERIMENTAL MultimodalInput { return std::holds_alternative(data_); } + /** + * Check if this input contains audio data. + * @return true if this input contains audio, false otherwise. + */ + bool is_audio() const noexcept { + return std::holds_alternative