From 12f2ee93730464af6aed20a05d11472713c818cd Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Tue, 26 Aug 2025 13:29:41 -0700
Subject: [PATCH 1/4] Add audio to multimodal runner

ghstack-source-id: 7eb653c3276672fc80bb1cc400a9311fd9057d72
Pull Request resolved: https://github.com/pytorch/executorch/pull/13662
---
 extension/llm/runner/audio.h                  |  52 ++++++
 extension/llm/runner/constants.h              |   5 +-
 extension/llm/runner/multimodal_input.h       | 161 +++++++++++++++++-
 extension/llm/runner/multimodal_prefiller.cpp |  68 ++++++--
 4 files changed, 266 insertions(+), 20 deletions(-)
 create mode 100644 extension/llm/runner/audio.h
diff --git a/extension/llm/runner/audio.h b/extension/llm/runner/audio.h
new file mode 100644
index 00000000000..868765950af
--- /dev/null
+++ b/extension/llm/runner/audio.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// A simple audio struct.
+
+#pragma once
+#include <executorch/runtime/platform/compiler.h>
+#include <cstdint>
+#include <vector>
+
+namespace executorch {
+namespace extension {
+namespace llm {
+
+/**
+ * Audio inputs as a raw audio tensor, for use when the audio processing
+ * into a mel spectrogram is baked into the audio encoder with torch.export.
+ */
+struct ET_EXPERIMENTAL RawAudio {
+  std::vector<uint8_t> data;
+  int32_t batch_size;
+  int32_t n_channels; // For mono, use n_channels = 1.
+  int32_t n_samples;
+};
+
+/**
+ * Pre-processed audio inputs, ready to feed directly into an audio
+ * encoder.
+ */
+struct ET_EXPERIMENTAL Audio {
+  std::vector<uint8_t> data;
+  int32_t batch_size;
+  int32_t n_bins;
+  int32_t n_frames;
+};
+
+} // namespace llm
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::llm::Audio;
+} // namespace executor
+} // namespace torch
diff --git a/extension/llm/runner/constants.h b/extension/llm/runner/constants.h
index fc6ddcb451c..b26f319b5ec 100644
--- a/extension/llm/runner/constants.h
+++ b/extension/llm/runner/constants.h
@@ -21,7 +21,8 @@ inline constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
 
 // Multimodal method name conventions
 inline constexpr auto kImageEncoderMethod = "image_encoder";
-inline constexpr auto kTokenEmbeddingMethod = "token_embedding";
-inline constexpr auto kTextModelMethod = "text_model";
+inline constexpr auto kAudioEncoderMethod = "audio_encoder";
+inline constexpr auto kTokenEmbeddingMethod = "token_embeddings";
+inline constexpr auto kTextModelMethod = "decoder";
 
 } // namespace executorch::extension::llm
diff --git a/extension/llm/runner/multimodal_input.h b/extension/llm/runner/multimodal_input.h
index ae243992fec..728d8aef08f 100644
--- a/extension/llm/runner/multimodal_input.h
+++ b/extension/llm/runner/multimodal_input.h
@@ -11,6 +11,7 @@
 
 #pragma once
 
+#include <executorch/extension/llm/runner/audio.h>
 #include <executorch/extension/llm/runner/image.h>
 #include <executorch/runtime/platform/compiler.h>
 #include <string>
@@ -19,19 +20,31 @@
 namespace executorch::extension::llm {
 
 /**
- * A generic class to hold either image or text data for multimodal inputs.
- * This allows the generate() API to take a std::vector of these objects
- * instead of separate image and text parameters.
+ * A generic class to hold either image, text, or audio data for multimodal
+ * inputs. This allows the generate() API to take a std::vector of these objects
+ * instead of separate image, text, and audio parameters.
  */
 class ET_EXPERIMENTAL MultimodalInput {
  public:
-  enum class Type { TEXT, IMAGE };
+  /// Type of multimodal input data
+  enum class Type {
+    TEXT, ///< Text string input
+    IMAGE, ///< Processed image input
+    AUDIO, ///< Processed audio input
+    RAW_AUDIO, ///< Raw unprocessed audio input (straight from audio file)
+    UNSUPPORTED ///< Unsupported input type
+  };
 
   // Constructors
   explicit MultimodalInput(const std::string& text) : data_(text) {}
   explicit MultimodalInput(std::string&& text) : data_(std::move(text)) {}
   explicit MultimodalInput(const Image& image) : data_(image) {}
   explicit MultimodalInput(Image&& image) : data_(std::move(image)) {}
+  explicit MultimodalInput(const Audio& audio) : data_(audio) {}
+  explicit MultimodalInput(Audio&& audio) : data_(std::move(audio)) {}
+  explicit MultimodalInput(const RawAudio& raw_audio) : data_(raw_audio) {}
+  explicit MultimodalInput(RawAudio&& raw_audio)
+      : data_(std::move(raw_audio)) {}
 
   // Copy constructor and assignment
   MultimodalInput(const MultimodalInput& other) = default;
@@ -60,12 +73,37 @@ class ET_EXPERIMENTAL MultimodalInput {
     return std::holds_alternative<Image>(data_);
   }
 
+  /**
+   * Check if this input contains audio data.
+   * @return true if this input contains audio, false otherwise.
+   */
+  bool is_audio() const noexcept {
+    return std::holds_alternative<Audio>(data_);
+  }
+
+  /**
+   * Check if this input contains raw audio data.
+   * @return true if this input contains raw audio, false otherwise.
+   */
+  bool is_raw_audio() const noexcept {
+    return std::holds_alternative<RawAudio>(data_);
+  }
+
   /**
    * Get the type of data stored in this input.
-   * @return Type::TEXT if text data, Type::IMAGE if image data.
+   * @return Type::TEXT if text data, Type::IMAGE if image data, Type::AUDIO if
+   * audio data, Type::RAW_AUDIO if raw audio data.
    */
   Type get_type() const noexcept {
-    return is_text() ? Type::TEXT : Type::IMAGE;
+    if (is_text())
+      return Type::TEXT;
+    if (is_image())
+      return Type::IMAGE;
+    if (is_audio())
+      return Type::AUDIO;
+    if (is_raw_audio())
+      return Type::RAW_AUDIO;
+    return Type::UNSUPPORTED;
   }
 
   /**
@@ -122,6 +160,60 @@ class ET_EXPERIMENTAL MultimodalInput {
     return std::get<Image>(std::move(data_));
   }
 
+  /**
+   * Get the audio data from this input.
+   * @return Reference to the stored Audio object.
+   * @throws std::bad_variant_access if this input doesn't contain audio.
+   */
+  const Audio& get_audio() const& {
+    return std::get<Audio>(data_);
+  }
+
+  /**
+   * Get the audio data from this input (mutable version).
+   * @return Mutable reference to the stored Audio object.
+   * @throws std::bad_variant_access if this input doesn't contain audio.
+   */
+  Audio& get_audio() & {
+    return std::get<Audio>(data_);
+  }
+
+  /**
+   * Get the audio data from this input (rvalue version).
+   * @return Rvalue reference to the stored Audio object for efficient moves.
+   * @throws std::bad_variant_access if this input doesn't contain audio.
+   */
+  Audio&& get_audio() && {
+    return std::get<Audio>(std::move(data_));
+  }
+
+  /**
+   * Get the raw audio data from this input.
+   * @return Reference to the stored RawAudio object.
+   * @throws std::bad_variant_access if this input doesn't contain raw audio.
+   */
+  const RawAudio& get_raw_audio() const& {
+    return std::get<RawAudio>(data_);
+  }
+
+  /**
+   * Get the raw audio data from this input (mutable version).
+   * @return Mutable reference to the stored RawAudio object.
+   * @throws std::bad_variant_access if this input doesn't contain raw audio.
+   */
+  RawAudio& get_raw_audio() & {
+    return std::get<RawAudio>(data_);
+  }
+
+  /**
+   * Get the raw audio data from this input (rvalue version).
+   * @return Rvalue reference to the stored RawAudio object for efficient moves.
+   * @throws std::bad_variant_access if this input doesn't contain raw audio.
+   */
+  RawAudio&& get_raw_audio() && {
+    return std::get<RawAudio>(std::move(data_));
+  }
+
   /**
    * Try to get the text data from this input safely.
    * @return Pointer to the text string if this input contains text, nullptr
@@ -158,8 +250,44 @@ class ET_EXPERIMENTAL MultimodalInput {
     return std::get_if<Image>(&data_);
   }
 
+  /**
+   * Try to get the audio data from this input safely.
+   * @return Pointer to the Audio object if this input contains audio,
+   * nullptr otherwise.
+   */
+  const Audio* try_get_audio() const noexcept {
+    return std::get_if<Audio>(&data_);
+  }
+
+  /**
+   * Try to get the audio data from this input safely (mutable version).
+   * @return Pointer to the Audio object if this input contains audio,
+   * nullptr otherwise.
+   */
+  Audio* try_get_audio() noexcept {
+    return std::get_if<Audio>(&data_);
+  }
+
+  /**
+   * Try to get the raw audio data from this input safely.
+   * @return Pointer to the RawAudio object if this input contains raw audio,
+   * nullptr otherwise.
+   */
+  const RawAudio* try_get_raw_audio() const noexcept {
+    return std::get_if<RawAudio>(&data_);
+  }
+
+  /**
+   * Try to get the raw audio data from this input safely (mutable version).
+   * @return Pointer to the RawAudio object if this input contains raw audio,
+   * nullptr otherwise.
+   */
+  RawAudio* try_get_raw_audio() noexcept {
+    return std::get_if<RawAudio>(&data_);
+  }
+
  private:
-  std::variant<std::string, Image> data_;
+  std::variant<std::string, Image, Audio, RawAudio> data_;
 };
 
 // Convenience factory functions
@@ -179,4 +307,21 @@ inline MultimodalInput make_image_input(Image&& image) noexcept {
   return MultimodalInput(std::move(image));
 }
 
-} // namespace executorch::extension::llm
\ No newline at end of file
+inline MultimodalInput make_audio_input(const Audio& audio) noexcept {
+  return MultimodalInput(audio);
+}
+
+inline MultimodalInput make_audio_input(Audio&& audio) noexcept {
+  return MultimodalInput(std::move(audio));
+}
+
+inline MultimodalInput make_raw_audio_input(
+    const RawAudio& raw_audio) noexcept {
+  return MultimodalInput(raw_audio);
+}
+
+inline MultimodalInput make_raw_audio_input(RawAudio&& raw_audio) noexcept {
+  return MultimodalInput(std::move(raw_audio));
+}
+
+} // namespace executorch::extension::llm
diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp
index 7f69041551f..029d6105875 100644
--- a/extension/llm/runner/multimodal_prefiller.cpp
+++ b/extension/llm/runner/multimodal_prefiller.cpp
@@ -37,7 +37,7 @@ MultimodalPrefiller::MultimodalPrefiller(
 Result<uint64_t> MultimodalPrefiller::prefill(
     const MultimodalInput& input,
     int64_t& start_pos) {
-  // Check if input is image
+  // 1. Run encoder model.
   ::executorch::runtime::EValue encoder_output;
   if (input.is_image()) {
     Image image = input.get_image();
@@ -51,34 +51,77 @@ Result<uint64_t> MultimodalPrefiller::prefill(
         ET_UNWRAP(module_->execute(kImageEncoderMethod, image_tensor));
 
     encoder_output = image_encoder_outputs[0];
+  } else if (input.is_audio()) {
+    Audio audio = input.get_audio();
+
+    // Use the original tensor shape as intended
+    auto audio_tensor = executorch::extension::from_blob(
+        audio.data.data(),
+        {audio.batch_size, audio.n_bins, audio.n_frames},
+        ::executorch::aten::ScalarType::Float);
+
+    // Run audio encoder
+    auto audio_encoder_result =
+        module_->execute(kAudioEncoderMethod, audio_tensor);
+    if (audio_encoder_result.error() != ::executorch::runtime::Error::Ok) {
+      return ::executorch::runtime::Error::Internal;
+    }
+    auto audio_encoder_outputs = audio_encoder_result.get();
+
+    encoder_output = audio_encoder_outputs[0];
   } else if (input.is_text()) {
-    // For text input, we don't need to run the image encoder.
-    // Instead, we run the text encoder to get the encoder output.
     auto& text = input.get_text();
     std::vector<uint64_t> tokens =
         ET_UNWRAP_TOKENIZER(tokenizer_->encode(text));
+
     auto text_tensor = executorch::extension::from_blob(
         tokens.data(),
         {1, static_cast<aten::SizesType>(tokens.size())},
         ::executorch::aten::ScalarType::Long);
 
-    // Run token embedding
+    // Run text encoder (token embeddings)
     auto token_embedding_outputs =
         ET_UNWRAP(module_->execute(kTokenEmbeddingMethod, text_tensor));
 
     encoder_output = token_embedding_outputs[0];
   } else {
     ET_LOG(Error, "Unsupported input type");
-    // For all other input types (e.g., audio), return error
+    // For any other input types, return error
     return ::executorch::runtime::Error::NotSupported;
   }
 
-  auto outputs_res =
-      ET_UNWRAP(text_decoder_runner_->decode(encoder_output, start_pos));
+  // 2. Run decoder model for prefill.
+  // `cache_position` goes from start_pos to start_pos + encoder_output.size(1).
+  // e.g. if start_pos = 2 and encoder_output.size(1) = 5,
+  // cache_position_tensor should be [2, 3, 4, 5, 6].
+  int64_t seq_len = encoder_output.toTensor().size(1);
+  if (seq_len == 0) {
+    ET_LOG(Error, "The encoder returned an empty output.");
+    return ::executorch::runtime::Error::InvalidState;
+  }
+  std::vector<int64_t> cache_positions(seq_len);
+  for (int64_t i = 0; i < seq_len; ++i) {
+    cache_positions[i] = start_pos + i;
+  }
+  auto cache_position_tensor = ::executorch::extension::from_blob(
+      cache_positions.data(), {seq_len}, executorch::aten::ScalarType::Long);
+  auto prefill_result = module_->execute(
+      kTextModelMethod, {cache_position_tensor, encoder_output});
+  if (prefill_result.error() != ::executorch::runtime::Error::Ok) {
+    return prefill_result.error();
+  }
+  // Check if prefill_outputs is empty, if it is return error and log that the
+  // specified encoder returned empty results when used to prefill decoder.
+  auto prefill_outputs = prefill_result.get();
+  if (prefill_outputs.empty()) {
+    ET_LOG(
+        Error, "Encoder returned empty results when used to prefill decoder");
+    return ::executorch::runtime::Error::InvalidState;
+  }
+  auto outputs_res = prefill_outputs[0].toTensor();
 
-  // Update the start_pos, which is only available inside this function.
-  // outputs_res can have only one logits.
-  start_pos += encoder_output.toTensor().size(1);
+  // Update start_pos, tracking the current cache position.
+  start_pos += seq_len;
 
   return static_cast<uint64_t>(
       text_decoder_runner_->logits_to_token(outputs_res));
@@ -103,6 +146,11 @@ ::executorch::runtime::Error MultimodalPrefiller::load() {
   if (methods.find(kImageEncoderMethod) != methods.end()) {
     ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kImageEncoderMethod));
   }
+
+  if (methods.find(kAudioEncoderMethod) != methods.end()) {
+    ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kAudioEncoderMethod));
+  }
+
   return ::executorch::runtime::Error::Ok;
 }
 

From 049515f5b22c219290869cab39ea18d5ca1dcc89 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Tue, 26 Aug 2025 13:29:42 -0700
Subject: [PATCH 2/4] Add Voxtral runner

ghstack-source-id: 81fadce7ec33d63ecc3deecc74fd176845883425
Pull Request resolved: https://github.com/pytorch/executorch/pull/13663
---
 examples/models/voxtral/CMakeLists.txt     |  99 ++++++++++
 examples/models/voxtral/multimodal.cpp     | 217 +++++++++++++++++++++
 extension/llm/runner/llm_runner_helper.cpp |  57 +++++-
 extension/llm/runner/llm_runner_helper.h   |  11 +-
 4 files changed, 374 insertions(+), 10 deletions(-)
 create mode 100644 examples/models/voxtral/CMakeLists.txt
 create mode 100644 examples/models/voxtral/multimodal.cpp

diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt
new file mode 100644
index 00000000000..1a5faf3d350
--- /dev/null
+++ b/examples/models/voxtral/CMakeLists.txt
@@ -0,0 +1,99 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#
+# Simple CMake build system for voxtral runner.
+#
+cmake_minimum_required(VERSION 3.24)
+project(voxtral)
+
+set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+
+if(CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
+  set(CMAKE_TOOLCHAIN_IOS ON)
+else()
+  set(CMAKE_TOOLCHAIN_IOS OFF)
+endif()
+
+# Let files say "include <executorch/path/to/header.h>"
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+
+# Need this for gflags for some reason
+set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
+find_package(gflags REQUIRED)
+
+# Find `executorch` libraries, same as for gflags
+list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
+find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
+executorch_target_link_options_shared_lib(executorch)
+
+set(LINK_LIBS executorch gflags)
+set(link_libraries ${LINK_LIBS})
+set(_srcs multimodal.cpp)
+
+list(
+  APPEND
+  link_libraries
+  optimized_native_cpu_ops_lib
+  quantized_ops_lib
+  custom_ops
+  cpublas
+  eigen_blas
+)
+executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
+executorch_target_link_options_shared_lib(quantized_ops_lib)
+executorch_target_link_options_shared_lib(custom_ops)
+
+# XNNPACK
+if(TARGET xnnpack_backend)
+  set(xnnpack_backend_libs xnnpack_backend XNNPACK xnnpack-microkernels-prod)
+  if(TARGET kleidiai)
+    list(APPEND xnnpack_backend_libs kleidiai)
+  endif()
+  list(APPEND link_libraries ${xnnpack_backend_libs})
+  executorch_target_link_options_shared_lib(xnnpack_backend)
+endif()
+
+# Add LLM runner and extension module
+if(NOT TARGET extension_llm_runner)
+  message(
+    FATAL_ERROR
+      "ExecuTorch must be installed with EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER enabled."
+  )
+endif()
+
+# Needed for cpuinfo where it uses android specific log lib
+if(ANDROID)
+  list(APPEND link_libraries log)
+endif()
+
+# Add the required ExecutorTorch extensions for multimodal LLM runner
+list(
+  APPEND
+  link_libraries
+  extension_llm_runner
+  extension_module
+  extension_data_loader
+  extension_tensor
+  extension_flat_tensor
+)
+
+# Add tokenizers
+list(APPEND link_libraries tokenizers::tokenizers)
+
+add_executable(voxtral_runner ${_srcs})
+if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+  target_link_options_gc_sections(voxtral_runner)
+  if(NOT APPLE)
+    target_link_options(voxtral_runner PRIVATE "LINKER:-s")
+  endif()
+endif()
+
+target_include_directories(voxtral_runner PUBLIC ${_common_include_directories})
+target_link_libraries(voxtral_runner PUBLIC ${link_libraries})
+target_compile_options(voxtral_runner PUBLIC ${_common_compile_options})
diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp
new file mode 100644
index 00000000000..d7183f3c662
--- /dev/null
+++ b/examples/models/voxtral/multimodal.cpp
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cmath>
+#include <cstring>
+#include <fstream>
+
+#include <gflags/gflags.h>
+
+#include <executorch/extension/llm/runner/audio.h>
+#include <executorch/extension/llm/runner/image.h>
+#include <executorch/extension/llm/runner/llm_runner_helper.h>
+#include <executorch/extension/llm/runner/multimodal_input.h>
+#include <executorch/extension/llm/runner/multimodal_runner.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/log.h>
+
+#if defined(ET_USE_THREADPOOL)
+#include <executorch/extension/threadpool/cpuinfo_utils.h>
+#include <executorch/extension/threadpool/threadpool.h>
+#endif
+
+DEFINE_string(
+    model_path,
+    "multimodal.pte",
+    "Model serialized in flatbuffer format.");
+
+DEFINE_string(tokenizer_path, "tekken.json", "Tokenizer stuff.");
+
+DEFINE_string(prompt, "What is happening in this audio?", "Text prompt.");
+
+DEFINE_string(audio_path, "", "Path to input audio file.");
+
+DEFINE_double(
+    temperature,
+    0.8f,
+    "Temperature; Default is 0.8f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic");
+
+DEFINE_int32(
+    cpu_threads,
+    -1,
+    "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");
+
+DEFINE_bool(warmup, false, "Whether to run a warmup run.");
+
+namespace {
+
+using ::executorch::extension::llm::Image;
+using ::executorch::extension::llm::make_image_input;
+using ::executorch::extension::llm::make_text_input;
+using ::executorch::extension::llm::MultimodalInput;
+
+bool ends_with(const std::string& str, const std::string& suffix) {
+  return str.size() >= suffix.size() &&
+      str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
+}
+
+/**
+ * @brief Loads preprocessed audio data from a binary file
+ *
+ * Reads mel spectrogram features that have been pre-computed and saved as a
+ * binary file. The audio data is expected to be stored as float values in
+ * binary format, typically saved using:
+ *   with open("tensor.bin", "wb") as f:
+ *       f.write(t.numpy().tobytes())
+ *
+ * @param audio_path Path to the binary audio file (.bin)
+ * @return MultimodalInput containing the loaded audio data
+ */
+MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
+  std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
+  int32_t n_bins = 128;
+  int32_t n_frames = 3000;
+  std::size_t n_floats =
+      f.tellg() / sizeof(float); // Number of floats in the audio file.
+  f.seekg(0, std::ios::beg);
+  int32_t batch_size = ceil(
+      n_floats /
+      (n_bins * n_frames)); // Batch in increments of n_frames, rounding up.
+  std::vector<float> audio_data(batch_size * n_bins * n_frames);
+  f.read(
+      reinterpret_cast<char*>(audio_data.data()),
+      audio_data.size() * sizeof(float));
+
+  ET_LOG(Info, "audio_data len = %d", audio_data.size());
+
+  auto audio = std::make_unique<::executorch::extension::llm::Audio>();
+  audio->batch_size = batch_size;
+  audio->n_bins = n_bins;
+  audio->n_frames = n_frames;
+  audio->data.resize(audio_data.size() * sizeof(float));
+  std::memcpy(
+      audio->data.data(), audio_data.data(), audio_data.size() * sizeof(float));
+  return ::executorch::extension::llm::make_audio_input(std::move(*audio));
+}
+
+/**
+ * @brief Processes audio files for multimodal input
+ *
+ * Dispatches audio file processing based on file extension:
+ * - .bin files: Loads preprocessed mel spectrogram features directly
+ * - .wav/.mp3 files: Currently unsupported, throws runtime_error
+ *
+ * This function provides a interface for different audio input formats
+ * and can be extended to support raw audio processing in the future.
+ *
+ * @param audio_path Path to the audio file
+ * @return MultimodalInput containing the processed audio data
+ * @throws std::runtime_error if file format is unsupported or processing fails
+ */
+MultimodalInput processAudioFile(const std::string& audio_path) {
+  if (ends_with(audio_path, ".bin")) {
+    // Current behavior - load preprocessed audio stored as a binary file.
+    return loadPreprocessedAudio(audio_path);
+  } else if (ends_with(audio_path, ".wav") || ends_with(audio_path, ".mp3")) {
+    // New: Process raw audio files - unsupported for now
+    ET_LOG(Error, "Raw audio file processing (.wav/.mp3) is not yet supported");
+    throw std::runtime_error("Raw audio file processing not supported");
+  } else {
+    ET_LOG(Error, "Unsupported audio file format: %s", audio_path.c_str());
+    throw std::runtime_error("Unsupported audio file format");
+  }
+}
+
+} // namespace
+
+int32_t main(int32_t argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  const char* model_path = FLAGS_model_path.c_str();
+
+  const char* tokenizer_path = FLAGS_tokenizer_path.c_str();
+  const char* prompt = FLAGS_prompt.c_str();
+  const char* audio_path = FLAGS_audio_path.c_str();
+  float temperature = FLAGS_temperature;
+  int32_t cpu_threads = FLAGS_cpu_threads;
+  bool warmup = FLAGS_warmup;
+
+#if defined(ET_USE_THREADPOOL)
+  uint32_t num_performant_cores = cpu_threads == -1
+      ? ::executorch::extension::cpuinfo::get_num_performant_cores()
+      : static_cast<uint32_t>(cpu_threads);
+  ET_LOG(
+      Info, "Resetting threadpool with num threads = %d", num_performant_cores);
+  if (num_performant_cores > 0) {
+    ::executorch::extension::threadpool::get_threadpool()
+        ->_unsafe_reset_threadpool(num_performant_cores);
+  }
+#endif
+
+  // Load tokenizer
+  std::unique_ptr<::tokenizers::Tokenizer> tokenizer =
+      ::executorch::extension::llm::load_tokenizer(tokenizer_path);
+  if (tokenizer == nullptr) {
+    ET_LOG(Error, "Failed to load tokenizer from: %s", tokenizer_path);
+    return 1;
+  }
+
+  // Create multimodal runner
+  std::unique_ptr<::executorch::extension::llm::MultimodalRunner> runner =
+      ::executorch::extension::llm::create_multimodal_runner(
+          model_path, std::move(tokenizer));
+  if (runner == nullptr) {
+    ET_LOG(Error, "Failed to create multimodal runner");
+    return 1;
+  }
+
+  // Load runner
+  auto load_error = runner->load();
+  if (load_error != ::executorch::runtime::Error::Ok) {
+    ET_LOG(Error, "Failed to load multimodal runner");
+    return 1;
+  }
+
+  // Prepare inputs
+  std::vector<MultimodalInput> inputs;
+
+  // 1. Add start bos-related text inputs and modality start token.
+  inputs.emplace_back(make_text_input("<s>[INST][BEGIN_AUDIO]"));
+
+  // 2. Add audio input
+  inputs.emplace_back(processAudioFile(audio_path));
+
+  // 3. Add text input (the actual user-submitted prompt)
+  inputs.emplace_back(make_text_input(std::string(prompt) + "[/INST]"));
+
+  ::executorch::extension::llm::GenerationConfig config;
+  config.max_new_tokens = 100;
+  config.temperature = temperature;
+
+  // Run warmup if requested
+  if (warmup) {
+    ET_LOG(Info, "Running warmup...");
+    auto warmup_error = runner->generate(inputs, config);
+    if (warmup_error != ::executorch::runtime::Error::Ok) {
+      ET_LOG(Error, "Failed to run warmup");
+      return 1;
+    }
+    runner->reset();
+  }
+
+  // Generate
+  ET_LOG(Info, "Starting generation...");
+  auto error = runner->generate(inputs, config);
+  if (error != ::executorch::runtime::Error::Ok) {
+    ET_LOG(Error, "Failed to generate with multimodal runner");
+    return 1;
+  }
+
+  printf("\n");
+  return 0;
+}
diff --git a/extension/llm/runner/llm_runner_helper.cpp b/extension/llm/runner/llm_runner_helper.cpp
index 2e17e518c4a..919e92cf39a 100644
--- a/extension/llm/runner/llm_runner_helper.cpp
+++ b/extension/llm/runner/llm_runner_helper.cpp
@@ -17,10 +17,12 @@
 #include <executorch/extension/llm/runner/text_llm_runner.h>
 #include <executorch/extension/llm/runner/text_prefiller.h>
 #include <executorch/extension/llm/runner/text_token_generator.h>
+#include <executorch/runtime/core/result.h>
 #include <executorch/runtime/platform/runtime.h>
 #include <pytorch/tokenizers/hf_tokenizer.h>
 #include <pytorch/tokenizers/llama2c_tokenizer.h>
 #include <pytorch/tokenizers/sentencepiece.h>
+#include <pytorch/tokenizers/tekken.h>
 #include <pytorch/tokenizers/tiktoken.h>
 
 namespace executorch::extension::llm {
@@ -35,6 +37,18 @@ std::unique_ptr<tokenizers::Tokenizer> load_tokenizer(
     size_t bos_token_index,
     size_t eos_token_index) {
   runtime::runtime_init();
+  auto tekken_tokenizer = std::make_unique<tokenizers::Tekken>();
+  // Prevent the case where tekken tokenizer accidentally successfully loads a
+  // HuggingFace tokenizer, which is also .json.
+  const std::string tekken_name = "tekken.json";
+  if (tokenizer_path.size() >= tekken_name.size() &&
+      tokenizer_path.rfind(tekken_name) ==
+          tokenizer_path.size() - tekken_name.size()) {
+    if (tekken_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) {
+      ET_LOG(Info, "Loaded tekken tokenizer");
+      return tekken_tokenizer;
+    }
+  }
   auto json_tokenizer = std::make_unique<tokenizers::HFTokenizer>();
   if (json_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) {
     ET_LOG(Info, "Loaded json tokenizer");
@@ -73,9 +87,8 @@ std::unique_ptr<tokenizers::Tokenizer> load_tokenizer(
   return nullptr;
 }
 
-std::unordered_map<std::string, int64_t> get_llm_metadata(
-    tokenizers::Tokenizer* tokenizer,
-    Module* module) {
+::executorch::runtime::Result<std::unordered_map<std::string, int64_t>>
+get_llm_metadata(tokenizers::Tokenizer* tokenizer, Module* module) {
   // Initialize metadata with default values
   std::unordered_map<std::string, int64_t> metadata({
       {llm::kEnableDynamicShape, false},
@@ -89,10 +102,20 @@ std::unordered_map<std::string, int64_t> get_llm_metadata(
   auto method_names_result = module->method_names();
   if (method_names_result.error() != Error::Ok) {
     ET_LOG(Error, "Failed reading method names");
-    return metadata;
+    return ::executorch::runtime::Error::InvalidArgument;
   }
   const auto& method_names = method_names_result.get();
 
+  // Error out if the max seq len metadata method is not present, since
+  // it is hard to figure out from just the .pte itself.
+  if (!method_names.count(llm::kMaxSeqLen)) {
+    ET_LOG(
+        Error,
+        "Required metadata method %s not found in model",
+        llm::kMaxSeqLen);
+    return ::executorch::runtime::Error::InvalidArgument;
+  }
+
   for (auto& pair : metadata) {
     const auto& method_name = pair.first;
     auto& value = pair.second;
@@ -109,6 +132,18 @@ std::unordered_map<std::string, int64_t> get_llm_metadata(
     }
     ET_LOG(Info, "Metadata: %s = %" PRId64, method_name.c_str(), value);
   }
+
+  // If kMaxContextLen method not found but kMaxSeqLen is
+  // available, set kMaxContextLen to the value of kMaxSeqLen.
+  if (!method_names.count(llm::kMaxContextLen) &&
+      method_names.count(llm::kMaxSeqLen)) {
+    metadata[llm::kMaxContextLen] = metadata[llm::kMaxSeqLen];
+    ET_LOG(
+        Info,
+        "Setting kMaxContextLen to kMaxSeqLen value: %" PRId64,
+        metadata[llm::kMaxContextLen]);
+  }
+
   // Set tokenizer-related metadata
   metadata[llm::kBosId] = tokenizer->bos_tok();
   metadata[llm::kVocabSize] = tokenizer->vocab_size();
@@ -165,7 +200,12 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
 
   // Get metadata from Module
   ET_LOG(Info, "Reading metadata from model");
-  auto metadata = llm::get_llm_metadata(tokenizer.get(), module.get());
+  auto metadata_result = llm::get_llm_metadata(tokenizer.get(), module.get());
+  if (metadata_result.error() != Error::Ok) {
+    ET_LOG(Error, "Failed to get metadata from model");
+    return nullptr;
+  }
+  auto metadata = metadata_result.get();
 
   auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>(
       llm::get_eos_ids(tokenizer.get(), module.get()));
@@ -228,7 +268,12 @@ std::unique_ptr<MultimodalRunner> create_multimodal_runner(
 
   // Get metadata from Module
   ET_LOG(Info, "Reading metadata from model");
-  auto metadata = get_llm_metadata(tokenizer.get(), module.get());
+  auto metadata_result = get_llm_metadata(tokenizer.get(), module.get());
+  if (metadata_result.error() != Error::Ok) {
+    ET_LOG(Error, "Failed to get metadata from model");
+    return nullptr;
+  }
+  auto metadata = metadata_result.get();
 
   auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>(
       get_eos_ids(tokenizer.get(), module.get()));
diff --git a/extension/llm/runner/llm_runner_helper.h b/extension/llm/runner/llm_runner_helper.h
index 5ca96b3bb96..191ea3ab090 100644
--- a/extension/llm/runner/llm_runner_helper.h
+++ b/extension/llm/runner/llm_runner_helper.h
@@ -19,6 +19,7 @@
 
 #include <executorch/extension/llm/runner/constants.h>
 #include <executorch/extension/module/module.h>
+#include <executorch/runtime/core/result.h>
 #include <executorch/runtime/platform/compiler.h>
 #include <pytorch/tokenizers/tokenizer.h>
 
@@ -59,11 +60,13 @@ ET_EXPERIMENTAL std::unique_ptr<tokenizers::Tokenizer> load_tokenizer(
  *
  * @param tokenizer Initialized tokenizer instance
  * @param module The model module
- * @return std::unordered_map<std::string, int64_t> Metadata key-value pairs
+ * @return Result<std::unordered_map<std::string, int64_t>> Metadata key-value
+ * pairs on success, or Error::InvalidArgument if required metadata (e.g.,
+ * kMaxSeqLen) is missing from the model
  */
-ET_EXPERIMENTAL std::unordered_map<std::string, int64_t> get_llm_metadata(
-    tokenizers::Tokenizer* tokenizer,
-    Module* module);
+ET_EXPERIMENTAL ::executorch::runtime::Result<
+    std::unordered_map<std::string, int64_t>>
+get_llm_metadata(tokenizers::Tokenizer* tokenizer, Module* module);
 
 /**
  * @brief Gets EOS token IDs from the model and tokenizer

From 70560f5a003fda01ee0e509a2d5e8447b20727a2 Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Tue, 26 Aug 2025 13:29:42 -0700
Subject: [PATCH 3/4] Make token and stat callback optional in multimodal
 runner

ghstack-source-id: f794b8844244c9ed5c8019827d1bfdc761b8d096
Pull Request resolved: https://github.com/pytorch/executorch/pull/13664
---
 extension/llm/runner/multimodal_runner.cpp  | 4 ++--
 extension/llm/runner/multimodal_runner.h    | 4 ++--
 extension/llm/runner/text_token_generator.h | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/extension/llm/runner/multimodal_runner.cpp b/extension/llm/runner/multimodal_runner.cpp
index 2bc658692da..f6b29d42c09 100644
--- a/extension/llm/runner/multimodal_runner.cpp
+++ b/extension/llm/runner/multimodal_runner.cpp
@@ -65,8 +65,8 @@ Error MultimodalRunner::load() {
 Error MultimodalRunner::generate(
     const std::vector<MultimodalInput>& inputs,
     const GenerationConfig& config,
-    std::function<void(const std::string&)>& token_callback,
-    std::function<void(const Stats&)>& stats_callback) {
+    std::function<void(const std::string&)> token_callback,
+    std::function<void(const Stats&)> stats_callback) {
   if (inputs.empty()) {
     ET_LOG(Error, "MultimodalInput vector cannot be empty");
     return Error::InvalidArgument;
diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h
index 186a5bf70e4..fe5d1d7f1d7 100644
--- a/extension/llm/runner/multimodal_runner.h
+++ b/extension/llm/runner/multimodal_runner.h
@@ -116,8 +116,8 @@ class ET_EXPERIMENTAL MultimodalRunner {
   virtual ::executorch::runtime::Error generate(
       const std::vector<MultimodalInput>& inputs,
       const GenerationConfig& config,
-      std::function<void(const std::string&)>& token_callback,
-      std::function<void(const Stats&)>& stats_callback);
+      std::function<void(const std::string&)> token_callback = {},
+      std::function<void(const Stats&)> stats_callback = {});
 
   inline void stop() {
     text_token_generator_->stop();
diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h
index 1a05921ed3a..a57961ee1d2 100644
--- a/extension/llm/runner/text_token_generator.h
+++ b/extension/llm/runner/text_token_generator.h
@@ -36,9 +36,9 @@ class ET_EXPERIMENTAL TextTokenGenerator {
 
   /**
    * Token generation loop.
-   * @param tokens prompt tokens as well as the first token generated by
-   * prefill.
-   * @param start_pos the start position of the new tokens, based on how many
+   * @param tokens The first token generated by prefill, if using kv cache. Else
+   * the prompt tokens + the first token generated by prefill.
+   * @param start_pos The start position of the new tokens, based on how many
    * prompt tokens is prefilled.
    * @param max_new_tokens Maximum number of new tokens to generate.
    * @param temperature controls the randomness of predictions by scaling the

From b9c1fe240f680e4f13407d8b2884196a748b978d Mon Sep 17 00:00:00 2001
From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com>
Date: Tue, 26 Aug 2025 14:45:45 -0700
Subject: [PATCH 4/4] Add multimodal runer tests

---
 extension/llm/runner/test/CMakeLists.txt      |   2 +-
 extension/llm/runner/test/targets.bzl         |  10 +
 .../runner/test/test_multimodal_prefiller.cpp | 402 ++++++++++++++++++
 3 files changed, 413 insertions(+), 1 deletion(-)
 create mode 100644 extension/llm/runner/test/test_multimodal_prefiller.cpp

diff --git a/extension/llm/runner/test/CMakeLists.txt b/extension/llm/runner/test/CMakeLists.txt
index 2aa18000831..dd7d6b86640 100644
--- a/extension/llm/runner/test/CMakeLists.txt
+++ b/extension/llm/runner/test/CMakeLists.txt
@@ -19,7 +19,7 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(_test_srcs
     test_generation_config.cpp test_text_llm_runner.cpp test_text_prefiller.cpp
-    test_text_decoder_runner.cpp test_multimodal_input.cpp
+    test_text_decoder_runner.cpp test_multimodal_input.cpp test_multimodal_prefiller.cpp
 )
 
 # Add LSan stub for Apple platforms
diff --git a/extension/llm/runner/test/targets.bzl b/extension/llm/runner/test/targets.bzl
index 3339b3b8584..e4e7f114aae 100644
--- a/extension/llm/runner/test/targets.bzl
+++ b/extension/llm/runner/test/targets.bzl
@@ -44,3 +44,13 @@ def define_common_targets():
             "//executorch/extension/llm/runner:multimodal_runner_lib",
         ],
     )
+
+    runtime.cxx_test(
+        name = "test_multimodal_prefiller",
+        srcs = ["test_multimodal_prefiller.cpp"],
+        deps = [
+            "//executorch/extension/llm/runner:multimodal_runner_lib",
+            "//executorch/extension/llm/runner/io_manager:io_manager",
+            "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+        ],
+    )
diff --git a/extension/llm/runner/test/test_multimodal_prefiller.cpp b/extension/llm/runner/test/test_multimodal_prefiller.cpp
new file mode 100644
index 00000000000..28fd13fd2a1
--- /dev/null
+++ b/extension/llm/runner/test/test_multimodal_prefiller.cpp
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ * @lint-ignore-every CLANGTIDY facebook-hte-Deprecated
+ */
+
+#include <executorch/extension/llm/runner/audio.h>
+#include <executorch/extension/llm/runner/constants.h>
+#include <executorch/extension/llm/runner/image.h>
+#include <executorch/extension/llm/runner/io_manager/io_manager.h>
+#include <executorch/extension/llm/runner/multimodal_decoder_runner.h>
+#include <executorch/extension/llm/runner/multimodal_input.h>
+#include <executorch/extension/llm/runner/multimodal_prefiller.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include <pytorch/tokenizers/tokenizer.h>
+
+using namespace ::testing;
+using executorch::extension::Module;
+using executorch::extension::llm::Audio;
+using executorch::extension::llm::Image;
+using executorch::extension::llm::IOManager;
+using executorch::extension::llm::MultimodalDecoderRunner;
+using executorch::extension::llm::MultimodalInput;
+using executorch::extension::llm::MultimodalPrefiller;
+using executorch::extension::llm::kAudioEncoderMethod;
+using executorch::extension::llm::kImageEncoderMethod;
+using executorch::extension::llm::kTextModelMethod;
+using executorch::extension::llm::kTokenEmbeddingMethod;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::Result;
+using executorch::runtime::testing::TensorFactory;
+
+namespace {
+
+// Mock classes for dependencies
+class MockModule : public Module {
+ public:
+  MockModule() : Module("") {}
+  
+  // Only mock the methods actually used by MultimodalPrefiller
+  MOCK_METHOD(
+      Result<std::vector<EValue>>,
+      execute,
+      (const std::string&, const std::vector<EValue>&),
+      (override));
+  MOCK_METHOD(
+      Error,
+      load_method,
+      (const std::string&),
+      (override));
+  MOCK_METHOD(bool, is_method_loaded, (const std::string&), (const, override));
+  MOCK_METHOD(
+      Result<std::unordered_set<std::string>>,
+      method_names,
+      (),
+      (override));
+};
+
+class MockTokenizer : public ::tokenizers::Tokenizer {
+ public:
+  // Only mock the encode method which is used by MultimodalPrefiller for text input
+  MOCK_METHOD(
+      ::tokenizers::Result<std::vector<uint64_t>>,
+      encode,
+      (const std::string&, int8_t, int8_t),
+      (const));
+};
+
+class MockMultimodalDecoderRunner : public MultimodalDecoderRunner {
+ public:
+  MockMultimodalDecoderRunner() : MultimodalDecoderRunner(nullptr, nullptr) {}
+  
+  // Only mock the logits_to_token method which is used by MultimodalPrefiller
+  MOCK_METHOD(int32_t, logits_to_token, (executorch::aten::Tensor&, float), ());
+};
+
+// Test fixture
+class MultimodalPrefillerTest : public Test {
+ protected:
+  void SetUp() override {
+    executorch::runtime::runtime_init();
+    
+    mock_module_ = std::make_unique<MockModule>();
+    mock_tokenizer_ = std::make_unique<MockTokenizer>();
+    mock_decoder_runner_ = std::make_unique<MockMultimodalDecoderRunner>();
+    io_manager_ = std::make_unique<IOManager>();
+    
+    prefiller_ = std::make_unique<MultimodalPrefiller>(
+        mock_module_.get(),
+        mock_decoder_runner_.get(),
+        mock_tokenizer_.get(),
+        io_manager_.get());
+        
+    // Set up tensor factory for creating test tensors
+    tf_float_ = std::make_unique<TensorFactory<executorch::aten::ScalarType::Float>>();
+  }
+
+  std::unique_ptr<MockModule> mock_module_;
+  std::unique_ptr<MockTokenizer> mock_tokenizer_;
+  std::unique_ptr<MockMultimodalDecoderRunner> mock_decoder_runner_;
+  std::unique_ptr<IOManager> io_manager_;
+  std::unique_ptr<MultimodalPrefiller> prefiller_;
+  
+  std::unique_ptr<TensorFactory<executorch::aten::ScalarType::Float>> tf_float_;
+};
+
+// Load Tests
+TEST_F(MultimodalPrefillerTest, LoadAllRequiredMethodsExist) {
+  // Set up method names to include all required methods
+  std::unordered_set<std::string> method_names = {
+      kTokenEmbeddingMethod, kTextModelMethod, kImageEncoderMethod, kAudioEncoderMethod
+  };
+  
+  EXPECT_CALL(*mock_module_, method_names())
+      .WillOnce(Return(Result<std::unordered_set<std::string>>(method_names)));
+  
+  EXPECT_CALL(*mock_module_, load_method(kTokenEmbeddingMethod))
+      .WillOnce(Return(Error::Ok));
+  EXPECT_CALL(*mock_module_, load_method(kTextModelMethod))
+      .WillOnce(Return(Error::Ok));
+  EXPECT_CALL(*mock_module_, load_method(kImageEncoderMethod))
+      .WillOnce(Return(Error::Ok));
+  EXPECT_CALL(*mock_module_, load_method(kAudioEncoderMethod))
+      .WillOnce(Return(Error::Ok));
+  
+  EXPECT_CALL(*mock_module_, is_method_loaded(kTokenEmbeddingMethod))
+      .WillOnce(Return(false));
+  
+  Error result = prefiller_->load();
+  EXPECT_EQ(result, Error::Ok);
+}
+
+TEST_F(MultimodalPrefillerTest, LoadTokenEmbeddingMethodDoesntExist) {
+  EXPECT_CALL(*mock_module_, load_method(kTokenEmbeddingMethod))
+      .WillOnce(Return(Error::InvalidProgram));
+  
+  EXPECT_CALL(*mock_module_, is_method_loaded(kTokenEmbeddingMethod))
+      .WillOnce(Return(false));
+  
+  Error result = prefiller_->load();
+  EXPECT_EQ(result, Error::InvalidProgram);
+}
+
+TEST_F(MultimodalPrefillerTest, LoadTextModelMethodDoesntExist) {
+  EXPECT_CALL(*mock_module_, load_method(kTokenEmbeddingMethod))
+      .WillOnce(Return(Error::Ok));
+  EXPECT_CALL(*mock_module_, load_method(kTextModelMethod))
+      .WillOnce(Return(Error::InvalidProgram));
+  
+  EXPECT_CALL(*mock_module_, is_method_loaded(kTokenEmbeddingMethod))
+      .WillOnce(Return(false));
+  
+  Error result = prefiller_->load();
+  EXPECT_EQ(result, Error::InvalidProgram);
+}
+
+// Prefill Tests
+TEST_F(MultimodalPrefillerTest, PrefillImageInput) {
+  // Create test image data
+  std::vector<uint8_t> image_data(3 * 224 * 224, 128); // 224x224 RGB image
+  Image test_image{std::move(image_data), 224, 224};
+  MultimodalInput input(std::move(test_image));
+  
+  // Create mock encoder output tensor
+  auto encoder_output_tensor = tf_float_->make({1, 5}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f});
+  std::vector<EValue> encoder_outputs = {EValue(encoder_output_tensor)};
+  
+  // Create mock prefill output tensor
+  auto prefill_output_tensor = tf_float_->make({1, 1, 4096}, std::vector<float>(4096, 0.5f));
+  std::vector<EValue> prefill_outputs = {EValue(prefill_output_tensor)};
+  
+  EXPECT_CALL(*mock_module_, execute(kImageEncoderMethod, _))
+      .WillOnce(Return(Result<std::vector<EValue>>(encoder_outputs)));
+  
+  EXPECT_CALL(*mock_module_, execute(kTextModelMethod, _))
+      .WillOnce(Return(Result<std::vector<EValue>>(prefill_outputs)));
+  
+  EXPECT_CALL(*mock_decoder_runner_, logits_to_token(_, _))
+      .WillOnce(Return(123));
+  
+  int64_t start_pos = 0;
+  Result<uint64_t> result = prefiller_->prefill(input, start_pos);
+  
+  EXPECT_TRUE(result.ok());
+  EXPECT_EQ(result.get(), 123);
+  EXPECT_EQ(start_pos, 5); // Should be incremented by seq_len
+}
+
+TEST_F(MultimodalPrefillerTest, PrefillAudioInput) {
+  // Create test audio data
+  std::vector<float> audio_data(2 * 80 * 100, 0.1f); // batch=2, n_bins=80, n_frames=100
+  Audio test_audio{std::move(audio_data), 2, 80, 100};
+  MultimodalInput input(std::move(test_audio));
+  
+  // Create mock encoder output tensor
+  auto encoder_output_tensor = tf_float_->make({1, 3}, {1.0f, 2.0f, 3.0f});
+  std::vector<EValue> encoder_outputs = {EValue(encoder_output_tensor)};
+  
+  // Create mock prefill output tensor
+  auto prefill_output_tensor = tf_float_->make({1, 1, 4096}, std::vector<float>(4096, 0.7f));
+  std::vector<EValue> prefill_outputs = {EValue(prefill_output_tensor)};
+  
+  EXPECT_CALL(*mock_module_, execute(kAudioEncoderMethod, _))
+      .WillOnce(Return(Result<std::vector<EValue>>(encoder_outputs)));
+  
+  EXPECT_CALL(*mock_module_, execute(kTextModelMethod, _))
+      .WillOnce(Return(Result<std::vector<EValue>>(prefill_outputs)));
+  
+  EXPECT_CALL(*mock_decoder_runner_, logits_to_token(_, _))
+      .WillOnce(Return(456));
+  
+  int64_t start_pos = 0;
+  Result<uint64_t> result = prefiller_->prefill(input, start_pos);
+  
+  EXPECT_TRUE(result.ok());
+  EXPECT_EQ(result.get(), 456);
+  EXPECT_EQ(start_pos, 3); // Should be incremented by seq_len
+}
+
+TEST_F(MultimodalPrefillerTest, PrefillTextInput) {
+  // Create test text input
+  std::string test_text = "Hello world";
+  MultimodalInput input(test_text);
+  
+  // Mock tokenizer encoding
+  std::vector<uint64_t> tokens = {1, 2, 3, 4};
+  ::tokenizers::Result<std::vector<uint64_t>> tokenize_result(tokens);
+  EXPECT_CALL(*mock_tokenizer_, encode(test_text, _, _))
+      .WillOnce(Return(tokenize_result));
+  
+  // Create mock encoder output tensor
+  auto encoder_output_tensor = tf_float_->make({1, 4}, {1.0f, 2.0f, 3.0f, 4.0f});
+  std::vector<EValue> encoder_outputs = {EValue(encoder_output_tensor)};
+  
+  // Create mock prefill output tensor
+  auto prefill_output_tensor = tf_float_->make({1, 1, 4096}, std::vector<float>(4096, 0.3f));
+  std::vector<EValue> prefill_outputs = {EValue(prefill_output_tensor)};
+  
+  EXPECT_CALL(*mock_module_, execute(kTokenEmbeddingMethod, _))
+      .WillOnce(Return(Result<std::vector<EValue>>(encoder_outputs)));
+  
+  EXPECT_CALL(*mock_module_, execute(kTextModelMethod, _))
+      .WillOnce(Return(Result<std::vector<EValue>>(prefill_outputs)));
+  
+  EXPECT_CALL(*mock_decoder_runner_, logits_to_token(_, _))
+      .WillOnce(Return(789));
+  
+  int64_t start_pos = 0;
+  Result<uint64_t> result = prefiller_->prefill(input, start_pos);
+  
+  EXPECT_TRUE(result.ok());
+  EXPECT_EQ(result.get(), 789);
+  EXPECT_EQ(start_pos, 4); // Should be incremented by seq_len
+}
+
+TEST_F(MultimodalPrefillerTest, PrefillUnsupportedInputType) {
+  // Create an unsupported input type by using RawAudio (which isn't handled in prefill)
+  executorch::extension::llm::RawAudio raw_audio{"test_path", 44100, 2, 1000};
+  MultimodalInput input(std::move(raw_audio));
+  
+  int64_t start_pos = 0;
+  Result<uint64_t> result = prefiller_->prefill(input, start_pos);
+  
+  EXPECT_FALSE(result.ok());
+  EXPECT_EQ(result.error(), Error::NotSupported);
+  EXPECT_EQ(start_pos, 0); // start_pos should not be modified
+}
+
+TEST_F(MultimodalPrefillerTest, PrefillCorrectCachePositionTensor) {
+  // Create test image data
+  std::vector<uint8_t> image_data(3 * 224 * 224, 128);
+  Image test_image{std::move(image_data), 224, 224};
+  MultimodalInput input(std::move(test_image));
+  
+  // Create mock encoder output tensor with specific seq_len
+  auto encoder_output_tensor = tf_float_->make({1, 3}, {1.0f, 2.0f, 3.0f});
+  std::vector<EValue> encoder_outputs = {EValue(encoder_output_tensor)};
+  
+  // Create mock prefill output tensor
+  auto prefill_output_tensor = tf_float_->make({1, 1, 4096}, std::vector<float>(4096, 0.5f));
+  std::vector<EValue> prefill_outputs = {EValue(prefill_output_tensor)};
+  
+  EXPECT_CALL(*mock_module_, execute(kImageEncoderMethod, _))
+      .WillOnce(Return(Result<std::vector<EValue>>(encoder_outputs)));
+  
+  // Verify that the cache_position_tensor is correctly constructed
+  EXPECT_CALL(*mock_module_, execute(kTextModelMethod, _))
+      .WillOnce([&](const std::string& method, const std::vector<EValue>& args) {
+        EXPECT_EQ(method, kTextModelMethod);
+        EXPECT_EQ(args.size(), 2);
+        
+        // Check cache position tensor - should be [5, 6, 7] since start_pos=5, seq_len=3
+        auto cache_pos_tensor = args[0].toTensor();
+        EXPECT_EQ(cache_pos_tensor.size(0), 3);
+        
+        int64_t* cache_pos_data = cache_pos_tensor.data_ptr<int64_t>();
+        EXPECT_EQ(cache_pos_data[0], 5);
+        EXPECT_EQ(cache_pos_data[1], 6);
+        EXPECT_EQ(cache_pos_data[2], 7);
+        
+        return Result<std::vector<EValue>>(prefill_outputs);
+      });
+  
+  EXPECT_CALL(*mock_decoder_runner_, logits_to_token(_, _))
+      .WillOnce(Return(123));
+  
+  int64_t start_pos = 5; // Start from position 5
+  Result<uint64_t> result = prefiller_->prefill(input, start_pos);
+  
+  EXPECT_TRUE(result.ok());
+  EXPECT_EQ(start_pos, 8); // Should be 5 + 3 = 8
+}
+
+TEST_F(MultimodalPrefillerTest, PrefillEmptyPrefillResults) {
+  // Create test image data
+  std::vector<uint8_t> image_data(3 * 224 * 224, 128);
+  Image test_image{std::move(image_data), 224, 224};
+  MultimodalInput input(std::move(test_image));
+  
+  // Create mock encoder output tensor
+  auto encoder_output_tensor = tf_float_->make({1, 3}, {1.0f, 2.0f, 3.0f});
+  std::vector<EValue> encoder_outputs = {EValue(encoder_output_tensor)};
+  
+  // Return empty prefill outputs
+  std::vector<EValue> empty_prefill_outputs = {};
+  
+  EXPECT_CALL(*mock_module_, execute(kImageEncoderMethod, _))
+      .WillOnce(Return(Result<std::vector<EValue>>(encoder_outputs)));
+  
+  EXPECT_CALL(*mock_module_, execute(kTextModelMethod, _))
+      .WillOnce(Return(Result<std::vector<EValue>>(empty_prefill_outputs)));
+  
+  int64_t start_pos = 0;
+  Result<uint64_t> result = prefiller_->prefill(input, start_pos);
+  
+  EXPECT_FALSE(result.ok());
+  EXPECT_EQ(result.error(), Error::InvalidState);
+  EXPECT_EQ(start_pos, 0); // start_pos should not be modified when error occurs
+}
+
+TEST_F(MultimodalPrefillerTest, PrefillStartPosIncrementAcrossMultipleCalls) {
+  // First call with image input
+  std::vector<uint8_t> image_data1(3 * 224 * 224, 128);
+  Image test_image1{std::move(image_data1), 224, 224};
+  MultimodalInput input1(std::move(test_image1));
+  
+  auto encoder_output_tensor1 = tf_float_->make({1, 2}, {1.0f, 2.0f});
+  std::vector<EValue> encoder_outputs1 = {EValue(encoder_output_tensor1)};
+  
+  auto prefill_output_tensor1 = tf_float_->make({1, 1, 4096}, std::vector<float>(4096, 0.5f));
+  std::vector<EValue> prefill_outputs1 = {EValue(prefill_output_tensor1)};
+  
+  EXPECT_CALL(*mock_module_, execute(kImageEncoderMethod, _))
+      .WillOnce(Return(Result<std::vector<EValue>>(encoder_outputs1)));
+  
+  EXPECT_CALL(*mock_module_, execute(kTextModelMethod, _))
+      .WillOnce(Return(Result<std::vector<EValue>>(prefill_outputs1)));
+  
+  EXPECT_CALL(*mock_decoder_runner_, logits_to_token(_, _))
+      .WillOnce(Return(100));
+  
+  int64_t start_pos = 0;
+  Result<uint64_t> result1 = prefiller_->prefill(input1, start_pos);
+  
+  EXPECT_TRUE(result1.ok());
+  EXPECT_EQ(start_pos, 2); // Should be incremented by 2
+  
+  // Second call with another image input
+  std::vector<uint8_t> image_data2(3 * 224 * 224, 128);
+  Image test_image2{std::move(image_data2), 224, 224};
+  MultimodalInput input2(std::move(test_image2));
+  
+  auto encoder_output_tensor2 = tf_float_->make({1, 3}, {1.0f, 2.0f, 3.0f});
+  std::vector<EValue> encoder_outputs2 = {EValue(encoder_output_tensor2)};
+  
+  auto prefill_output_tensor2 = tf_float_->make({1, 1, 4096}, std::vector<float>(4096, 0.7f));
+  std::vector<EValue> prefill_outputs2 = {EValue(prefill_output_tensor2)};
+  
+  EXPECT_CALL(*mock_module_, execute(kImageEncoderMethod, _))
+      .WillOnce(Return(Result<std::vector<EValue>>(encoder_outputs2)));
+  
+  EXPECT_CALL(*mock_module_, execute(kTextModelMethod, _))
+      .WillOnce(Return(Result<std::vector<EValue>>(prefill_outputs2)));
+  
+  EXPECT_CALL(*mock_decoder_runner_, logits_to_token(_, _))
+      .WillOnce(Return(200));
+  
+  Result<uint64_t> result2 = prefiller_->prefill(input2, start_pos);
+  
+  EXPECT_TRUE(result2.ok());
+  EXPECT_EQ(start_pos, 5); // Should be 2 + 3 = 5
+}
+
+} // namespace