From 645fb63ba298d93727a61cefc838e2bf9a2ee1fd Mon Sep 17 00:00:00 2001
From: winskuo-quic <quic_winskuo@quicinc.com>
Date: Wed, 4 Dec 2024 09:14:23 +0800
Subject: [PATCH] Qualcomm AI Engine Direct - Support Hybrid Mode for Llama3.2

---
 backends/qualcomm/aot/ir/qcir.fbs             |   9 +-
 backends/qualcomm/aot/ir/qcir_utils.cpp       |  14 +-
 backends/qualcomm/aot/ir/qcir_utils.h         |   3 +-
 .../qualcomm/aot/python/PyQnnManagerAdaptor.h | 189 ++++++++---
 .../aot/python/PyQnnWrapperAdaptor.cpp        |  14 +-
 .../qualcomm/runtime/QnnExecuTorchBackend.cpp |  16 +
 backends/qualcomm/runtime/QnnManager.cpp      |  49 ++-
 backends/qualcomm/runtime/QnnManager.h        |   1 +
 .../runtime/backends/QnnBackendCache.cpp      |  13 +-
 .../runtime/backends/QnnBackendCache.h        |   5 +-
 .../runtime/backends/QnnContextCommon.cpp     |  19 +-
 .../runtime/backends/QnnContextCommon.h       |   3 +-
 .../qualcomm/runtime/backends/QnnLogger.cpp   |   1 -
 .../qualcomm/serialization/qc_binary_info.fbs |   4 +-
 backends/qualcomm/tests/test_qnn_delegate.py  |  11 +-
 backends/qualcomm/utils/utils.py              | 121 ++++++-
 .../qualcomm/oss_scripts/llama3_2/README.md   |  39 +++
 .../qualcomm/oss_scripts/llama3_2/llama.py    | 258 ++++++++++-----
 .../llama3_2/qnn_llama3_2_runner.cpp          |  22 +-
 .../oss_scripts/llama3_2/runner/io_memory.cpp | 305 +++++++++++++-----
 .../oss_scripts/llama3_2/runner/io_memory.h   |  81 ++++-
 .../oss_scripts/llama3_2/runner/runner.cpp    | 220 +++++++++----
 .../oss_scripts/llama3_2/runner/runner.h      |  19 +-
 23 files changed, 1022 insertions(+), 394 deletions(-)
 create mode 100644 examples/qualcomm/oss_scripts/llama3_2/README.md

diff --git a/backends/qualcomm/aot/ir/qcir.fbs b/backends/qualcomm/aot/ir/qcir.fbs
index 6c16a54e0db..dfd9bbc91e1 100755
--- a/backends/qualcomm/aot/ir/qcir.fbs
+++ b/backends/qualcomm/aot/ir/qcir.fbs
@@ -80,7 +80,8 @@ table Tensor {
     type: TensorType;
     dtype: DataType;
     qparam: QuantizeParam;
-    data: [ubyte];
+    size: uint;
+    offset: ulong;
 }
 
 table Operator {
@@ -88,9 +89,9 @@ table Operator {
     package_name: string;
     type_name: string;
     // keep only tensor indexes
-    inputs: [int];
-    outputs: [int];
-    params: [int];
+    inputs: [uint];
+    outputs: [uint];
+    params: [uint];
 }
 
 table Graph {
diff --git a/backends/qualcomm/aot/ir/qcir_utils.cpp b/backends/qualcomm/aot/ir/qcir_utils.cpp
index 8cf024ba006..48f069767bf 100755
--- a/backends/qualcomm/aot/ir/qcir_utils.cpp
+++ b/backends/qualcomm/aot/ir/qcir_utils.cpp
@@ -235,11 +235,8 @@ Qnn_QuantizeParams_t ToQuantizeParam(const tensor_type& tensor) {
 
 flatbuffers::Offset<qcir::Tensor> ToTensor(
     const Qnn_Tensor_t& tensor,
+    const uint64_t data_offset,
     flatbuffers::FlatBufferBuilder* builder) {
-  std::vector<uint8_t> buffer(
-      static_cast<uint8_t*>(QNN_VER_PTR(tensor)->clientBuf.data),
-      static_cast<uint8_t*>(QNN_VER_PTR(tensor)->clientBuf.data) +
-          QNN_VER_PTR(tensor)->clientBuf.dataSize);
   std::vector<uint32_t> shape(
       QNN_VER_PTR(tensor)->dimensions,
       QNN_VER_PTR(tensor)->dimensions + QNN_VER_PTR(tensor)->rank);
@@ -251,10 +248,11 @@ flatbuffers::Offset<qcir::Tensor> ToTensor(
       ToTensorType(QNN_VER_PTR(tensor)->type),
       ToDataType(QNN_VER_PTR(tensor)->dataType),
       ToQuantizeParam(tensor, builder),
-      &buffer);
+      QNN_VER_PTR(tensor)->clientBuf.dataSize,
+      data_offset);
 }
 
-Qnn_Tensor_t ToTensor(const tensor_type& tensor) {
+Qnn_Tensor_t ToTensor(const tensor_type& tensor, const uint8_t* data_ptr) {
   auto is_io_tensor = [](Qnn_TensorType_t type) {
     return type < QNN_TENSOR_TYPE_STATIC;
   };
@@ -266,10 +264,10 @@ Qnn_Tensor_t ToTensor(const tensor_type& tensor) {
   QNN_VER_PTR(t)->quantizeParams = ToQuantizeParam(tensor);
   QNN_VER_PTR(t)->rank = tensor->shape()->size();
   QNN_VER_PTR(t)->dimensions = const_cast<uint32_t*>(tensor->shape()->data());
-  QNN_VER_PTR(t)->clientBuf.dataSize = tensor->data()->size();
+  QNN_VER_PTR(t)->clientBuf.dataSize = tensor->size();
   QNN_VER_PTR(t)->clientBuf.data = is_io_tensor(QNN_VER_PTR(t)->type)
       ? nullptr
-      : static_cast<void*>(const_cast<uint8_t*>(tensor->data()->Data()));
+      : static_cast<void*>(const_cast<uint8_t*>(data_ptr));
   return t;
 }
 
diff --git a/backends/qualcomm/aot/ir/qcir_utils.h b/backends/qualcomm/aot/ir/qcir_utils.h
index 5d54eb30a69..085f09bf145 100755
--- a/backends/qualcomm/aot/ir/qcir_utils.h
+++ b/backends/qualcomm/aot/ir/qcir_utils.h
@@ -32,8 +32,9 @@ Qnn_QuantizeParams_t ToQuantizeParam(const tensor_type& tensor);
 
 flatbuffers::Offset<qcir::Tensor> ToTensor(
     const Qnn_Tensor_t& tensor,
+    const uint64_t data_offset,
     flatbuffers::FlatBufferBuilder* builder);
-Qnn_Tensor_t ToTensor(const tensor_type& tensor);
+Qnn_Tensor_t ToTensor(const tensor_type& tensor, const uint8_t* data_ptr);
 
 } // namespace qnn
 } // namespace backends
diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
index 55429f2b430..bbe52bf74bf 100644
--- a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
+++ b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
@@ -57,47 +57,67 @@ class PyQnnManager {
         qnn_executorch_option_ptr_.cast<std::string_view>().data());
 
     // merge multiple qcirs into one context with multiple graphs
-    std::vector<flatbuffers::Offset<qcir::Graph>> graphs;
+
+    // this makes it easier to do subtraction for offsets
+    std::vector<uint32_t> offsets(1, 0);
+    std::vector<const flatbuffers::Vector64<uint8_t>*> tensor_data;
+    fb_opt_.max_size = FLATBUFFERS_MAX_64_BUFFER_SIZE;
     for (size_t i = 0; i < qcirs.size(); ++i) {
       py::buffer_info info(py::buffer(qcirs[i].cast<py::bytes>()).request());
       flatbuffers::Verifier verifier_binary_info(
           static_cast<const uint8_t* const>(info.ptr),
-          info.size * info.itemsize);
+          info.size * info.itemsize,
+          fb_opt_);
       if (!qnn_delegate::VerifyBinaryInfoBuffer(verifier_binary_info)) {
         QNN_EXECUTORCH_LOG_ERROR("Fail to verify binary info");
         return;
       }
       auto binary_info = qnn_delegate::GetBinaryInfo(info.ptr);
+      tensor_data.push_back(binary_info->tensor_data());
 
       flatbuffers::Verifier verifier_qcir(
-          binary_info->data()->data(), binary_info->data()->size());
+          binary_info->context_data()->Data(),
+          binary_info->context_data()->size());
       if (!qcir::VerifyContextBuffer(verifier_qcir)) {
         QNN_EXECUTORCH_LOG_ERROR("Fail to verify qcir format");
         return;
       }
-      auto context = qcir::GetContext(binary_info->data()->data());
+      offsets.push_back(offsets.back() + binary_info->tensor_data()->size());
+    }
+
+    std::vector<flatbuffers::Offset<qcir::Graph>> graphs;
+    for (size_t i = 0; i < qcirs.size(); ++i) {
+      py::buffer_info info(py::buffer(qcirs[i].cast<py::bytes>()).request());
+      auto binary_info = qnn_delegate::GetBinaryInfo(info.ptr);
+      auto context = qcir::GetContext(binary_info->context_data()->Data());
       for (const auto& graph : *context->graphs()) {
         std::vector<flatbuffers::Offset<qcir::Tensor>> tensors;
         for (const auto tensor : *graph->tensors()) {
           // here we need to take a detour to merge multiple qcir flatbuffers
           // outer ToTensor
           //   return: flatbuffers::Offset<Tensor>
-          //   consume: QnnTensor, flatbuffers::FlatBufferBuilder*
+          //   consume: QnnTensor, data_offset, flatbuffers::FlatBufferBuilder*
           // inner ToTensor
           //   return: QnnTensor
-          //   consume: flatbuffers::Vector<::flatbuffers::Offset<qcir::Tensor>>
-          tensors.emplace_back(ToTensor(ToTensor(tensor), &builder_));
+          //   consume:
+          //   flatbuffers::Vector<::flatbuffers::Offset<qcir::Tensor>>,
+          //   data_ptr
+          tensors.emplace_back(ToTensor(
+              ToTensor(tensor, nullptr),
+              offsets[i] + tensor->offset(),
+              &builder_));
         }
         std::vector<flatbuffers::Offset<qcir::Operator>> nodes;
         for (const auto& node : *graph->nodes()) {
-          int32_t* inputs_ptr = const_cast<int32_t*>(node->inputs()->data());
-          int32_t* outputs_ptr = const_cast<int32_t*>(node->outputs()->data());
-          int32_t* params_ptr = const_cast<int32_t*>(node->params()->data());
-          std::vector<int32_t> inputs(
+          uint32_t* inputs_ptr = const_cast<uint32_t*>(node->inputs()->data());
+          uint32_t* outputs_ptr =
+              const_cast<uint32_t*>(node->outputs()->data());
+          uint32_t* params_ptr = const_cast<uint32_t*>(node->params()->data());
+          std::vector<uint32_t> inputs(
               inputs_ptr, inputs_ptr + node->inputs()->size());
-          std::vector<int32_t> outputs(
+          std::vector<uint32_t> outputs(
               outputs_ptr, outputs_ptr + node->outputs()->size());
-          std::vector<int32_t> params(
+          std::vector<uint32_t> params(
               params_ptr, params_ptr + node->params()->size());
           nodes.emplace_back(qcir::CreateOperatorDirect(
               builder_,
@@ -118,7 +138,7 @@ class PyQnnManager {
     QnnExecuTorchContextBinary qcir_bin(
         {builder_.GetBufferPointer(), builder_.GetSize()});
 
-    qnn_executorch_context_binary_ = MakeBinaryInfo(qcir_bin);
+    qnn_executorch_context_binary_ = MakeBinaryInfo(qcir_bin, tensor_data);
     qnn_manager_ = std::make_shared<QnnManager>(
         qnn_executorch_options, qnn_executorch_context_binary_);
   }
@@ -157,26 +177,37 @@ class PyQnnManager {
 
     if (qnn_manager_->IsOnlinePrepare() || qnn_manager_->IsMultipleGraphs()) {
       builder_.Reset();
-      std::vector<flatbuffers::Offset<qcir::Tensor>> tensors;
+      std::vector<uint8_t> tensor_data;
+      std::vector<uint64_t> offsets;
       std::unordered_map<void*, int> tensor_map;
+      std::vector<flatbuffers::Offset<qcir::Tensor>> fb_tensors;
+      std::vector<flatbuffers::Offset<qcir::Operator>> fb_ops;
 
       auto set_tensor = [&](const std::shared_ptr<TensorWrapper>& wrapper,
-                            std::vector<int>& index) {
+                            std::vector<uint32_t>& index) {
         auto it = tensor_map.find(wrapper.get());
         if (it != tensor_map.end()) {
           index.push_back(it->second);
         } else {
-          int i = tensors.size();
-          tensor_map[wrapper.get()] = i;
-          index.push_back(i);
-          tensors.emplace_back(
-              ToTensor(wrapper->CloneTensorStruct(), &builder_));
+          tensor_map[wrapper.get()] = fb_tensors.size();
+          index.push_back(fb_tensors.size());
+          offsets.push_back(tensor_data.size());
+          Qnn_Tensor_t qnn_tensor = wrapper->CloneTensorStruct();
+          fb_tensors.emplace_back(
+              ToTensor(qnn_tensor, offsets.back(), &builder_));
+          uint8_t* data_ptr =
+              static_cast<uint8_t*>(QNN_VER_PTR(qnn_tensor)->clientBuf.data);
+          if (data_ptr != nullptr) {
+            tensor_data.insert(
+                tensor_data.end(),
+                data_ptr,
+                data_ptr + QNN_VER_PTR(qnn_tensor)->clientBuf.dataSize);
+          }
         }
       };
 
-      std::vector<flatbuffers::Offset<qcir::Operator>> operators;
       for (std::shared_ptr<OpWrapper>& op_wrapper : op_wrappers) {
-        std::vector<int> inputs, outputs, params;
+        std::vector<uint32_t> inputs, outputs, params;
 
         for (const auto& tensor_wrapper : op_wrapper->GetInputTensors()) {
           set_tensor(tensor_wrapper, inputs);
@@ -207,13 +238,22 @@ class PyQnnManager {
                 static_cast<void*>(&p.scalarParam.uint8Value);
             QNN_VER_PTR(t)->clientBuf.dataSize =
                 GetDataTypeSize(QNN_VER_PTR(t)->dataType);
-            params.push_back(tensors.size());
-            tensors.emplace_back(ToTensor(t, &builder_));
+
+            // collect tensor data
+            offsets.push_back(tensor_data.size());
+            const uint8_t* data_ptr =
+                static_cast<uint8_t*>(QNN_VER_PTR(t)->clientBuf.data);
+            tensor_data.insert(
+                tensor_data.end(),
+                data_ptr,
+                data_ptr + QNN_VER_PTR(t)->clientBuf.dataSize);
+            params.push_back(fb_tensors.size());
+            fb_tensors.emplace_back(ToTensor(t, offsets.back(), &builder_));
           }
         }
 
         Qnn_OpConfig_t op_config = op_wrapper->GetOpConfig();
-        operators.emplace_back(qcir::CreateOperatorDirect(
+        fb_ops.emplace_back(qcir::CreateOperatorDirect(
             builder_,
             QNN_VER_PTR(op_config)->name,
             QNN_VER_PTR(op_config)->packageName,
@@ -222,14 +262,16 @@ class PyQnnManager {
             &outputs,
             &params));
       }
-      auto graph = qcir::CreateGraphDirect(
-          builder_, graph_name.c_str(), &operators, &tensors);
-      std::vector<flatbuffers::Offset<qcir::Graph>> graphs({graph});
-      auto context = qcir::CreateContextDirect(builder_, &graphs);
+
+      std::vector<flatbuffers::Offset<qcir::Graph>> fb_graphs(
+          {qcir::CreateGraphDirect(
+              builder_, graph_name.c_str(), &fb_ops, &fb_tensors)});
+      auto context = qcir::CreateContextDirect(builder_, &fb_graphs);
       builder_.Finish(context);
+
       QnnExecuTorchContextBinary qcir_binary(
           {builder_.GetBufferPointer(), builder_.GetSize()});
-      binary_info = MakeBinaryInfo(qcir_binary);
+      binary_info = MakeBinaryInfo(qcir_binary, tensor_data);
     } else {
       if (qnn_manager_->Compile(graph_name, op_wrappers) !=
           executorch::runtime::Error::Ok) {
@@ -300,7 +342,8 @@ class PyQnnManager {
     py::buffer_info info(py::buffer(ctx_bin).request());
     QnnExecuTorchContextBinary binary(
         {info.ptr, static_cast<uint64_t>(info.size * info.itemsize)});
-    auto binary_info = MakeBinaryInfo(binary);
+    std::vector<uint8_t> tensor_data;
+    auto binary_info = MakeBinaryInfo(binary, tensor_data);
     auto result = py::array_t<char>(binary_info.nbytes);
     auto result_buffer = result.request();
     std::memcpy(result_buffer.ptr, binary_info.buffer, binary_info.nbytes);
@@ -308,22 +351,78 @@ class PyQnnManager {
   }
 
  private:
+  std::string signature() {
+    return std::to_string(
+        std::chrono::high_resolution_clock::now().time_since_epoch().count());
+  };
+
   QnnExecuTorchContextBinary MakeBinaryInfo(
-      const QnnExecuTorchContextBinary& ctx_bin) {
-    auto signature = []() {
-      return std::to_string(
-          std::chrono::high_resolution_clock::now().time_since_epoch().count());
-    };
-    const uint8_t* base = static_cast<uint8_t*>(ctx_bin.buffer);
-    std::vector<uint8_t> data(base, base + ctx_bin.nbytes);
+      const QnnExecuTorchContextBinary& ctx_bin,
+      const std::vector<const flatbuffers::Vector64<uint8_t>*>& tensor_data) {
+    // the build order matters, 64 bit data is required to be shipped first
+    // add context data
+    builder64_.Reset();
+    auto offset_context = builder64_.CreateVector<
+        uint8_t,
+        flatbuffers::Offset64,
+        flatbuffers::Vector64>(
+        static_cast<const uint8_t*>(ctx_bin.buffer), ctx_bin.nbytes);
+    // add tensor data
+    // this is a little bit tricky but have smallest memory footprint in AoT
+    size_t buffer_size = 0;
+    for (auto& td : tensor_data) {
+      buffer_size += td->size();
+    }
+    builder64_.StartVector<
+        uint8_t,
+        flatbuffers::Offset64,
+        flatbuffers::Vector64<uint8_t>::size_type>(buffer_size);
+    for (int i = tensor_data.size() - 1; i >= 0; --i) {
+      builder64_.PushBytes(tensor_data[i]->Data(), tensor_data[i]->size());
+    }
+    auto offset_tensor = flatbuffers::Offset64<flatbuffers::Vector64<uint8_t>>(
+        builder64_.EndVector<
+            flatbuffers::Vector64<uint8_t>::size_type,
+            flatbuffers::Offset64<flatbuffers::Vector64<uint8_t>>::offset_type>(
+            buffer_size));
     // add signature to binary for cache reuse in runtime
-    builder_.Reset();
-    auto binary_info = qnn_delegate::CreateBinaryInfoDirect(
-        builder_, signature().c_str(), &data);
-    builder_.Finish(binary_info);
+    auto offset_signature = builder64_.CreateString(signature().c_str());
+    // build binary info
+    auto binary_info = qnn_delegate::CreateBinaryInfo(
+        builder64_, offset_signature, offset_context, offset_tensor);
+    builder64_.Finish(binary_info);
 
     return QnnExecuTorchContextBinary(
-        {builder_.GetBufferPointer(), builder_.GetSize()});
+        {builder64_.GetBufferPointer(), builder64_.GetSize()});
+  }
+
+  QnnExecuTorchContextBinary MakeBinaryInfo(
+      const QnnExecuTorchContextBinary& ctx_bin,
+      const std::vector<uint8_t>& tensor_data) {
+    // the build order matters, 64 bit data is required to be shipped first
+    // add context data
+    builder64_.Reset();
+
+    auto offset_context = builder64_.CreateVector<
+        uint8_t,
+        flatbuffers::Offset64,
+        flatbuffers::Vector64>(
+        static_cast<const uint8_t*>(ctx_bin.buffer), ctx_bin.nbytes);
+    // add tensor data
+    auto offset_tensor = builder64_.CreateVector<
+        uint8_t,
+        flatbuffers::Offset64,
+        flatbuffers::Vector64>(
+        static_cast<const uint8_t*>(tensor_data.data()), tensor_data.size());
+    // add signature to binary for cache reuse in runtime
+    auto offset_signature = builder64_.CreateString(signature().c_str());
+    // build binary info
+    auto binary_info = qnn_delegate::CreateBinaryInfo(
+        builder64_, offset_signature, offset_context, offset_tensor);
+    builder64_.Finish(binary_info);
+
+    return QnnExecuTorchContextBinary(
+        {builder64_.GetBufferPointer(), builder64_.GetSize()});
   }
 
   // Store the bytes object instead of a raw pointer so that this module will
@@ -331,7 +430,9 @@ class PyQnnManager {
   const py::bytes qnn_executorch_option_ptr_;
   QnnExecuTorchContextBinary qnn_executorch_context_binary_;
   std::shared_ptr<QnnManager> qnn_manager_;
+  flatbuffers::FlatBufferBuilder64 builder64_;
   flatbuffers::FlatBufferBuilder builder_;
+  flatbuffers::Verifier::Options fb_opt_;
 };
 } // namespace qnn
 } // namespace backends
diff --git a/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.cpp b/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.cpp
index 97d4491bc6a..2b4b88967e5 100644
--- a/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.cpp
+++ b/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.cpp
@@ -79,18 +79,6 @@ std::shared_ptr<TensorWrapper> CreateTensorWrapper(
   std::unique_ptr<QuantizeParamsWrapper> quantize_param_wrapper =
       CreateQuantizationParamWrapper(encoding, quant_info);
 
-  if (data.size() == 0) {
-    return CreateTensorWrapper(
-        tensor_name,
-        tensor_type,
-        data_type,
-        std::move(quantize_param_wrapper),
-        rank,
-        dims.data(),
-        0,
-        nullptr,
-        copy_data);
-  }
   return CreateTensorWrapper(
       tensor_name,
       tensor_type,
@@ -99,7 +87,7 @@ std::shared_ptr<TensorWrapper> CreateTensorWrapper(
       rank,
       dims.data(),
       0,
-      data.data(),
+      data.size() == 0 ? nullptr : data.data(),
       copy_data);
 }
 
diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
index 60208afeec5..eb8f78a883a 100644
--- a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
+++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
@@ -115,6 +115,13 @@ Error QnnExecuTorchBackend::execute(
 
   input_tensor_structs.reserve(input_tensors.size());
   for (int i = 0; i < input_tensors.size(); ++i) {
+    // TODO: Enable this in future to avoid unmatch tensor size, e.g., QuantIO
+    // pass causing mismatch
+    // ET_CHECK_MSG(
+    //     input_tensors[i]->GetBytes() == args[i]->toTensor().nbytes(),
+    //     "Input index %d, number of bytes does not match between args and
+    //     input_tensor, %d != %zu", i, input_tensors[i]->GetBytes(),
+    //     args[i]->toTensor().nbytes());
     if (qnn_manager->RegisterMem(
             args[i]->toTensor().mutable_data_ptr(), input_tensors[i]) !=
         Error::Ok) {
@@ -129,6 +136,15 @@ Error QnnExecuTorchBackend::execute(
   for (const auto& output_tensor : output_tensors) {
     // pos=0 limits the search to the prefix
     if (output_tensor->GetName().rfind("output_", 0) == 0) {
+      // TODO: Enable this in future to avoid unmatch tensor size, e.g., QuantIO
+      // pass causing mismatch
+      // ET_CHECK_MSG(
+      //     output_tensor->GetBytes() ==
+      //     args[output_index]->toTensor().nbytes(), "Output index %d, number
+      //     of bytes does not match between args and output_tensor, %d != %zu",
+      //     output_index,
+      //     output_tensor->GetBytes(),
+      //     args[output_index]->toTensor().nbytes());
       void* mutable_data_ptr =
           args[output_index]->toTensor().mutable_data_ptr();
       if (qnn_manager->RegisterMem(mutable_data_ptr, output_tensor) !=
diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp
index a4d83585f28..fe7050e7b13 100644
--- a/backends/qualcomm/runtime/QnnManager.cpp
+++ b/backends/qualcomm/runtime/QnnManager.cpp
@@ -54,6 +54,7 @@ QnnManager::QnnManager(
   QnnExecuTorchBackendType backend_type =
       options->backend_options()->backend_type();
   std::string library_path = options->library_path()->str();
+  fb_opt_.max_size = FLATBUFFERS_MAX_64_BUFFER_SIZE;
 
   if (options->log_level() >= QnnExecuTorchLogLevel::kLogLevelInfo) {
     QNN_EXECUTORCH_LOG_INFO(
@@ -490,7 +491,8 @@ Error QnnManager::GetContextBinary(
 Error QnnManager::CompileQcir() {
   flatbuffers::Verifier verifier_binary_info(
       static_cast<const uint8_t* const>(qnn_context_blob_.buffer),
-      qnn_context_blob_.nbytes);
+      qnn_context_blob_.nbytes,
+      fb_opt_);
   if (!qnn_delegate::VerifyBinaryInfoBuffer(verifier_binary_info)) {
     QNN_EXECUTORCH_LOG_ERROR("Fail to verify binary info");
     return Error::Internal;
@@ -498,19 +500,22 @@ Error QnnManager::CompileQcir() {
 
   auto binary_info = qnn_delegate::GetBinaryInfo(qnn_context_blob_.buffer);
   flatbuffers::Verifier verifier_qcir(
-      binary_info->data()->data(), binary_info->data()->size());
+      binary_info->context_data()->Data(),
+      binary_info->context_data()->size(),
+      fb_opt_);
   if (!qcir::VerifyContextBuffer(verifier_qcir)) {
     QNN_EXECUTORCH_LOG_ERROR("Fail to verify qcir format");
     return Error::Internal;
   }
 
-  auto context = qcir::GetContext(binary_info->data()->data());
+  auto context = qcir::GetContext(binary_info->context_data()->Data());
   for (const auto& graph : *context->graphs()) {
     // qcir tensors to TensorWrapper
     std::vector<std::shared_ptr<TensorWrapper>> graph_inputs, graph_outputs,
         tensors;
     for (const auto& tensor : *graph->tensors()) {
-      tensors.emplace_back(CreateTensorWrapper(ToTensor(tensor)));
+      tensors.emplace_back(CreateTensorWrapper(ToTensor(
+          tensor, binary_info->tensor_data()->Data() + tensor->offset())));
       if (tensor->type() == qcir::TensorType::WRITE) {
         graph_inputs.push_back(tensors.back());
       } else if (tensor->type() == qcir::TensorType::READ) {
@@ -544,6 +549,8 @@ Error QnnManager::CompileQcir() {
         const auto& tensor = graph->tensors()->Get(index);
         std::string name = tensor->name()->str();
         Qnn_DataType_t dtype = ToDataType(tensor->dtype());
+        const uint8_t* data_ptr =
+            binary_info->tensor_data()->Data() + tensor->offset();
         if (tensor->shape()->size() != 0) {
           // add tensor param
           op->AddTensorParam(
@@ -551,50 +558,39 @@ Error QnnManager::CompileQcir() {
               dtype,
               tensor->shape()->size(),
               tensor->shape()->data(),
-              tensor->data()->data());
+              data_ptr);
         } else {
           // add scalar param
           switch (dtype) {
             case Qnn_DataType_t::QNN_DATATYPE_INT_32:
               op->AddScalarParam(
-                  name,
-                  dtype,
-                  *reinterpret_cast<const int32_t*>(tensor->data()->Data()));
+                  name, dtype, *reinterpret_cast<const int32_t*>(data_ptr));
               break;
             case Qnn_DataType_t::QNN_DATATYPE_INT_16:
               op->AddScalarParam(
-                  name,
-                  dtype,
-                  *reinterpret_cast<const int16_t*>(tensor->data()->Data()));
+                  name, dtype, *reinterpret_cast<const int16_t*>(data_ptr));
               break;
             case Qnn_DataType_t::QNN_DATATYPE_INT_8:
-              op->AddScalarParam(
-                  name, dtype, static_cast<int8_t>(*tensor->data()->Data()));
+              op->AddScalarParam(name, dtype, static_cast<int8_t>(*data_ptr));
               break;
             case Qnn_DataType_t::QNN_DATATYPE_UINT_32:
               op->AddScalarParam(
-                  name,
-                  dtype,
-                  *reinterpret_cast<const uint32_t*>(tensor->data()->Data()));
+                  name, dtype, *reinterpret_cast<const uint32_t*>(data_ptr));
               break;
             case Qnn_DataType_t::QNN_DATATYPE_UINT_16:
               op->AddScalarParam(
-                  name,
-                  dtype,
-                  *reinterpret_cast<const uint16_t*>(tensor->data()->Data()));
+                  name, dtype, *reinterpret_cast<const uint16_t*>(data_ptr));
               break;
             case Qnn_DataType_t::QNN_DATATYPE_UINT_8:
-              op->AddScalarParam(name, dtype, *tensor->data()->Data());
+              op->AddScalarParam(name, dtype, *data_ptr);
               break;
             case Qnn_DataType_t::QNN_DATATYPE_FLOAT_32:
             case Qnn_DataType_t::QNN_DATATYPE_FLOAT_16:
               op->AddScalarParam(
-                  name,
-                  dtype,
-                  *reinterpret_cast<const float*>(tensor->data()->Data()));
+                  name, dtype, *reinterpret_cast<const float*>(data_ptr));
               break;
             case Qnn_DataType_t::QNN_DATATYPE_BOOL_8:
-              op->AddScalarParam(name, dtype, *tensor->data()->Data());
+              op->AddScalarParam(name, dtype, *data_ptr);
               break;
             default:
               QNN_EXECUTORCH_LOG_ERROR(
@@ -603,7 +599,7 @@ Error QnnManager::CompileQcir() {
           }
         }
       }
-      op_wrappers.push_back(std::move(op));
+      op_wrappers.emplace_back(std::move(op));
     }
 
     ET_CHECK_OR_RETURN_ERROR(
@@ -687,7 +683,8 @@ Error QnnManager::Compile(
 std::string QnnManager::GetBinarySignature() {
   flatbuffers::Verifier verifier(
       static_cast<const uint8_t* const>(qnn_context_blob_.buffer),
-      qnn_context_blob_.nbytes);
+      qnn_context_blob_.nbytes,
+      fb_opt_);
   return VerifyBinaryInfoBuffer(verifier)
       ? GetBinaryInfo(qnn_context_blob_.buffer)->signature()->str()
       : "";
diff --git a/backends/qualcomm/runtime/QnnManager.h b/backends/qualcomm/runtime/QnnManager.h
index 0157ee58378..7c78418ffa7 100644
--- a/backends/qualcomm/runtime/QnnManager.h
+++ b/backends/qualcomm/runtime/QnnManager.h
@@ -147,6 +147,7 @@ class QnnManager {
           {Qnn_DataType_t::QNN_DATATYPE_UFIXED_POINT_16,
            executorch::aten::ScalarType::Bits16},
   };
+  flatbuffers::Verifier::Options fb_opt_;
 };
 } // namespace qnn
 } // namespace backends
diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
index 43cb835cfff..244af7cd84e 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
+++ b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
@@ -109,7 +109,8 @@ Error QnnBackendCache::Configure() {
   QNN_EXECUTORCH_LOG_INFO("Caching: Caching is in RESTORE MODE.");
   flatbuffers::Verifier verifier_binary_info(
       static_cast<const uint8_t* const>(qnn_context_blob_.buffer),
-      qnn_context_blob_.nbytes);
+      qnn_context_blob_.nbytes,
+      fb_opt_);
   if (!qnn_delegate::VerifyBinaryInfoBuffer(verifier_binary_info)) {
     QNN_EXECUTORCH_LOG_ERROR("Fail to verify binary info");
     return Error::Internal;
@@ -117,17 +118,19 @@ Error QnnBackendCache::Configure() {
 
   auto binary_info = GetBinaryInfo(qnn_context_blob_.buffer);
   Error status = GetQnnGraphInfoFromBinary(
-      const_cast<uint8_t*>(binary_info->data()->data()),
-      binary_info->data()->size());
+      const_cast<uint8_t*>(binary_info->context_data()->Data()),
+      binary_info->context_data()->size());
 
   if (status == Error::Internal) {
     // check if context binary came from flatbuffer
     flatbuffers::Verifier verifier(
-        binary_info->data()->data(), binary_info->data()->size());
+        binary_info->context_data()->Data(),
+        binary_info->context_data()->size(),
+        fb_opt_);
 
     if (qcir::VerifyContextBuffer(verifier)) {
       state_ = ONLINE_PREPARE;
-      auto context = qcir::GetContext(binary_info->data()->data());
+      auto context = qcir::GetContext(binary_info->context_data()->Data());
       for (const auto& graph : *context->graphs()) {
         graph_names_.emplace_back(graph->name()->str());
       }
diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.h b/backends/qualcomm/runtime/backends/QnnBackendCache.h
index b9e00f0a662..e8ce9af88e7 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendCache.h
+++ b/backends/qualcomm/runtime/backends/QnnBackendCache.h
@@ -28,7 +28,9 @@ class QnnBackendCache {
   explicit QnnBackendCache(
       const QnnExecuTorchContextBinary& qnn_context_blob,
       const std::string& aot_graph_name)
-      : qnn_context_blob_(qnn_context_blob), aot_graph_name_(aot_graph_name) {}
+      : qnn_context_blob_(qnn_context_blob), aot_graph_name_(aot_graph_name) {
+    fb_opt_.max_size = FLATBUFFERS_MAX_64_BUFFER_SIZE;
+  }
   virtual ~QnnBackendCache();
   QnnBackendCache(const QnnBackendCache&) = delete;
   QnnBackendCache(QnnBackendCache&&) = delete;
@@ -82,6 +84,7 @@ class QnnBackendCache {
       input_tensor_structs_;
   std::unordered_map<std::string, std::vector<Qnn_Tensor_t>>
       output_tensor_structs_;
+  flatbuffers::Verifier::Options fb_opt_;
 };
 } // namespace qnn
 } // namespace backends
diff --git a/backends/qualcomm/runtime/backends/QnnContextCommon.cpp b/backends/qualcomm/runtime/backends/QnnContextCommon.cpp
index 7db5164a1d5..4c3fe53eece 100644
--- a/backends/qualcomm/runtime/backends/QnnContextCommon.cpp
+++ b/backends/qualcomm/runtime/backends/QnnContextCommon.cpp
@@ -51,8 +51,8 @@ Error QnnContext::Configure() {
         backend_->GetHandle(),
         device_->GetHandle(),
         temp_context_config.empty() ? nullptr : temp_context_config.data(),
-        const_cast<uint8_t*>(binary_info->data()->data()),
-        binary_info->data()->size(),
+        const_cast<uint8_t*>(binary_info->context_data()->Data()),
+        binary_info->context_data()->size(),
         &handle_,
         /*profile=*/nullptr);
     if (error != QNN_SUCCESS) {
@@ -93,10 +93,11 @@ Error QnnContext::GetContextBinary(
   Qnn_ContextBinarySize_t bytes_written = 0;
   Qnn_ErrorHandle_t error =
       qnn_interface.qnn_context_get_binary_size(handle_, &binary_size);
+  std::vector<uint8_t> binary_buffer;
   if (error == QNN_SUCCESS) {
-    binary_buffer_.resize(binary_size);
+    binary_buffer.resize(binary_size);
     error = qnn_interface.qnn_context_get_binary(
-        handle_, binary_buffer_.data(), binary_size, &bytes_written);
+        handle_, binary_buffer.data(), binary_size, &bytes_written);
     if (error != QNN_SUCCESS) {
       QNN_EXECUTORCH_LOG_ERROR(
           "Can't get graph binary to be saved to "
@@ -118,12 +119,12 @@ Error QnnContext::GetContextBinary(
                                   .time_since_epoch()
                                   .count());
       };
-      builder_.Reset();
+      builder64_.Reset();
       auto binary_info = qnn_delegate::CreateBinaryInfoDirect(
-          builder_, signature().c_str(), &binary_buffer_);
-      builder_.Finish(binary_info);
-      qnn_executorch_context_binary.buffer = builder_.GetBufferPointer();
-      qnn_executorch_context_binary.nbytes = builder_.GetSize();
+          builder64_, signature().c_str(), &binary_buffer);
+      builder64_.Finish(binary_info);
+      qnn_executorch_context_binary.buffer = builder64_.GetBufferPointer();
+      qnn_executorch_context_binary.nbytes = builder64_.GetSize();
     }
   } else {
     QNN_EXECUTORCH_LOG_ERROR(
diff --git a/backends/qualcomm/runtime/backends/QnnContextCommon.h b/backends/qualcomm/runtime/backends/QnnContextCommon.h
index d93390a5379..d6823a5d4a5 100644
--- a/backends/qualcomm/runtime/backends/QnnContextCommon.h
+++ b/backends/qualcomm/runtime/backends/QnnContextCommon.h
@@ -71,8 +71,7 @@ class QnnContext {
   QnnBackend* backend_;
   QnnDevice* device_;
   QnnBackendCache* cache_;
-  std::vector<uint8_t> binary_buffer_;
-  flatbuffers::FlatBufferBuilder builder_;
+  flatbuffers::FlatBufferBuilder64 builder64_;
 };
 } // namespace qnn
 } // namespace backends
diff --git a/backends/qualcomm/runtime/backends/QnnLogger.cpp b/backends/qualcomm/runtime/backends/QnnLogger.cpp
index 412b1a2db2c..5b86894d874 100644
--- a/backends/qualcomm/runtime/backends/QnnLogger.cpp
+++ b/backends/qualcomm/runtime/backends/QnnLogger.cpp
@@ -10,7 +10,6 @@
 #include <executorch/backends/qualcomm/runtime/backends/QnnImplementation.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnLogger.h>
 
-#include <iostream>
 #include <memory>
 
 #include "QnnLog.h"
diff --git a/backends/qualcomm/serialization/qc_binary_info.fbs b/backends/qualcomm/serialization/qc_binary_info.fbs
index 3f301055269..e924fa76871 100644
--- a/backends/qualcomm/serialization/qc_binary_info.fbs
+++ b/backends/qualcomm/serialization/qc_binary_info.fbs
@@ -14,7 +14,9 @@ table BinaryInfo {
   // Signature of binary
   signature: string;
   // Data of processed binary
-  data: [ubyte];
+  context_data: [ubyte] (vector64);
+  // Data of tensor
+  tensor_data: [ubyte] (vector64);
 }
 
 root_type BinaryInfo;
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 10917cdd6bf..875d34760cc 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -1595,11 +1595,7 @@ def test_qnn_backend_multi_graphs(self):
             for i, edge_prog in enumerate(edge_progs)
         ]
         prog_mgr = generate_multi_graph_program(
-            compiler_specs=compiler_specs[0],
-            processed_bytes=[
-                prog.graph_module.lowered_module_0.processed_bytes
-                for prog in exported_programs
-            ],
+            compiler_specs=compiler_specs[0], exported_programs=exported_programs
         )
         for index, module in enumerate(modules):
             self.verify_output(
@@ -1915,10 +1911,7 @@ def test_qnn_backend_multi_graphs(self):
         ]
         prog_mgr = generate_multi_graph_program(
             compiler_specs=compiler_specs[0],
-            processed_bytes=[
-                prog.graph_module.lowered_module_0.processed_bytes
-                for prog in exported_programs
-            ],
+            exported_programs=exported_programs,
         )
         for index, module in enumerate(modules):
             self.verify_output(
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
index 590ede74319..791ee802177 100644
--- a/backends/qualcomm/utils/utils.py
+++ b/backends/qualcomm/utils/utils.py
@@ -5,9 +5,10 @@
 # LICENSE file in the root directory of this source tree.
 
 import operator
+import re
 import warnings
 from collections import OrderedDict
-from typing import Callable, Dict, FrozenSet, List, Tuple
+from typing import Any, Callable, Dict, FrozenSet, List, Optional, Tuple
 
 import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManagerAdaptor
 
@@ -648,7 +649,13 @@ def op_impl(inputs: List[torch.Tensor]):
                 for v in outputs.values()
             )
 
-    def build_graph(inputs, outputs):
+    def build_graph(
+        inputs,
+        outputs,
+        qnn_in_order: Optional[List[int]] = None,
+        executorch_in_order: Optional[List[int]] = None,
+        executorch_out_order: Optional[List[int]] = None,
+    ):
         # custom op declaration
         inputs_str = "Tensor[] inputs"
         func_proto = f"{op_name}({inputs_str}) -> Any"
@@ -659,13 +666,39 @@ def build_graph(inputs, outputs):
 
         # model architecture mimicking context binary
         class Model(torch.nn.Module):
-            def forward(self, *inputs):
-                return getattr(
+            """
+            The args of forward() can be thought of as what executorch is accepting as input.
+            The getattr inside the forward() can be thought of as qnn context binary.
+            When we first pass in the input, we need to use the executorch's(nn.module) input order.
+            After we get into forward(), we then need to convert input order to qnn's input order.
+            Same as return, when qnn returns the value, we need to reorder them back to executorh's output order.
+            """
+
+            def __init__(self, qnn_in_order, executorch_out_order):
+                super().__init__()
+                self.qnn_in_order = qnn_in_order
+                self.executorch_out_order = executorch_out_order
+
+            def forward(self, *inputs):  # executorch
+                if self.qnn_in_order:
+                    inputs = tuple(inputs[i] for i in self.qnn_in_order)
+                ret = getattr(
                     getattr(torch.ops, OpContextLoader.namespace), op_name
                 ).default(inputs)
+                return (
+                    [ret[idx] for idx in self.executorch_out_order]
+                    if self.executorch_out_order
+                    else ret
+                )
+
+        inputs = (
+            tuple(tuple(inputs.values())[i] for i in executorch_in_order)
+            if executorch_in_order
+            else tuple(inputs.values())
+        )
 
-        model = Model()
-        prog = torch.export.export(model, tuple(inputs.values()))
+        model = Model(qnn_in_order, executorch_out_order)
+        prog = torch.export.export(model, inputs)
         # bookkeeping for variables' life cycle
         return {
             "custom_op": custom_op,
@@ -708,6 +741,7 @@ def preprocess_binary(ctx_bin, compiler_specs):
         for k, v in type_map.items():
             dtype_map.setdefault(v, k)
 
+    qnn_in_order, executorch_in_order, executorch_out_order = [], [], []
     if custom_info is not None:
         # since some context binaries might fail to open on host
         # if they are compiled with special flags:
@@ -715,6 +749,9 @@ def preprocess_binary(ctx_bin, compiler_specs):
         # use custom information here instead
         inputs = build_tensor(custom_info["graph_inputs"], dtype_map)
         outputs = build_tensor(custom_info["graph_outputs"], dtype_map)
+        qnn_in_order = custom_info["qnn_in_order"]
+        executorch_in_order = custom_info["executorch_in_order"]
+        executorch_out_order = custom_info["executorch_out_order"]
         graph_name = custom_info["graph_name"]
     else:
         # get context-binary io tensor info through qnn manager
@@ -729,15 +766,21 @@ def preprocess_binary(ctx_bin, compiler_specs):
         inputs = build_tensor(qnn_mgr.GetGraphInputs(graph_name), dtype_map)
         outputs = build_tensor(qnn_mgr.GetGraphOutputs(graph_name), dtype_map)
         qnn_mgr.Destroy()
-
     # generate graph specific for loading context
-    bundle_prog = build_graph(inputs, outputs)
+    bundle_prog = build_graph(
+        inputs, outputs, qnn_in_order, executorch_in_order, executorch_out_order
+    )
     bundle_prog.update({"inputs": inputs, "outputs": outputs})
+
+    # TODO: to_edge() decorator alters the function call behavior, which
+    # requires "self" when calling. To work around this issue,
+    # temporarily remove the first parameter name.
     edge_prog_mgr = to_edge(
-        programs={graph_name: bundle_prog["exported_program"]},
+        {graph_name: bundle_prog["exported_program"]},
         # do not alter name for custom op
         compile_config=EdgeCompileConfig(_use_edge_ops=False),
     )
+
     # update meta with context binary
     for n in edge_prog_mgr._edge_programs[graph_name].graph.nodes:
         if n.op == "call_function" and OpContextLoader.namespace in str(n.target):
@@ -758,11 +801,23 @@ def draw_graph(title, path, graph_module: torch.fx.GraphModule):
 
 def generate_multi_graph_program(
     compiler_specs: List[CompileSpec],
-    processed_bytes: List[bytes],
+    exported_programs: List[ExportedProgram] = None,
     backend_config: ExecutorchBackendConfig = None,
+    constant_methods: Optional[Dict[str, Any]] = None,
 ) -> ExecutorchProgramManager:
+
     # compile multiple graphs in qcir into single context binary
-    graph_inputs, graph_outputs = {}, {}
+    (
+        graph_inputs,
+        graph_outputs,
+        qnn_in_order,
+        executorch_in_order,
+        executorch_out_order,
+    ) = ({}, {}, {}, {}, {})
+
+    processed_bytes = [
+        prog.graph_module.lowered_module_0.processed_bytes for prog in exported_programs
+    ]
     qnn_mgr = PyQnnManagerAdaptor.QnnManager(
         generate_qnn_executorch_option(compiler_specs), processed_bytes
     )
@@ -773,6 +828,41 @@ def generate_multi_graph_program(
     for graph_name in graph_names:
         graph_inputs[graph_name] = qnn_mgr.GetGraphInputs(graph_name)
         graph_outputs[graph_name] = qnn_mgr.GetGraphOutputs(graph_name)
+
+    # We need to obtain the order of the IOs to correctly map QNN with nn.module
+    for i, graph_name in enumerate(graph_names):
+        # input
+        input_names = [
+            node.name
+            for node in exported_programs[i].graph_module.graph.nodes
+            if node.op == "placeholder"
+        ]
+        qnn_input_names = [wrapper.GetName() for wrapper in graph_inputs[graph_name]]
+        input_order_list = []
+        for input_name in input_names:
+            # e.g., input_0_tokens_0
+            pattern = rf"^input_(\d+)_({input_name})_(\d+)$"
+            for j in range(len(qnn_input_names)):
+                if re.match(pattern, qnn_input_names[j]):
+                    input_order_list.append(j)
+                    break
+        assert (
+            len(input_order_list) == len(input_names) == len(qnn_input_names)
+        ), "Order list length is different from names"
+        executorch_in_order[graph_name] = input_order_list
+        qnn_in_order[graph_name] = sorted(
+            range(len(input_order_list)), key=lambda k: input_order_list[k]
+        )
+
+        # output
+        get_item_list = [
+            node
+            for node in exported_programs[i].graph_module.graph.nodes
+            if node.op == "output"
+        ][0].args[0]
+        output_order_list = [item.args[1] for item in get_item_list]
+        executorch_out_order[graph_name] = output_order_list
+
     qnn_mgr.Destroy()
 
     # build custom ops with different graph signatures
@@ -786,16 +876,20 @@ def generate_multi_graph_program(
                 "graph_inputs": graph_inputs[graph_name],
                 "graph_outputs": graph_outputs[graph_name],
                 "graph_name": graph_name,
+                "qnn_in_order": qnn_in_order[graph_name],
+                "executorch_in_order": executorch_in_order[graph_name],
+                "executorch_out_order": executorch_out_order[graph_name],
             },
         )
         for graph_name in graph_names
     ]
     # leverage ExecutorchProgramManager for generating pte with multi-methods
     edge_prog_mgr = to_edge(
-        programs={
+        {
             graph_name: bundle_prog["exported_program"]
             for graph_name, bundle_prog in zip(graph_names, bundle_progs)
         },
+        constant_methods=constant_methods,
         # do not alter name for custom op
         compile_config=EdgeCompileConfig(_use_edge_ops=False),
     )
@@ -806,7 +900,8 @@ def generate_multi_graph_program(
                 n.meta[OpContextLoader.meta_ctx_bin] = binary_info
                 break
 
-    return edge_prog_mgr.to_backend(QnnPartitioner(compiler_specs)).to_executorch(
+    edge_prog_mgr = edge_prog_mgr.to_backend(QnnPartitioner(compiler_specs))
+    return edge_prog_mgr.to_executorch(
         config=backend_config or ExecutorchBackendConfig()
     )
 
diff --git a/examples/qualcomm/oss_scripts/llama3_2/README.md b/examples/qualcomm/oss_scripts/llama3_2/README.md
new file mode 100644
index 00000000000..51de982b1b1
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama3_2/README.md
@@ -0,0 +1,39 @@
+# Summary
+
+## Overview
+This file provides instructions to run LLAMA3.2 1B and 3B (WIP) with different parameters via the Qualcomm HTP backend. In LLAMA3.2, we offer the following modes to execute the model:
+
+Prefill Mode: This is also known as batch prefill mode, where the model takes in a list of tokens as input and generates the next token along with the key-value (KV) cache for all tokens. This mode is efficient for generating the initial sequence of tokens (usually the user's prompt).
+
+KV Cache Mode: In KV Cache mode, the model takes in a single previous token and generates the next predicted token along with its KV cache. It is efficient for generating subsequent tokens after the initial prompt.
+
+Hybrid Mode: Hybrid mode leverages the strengths of both batch prefill and KV cache modes to optimize token generation speed. Initially, it uses prefill mode to efficiently generate the prompt's key-value (KV) cache. Then, the mode switches to KV cache mode, which excels at generating subsequent tokens.
+
+## Instructions
+### Note
+1. For hybrid mode, the export time will be longer and can take up to 2-4 hours to complete.
+2. When exporting a hybrid mode model, please ensure the device has at least 80 GB of memory and swap space.
+
+### Step 1: Setup
+1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch.
+2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend.
+
+### Step 2: Prepare Model
+1. Follow the [instructions](https://www.llama.com/) to download models.
+At the end of this step, users should have the following files ready: consolidated.00.pth, params.json, and tokenizer.model.
+
+### Step3: Run default examples using hybrid mode.
+Default example using hybrid mode.
+```bash
+python examples/qualcomm/oss_scripts/llama3_2/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --prompt "what is 1+1" --temperature 0 --model_size 1B --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128
+```
+
+If you would like to compile the model only, we have provided the flag `--compile_only`.
+```bash
+python examples/qualcomm/oss_scripts/llama3_2/llama.py -b build-android -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --prompt "what is 1+1" --temperature 0 --model_size 1B --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --compile_only
+```
+
+On the other hand, if you already have a pre-compiled .pte model, you can perform inference by providing the flag `--pre_gen_pte` and specifying the folder that contains the .pte model.
+```bash
+python examples/qualcomm/oss_scripts/llama3_2/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --prompt "what is 1+1" --temperature 0 --model_size 1B --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE}
+```
\ No newline at end of file
diff --git a/examples/qualcomm/oss_scripts/llama3_2/llama.py b/examples/qualcomm/oss_scripts/llama3_2/llama.py
index 75c0bb0ff0f..77347cc3616 100755
--- a/examples/qualcomm/oss_scripts/llama3_2/llama.py
+++ b/examples/qualcomm/oss_scripts/llama3_2/llama.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import copy
 import getpass
 import json
 import logging
@@ -20,7 +21,6 @@
 
 from executorch.backends.qualcomm.quantizer.custom_annotation import (
     annotate_matmul_16a8w,
-    custom_annotate_llama_last_conv_16a8w,
 )
 
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
@@ -30,6 +30,7 @@
     capture_program,
     convert_linear_to_conv2d,
     generate_htp_compiler_spec,
+    generate_multi_graph_program,
     generate_qnn_executorch_compiler_spec,
     get_soc_to_chipset_map,
 )
@@ -44,6 +45,7 @@
     SimpleADB,
 )
 from executorch.exir import EdgeCompileConfig, EdgeProgramManager
+from executorch.exir.backend.backend_api import to_backend
 from executorch.exir.capture._config import ExecutorchBackendConfig
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
@@ -59,8 +61,6 @@
 logging.basicConfig(level=logging.INFO, format=FORMAT)
 logging.getLogger().setLevel(logging.INFO)
 
-pte_filename = "llama3_2_qnn"
-
 
 def _kv_calibrate(
     example_inputs,
@@ -103,7 +103,7 @@ def _kv_calibrate(
     print(f"calibration data:\n{sp_model.decode(token_list)}")
 
 
-def _batch_prefill_calibrate(
+def _prefill_calibrate(
     example_inputs,
     user_prompts,
     module: torch.fx.GraphModule,
@@ -147,7 +147,7 @@ def calibrate(
     max_seq_len=512,
 ):
     if len(example_inputs) == 2:
-        _batch_prefill_calibrate(
+        _prefill_calibrate(
             example_inputs,
             user_prompts,
             module,
@@ -167,12 +167,13 @@ def calibrate(
 
 
 class SingleLlama:
-    def __init__(self, llama_model) -> None:
+    def __init__(self, llama_model, pte_filename) -> None:
         super().__init__()
         self.llama_model = llama_model
         self.quant_dtype = None
         self.llama_meta = self.llama_model.get_metadata()
         self.has_quant_io = False
+        self.pte_filename = pte_filename
         if self.llama_meta["get_use_kv_cache"]:
             tokens, atten_mask, pos_ids, k_caches, v_caches = self.get_example_inputs(
                 use_kv_cache=True
@@ -206,7 +207,7 @@ def _tag_kv_ios(self, gm: torch.fx.GraphModule, kv_type, sharding_type):
                         == self.llama_meta["get_head_dim"]
                     ):
                         a.meta[QCOM_QUANTIZED_IO] = kv_type
-                    # single head, batch_prefill mode
+                    # single head, prefill mode
                     elif a.meta["val"].flatten().size()[0] == self.llama_meta[
                         "get_head_dim"
                     ] * (self.llama_meta["get_max_seq_len"] - 1):
@@ -237,13 +238,12 @@ def quantize(self, quant_dtype, custom_annotations=()):
             ).module()
             fx_graph_module = prepare_pt2e(fx_graph_module, quantizer)
         logging.info("Quantizing the model...")
-
         calibrate(
             self.get_example_inputs(self.llama_meta["get_use_kv_cache"]),
             args.prompt,
             fx_graph_module,
             tokenizer_model_path=args.tokenizer_model,
-            max_seq_len=args.seq_len,
+            max_seq_len=self.llama_meta["get_max_seq_len"],
         )
 
         self.llama_model = convert_pt2e(fx_graph_module)
@@ -277,7 +277,7 @@ def lowering_modules(
             compiler_specs = generate_qnn_executorch_compiler_spec(
                 soc_model=soc_model,
                 backend_options=backend_options,
-                shared_buffer=True,
+                shared_buffer=False,
             )
             skip_node_op_set = {"llama.fallback.default"}
             partitioner = QnnPartitioner(
@@ -313,49 +313,61 @@ def get_example_inputs(self, use_kv_cache=True):
         return self.llama_model.get_example_inputs(use_kv_cache)
 
 
-def compile(args):
+def compile(args, pte_filename):
     os.makedirs(args.artifact, exist_ok=True)
     start_ts = time.time()
 
-    if args.model_mode == "kv":
-        use_kv_cache = output_new_cache_only = True
-        matmul_annotate_func = partial(annotate_matmul_16a8w, traverse_input1=True)
-    elif args.model_mode == "batch_prefill":
-        use_kv_cache = output_new_cache_only = False
-        matmul_annotate_func = partial(annotate_matmul_16a8w, traverse_input1=False)
-    elif args.model_mode == "hybrid":
-        raise NotImplementedError(
-            f"model_mode {args.model_mode} is not implemented yet."
-        )
-    else:
-        raise RuntimeError(f"No such model_mode {args.model_mode}.")
-
     with open(args.params) as f:
-        config = ModelArgs(**json.load(f))
+        kv_config = ModelArgs(**json.load(f))
         # TODO: support batch inputs if necessary
-        config.max_batch_size = 1
-        config.max_seq_len = args.seq_len
-        config.use_kv_cache = use_kv_cache
+        kv_config.max_batch_size = 1
+        kv_config.max_seq_len = args.kv_seq_len
+        kv_config.use_kv_cache = True
+
+        prefill_config = copy.copy(kv_config)
+        prefill_config.max_seq_len = args.prefill_seq_len
+        prefill_config.use_kv_cache = False
+
     state_dict = torch.load(
         args.checkpoint, weights_only=True, map_location="cpu", mmap=True
     )
 
-    llama_instance = None
+    llama_instance_list = []
     with torch.device("meta"):
-        llama_instance = LlamaModel(config, output_new_cache_only=output_new_cache_only)
+        if args.model_mode == "kv":
+            llama_instance_list.append(
+                LlamaModel(kv_config, output_new_cache_only=True)
+            )
+        elif args.model_mode == "prefill":
+            llama_instance_list.append(
+                LlamaModel(prefill_config, output_new_cache_only=False)
+            )
+        elif args.model_mode == "hybrid":
+            llama_instance_list.append(
+                LlamaModel(prefill_config, output_new_cache_only=False)
+            )
+            llama_instance_list.append(
+                LlamaModel(kv_config, output_new_cache_only=True)
+            )
+        else:
+            raise RuntimeError(f"No such model_mode {args.model_mode}.")
+
     if "model" in state_dict:
         state_dict = state_dict["model"]
-    llama_instance.load_state_dict(
-        state_dict,
-        strict=False,
-        assign=True,
-    )
+
+    for llama_instance in llama_instance_list:
+        llama_instance.load_state_dict(
+            state_dict,
+            strict=False,
+            assign=True,
+        )
     end_load_ts = time.time()
     logging.info(f"Time for loading checkpoint: {end_load_ts - start_ts}")
 
-    for layer in llama_instance.layers:
-        if getattr(layer.attention, "prepare_sha", None):
-            layer.attention.prepare_sha()
+    for llama_instance in llama_instance_list:
+        for layer in llama_instance.layers:
+            if getattr(layer.attention, "prepare_sha", None):
+                layer.attention.prepare_sha()
 
     use_fp16 = False
     if args.ptq != None:
@@ -378,60 +390,136 @@ def compile(args):
 
     if args.dtype_override is not None:
         dtype_override = DType[args.dtype_override]
-        llama_instance = llama_instance.to(dtype_override.to_torch_dtype())
+        for i in range(len(llama_instance_list)):
+            llama_instance_list[i] = llama_instance_list[i].to(
+                dtype_override.to_torch_dtype()
+            )
 
-    llama_instance = convert_linear_to_conv2d(llama_instance)
-    single_llama = SingleLlama(llama_instance.eval())
+    for i in range(len(llama_instance_list)):
+        llama_instance_list[i] = convert_linear_to_conv2d(llama_instance_list[i])
+        llama_instance_list[i] = SingleLlama(
+            llama_instance_list[i].eval(), pte_filename
+        )
 
     if args.ptq != None:
         start_quantize_ts = time.time()
-        single_llama.quantize(
-            quant_dtype,
-            custom_annotations=(
-                custom_annotate_llama_last_conv_16a8w,
-                matmul_annotate_func,
-            ),
-        )
+        for llama_instance in llama_instance_list:
+            llama_instance.quantize(
+                quant_dtype,
+                custom_annotations=(
+                    partial(
+                        annotate_matmul_16a8w,
+                        traverse_input1=llama_instance.llama_meta["get_use_kv_cache"],
+                    ),
+                ),
+            )
         end_quantize_ts = time.time()
         logging.info(f"Time for quantizing: {end_quantize_ts - start_quantize_ts}")
 
     start_lowering_ts = time.time()
-    single_llama.lowering_modules(
-        args.artifact,
-        kv_type=kv_type,
-        sharding_type=sharding_type,
-        use_fp16=use_fp16,
-        soc_model=get_soc_to_chipset_map()[args.model],
-        num_sharding=args.num_sharding,
-    )
+
+    if len(llama_instance_list) == 1:
+        llama_instance_list[0].lowering_modules(
+            args.artifact,
+            kv_type=kv_type,
+            sharding_type=sharding_type,
+            use_fp16=use_fp16,
+            soc_model=get_soc_to_chipset_map()[args.model],
+            num_sharding=args.num_sharding,
+        )
+    else:
+        sample_inputs_list = [
+            llama_instace.inputs for llama_instace in llama_instance_list
+        ]
+        edge_progs = [
+            capture_program(llama_instance.llama_model, sample_input)
+            for llama_instance, sample_input in zip(
+                llama_instance_list, sample_inputs_list
+            )
+        ]
+
+        if args.num_sharding > 0:
+            for i in range(len(llama_instance_list)):
+                model_sharding.split_graph(
+                    edge_progs[i].exported_program,
+                    llama_instance_list[i].llama_meta["get_n_layers"],
+                    shares=args.num_sharding,
+                )
+
+        for i in range(len(llama_instance_list)):
+            llama_instance_list[i]._tag_kv_ios(
+                edge_progs[i].exported_program.graph_module,
+                kv_type=kv_type,
+                sharding_type=sharding_type,
+            )
+        backend_options = generate_htp_compiler_spec(use_fp16=use_fp16)
+        graph_names = ["prefill_forward", "kv_forward"]
+        compiler_specs = [
+            generate_qnn_executorch_compiler_spec(
+                soc_model=get_soc_to_chipset_map()[args.model],
+                backend_options=backend_options,
+                shared_buffer=True,
+                multiple_graphs=True,
+                graph_name=graph_name,
+            )
+            for graph_name in graph_names
+        ]
+        exported_programs = [
+            to_backend(edge_prog.exported_program, QnnPartitioner(compiler_specs[i]))
+            for i, edge_prog in enumerate(edge_progs)
+        ]
+
+        executorch_config = ExecutorchBackendConfig(
+            passes=[
+                BuildQuantIo(),
+            ],
+            # For shared buffer, user must pass the memory address
+            # which is allocated by RPC memory to executor runner.
+            # Therefore, won't want to pre-allocate
+            # by memory manager in runtime.
+            memory_planning_pass=MemoryPlanningPass(
+                alloc_graph_input=False,
+                alloc_graph_output=False,
+            ),
+            extract_delegate_segments=True,
+        )
+
+        prog_mgr = generate_multi_graph_program(
+            compiler_specs=compiler_specs[0],
+            exported_programs=exported_programs,
+            backend_config=executorch_config,
+            constant_methods=llama_instance_list[1].llama_meta,  # kv method meta
+        )
+        with open(f"{args.artifact}/{pte_filename}.pte", "wb") as file:
+            prog_mgr.write_to_file(file)
+
     end_lowering_ts = time.time()
     logging.info(f"Time for compiling: {end_lowering_ts - start_lowering_ts}")
 
 
-def inference(args, pre_gen_pte=""):
+def inference(args, pte_filename, pre_gen_pte=""):
     workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/single_llama"
 
-    if args.model_mode == "batch_prefill":
+    if args.model_mode == "prefill":
         eval_mode = 0
     elif args.model_mode == "kv":
         eval_mode = 1
     elif args.model_mode == "hybrid":
         eval_mode = 2
-        raise NotImplementedError(
-            f"model_mode {args.model_mode} is not implemented yet."
-        )
     else:
         raise RuntimeError(f"No such model_mode {args.model_mode}.")
 
+    seq_len = args.prefill_seq_len if args.model_mode == "prefill" else args.kv_seq_len
     runner_args = " ".join(
         [
             f"--model_path {pte_filename}.pte",
             "--output_path outputs/outputs.txt",
             f"--tokenizer_path {os.path.basename(args.tokenizer_model)}",
             f'--prompt "{args.prompt}"',
-            f"--seq_len {args.seq_len}",
+            f"--seq_len {seq_len}",
             f"--eval_mode {eval_mode}",
             f"--temperature {args.temperature}",
+            f"--system_prompt '{args.system_prompt}'",
         ]
     )
     runner_cmd = " ".join(
@@ -541,10 +629,10 @@ def post_process():
     )
 
     parser.add_argument(
-        "--seq_len",
-        help="Ouput sequence length for llama.",
-        default=128,
-        type=int,
+        "--system_prompt",
+        help="Tells the model what kind of assistant it should be. For example, You are a helpful AI assistant for travel tips and recommendations. Default is None",
+        default="",
+        type=str,
     )
 
     parser.add_argument(
@@ -578,27 +666,53 @@ def post_process():
 
     parser.add_argument(
         "--model_mode",
-        help="Export and inference batch_prefill mode, kv mode or hybrid(TBD) mode",
+        help="Export and inference prefill mode, kv mode or hybrid mode",
         default="kv",
-        choices=["batch_prefill", "kv", "hybrid"],
+        choices=["prefill", "kv", "hybrid"],
         type=str,
     )
 
+    parser.add_argument(
+        "--prefill_seq_len",
+        help="Ouput sequence length for llama. Use this option for prefill or hybrid mode",
+        default=32,
+        type=int,
+    )
+
+    parser.add_argument(
+        "--kv_seq_len",
+        help="Ouput sequence length for llama. Use this option for kv or hybrid mode",
+        default=512,
+        type=int,
+    )
+
     args = parser.parse_args()
     if args.compile_only and args.pre_gen_pte:
         exit("Cannot set both compile_only and pre_gen_pte as true")
 
+    if args.model_mode == "kv":
+        pte_filename = "kv_llama3_2_qnn"
+    elif args.model_mode == "prefill":
+        pte_filename = "prefill_llama3_2_qnn"
+    elif args.model_mode == "hybrid":
+        assert (
+            args.kv_seq_len >= args.prefill_seq_len
+        ), "Please ensure kv_seq_len is >= prefill_seq_len"
+        pte_filename = "hybrid_llama3_2_qnn"
+    else:
+        raise RuntimeError(f"No such model_mode {args.model_mode}.")
+
     if args.pre_gen_pte:
-        inference(args, args.pre_gen_pte)
+        inference(args, pte_filename, args.pre_gen_pte)
         exit(f"Finish the running pre_gen_pte from {args.pre_gen_pte}")
 
     if args.compile_only:
-        compile(args)
+        compile(args, pte_filename)
         exit(f"Finish compile_only and save to {args.artifact}")
 
     try:
-        compile(args)
-        inference(args)
+        compile(args, pte_filename)
+        inference(args, pte_filename)
     except Exception as e:
         if args.ip and args.port != -1:
             with Client((args.ip, args.port)) as conn:
diff --git a/examples/qualcomm/oss_scripts/llama3_2/qnn_llama3_2_runner.cpp b/examples/qualcomm/oss_scripts/llama3_2/qnn_llama3_2_runner.cpp
index 554e3ba9329..8c7ac6dd363 100644
--- a/examples/qualcomm/oss_scripts/llama3_2/qnn_llama3_2_runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama3_2/qnn_llama3_2_runner.cpp
@@ -18,6 +18,7 @@
 #include <executorch/runtime/platform/log.h>
 #include <gflags/gflags.h>
 #include <fstream>
+#include <vector>
 
 DEFINE_string(
     model_path,
@@ -46,7 +47,7 @@ DEFINE_int32(
 DEFINE_int32(
     eval_mode,
     0,
-    "0: PromptProcessor(batch_prefill) / 1: TokenGenerator(kv) / 2: HybridMode (TBD)");
+    "0: PromptProcessor(prefill) / 1: TokenGenerator(kv) / 2: HybridMode (prefill+kv)");
 
 int main(int argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
@@ -55,16 +56,21 @@ int main(int argc, char** argv) {
   example::Runner runner(
       {FLAGS_model_path},
       FLAGS_tokenizer_path.c_str(),
+      FLAGS_prompt.c_str(),
+      FLAGS_system_prompt.c_str(),
       FLAGS_temperature,
       FLAGS_eval_mode);
-
-  // generate tokens & store inference output
+  std::vector<char> buf;
+  buf.reserve(5 * FLAGS_seq_len); // assume each token is around 5 char
   std::ofstream fout(FLAGS_output_path.c_str());
-  runner.generate(
-      FLAGS_prompt,
-      FLAGS_system_prompt,
-      FLAGS_seq_len,
-      [&](const std::string& piece) { fout << piece; });
+  auto callback = [&](const std::string& piece) {
+    for (const char c : piece) {
+      buf.push_back(c);
+    }
+  };
+  // generate tokens & store inference output
+  runner.generate(FLAGS_seq_len, callback);
+  fout.write(buf.data(), buf.size());
   fout.close();
   return 0;
 }
diff --git a/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.cpp b/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.cpp
index 9b37d056cf5..aabad659f48 100644
--- a/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.cpp
+++ b/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.cpp
@@ -23,10 +23,7 @@ using executorch::runtime::TensorInfo;
 namespace example {
 
 Memory::Memory(std::vector<std::shared_ptr<Module>>& modules)
-    : data_ptr_(nullptr, [](void*) {}),
-      input_tensors_(modules.size()),
-      output_tensors_(modules.size()),
-      modules_(modules) {}
+    : data_ptr_(nullptr, [](void*) {}), modules_(modules) {}
 
 Memory::~Memory() {}
 
@@ -34,19 +31,23 @@ void* Memory::get_mutable_ptr() {
   return data_ptr_.get();
 }
 
-std::vector<Tensor> Memory::get_input_tensors(int shard_index) {
+std::vector<Tensor> Memory::get_input_tensors(
+    int shard_index,
+    const std::string& method_name) {
   std::vector<Tensor> ret;
   ret.reserve(input_tensors_.size());
-  for (TensorImpl* impl : input_tensors_[shard_index]) {
+  for (TensorImpl* impl : input_tensors_[method_name][shard_index]) {
     ret.emplace_back(Tensor(impl));
   }
   return ret;
 }
 
-std::vector<Tensor> Memory::get_output_tensors(int shard_index) {
+std::vector<Tensor> Memory::get_output_tensors(
+    int shard_index,
+    const std::string& method_name) {
   std::vector<Tensor> ret;
-  ret.reserve(output_tensors_.size());
-  for (TensorImpl* impl : output_tensors_[shard_index]) {
+  ret.reserve(output_tensors_[method_name][shard_index].size());
+  for (TensorImpl* impl : output_tensors_[method_name][shard_index]) {
     ret.emplace_back(Tensor(impl));
   }
   return ret;
@@ -58,22 +59,110 @@ HybridMemory::HybridMemory(
     int32_t vocab_size,
     int32_t num_layers,
     int32_t head_dim,
-    int32_t num_heads)
+    int32_t num_heads,
+    EvalMode eval_mode,
+    const std::string& prefill_forward_name,
+    const std::string& kv_forward_name)
     : Memory(modules),
       shard_layers_({num_layers}),
       max_seq_len_(max_seq_len),
       vocab_size_(vocab_size),
       num_layers_(num_layers),
       head_dim_(head_dim),
-      num_heads_(num_heads) {
+      num_heads_(num_heads),
+      eval_mode_(eval_mode),
+      prefill_forward_name_(prefill_forward_name),
+      kv_forward_name_(kv_forward_name) {
+  if (!prefill_forward_name_.empty()) {
+    input_tensors_[prefill_forward_name_] =
+        std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
+    output_tensors_[prefill_forward_name_] =
+        std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
+    k_cache_in_[prefill_forward_name_] =
+        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
+    v_cache_in_[prefill_forward_name_] =
+        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
+    k_cache_out_[prefill_forward_name_] =
+        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
+    v_cache_out_[prefill_forward_name_] =
+        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
+  }
+  if (!kv_forward_name_.empty()) {
+    input_tensors_[kv_forward_name_] =
+        std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
+    output_tensors_[kv_forward_name_] =
+        std::vector<std::vector<executorch::aten::TensorImpl*>>(modules.size());
+    k_cache_in_[kv_forward_name_] =
+        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
+    v_cache_in_[kv_forward_name_] =
+        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
+    k_cache_out_[kv_forward_name_] =
+        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
+    v_cache_out_[kv_forward_name_] =
+        std::vector<std::unique_ptr<executorch::aten::TensorImpl>>();
+  }
+
   data_ptr_ = std::unique_ptr<void, void (*)(void*)>(
       new IO, [](void* ptr) { delete static_cast<IO*>(ptr); });
 }
 
-void HybridMemory::prepare_kv_io(
-    const std::vector<Result<MethodMeta>>& methods_meta) {
+void HybridMemory::init_io(
+    const std::vector<executorch::runtime::Result<
+        executorch::runtime::MethodMeta>>& methods_meta,
+    EvalMode eval_mode) {
   IO* ptr = static_cast<IO*>(data_ptr_.get());
   std::memset(ptr, 0, sizeof(IO));
+
+  int32_t cache_len = max_seq_len_ - 1;
+  int32_t k_in_size = (head_dim_ + 1) * (max_seq_len_ - 1);
+  int32_t k_cache_out_size = num_heads_ * head_dim_ * cache_len;
+  int32_t v_cache_size = (num_heads_ + 1) * (max_seq_len_ - 1) * head_dim_;
+
+  // Init kv vector shape, general enough to be shared across all 3 modes.
+  ptr->k_cache_out.reserve(num_layers_);
+  ptr->v_cache.reserve(num_layers_);
+  for (int layer = 0; layer < num_layers_; layer++) {
+    ptr->k_cache_out.emplace_back(std::vector<uint8_t>(k_cache_out_size));
+    ptr->v_cache.emplace_back(std::vector<uint8_t>(v_cache_size));
+  }
+
+  auto init_prefill = [&]() {
+    ptr->prefill_input_toks.resize(cache_len);
+    ptr->prefill_atten_mask.resize(cache_len * cache_len);
+    ptr->prefill_logits.resize(cache_len * vocab_size_);
+  };
+
+  auto init_kv = [&]() {
+    ptr->kv_logits.resize(vocab_size_);
+    ptr->kv_attention_mask.resize(max_seq_len_, -255);
+    ptr->k_cache.reserve(num_layers_);
+    for (int layer = 0; layer < num_layers_; layer++) {
+      ptr->k_cache.emplace_back();
+      ptr->k_cache[layer].reserve(num_heads_);
+      for (int head = 0; head < num_heads_; head++) {
+        ptr->k_cache[layer].emplace_back(std::vector<uint8_t>(k_in_size));
+      }
+    }
+  };
+
+  switch (eval_mode) {
+    case EvalMode::kPrefill:
+      init_prefill();
+      break;
+    case EvalMode::kKVCached:
+      init_kv();
+      break;
+    case EvalMode::kHybrid:
+      init_prefill();
+      init_kv();
+      break;
+    default:
+      break;
+  }
+}
+
+void HybridMemory::prepare_kv_io(
+    const std::vector<Result<MethodMeta>>& methods_meta) {
   for (int i = 0; i < modules_.size(); ++i) {
     ET_CHECK_MSG(
         methods_meta[i].ok(),
@@ -81,23 +170,8 @@ void HybridMemory::prepare_kv_io(
         static_cast<uint32_t>(methods_meta[i].error()));
   }
 
-  // Init IO vector shape
-  // atten_mask
-  ptr->logits.resize(vocab_size_);
-  ptr->attention_mask.resize(
-      max_seq_len_, -255); // attention mask shape should be [1, ctx_length]
-  // kv
-  int32_t k_in_size = (head_dim_ + 1) * (max_seq_len_ - 1);
-  int32_t k_out_size = num_heads_ * head_dim_;
-  int32_t v_cache_size = (num_heads_ + 1) * (max_seq_len_ - 1) * head_dim_;
-  for (int layer = 0; layer < num_layers_; layer++) {
-    ptr->k_cache.emplace_back();
-    for (int head = 0; head < num_heads_; head++) {
-      ptr->k_cache[layer].emplace_back(std::vector<uint8_t>(k_in_size));
-    }
-    ptr->k_cache_out.emplace_back(std::vector<uint8_t>(k_out_size));
-    ptr->v_cache.emplace_back(std::vector<uint8_t>(v_cache_size));
-  }
+  ET_CHECK_MSG(!(kv_forward_name_.empty()), "kv forward name is empty");
+  IO* ptr = static_cast<IO*>(data_ptr_.get());
 
   // [I]: input_tokens
   Result<TensorInfo> input_tok = methods_meta[0]->input_tensor_meta(0);
@@ -107,7 +181,7 @@ void HybridMemory::prepare_kv_io(
       const_cast<TensorImpl::SizesType*>(input_tok->sizes().data()),
       &ptr->input_tok,
       const_cast<TensorImpl::DimOrderType*>(input_tok->dim_order().data()));
-  input_tensors_[0].push_back(input_tok_.get());
+  input_tensors_[kv_forward_name_][0].push_back(input_tok_.get());
 
   // [I]: atten_mask
   Result<TensorInfo> atten_mask = methods_meta[0]->input_tensor_meta(1);
@@ -115,9 +189,9 @@ void HybridMemory::prepare_kv_io(
       atten_mask->scalar_type(),
       atten_mask->sizes().size(),
       const_cast<TensorImpl::SizesType*>(atten_mask->sizes().data()),
-      ptr->attention_mask.data(),
+      ptr->kv_attention_mask.data(),
       const_cast<TensorImpl::DimOrderType*>(atten_mask->dim_order().data()));
-  input_tensors_[0].push_back(attention_mask_.get());
+  input_tensors_[kv_forward_name_][0].push_back(attention_mask_.get());
 
   // [I]: input_pos
   Result<TensorInfo> input_pos = methods_meta[0]->input_tensor_meta(2);
@@ -127,7 +201,7 @@ void HybridMemory::prepare_kv_io(
       const_cast<TensorImpl::SizesType*>(input_pos->sizes().data()),
       &ptr->input_pos,
       const_cast<TensorImpl::DimOrderType*>(input_pos->dim_order().data()));
-  input_tensors_[0].push_back(input_pos_.get());
+  input_tensors_[kv_forward_name_][0].push_back(input_pos_.get());
 
   // [I] kv_cache
   int index = 3; // bypass input_tokens, input_pos, atten_mask
@@ -142,7 +216,8 @@ void HybridMemory::prepare_kv_io(
           Result<TensorInfo> kv_cache =
               methods_meta[shard_index]->input_tensor_meta(index);
           std::vector<std::unique_ptr<TensorImpl>>& cache =
-              (cache_group == 0 ? k_cache_in_ : v_cache_in_);
+              (cache_group == 0 ? k_cache_in_[kv_forward_name_]
+                                : v_cache_in_[kv_forward_name_]);
           void* cache_ptr = (cache_group == 0)
               ? static_cast<void*>(ptr->k_cache[layer + offset][head].data())
               : static_cast<void*>(
@@ -155,7 +230,8 @@ void HybridMemory::prepare_kv_io(
               cache_ptr,
               const_cast<TensorImpl::DimOrderType*>(
                   kv_cache->dim_order().data())));
-          input_tensors_[shard_index].push_back(cache.back().get());
+          input_tensors_[kv_forward_name_][shard_index].push_back(
+              cache.back().get());
         }
       }
     }
@@ -165,13 +241,14 @@ void HybridMemory::prepare_kv_io(
   int logit_index = 0;
   Result<TensorInfo> logits =
       methods_meta[modules_.size() - 1]->output_tensor_meta(logit_index);
-  logits_ = std::make_unique<TensorImpl>(
+  kv_logits_ = std::make_unique<TensorImpl>(
       logits->scalar_type(),
       logits->sizes().size(),
       const_cast<TensorImpl::SizesType*>(logits->sizes().data()),
-      ptr->logits.data(),
+      ptr->kv_logits.data(),
       const_cast<TensorImpl::DimOrderType*>(logits->dim_order().data()));
-  output_tensors_[modules_.size() - 1].push_back(logits_.get());
+  output_tensors_[kv_forward_name_][modules_.size() - 1].push_back(
+      kv_logits_.get());
 
   // [O] kv_cache
   index = 1;
@@ -190,7 +267,8 @@ void HybridMemory::prepare_kv_io(
           Result<TensorInfo> kv_cache =
               methods_meta[shard_index]->output_tensor_meta(index);
           std::vector<std::unique_ptr<TensorImpl>>& cache =
-              (cache_group == 0 ? k_cache_out_ : v_cache_out_);
+              (cache_group == 0 ? k_cache_out_[kv_forward_name_]
+                                : v_cache_out_[kv_forward_name_]);
           void* cache_ptr = (cache_group == 0)
               ? static_cast<void*>(
                     ptr->k_cache_out[layer + offset].data() +
@@ -205,7 +283,8 @@ void HybridMemory::prepare_kv_io(
               cache_ptr,
               const_cast<TensorImpl::DimOrderType*>(
                   kv_cache->dim_order().data())));
-          output_tensors_[shard_index].push_back(cache.back().get());
+          output_tensors_[kv_forward_name_][shard_index].push_back(
+              cache.back().get());
         }
       }
     }
@@ -214,8 +293,6 @@ void HybridMemory::prepare_kv_io(
 
 void HybridMemory::prepare_prefill_io(
     const std::vector<Result<MethodMeta>>& methods_meta) {
-  IO* ptr = static_cast<IO*>(data_ptr_.get());
-  std::memset(ptr, 0, sizeof(IO));
   for (int i = 0; i < modules_.size(); ++i) {
     ET_CHECK_MSG(
         methods_meta[i].ok(),
@@ -223,24 +300,13 @@ void HybridMemory::prepare_prefill_io(
         static_cast<uint32_t>(methods_meta[i].error()));
   }
 
-  // Parse some IO info from method meta
-  // cache_len should be max_seq_len - 1
-  int cache_len = methods_meta[0]->input_tensor_meta(0)->sizes()[1];
-
-  // TODO: Combine vector init with KV mode once Hybrid mode is enabled
-  // as it shares some common data structure.
-  // Init IO vector shape
-  ptr->prefill_input_toks.resize(cache_len);
-  ptr->prefill_atten_mask.resize(cache_len * cache_len);
-  ptr->prefill_logits.resize(cache_len * vocab_size_);
-  // Init kv vector shape
-  int32_t k_cache_out_size = num_heads_ * head_dim_ * cache_len;
-  int32_t v_cache_size = (num_heads_ + 1) * cache_len * head_dim_;
-  for (int layer = 0; layer < num_layers_; layer++) {
-    ptr->k_cache_out.emplace_back(std::vector<uint8_t>(k_cache_out_size));
-    ptr->v_cache.emplace_back(std::vector<uint8_t>(v_cache_size));
-  }
+  ET_CHECK_MSG(
+      !(prefill_forward_name_.empty()), "prefill forward name is empty");
 
+  IO* ptr = static_cast<IO*>(data_ptr_.get());
+
+  // cache_len should be max_seq_len - 1
+  int32_t cache_len = methods_meta[0]->input_tensor_meta(0)->sizes()[1];
   // [I]: pre_input_tokens
   Result<TensorInfo> prefill_input_toks = methods_meta[0]->input_tensor_meta(0);
   prefill_input_toks_ = std::make_unique<TensorImpl>(
@@ -250,7 +316,7 @@ void HybridMemory::prepare_prefill_io(
       ptr->prefill_input_toks.data(),
       const_cast<TensorImpl::DimOrderType*>(
           prefill_input_toks->dim_order().data()));
-  input_tensors_[0].push_back(prefill_input_toks_.get());
+  input_tensors_[prefill_forward_name_][0].push_back(prefill_input_toks_.get());
   // [I]: prefill_attn_mask
   for (int i = 0; i < cache_len; ++i) {
     for (int j = 0; j < cache_len; ++j) {
@@ -261,28 +327,26 @@ void HybridMemory::prepare_prefill_io(
       }
     }
   }
-
-  Result<TensorInfo> prefill_attn_mask = methods_meta[0]->input_tensor_meta(1);
+  Result<TensorInfo> prefill_atten_mask = methods_meta[0]->input_tensor_meta(1);
   prefill_attn_mask_ = std::make_unique<TensorImpl>(
-      prefill_attn_mask->scalar_type(),
-      prefill_attn_mask->sizes().size(),
-      const_cast<TensorImpl::SizesType*>(prefill_attn_mask->sizes().data()),
+      prefill_atten_mask->scalar_type(),
+      prefill_atten_mask->sizes().size(),
+      const_cast<TensorImpl::SizesType*>(prefill_atten_mask->sizes().data()),
       ptr->prefill_atten_mask.data(),
       const_cast<TensorImpl::DimOrderType*>(
-          prefill_attn_mask->dim_order().data()));
-  input_tensors_[0].push_back(prefill_attn_mask_.get());
-
+          prefill_atten_mask->dim_order().data()));
+  input_tensors_[prefill_forward_name_][0].push_back(prefill_attn_mask_.get());
   // [O]: logits
   int logit_index = 0;
-  Result<TensorInfo> logits =
-      methods_meta[modules_.size() - 1]->output_tensor_meta(logit_index);
-  logits_ = std::make_unique<TensorImpl>(
+  Result<TensorInfo> logits = methods_meta[0]->output_tensor_meta(0);
+  prefill_logits_ = std::make_unique<TensorImpl>(
       logits->scalar_type(),
       logits->sizes().size(),
       const_cast<TensorImpl::SizesType*>(logits->sizes().data()),
       ptr->prefill_logits.data(),
       const_cast<TensorImpl::DimOrderType*>(logits->dim_order().data()));
-  output_tensors_[modules_.size() - 1].push_back(logits_.get());
+  output_tensors_[prefill_forward_name_][modules_.size() - 1].push_back(
+      prefill_logits_.get());
   // [O] kv_cache
   int index = 1;
   for (int offset = 0, shard_index = 0, cache_stride = cache_len * head_dim_;
@@ -294,13 +358,15 @@ void HybridMemory::prepare_prefill_io(
           Result<TensorInfo> kv_cache =
               methods_meta[shard_index]->output_tensor_meta(index);
           std::vector<std::unique_ptr<TensorImpl>>& cache =
-              (cache_group == 0 ? k_cache_out_ : v_cache_out_);
+              (cache_group == 0 ? k_cache_out_[prefill_forward_name_]
+                                : v_cache_out_[prefill_forward_name_]);
           void* cache_ptr = (cache_group == 0)
               ? static_cast<void*>(
                     ptr->k_cache_out[layer + offset].data() +
                     head * cache_stride)
               : static_cast<void*>(
-                    ptr->v_cache[layer + offset].data() + head * cache_stride);
+                    ptr->v_cache[layer + offset].data() +
+                    (head + 1) * cache_stride);
           cache.emplace_back(std::make_unique<TensorImpl>(
               kv_cache->scalar_type(),
               kv_cache->sizes().size(),
@@ -308,14 +374,72 @@ void HybridMemory::prepare_prefill_io(
               cache_ptr,
               const_cast<TensorImpl::DimOrderType*>(
                   kv_cache->dim_order().data())));
-          output_tensors_[shard_index].push_back(cache.back().get());
+          output_tensors_[prefill_forward_name_][shard_index].push_back(
+              cache.back().get());
         }
       }
     }
   }
 }
 
-void HybridMemory::update_io(
+void HybridMemory::update_prefill_to_kv_io(
+    int64_t cur_token,
+    int64_t pos,
+    std::vector<std::vector<Tensor>>& output_tensors) {
+  int cache_len = (max_seq_len_ - 1);
+  IO* ptr = static_cast<IO*>(data_ptr_.get());
+
+  ptr->input_tok = static_cast<int32_t>(cur_token);
+  ptr->input_pos = static_cast<int32_t>(pos);
+  // If prompt len is 30, prefill will handle to pos = 30.
+  // At this point, pos should be 31.
+  for (int i = 0; i < pos + 1; i++) {
+    ptr->kv_attention_mask[cache_len - i] = 0;
+  }
+
+  // update v_cache
+  std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& v_cache_in =
+      v_cache_in_[kv_forward_name_];
+  std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& v_cache_out =
+      v_cache_out_[kv_forward_name_];
+  for (int i = 0, v_cache_stride = head_dim_ * pos; i < v_cache_in.size();
+       i++) {
+    v_cache_in[i]->set_data(
+        v_cache_in[i]->mutable_data<uint8_t>() + v_cache_stride);
+    v_cache_out[i]->set_data(
+        v_cache_out[i]->mutable_data<uint8_t>() + v_cache_stride);
+  }
+  for (int shard = 0; shard < output_tensors.size(); shard++) {
+    for (int index = 0; index < output_tensors[shard].size(); index++) {
+      ET_CHECK_MSG(
+          modules_[shard]->set_output(
+              kv_forward_name_, output_tensors[shard][index], index) ==
+              Error::Ok,
+          "Failed to set output tensor for module %d's %d'th output "
+          "while updating kv_cache output tensors",
+          shard,
+          index);
+    }
+  }
+
+  std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& k_cache_in =
+      k_cache_in_[kv_forward_name_];
+  std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& k_cache_out =
+      k_cache_out_[prefill_forward_name_];
+  for (int i = 0; i < k_cache_in.size(); ++i) {
+    uint8_t* ptr_in = k_cache_in[i]->mutable_data<uint8_t>();
+    const uint8_t* ptr_out = k_cache_out[i]->data<uint8_t>();
+    for (size_t j = 0, offset = cache_len; j < head_dim_;
+         ++j, offset += cache_len) {
+      for (int k = 0, k_stride = j * cache_len; k < pos; k++) {
+        ptr_in[offset + k] = ptr_out[k_stride + k];
+      }
+    }
+    k_cache_in[i]->set_data(ptr_in + pos);
+  }
+}
+
+void HybridMemory::update_kv_io(
     int64_t cur_token,
     int64_t pos,
     std::vector<std::vector<Tensor>>& output_tensors) {
@@ -326,19 +450,22 @@ void HybridMemory::update_io(
   // update position_ids
   ptr->input_pos = static_cast<int32_t>(pos);
   // update causal mask for next token
-  ptr->attention_mask[seq_len - pos] = 0;
+  ptr->kv_attention_mask[seq_len - pos] = 0;
 
   // update v_cache
-  for (int i = 0; i < v_cache_in_.size(); i++) {
-    v_cache_in_[i]->set_data(
-        v_cache_in_[i]->mutable_data<uint8_t>() + head_dim_);
-    v_cache_out_[i]->set_data(
-        v_cache_out_[i]->mutable_data<uint8_t>() + head_dim_);
+  auto& v_cache_in = v_cache_in_[kv_forward_name_];
+  auto& v_cache_out = v_cache_out_[kv_forward_name_];
+  for (int i = 0; i < v_cache_in.size(); i++) {
+    v_cache_in[i]->set_data(v_cache_in[i]->mutable_data<uint8_t>() + head_dim_);
+    v_cache_out[i]->set_data(
+        v_cache_out[i]->mutable_data<uint8_t>() + head_dim_);
   }
+
   for (int shard = 0; shard < output_tensors.size(); shard++) {
     for (int index = 0; index < output_tensors[shard].size(); index++) {
       ET_CHECK_MSG(
-          modules_[shard]->set_output(output_tensors[shard][index], index) ==
+          modules_[shard]->set_output(
+              kv_forward_name_, output_tensors[shard][index], index) ==
               Error::Ok,
           "failed to set output tensor for module %d's %d'th output "
           "while updating kv_cache output tensors",
@@ -347,15 +474,17 @@ void HybridMemory::update_io(
     }
   }
 
+  auto& k_cache_in = k_cache_in_[kv_forward_name_];
+  auto& k_cache_out = k_cache_out_[kv_forward_name_];
   // update k_cache by single thread, this part is cpu cache sensitive
-  for (int i = 0; i < k_cache_in_.size(); ++i) {
-    uint8_t* ptr_in = k_cache_in_[i]->mutable_data<uint8_t>();
-    const uint8_t* ptr_out = k_cache_out_[i]->data<uint8_t>();
+  for (int i = 0; i < k_cache_in.size(); ++i) {
+    uint8_t* ptr_in = k_cache_in[i]->mutable_data<uint8_t>();
+    const uint8_t* ptr_out = k_cache_out[i]->data<uint8_t>();
     for (size_t j = 0, offset = seq_len; j < head_dim_;
          ++j, offset += seq_len) {
       ptr_in[offset] = ptr_out[j];
     }
-    k_cache_in_[i]->set_data(ptr_in + 1);
+    k_cache_in[i]->set_data(ptr_in + 1);
   }
 }
 
diff --git a/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.h b/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.h
index 31ed351ef4b..956d58caf23 100644
--- a/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.h
+++ b/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.h
@@ -21,10 +21,20 @@
 
 namespace example {
 
+enum EvalMode {
+  kPrefill = 0,
+  kKVCached,
+  kHybrid,
+  kUnsupported,
+};
 class Memory {
  public:
   Memory(std::vector<std::shared_ptr<executorch::extension::Module>>& modules);
   virtual ~Memory();
+  virtual void init_io(
+      const std::vector<executorch::runtime::Result<
+          executorch::runtime::MethodMeta>>& methods_meta,
+      EvalMode eval_mode) = 0;
   virtual void prepare_prefill_io(
       const std::vector<
           executorch::runtime::Result<executorch::runtime::MethodMeta>>&
@@ -33,18 +43,32 @@ class Memory {
       const std::vector<
           executorch::runtime::Result<executorch::runtime::MethodMeta>>&
           methods_meta) = 0;
-  virtual void update_io(
+  virtual void update_prefill_to_kv_io(
+      int64_t cur_token,
+      int64_t pos,
+      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors) = 0;
+  virtual void update_kv_io(
       int64_t cur_token,
       int64_t pos,
       std::vector<std::vector<executorch::aten::Tensor>>& output_tensors) = 0;
   void* get_mutable_ptr();
-  std::vector<executorch::aten::Tensor> get_input_tensors(int shard_index);
-  std::vector<executorch::aten::Tensor> get_output_tensors(int shard_index);
+  std::vector<executorch::aten::Tensor> get_input_tensors(
+      int shard_index,
+      const std::string& method_name);
+  std::vector<executorch::aten::Tensor> get_output_tensors(
+      int shard_index,
+      const std::string& method_name);
 
  protected:
   std::unique_ptr<void, void (*)(void*)> data_ptr_;
-  std::vector<std::vector<executorch::aten::TensorImpl*>> input_tensors_;
-  std::vector<std::vector<executorch::aten::TensorImpl*>> output_tensors_;
+  std::unordered_map<
+      std::string,
+      std::vector<std::vector<executorch::aten::TensorImpl*>>>
+      input_tensors_;
+  std::unordered_map<
+      std::string,
+      std::vector<std::vector<executorch::aten::TensorImpl*>>>
+      output_tensors_;
   std::vector<std::shared_ptr<executorch::extension::Module>> modules_;
 };
 
@@ -56,7 +80,15 @@ class HybridMemory : public Memory {
       int32_t vocab_size,
       int32_t num_layers,
       int32_t head_dim,
-      int32_t num_heads);
+      int32_t num_heads,
+      EvalMode eval_mode,
+      const std::string& prefill_forward_name,
+      const std::string& kv_forward_name);
+
+  void init_io(
+      const std::vector<executorch::runtime::Result<
+          executorch::runtime::MethodMeta>>& methods_meta,
+      EvalMode eval_mode) override;
   void prepare_prefill_io(
       const std::vector<
           executorch::runtime::Result<executorch::runtime::MethodMeta>>&
@@ -65,7 +97,12 @@ class HybridMemory : public Memory {
       const std::vector<
           executorch::runtime::Result<executorch::runtime::MethodMeta>>&
           methods_meta) override;
-  void update_io(
+  void update_prefill_to_kv_io(
+      int64_t cur_token,
+      int64_t pos,
+      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
+      override;
+  void update_kv_io(
       int64_t cur_token,
       int64_t pos,
       std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
@@ -73,11 +110,11 @@ class HybridMemory : public Memory {
   struct IO {
     int32_t input_tok;
     int32_t input_pos;
-    std::vector<float> attention_mask;
     std::vector<std::vector<std::vector<uint8_t>>> k_cache;
     std::vector<std::vector<uint8_t>> v_cache;
     std::vector<std::vector<uint8_t>> k_cache_out;
-    std::vector<float> logits;
+    std::vector<float> kv_attention_mask;
+    std::vector<float> kv_logits;
     std::vector<int32_t> prefill_input_toks;
     std::vector<float> prefill_atten_mask;
     std::vector<float> prefill_logits;
@@ -90,17 +127,33 @@ class HybridMemory : public Memory {
   std::unique_ptr<executorch::aten::TensorImpl> attention_mask_;
   std::unique_ptr<executorch::aten::TensorImpl> prefill_input_toks_;
   std::unique_ptr<executorch::aten::TensorImpl> prefill_attn_mask_;
-  std::vector<std::unique_ptr<executorch::aten::TensorImpl>> k_cache_in_;
-  std::vector<std::unique_ptr<executorch::aten::TensorImpl>> v_cache_in_;
-  std::vector<std::unique_ptr<executorch::aten::TensorImpl>> k_cache_out_;
-  std::vector<std::unique_ptr<executorch::aten::TensorImpl>> v_cache_out_;
-  std::unique_ptr<executorch::aten::TensorImpl> logits_;
+  std::unique_ptr<executorch::aten::TensorImpl> prefill_logits_;
+  std::unordered_map<
+      std::string,
+      std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
+      k_cache_in_;
+  std::unordered_map<
+      std::string,
+      std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
+      v_cache_in_;
+  std::unordered_map<
+      std::string,
+      std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
+      k_cache_out_;
+  std::unordered_map<
+      std::string,
+      std::vector<std::unique_ptr<executorch::aten::TensorImpl>>>
+      v_cache_out_;
+  std::unique_ptr<executorch::aten::TensorImpl> kv_logits_;
   std::vector<int> shard_layers_;
   int32_t max_seq_len_;
   int32_t vocab_size_;
   int32_t num_layers_;
   int32_t head_dim_;
   int32_t num_heads_;
+  EvalMode eval_mode_;
+  std::string prefill_forward_name_;
+  std::string kv_forward_name_;
 };
 
 } // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama3_2/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama3_2/runner/runner.cpp
index 80da5b98873..ce784fed500 100644
--- a/examples/qualcomm/oss_scripts/llama3_2/runner/runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama3_2/runner/runner.cpp
@@ -41,19 +41,22 @@ std::string statsToJsonString(const Runner::Stats& stats);
 Runner::Runner(
     const std::vector<std::string>& models_path,
     const std::string& tokenizer_path,
+    const std::string& prompt,
+    const std::string& system_prompt,
     const float temperature,
     const int eval_mode)
     : n_bos_(1),
       n_eos_(1),
       tokenizer_path_(tokenizer_path),
       temperature_(temperature),
-      eval_mode_(eval_mode) {
+      eval_mode_(static_cast<EvalMode>(eval_mode)) {
   for (size_t i = 0; i < models_path.size(); ++i) {
     modules_.push_back(std::make_shared<Module>(
         models_path[i], Module::LoadMode::MmapUseMlockIgnoreErrors));
     ET_LOG(Info, "creating module: model_path=%s", models_path[i].c_str());
   }
   ET_LOG(Info, "creating runner: tokenizer_path=%s", tokenizer_path_.c_str());
+  ET_LOG(Info, "eval mode=%d", eval_mode);
 
   int64_t max_seq_len = getMetadataHelper<int64_t>("get_max_seq_len", -1);
   int64_t vocab_size = getMetadataHelper<int64_t>("get_vocab_size", -1);
@@ -76,8 +79,47 @@ Runner::Runner(
   bos_id_ = tokenizer_->bos_tok();
   eos_id_.insert(tokenizer_->eos_tok());
 
+  ET_CHECK_MSG(!prompt.empty(), "prompt cannot be null");
+
+  if (!system_prompt.empty()) {
+    prompt_.append("<|start_header_id|>system<|end_header_id|>\n\n");
+    prompt_.append(system_prompt);
+    prompt_.append("<|eot_id|>\n");
+  }
+  prompt_.append("<|start_header_id|>user<|end_header_id|>\n\n");
+  prompt_.append(prompt);
+  prompt_.append("<|eot_id|><|start_header_id|>assistant<|end_header_id|>");
+
+  switch (eval_mode_) {
+    case EvalMode::kPrefill:
+      prefill_forward_name_ = "forward";
+      method_names_.emplace_back(prefill_forward_name_);
+      break;
+    case EvalMode::kKVCached:
+      kv_forward_name_ = "forward";
+      method_names_.emplace_back(kv_forward_name_);
+      break;
+    case EvalMode::kHybrid:
+      prefill_forward_name_ = "prefill_forward";
+      kv_forward_name_ = "kv_forward";
+      method_names_.emplace_back(prefill_forward_name_);
+      method_names_.emplace_back(kv_forward_name_);
+      break;
+    case EvalMode::kUnsupported:
+      ET_CHECK_MSG(false, "Unsupported llama version");
+      break;
+  }
+
   io_mem_ = std::make_unique<HybridMemory>(
-      modules_, max_seq_len_, vocab_size_, num_layers, head_dim, num_heads);
+      modules_,
+      max_seq_len_,
+      vocab_size_,
+      num_layers,
+      head_dim,
+      num_heads,
+      eval_mode_,
+      prefill_forward_name_,
+      kv_forward_name_);
   ET_LOG(Info, "creating io_memory");
 }
 
@@ -94,7 +136,12 @@ Error Runner::load() {
     return Error::Ok;
   }
   for (std::shared_ptr<Module>& module : modules_) {
-    ET_CHECK_OK_OR_RETURN_ERROR(module->load_method("forward"));
+    if (!prefill_forward_name_.empty()) {
+      ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(prefill_forward_name_));
+    }
+    if (!kv_forward_name_.empty()) {
+      ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(kv_forward_name_));
+    }
   }
 
   // create sampler
@@ -105,12 +152,25 @@ Error Runner::load() {
       static_cast<unsigned long long>(std::time(nullptr)));
 
   // prepare io
-  auto methods_meta = get_methods_meta();
-  if (eval_mode_ == EvalMode::kBatchPrefill) {
-    io_mem_->prepare_prefill_io(methods_meta);
-  } else {
-    io_mem_->prepare_kv_io(methods_meta);
+  switch (eval_mode_) {
+    case EvalMode::kPrefill:
+      io_mem_->init_io(get_methods_meta(prefill_forward_name_), eval_mode_);
+      io_mem_->prepare_prefill_io(get_methods_meta(prefill_forward_name_));
+      break;
+    case EvalMode::kKVCached:
+      io_mem_->init_io(get_methods_meta(kv_forward_name_), eval_mode_);
+      io_mem_->prepare_kv_io(get_methods_meta(kv_forward_name_));
+      break;
+    case EvalMode::kHybrid:
+      io_mem_->init_io(get_methods_meta(kv_forward_name_), eval_mode_);
+      io_mem_->prepare_prefill_io(get_methods_meta(prefill_forward_name_));
+      io_mem_->prepare_kv_io(get_methods_meta(kv_forward_name_));
+      break;
+    case EvalMode::kUnsupported:
+      ET_CHECK_MSG(false, "unsupported mode");
+      break;
   }
+
   return Error::Ok;
 }
 
@@ -145,65 +205,59 @@ int32_t Runner::logitsToToken(const Tensor& logits_tensor) {
   return sampler_->sample(logits_last);
 }
 
-void Runner::run_model_step(std::vector<std::vector<EValue>>& inputs) {
+void Runner::run_model_step(
+    const std::string& method_name,
+    std::vector<std::vector<EValue>>& inputs) {
   for (size_t i = 0, num_modules = modules_.size(); i < num_modules; ++i) {
-    Result<std::vector<EValue>> outputs_res = modules_[i]->forward(inputs[i]);
+    Result<std::vector<EValue>> outputs_res =
+        modules_[i]->execute(method_name, inputs[i]);
     ET_CHECK_MSG(
         outputs_res.error() == Error::Ok, "shard %zu inference failed", i);
   }
 }
 
 Error Runner::generate(
-    const std::string& prompt,
-    const std::string& system_prompt,
     int32_t seq_len,
     std::function<void(const std::string&)> token_callback,
     std::function<void(const Stats&)> stats_callback) {
-  ET_CHECK_MSG(!prompt.empty(), "prompt cannot be null");
-
-  std::vector<std::vector<Tensor>> input_tensors, output_tensors;
-  std::vector<std::vector<EValue>> inputs;
+  std::unordered_map<std::string, std::vector<std::vector<Tensor>>>
+      input_tensors, output_tensors;
+  std::unordered_map<std::string, std::vector<std::vector<EValue>>> inputs;
   if (!is_loaded()) {
     stats_.model_load_start_ms = time_in_ms();
     ET_CHECK_OK_OR_RETURN_ERROR(load());
-    for (int i = 0; i < modules_.size(); ++i) {
-      input_tensors.emplace_back(io_mem_->get_input_tensors(i));
-      output_tensors.emplace_back(io_mem_->get_output_tensors(i));
-      for (size_t j = 0; j < output_tensors[i].size(); ++j) {
-        ET_CHECK_MSG(
-            modules_[i]->set_output(output_tensors[i][j], j) == Error::Ok,
-            "failed to set output tensor for module %d's %zu'th output",
-            i,
-            j);
+    for (auto method_name : method_names_) {
+      for (int i = 0; i < modules_.size(); ++i) {
+        input_tensors[method_name].emplace_back(
+            io_mem_->get_input_tensors(i, method_name));
+        output_tensors[method_name].emplace_back(
+            io_mem_->get_output_tensors(i, method_name));
+        for (size_t j = 0; j < output_tensors[method_name][i].size(); ++j) {
+          ET_CHECK_MSG(
+              modules_[i]->set_output(
+                  method_name, output_tensors[method_name][i][j], j) ==
+                  Error::Ok,
+              "failed to set output tensor for module %d's %zu'th output",
+              i,
+              j);
+        }
+        inputs[method_name].emplace_back(std::vector<EValue>(
+            begin(input_tensors[method_name][i]),
+            end(input_tensors[method_name][i])));
       }
-      inputs.emplace_back(
-          std::vector<EValue>(begin(input_tensors[i]), end(input_tensors[i])));
     }
-    stats_.model_load_end_ms = time_in_ms();
   }
-  std::string post_process_prompt;
-
-  if (!system_prompt.empty()) {
-    post_process_prompt.append(
-        "<|start_header_id|>system<|end_header_id|>\n\n");
-    post_process_prompt.append(system_prompt);
-    post_process_prompt.append("<|eot_id|>\n");
-  }
-  post_process_prompt.append("<|start_header_id|>user<|end_header_id|>\n\n");
-  post_process_prompt.append(prompt);
-  post_process_prompt.append(
-      "<|eot_id|><|start_header_id|>assistant<|end_header_id|>");
-  token_callback("<|begin_of_text|>");
-
+  stats_.model_load_end_ms = time_in_ms();
   stats_.inference_start_ms = time_in_ms();
 
+  if (token_callback) {
+    token_callback("<|begin_of_text|>");
+  }
   seq_len = (seq_len > 0 && seq_len <= max_seq_len_) ? seq_len : max_seq_len_;
   Result<std::vector<uint64_t>> encode_res =
-      tokenizer_->encode(post_process_prompt, n_bos_, 0);
+      tokenizer_->encode(prompt_, n_bos_, 0);
   ET_CHECK_OK_OR_RETURN_ERROR(
-      encode_res.error(),
-      "failed to encode prompt %s",
-      post_process_prompt.c_str());
+      encode_res.error(), "failed to encode prompt %s", prompt_.c_str());
 
   std::vector<uint64_t> prompt_tokens = encode_res.get();
   int num_prompt_tokens = prompt_tokens.size();
@@ -211,58 +265,74 @@ Error Runner::generate(
   ET_CHECK_MSG(
       num_prompt_tokens < seq_len,
       "sequence length exceeded - please increase the seq_len value");
+  if (eval_mode_ == EvalMode::kHybrid) {
+    int prefill_seq_len = get_methods_meta(prefill_forward_name_)[0]
+                              ->input_tensor_meta(0)
+                              ->sizes()[1] +
+        1;
+    ET_CHECK_MSG(
+        num_prompt_tokens < prefill_seq_len,
+        "For hybrid mode, please ensure prompt length(%d) is less than prefill's seq_len(%d)",
+        num_prompt_tokens,
+        prefill_seq_len);
+  }
 
   int64_t pos = 0, prev_token, cur_token = prompt_tokens[0];
   HybridMemory::IO* ptr =
       static_cast<HybridMemory::IO*>(io_mem_->get_mutable_ptr());
 
-  if (eval_mode_ == EvalMode::kBatchPrefill) {
+  auto prefill_execute = [&](const std::string& method_name) {
     for (int i = 0; i < num_prompt_tokens; i++) {
       ptr->prefill_input_toks[i] = static_cast<int32_t>(prompt_tokens[i]);
       auto piece_res = tokenizer_->decode(prompt_tokens[i], prompt_tokens[i]);
       token_callback(piece_res.get());
     }
     // inference
-    run_model_step(inputs);
-    Tensor& logits_tensor = output_tensors.back()[0];
+    run_model_step(method_name, inputs[method_name]);
+    Tensor& logits_tensor = output_tensors[method_name].back()[0];
     // offset to the meaningful logit we want.
     float* logits = logits_tensor.mutable_data_ptr<float>() +
         (num_prompt_tokens - 1) * vocab_size_;
     prev_token = prompt_tokens[num_prompt_tokens - 1];
+    long sample_start_time_ms = time_in_ms();
     cur_token = sampler_->sample(logits);
+    stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms;
     stats_.first_token_ms = time_in_ms();
     stats_.prompt_eval_end_ms = time_in_ms();
-    long sample_start_time_ms = time_in_ms();
-    stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms;
     auto piece_res = tokenizer_->decode(prev_token, cur_token);
     ET_CHECK(piece_res.ok());
     if (token_callback) {
       token_callback(piece_res.get().c_str());
     }
     pos += num_prompt_tokens;
-  } else {
+  };
+
+  auto kv_execute = [&](const std::string& method_name) {
     ptr->input_tok = static_cast<int32_t>(cur_token);
-    ptr->attention_mask[max_seq_len_ - 1] = 0;
+    ptr->kv_attention_mask[max_seq_len_ - 1] = 0;
     while (pos < seq_len - 1) {
       // inference
-      run_model_step(inputs);
-      Tensor& logits_tensor = output_tensors.back()[0];
-
-      if (pos == num_prompt_tokens) {
-        stats_.first_token_ms = time_in_ms();
-      } else if (pos == num_prompt_tokens - 1) {
-        stats_.prompt_eval_end_ms = time_in_ms();
+      run_model_step(method_name, inputs[method_name]);
+      Tensor& logits_tensor = output_tensors[method_name].back()[0];
+
+      // hybrid mode will check these stats_ at prefill(prefill)
+      if (eval_mode_ == EvalMode::kKVCached) {
+        if (pos == num_prompt_tokens) {
+          stats_.first_token_ms = time_in_ms();
+        } else if (pos == num_prompt_tokens - 1) {
+          stats_.prompt_eval_end_ms = time_in_ms();
+        }
       }
 
-      long sample_start_time_ms = time_in_ms();
       prev_token = cur_token;
+      long sample_start_time_ms = time_in_ms();
       cur_token = logitsToToken<float>(logits_tensor);
       stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms;
 
       if (pos < num_prompt_tokens - 1) {
         cur_token = prompt_tokens[pos + 1];
       }
-      io_mem_->update_io(cur_token, ++pos, output_tensors);
+      io_mem_->update_kv_io(cur_token, ++pos, output_tensors[method_name]);
       auto piece_res = tokenizer_->decode(prev_token, cur_token);
       ET_CHECK(piece_res.ok());
 
@@ -275,8 +345,25 @@ Error Runner::generate(
         break;
       }
     }
+  };
+
+  switch (eval_mode_) {
+    case EvalMode::kPrefill:
+      prefill_execute(prefill_forward_name_);
+      break;
+    case EvalMode::kKVCached:
+      kv_execute(kv_forward_name_);
+      break;
+    case EvalMode::kHybrid:
+      prefill_execute(prefill_forward_name_);
+      io_mem_->update_prefill_to_kv_io(
+          cur_token, pos, output_tensors[kv_forward_name_]);
+      kv_execute(kv_forward_name_);
+      break;
+    default:
+      ET_CHECK_MSG(false, "Unsupported eval mode");
+      break;
   }
-
   stats_.inference_end_ms = time_in_ms();
   if (pos == seq_len) {
     ET_LOG(Info, "\nSequence length (%i tokens) reached!", seq_len);
@@ -348,7 +435,7 @@ void printReport(const Runner::Stats& stats) {
   ET_LOG(
       Info,
       "\tSampling time over %" PRIu64 " tokens:\t%f (seconds)",
-      stats.num_prompt_tokens + stats.num_generated_tokens,
+      stats.num_generated_tokens,
       (double)stats.aggregate_sampling_time_ms /
           stats.SCALING_FACTOR_UNITS_PER_SECOND);
 }
@@ -370,11 +457,12 @@ std::string statsToJsonString(const Runner::Stats& stats) {
 }
 } // namespace
 
-std::vector<Result<MethodMeta>> Runner::get_methods_meta() {
+std::vector<Result<MethodMeta>> Runner::get_methods_meta(
+    std::string& method_name) {
   std::vector<Result<MethodMeta>> methods_meta;
   methods_meta.reserve(modules_.size());
   for (std::shared_ptr<Module>& module : modules_) {
-    methods_meta.emplace_back(module->method_meta("forward"));
+    methods_meta.emplace_back(module->method_meta(method_name));
   }
   return methods_meta;
 }
diff --git a/examples/qualcomm/oss_scripts/llama3_2/runner/runner.h b/examples/qualcomm/oss_scripts/llama3_2/runner/runner.h
index b720697be5f..3f0248872d5 100644
--- a/examples/qualcomm/oss_scripts/llama3_2/runner/runner.h
+++ b/examples/qualcomm/oss_scripts/llama3_2/runner/runner.h
@@ -29,6 +29,8 @@ class Runner {
   explicit Runner(
       const std::vector<std::string>& models_path,
       const std::string& tokenizer_path,
+      const std::string& prompt,
+      const std::string& system_prompt,
       const float temperature,
       const int eval_mode);
 
@@ -61,27 +63,23 @@ class Runner {
   bool is_loaded() const;
   executorch::runtime::Error load();
   executorch::runtime::Error generate(
-      const std::string& prompt,
-      const std::string& system_prompt,
       int32_t seq_len,
       std::function<void(const std::string&)> token_callback = {},
       std::function<void(const Stats&)> stats_callback = {});
   void stop();
   std::vector<executorch::runtime::Result<executorch::runtime::MethodMeta>>
-  get_methods_meta();
+  get_methods_meta(std::string& method_name);
 
  private:
-  enum EvalMode {
-    kBatchPrefill = 0,
-    kKVCached,
-    kUnsupported,
-  };
   template <typename T>
   T getMetadataHelper(std::string method_name, T default_val);
   template <typename T>
   int32_t logitsToToken(const executorch::aten::Tensor& logits_tensor);
   void run_model_step(
+      const std::string& method_name,
       std::vector<std::vector<executorch::runtime::EValue>>& inputs);
+  std::string prompt_;
+
   // metadata
   int32_t max_seq_len_;
   int32_t vocab_size_;
@@ -96,7 +94,10 @@ class Runner {
   std::unique_ptr<executorch::extension::llm::Sampler> sampler_;
   Stats stats_;
   std::unique_ptr<Memory> io_mem_;
-  int32_t eval_mode_;
+  EvalMode eval_mode_;
+  std::string prefill_forward_name_;
+  std::string kv_forward_name_;
+  std::vector<std::string> method_names_;
 };
 
 } // namespace example