From 645fb63ba298d93727a61cefc838e2bf9a2ee1fd Mon Sep 17 00:00:00 2001 From: winskuo-quic Date: Wed, 4 Dec 2024 09:14:23 +0800 Subject: [PATCH] Qualcomm AI Engine Direct - Support Hybrid Mode for Llama3.2 --- backends/qualcomm/aot/ir/qcir.fbs | 9 +- backends/qualcomm/aot/ir/qcir_utils.cpp | 14 +- backends/qualcomm/aot/ir/qcir_utils.h | 3 +- .../qualcomm/aot/python/PyQnnManagerAdaptor.h | 189 ++++++++--- .../aot/python/PyQnnWrapperAdaptor.cpp | 14 +- .../qualcomm/runtime/QnnExecuTorchBackend.cpp | 16 + backends/qualcomm/runtime/QnnManager.cpp | 49 ++- backends/qualcomm/runtime/QnnManager.h | 1 + .../runtime/backends/QnnBackendCache.cpp | 13 +- .../runtime/backends/QnnBackendCache.h | 5 +- .../runtime/backends/QnnContextCommon.cpp | 19 +- .../runtime/backends/QnnContextCommon.h | 3 +- .../qualcomm/runtime/backends/QnnLogger.cpp | 1 - .../qualcomm/serialization/qc_binary_info.fbs | 4 +- backends/qualcomm/tests/test_qnn_delegate.py | 11 +- backends/qualcomm/utils/utils.py | 121 ++++++- .../qualcomm/oss_scripts/llama3_2/README.md | 39 +++ .../qualcomm/oss_scripts/llama3_2/llama.py | 258 ++++++++++----- .../llama3_2/qnn_llama3_2_runner.cpp | 22 +- .../oss_scripts/llama3_2/runner/io_memory.cpp | 305 +++++++++++++----- .../oss_scripts/llama3_2/runner/io_memory.h | 81 ++++- .../oss_scripts/llama3_2/runner/runner.cpp | 220 +++++++++---- .../oss_scripts/llama3_2/runner/runner.h | 19 +- 23 files changed, 1022 insertions(+), 394 deletions(-) create mode 100644 examples/qualcomm/oss_scripts/llama3_2/README.md diff --git a/backends/qualcomm/aot/ir/qcir.fbs b/backends/qualcomm/aot/ir/qcir.fbs index 6c16a54e0db..dfd9bbc91e1 100755 --- a/backends/qualcomm/aot/ir/qcir.fbs +++ b/backends/qualcomm/aot/ir/qcir.fbs @@ -80,7 +80,8 @@ table Tensor { type: TensorType; dtype: DataType; qparam: QuantizeParam; - data: [ubyte]; + size: uint; + offset: ulong; } table Operator { @@ -88,9 +89,9 @@ table Operator { package_name: string; type_name: string; // keep only tensor indexes - inputs: [int]; - outputs: [int]; - params: [int]; + inputs: [uint]; + outputs: [uint]; + params: [uint]; } table Graph { diff --git a/backends/qualcomm/aot/ir/qcir_utils.cpp b/backends/qualcomm/aot/ir/qcir_utils.cpp index 8cf024ba006..48f069767bf 100755 --- a/backends/qualcomm/aot/ir/qcir_utils.cpp +++ b/backends/qualcomm/aot/ir/qcir_utils.cpp @@ -235,11 +235,8 @@ Qnn_QuantizeParams_t ToQuantizeParam(const tensor_type& tensor) { flatbuffers::Offset ToTensor( const Qnn_Tensor_t& tensor, + const uint64_t data_offset, flatbuffers::FlatBufferBuilder* builder) { - std::vector buffer( - static_cast(QNN_VER_PTR(tensor)->clientBuf.data), - static_cast(QNN_VER_PTR(tensor)->clientBuf.data) + - QNN_VER_PTR(tensor)->clientBuf.dataSize); std::vector shape( QNN_VER_PTR(tensor)->dimensions, QNN_VER_PTR(tensor)->dimensions + QNN_VER_PTR(tensor)->rank); @@ -251,10 +248,11 @@ flatbuffers::Offset ToTensor( ToTensorType(QNN_VER_PTR(tensor)->type), ToDataType(QNN_VER_PTR(tensor)->dataType), ToQuantizeParam(tensor, builder), - &buffer); + QNN_VER_PTR(tensor)->clientBuf.dataSize, + data_offset); } -Qnn_Tensor_t ToTensor(const tensor_type& tensor) { +Qnn_Tensor_t ToTensor(const tensor_type& tensor, const uint8_t* data_ptr) { auto is_io_tensor = [](Qnn_TensorType_t type) { return type < QNN_TENSOR_TYPE_STATIC; }; @@ -266,10 +264,10 @@ Qnn_Tensor_t ToTensor(const tensor_type& tensor) { QNN_VER_PTR(t)->quantizeParams = ToQuantizeParam(tensor); QNN_VER_PTR(t)->rank = tensor->shape()->size(); QNN_VER_PTR(t)->dimensions = const_cast(tensor->shape()->data()); - QNN_VER_PTR(t)->clientBuf.dataSize = tensor->data()->size(); + QNN_VER_PTR(t)->clientBuf.dataSize = tensor->size(); QNN_VER_PTR(t)->clientBuf.data = is_io_tensor(QNN_VER_PTR(t)->type) ? nullptr - : static_cast(const_cast(tensor->data()->Data())); + : static_cast(const_cast(data_ptr)); return t; } diff --git a/backends/qualcomm/aot/ir/qcir_utils.h b/backends/qualcomm/aot/ir/qcir_utils.h index 5d54eb30a69..085f09bf145 100755 --- a/backends/qualcomm/aot/ir/qcir_utils.h +++ b/backends/qualcomm/aot/ir/qcir_utils.h @@ -32,8 +32,9 @@ Qnn_QuantizeParams_t ToQuantizeParam(const tensor_type& tensor); flatbuffers::Offset ToTensor( const Qnn_Tensor_t& tensor, + const uint64_t data_offset, flatbuffers::FlatBufferBuilder* builder); -Qnn_Tensor_t ToTensor(const tensor_type& tensor); +Qnn_Tensor_t ToTensor(const tensor_type& tensor, const uint8_t* data_ptr); } // namespace qnn } // namespace backends diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h index 55429f2b430..bbe52bf74bf 100644 --- a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h +++ b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h @@ -57,47 +57,67 @@ class PyQnnManager { qnn_executorch_option_ptr_.cast().data()); // merge multiple qcirs into one context with multiple graphs - std::vector> graphs; + + // this makes it easier to do subtraction for offsets + std::vector offsets(1, 0); + std::vector*> tensor_data; + fb_opt_.max_size = FLATBUFFERS_MAX_64_BUFFER_SIZE; for (size_t i = 0; i < qcirs.size(); ++i) { py::buffer_info info(py::buffer(qcirs[i].cast()).request()); flatbuffers::Verifier verifier_binary_info( static_cast(info.ptr), - info.size * info.itemsize); + info.size * info.itemsize, + fb_opt_); if (!qnn_delegate::VerifyBinaryInfoBuffer(verifier_binary_info)) { QNN_EXECUTORCH_LOG_ERROR("Fail to verify binary info"); return; } auto binary_info = qnn_delegate::GetBinaryInfo(info.ptr); + tensor_data.push_back(binary_info->tensor_data()); flatbuffers::Verifier verifier_qcir( - binary_info->data()->data(), binary_info->data()->size()); + binary_info->context_data()->Data(), + binary_info->context_data()->size()); if (!qcir::VerifyContextBuffer(verifier_qcir)) { QNN_EXECUTORCH_LOG_ERROR("Fail to verify qcir format"); return; } - auto context = qcir::GetContext(binary_info->data()->data()); + offsets.push_back(offsets.back() + binary_info->tensor_data()->size()); + } + + std::vector> graphs; + for (size_t i = 0; i < qcirs.size(); ++i) { + py::buffer_info info(py::buffer(qcirs[i].cast()).request()); + auto binary_info = qnn_delegate::GetBinaryInfo(info.ptr); + auto context = qcir::GetContext(binary_info->context_data()->Data()); for (const auto& graph : *context->graphs()) { std::vector> tensors; for (const auto tensor : *graph->tensors()) { // here we need to take a detour to merge multiple qcir flatbuffers // outer ToTensor // return: flatbuffers::Offset - // consume: QnnTensor, flatbuffers::FlatBufferBuilder* + // consume: QnnTensor, data_offset, flatbuffers::FlatBufferBuilder* // inner ToTensor // return: QnnTensor - // consume: flatbuffers::Vector<::flatbuffers::Offset> - tensors.emplace_back(ToTensor(ToTensor(tensor), &builder_)); + // consume: + // flatbuffers::Vector<::flatbuffers::Offset>, + // data_ptr + tensors.emplace_back(ToTensor( + ToTensor(tensor, nullptr), + offsets[i] + tensor->offset(), + &builder_)); } std::vector> nodes; for (const auto& node : *graph->nodes()) { - int32_t* inputs_ptr = const_cast(node->inputs()->data()); - int32_t* outputs_ptr = const_cast(node->outputs()->data()); - int32_t* params_ptr = const_cast(node->params()->data()); - std::vector inputs( + uint32_t* inputs_ptr = const_cast(node->inputs()->data()); + uint32_t* outputs_ptr = + const_cast(node->outputs()->data()); + uint32_t* params_ptr = const_cast(node->params()->data()); + std::vector inputs( inputs_ptr, inputs_ptr + node->inputs()->size()); - std::vector outputs( + std::vector outputs( outputs_ptr, outputs_ptr + node->outputs()->size()); - std::vector params( + std::vector params( params_ptr, params_ptr + node->params()->size()); nodes.emplace_back(qcir::CreateOperatorDirect( builder_, @@ -118,7 +138,7 @@ class PyQnnManager { QnnExecuTorchContextBinary qcir_bin( {builder_.GetBufferPointer(), builder_.GetSize()}); - qnn_executorch_context_binary_ = MakeBinaryInfo(qcir_bin); + qnn_executorch_context_binary_ = MakeBinaryInfo(qcir_bin, tensor_data); qnn_manager_ = std::make_shared( qnn_executorch_options, qnn_executorch_context_binary_); } @@ -157,26 +177,37 @@ class PyQnnManager { if (qnn_manager_->IsOnlinePrepare() || qnn_manager_->IsMultipleGraphs()) { builder_.Reset(); - std::vector> tensors; + std::vector tensor_data; + std::vector offsets; std::unordered_map tensor_map; + std::vector> fb_tensors; + std::vector> fb_ops; auto set_tensor = [&](const std::shared_ptr& wrapper, - std::vector& index) { + std::vector& index) { auto it = tensor_map.find(wrapper.get()); if (it != tensor_map.end()) { index.push_back(it->second); } else { - int i = tensors.size(); - tensor_map[wrapper.get()] = i; - index.push_back(i); - tensors.emplace_back( - ToTensor(wrapper->CloneTensorStruct(), &builder_)); + tensor_map[wrapper.get()] = fb_tensors.size(); + index.push_back(fb_tensors.size()); + offsets.push_back(tensor_data.size()); + Qnn_Tensor_t qnn_tensor = wrapper->CloneTensorStruct(); + fb_tensors.emplace_back( + ToTensor(qnn_tensor, offsets.back(), &builder_)); + uint8_t* data_ptr = + static_cast(QNN_VER_PTR(qnn_tensor)->clientBuf.data); + if (data_ptr != nullptr) { + tensor_data.insert( + tensor_data.end(), + data_ptr, + data_ptr + QNN_VER_PTR(qnn_tensor)->clientBuf.dataSize); + } } }; - std::vector> operators; for (std::shared_ptr& op_wrapper : op_wrappers) { - std::vector inputs, outputs, params; + std::vector inputs, outputs, params; for (const auto& tensor_wrapper : op_wrapper->GetInputTensors()) { set_tensor(tensor_wrapper, inputs); @@ -207,13 +238,22 @@ class PyQnnManager { static_cast(&p.scalarParam.uint8Value); QNN_VER_PTR(t)->clientBuf.dataSize = GetDataTypeSize(QNN_VER_PTR(t)->dataType); - params.push_back(tensors.size()); - tensors.emplace_back(ToTensor(t, &builder_)); + + // collect tensor data + offsets.push_back(tensor_data.size()); + const uint8_t* data_ptr = + static_cast(QNN_VER_PTR(t)->clientBuf.data); + tensor_data.insert( + tensor_data.end(), + data_ptr, + data_ptr + QNN_VER_PTR(t)->clientBuf.dataSize); + params.push_back(fb_tensors.size()); + fb_tensors.emplace_back(ToTensor(t, offsets.back(), &builder_)); } } Qnn_OpConfig_t op_config = op_wrapper->GetOpConfig(); - operators.emplace_back(qcir::CreateOperatorDirect( + fb_ops.emplace_back(qcir::CreateOperatorDirect( builder_, QNN_VER_PTR(op_config)->name, QNN_VER_PTR(op_config)->packageName, @@ -222,14 +262,16 @@ class PyQnnManager { &outputs, ¶ms)); } - auto graph = qcir::CreateGraphDirect( - builder_, graph_name.c_str(), &operators, &tensors); - std::vector> graphs({graph}); - auto context = qcir::CreateContextDirect(builder_, &graphs); + + std::vector> fb_graphs( + {qcir::CreateGraphDirect( + builder_, graph_name.c_str(), &fb_ops, &fb_tensors)}); + auto context = qcir::CreateContextDirect(builder_, &fb_graphs); builder_.Finish(context); + QnnExecuTorchContextBinary qcir_binary( {builder_.GetBufferPointer(), builder_.GetSize()}); - binary_info = MakeBinaryInfo(qcir_binary); + binary_info = MakeBinaryInfo(qcir_binary, tensor_data); } else { if (qnn_manager_->Compile(graph_name, op_wrappers) != executorch::runtime::Error::Ok) { @@ -300,7 +342,8 @@ class PyQnnManager { py::buffer_info info(py::buffer(ctx_bin).request()); QnnExecuTorchContextBinary binary( {info.ptr, static_cast(info.size * info.itemsize)}); - auto binary_info = MakeBinaryInfo(binary); + std::vector tensor_data; + auto binary_info = MakeBinaryInfo(binary, tensor_data); auto result = py::array_t(binary_info.nbytes); auto result_buffer = result.request(); std::memcpy(result_buffer.ptr, binary_info.buffer, binary_info.nbytes); @@ -308,22 +351,78 @@ class PyQnnManager { } private: + std::string signature() { + return std::to_string( + std::chrono::high_resolution_clock::now().time_since_epoch().count()); + }; + QnnExecuTorchContextBinary MakeBinaryInfo( - const QnnExecuTorchContextBinary& ctx_bin) { - auto signature = []() { - return std::to_string( - std::chrono::high_resolution_clock::now().time_since_epoch().count()); - }; - const uint8_t* base = static_cast(ctx_bin.buffer); - std::vector data(base, base + ctx_bin.nbytes); + const QnnExecuTorchContextBinary& ctx_bin, + const std::vector*>& tensor_data) { + // the build order matters, 64 bit data is required to be shipped first + // add context data + builder64_.Reset(); + auto offset_context = builder64_.CreateVector< + uint8_t, + flatbuffers::Offset64, + flatbuffers::Vector64>( + static_cast(ctx_bin.buffer), ctx_bin.nbytes); + // add tensor data + // this is a little bit tricky but have smallest memory footprint in AoT + size_t buffer_size = 0; + for (auto& td : tensor_data) { + buffer_size += td->size(); + } + builder64_.StartVector< + uint8_t, + flatbuffers::Offset64, + flatbuffers::Vector64::size_type>(buffer_size); + for (int i = tensor_data.size() - 1; i >= 0; --i) { + builder64_.PushBytes(tensor_data[i]->Data(), tensor_data[i]->size()); + } + auto offset_tensor = flatbuffers::Offset64>( + builder64_.EndVector< + flatbuffers::Vector64::size_type, + flatbuffers::Offset64>::offset_type>( + buffer_size)); // add signature to binary for cache reuse in runtime - builder_.Reset(); - auto binary_info = qnn_delegate::CreateBinaryInfoDirect( - builder_, signature().c_str(), &data); - builder_.Finish(binary_info); + auto offset_signature = builder64_.CreateString(signature().c_str()); + // build binary info + auto binary_info = qnn_delegate::CreateBinaryInfo( + builder64_, offset_signature, offset_context, offset_tensor); + builder64_.Finish(binary_info); return QnnExecuTorchContextBinary( - {builder_.GetBufferPointer(), builder_.GetSize()}); + {builder64_.GetBufferPointer(), builder64_.GetSize()}); + } + + QnnExecuTorchContextBinary MakeBinaryInfo( + const QnnExecuTorchContextBinary& ctx_bin, + const std::vector& tensor_data) { + // the build order matters, 64 bit data is required to be shipped first + // add context data + builder64_.Reset(); + + auto offset_context = builder64_.CreateVector< + uint8_t, + flatbuffers::Offset64, + flatbuffers::Vector64>( + static_cast(ctx_bin.buffer), ctx_bin.nbytes); + // add tensor data + auto offset_tensor = builder64_.CreateVector< + uint8_t, + flatbuffers::Offset64, + flatbuffers::Vector64>( + static_cast(tensor_data.data()), tensor_data.size()); + // add signature to binary for cache reuse in runtime + auto offset_signature = builder64_.CreateString(signature().c_str()); + // build binary info + auto binary_info = qnn_delegate::CreateBinaryInfo( + builder64_, offset_signature, offset_context, offset_tensor); + builder64_.Finish(binary_info); + + return QnnExecuTorchContextBinary( + {builder64_.GetBufferPointer(), builder64_.GetSize()}); } // Store the bytes object instead of a raw pointer so that this module will @@ -331,7 +430,9 @@ class PyQnnManager { const py::bytes qnn_executorch_option_ptr_; QnnExecuTorchContextBinary qnn_executorch_context_binary_; std::shared_ptr qnn_manager_; + flatbuffers::FlatBufferBuilder64 builder64_; flatbuffers::FlatBufferBuilder builder_; + flatbuffers::Verifier::Options fb_opt_; }; } // namespace qnn } // namespace backends diff --git a/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.cpp b/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.cpp index 97d4491bc6a..2b4b88967e5 100644 --- a/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.cpp +++ b/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.cpp @@ -79,18 +79,6 @@ std::shared_ptr CreateTensorWrapper( std::unique_ptr quantize_param_wrapper = CreateQuantizationParamWrapper(encoding, quant_info); - if (data.size() == 0) { - return CreateTensorWrapper( - tensor_name, - tensor_type, - data_type, - std::move(quantize_param_wrapper), - rank, - dims.data(), - 0, - nullptr, - copy_data); - } return CreateTensorWrapper( tensor_name, tensor_type, @@ -99,7 +87,7 @@ std::shared_ptr CreateTensorWrapper( rank, dims.data(), 0, - data.data(), + data.size() == 0 ? nullptr : data.data(), copy_data); } diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp index 60208afeec5..eb8f78a883a 100644 --- a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp +++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp @@ -115,6 +115,13 @@ Error QnnExecuTorchBackend::execute( input_tensor_structs.reserve(input_tensors.size()); for (int i = 0; i < input_tensors.size(); ++i) { + // TODO: Enable this in future to avoid unmatch tensor size, e.g., QuantIO + // pass causing mismatch + // ET_CHECK_MSG( + // input_tensors[i]->GetBytes() == args[i]->toTensor().nbytes(), + // "Input index %d, number of bytes does not match between args and + // input_tensor, %d != %zu", i, input_tensors[i]->GetBytes(), + // args[i]->toTensor().nbytes()); if (qnn_manager->RegisterMem( args[i]->toTensor().mutable_data_ptr(), input_tensors[i]) != Error::Ok) { @@ -129,6 +136,15 @@ Error QnnExecuTorchBackend::execute( for (const auto& output_tensor : output_tensors) { // pos=0 limits the search to the prefix if (output_tensor->GetName().rfind("output_", 0) == 0) { + // TODO: Enable this in future to avoid unmatch tensor size, e.g., QuantIO + // pass causing mismatch + // ET_CHECK_MSG( + // output_tensor->GetBytes() == + // args[output_index]->toTensor().nbytes(), "Output index %d, number + // of bytes does not match between args and output_tensor, %d != %zu", + // output_index, + // output_tensor->GetBytes(), + // args[output_index]->toTensor().nbytes()); void* mutable_data_ptr = args[output_index]->toTensor().mutable_data_ptr(); if (qnn_manager->RegisterMem(mutable_data_ptr, output_tensor) != diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp index a4d83585f28..fe7050e7b13 100644 --- a/backends/qualcomm/runtime/QnnManager.cpp +++ b/backends/qualcomm/runtime/QnnManager.cpp @@ -54,6 +54,7 @@ QnnManager::QnnManager( QnnExecuTorchBackendType backend_type = options->backend_options()->backend_type(); std::string library_path = options->library_path()->str(); + fb_opt_.max_size = FLATBUFFERS_MAX_64_BUFFER_SIZE; if (options->log_level() >= QnnExecuTorchLogLevel::kLogLevelInfo) { QNN_EXECUTORCH_LOG_INFO( @@ -490,7 +491,8 @@ Error QnnManager::GetContextBinary( Error QnnManager::CompileQcir() { flatbuffers::Verifier verifier_binary_info( static_cast(qnn_context_blob_.buffer), - qnn_context_blob_.nbytes); + qnn_context_blob_.nbytes, + fb_opt_); if (!qnn_delegate::VerifyBinaryInfoBuffer(verifier_binary_info)) { QNN_EXECUTORCH_LOG_ERROR("Fail to verify binary info"); return Error::Internal; @@ -498,19 +500,22 @@ Error QnnManager::CompileQcir() { auto binary_info = qnn_delegate::GetBinaryInfo(qnn_context_blob_.buffer); flatbuffers::Verifier verifier_qcir( - binary_info->data()->data(), binary_info->data()->size()); + binary_info->context_data()->Data(), + binary_info->context_data()->size(), + fb_opt_); if (!qcir::VerifyContextBuffer(verifier_qcir)) { QNN_EXECUTORCH_LOG_ERROR("Fail to verify qcir format"); return Error::Internal; } - auto context = qcir::GetContext(binary_info->data()->data()); + auto context = qcir::GetContext(binary_info->context_data()->Data()); for (const auto& graph : *context->graphs()) { // qcir tensors to TensorWrapper std::vector> graph_inputs, graph_outputs, tensors; for (const auto& tensor : *graph->tensors()) { - tensors.emplace_back(CreateTensorWrapper(ToTensor(tensor))); + tensors.emplace_back(CreateTensorWrapper(ToTensor( + tensor, binary_info->tensor_data()->Data() + tensor->offset()))); if (tensor->type() == qcir::TensorType::WRITE) { graph_inputs.push_back(tensors.back()); } else if (tensor->type() == qcir::TensorType::READ) { @@ -544,6 +549,8 @@ Error QnnManager::CompileQcir() { const auto& tensor = graph->tensors()->Get(index); std::string name = tensor->name()->str(); Qnn_DataType_t dtype = ToDataType(tensor->dtype()); + const uint8_t* data_ptr = + binary_info->tensor_data()->Data() + tensor->offset(); if (tensor->shape()->size() != 0) { // add tensor param op->AddTensorParam( @@ -551,50 +558,39 @@ Error QnnManager::CompileQcir() { dtype, tensor->shape()->size(), tensor->shape()->data(), - tensor->data()->data()); + data_ptr); } else { // add scalar param switch (dtype) { case Qnn_DataType_t::QNN_DATATYPE_INT_32: op->AddScalarParam( - name, - dtype, - *reinterpret_cast(tensor->data()->Data())); + name, dtype, *reinterpret_cast(data_ptr)); break; case Qnn_DataType_t::QNN_DATATYPE_INT_16: op->AddScalarParam( - name, - dtype, - *reinterpret_cast(tensor->data()->Data())); + name, dtype, *reinterpret_cast(data_ptr)); break; case Qnn_DataType_t::QNN_DATATYPE_INT_8: - op->AddScalarParam( - name, dtype, static_cast(*tensor->data()->Data())); + op->AddScalarParam(name, dtype, static_cast(*data_ptr)); break; case Qnn_DataType_t::QNN_DATATYPE_UINT_32: op->AddScalarParam( - name, - dtype, - *reinterpret_cast(tensor->data()->Data())); + name, dtype, *reinterpret_cast(data_ptr)); break; case Qnn_DataType_t::QNN_DATATYPE_UINT_16: op->AddScalarParam( - name, - dtype, - *reinterpret_cast(tensor->data()->Data())); + name, dtype, *reinterpret_cast(data_ptr)); break; case Qnn_DataType_t::QNN_DATATYPE_UINT_8: - op->AddScalarParam(name, dtype, *tensor->data()->Data()); + op->AddScalarParam(name, dtype, *data_ptr); break; case Qnn_DataType_t::QNN_DATATYPE_FLOAT_32: case Qnn_DataType_t::QNN_DATATYPE_FLOAT_16: op->AddScalarParam( - name, - dtype, - *reinterpret_cast(tensor->data()->Data())); + name, dtype, *reinterpret_cast(data_ptr)); break; case Qnn_DataType_t::QNN_DATATYPE_BOOL_8: - op->AddScalarParam(name, dtype, *tensor->data()->Data()); + op->AddScalarParam(name, dtype, *data_ptr); break; default: QNN_EXECUTORCH_LOG_ERROR( @@ -603,7 +599,7 @@ Error QnnManager::CompileQcir() { } } } - op_wrappers.push_back(std::move(op)); + op_wrappers.emplace_back(std::move(op)); } ET_CHECK_OR_RETURN_ERROR( @@ -687,7 +683,8 @@ Error QnnManager::Compile( std::string QnnManager::GetBinarySignature() { flatbuffers::Verifier verifier( static_cast(qnn_context_blob_.buffer), - qnn_context_blob_.nbytes); + qnn_context_blob_.nbytes, + fb_opt_); return VerifyBinaryInfoBuffer(verifier) ? GetBinaryInfo(qnn_context_blob_.buffer)->signature()->str() : ""; diff --git a/backends/qualcomm/runtime/QnnManager.h b/backends/qualcomm/runtime/QnnManager.h index 0157ee58378..7c78418ffa7 100644 --- a/backends/qualcomm/runtime/QnnManager.h +++ b/backends/qualcomm/runtime/QnnManager.h @@ -147,6 +147,7 @@ class QnnManager { {Qnn_DataType_t::QNN_DATATYPE_UFIXED_POINT_16, executorch::aten::ScalarType::Bits16}, }; + flatbuffers::Verifier::Options fb_opt_; }; } // namespace qnn } // namespace backends diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp index 43cb835cfff..244af7cd84e 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp +++ b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp @@ -109,7 +109,8 @@ Error QnnBackendCache::Configure() { QNN_EXECUTORCH_LOG_INFO("Caching: Caching is in RESTORE MODE."); flatbuffers::Verifier verifier_binary_info( static_cast(qnn_context_blob_.buffer), - qnn_context_blob_.nbytes); + qnn_context_blob_.nbytes, + fb_opt_); if (!qnn_delegate::VerifyBinaryInfoBuffer(verifier_binary_info)) { QNN_EXECUTORCH_LOG_ERROR("Fail to verify binary info"); return Error::Internal; @@ -117,17 +118,19 @@ Error QnnBackendCache::Configure() { auto binary_info = GetBinaryInfo(qnn_context_blob_.buffer); Error status = GetQnnGraphInfoFromBinary( - const_cast(binary_info->data()->data()), - binary_info->data()->size()); + const_cast(binary_info->context_data()->Data()), + binary_info->context_data()->size()); if (status == Error::Internal) { // check if context binary came from flatbuffer flatbuffers::Verifier verifier( - binary_info->data()->data(), binary_info->data()->size()); + binary_info->context_data()->Data(), + binary_info->context_data()->size(), + fb_opt_); if (qcir::VerifyContextBuffer(verifier)) { state_ = ONLINE_PREPARE; - auto context = qcir::GetContext(binary_info->data()->data()); + auto context = qcir::GetContext(binary_info->context_data()->Data()); for (const auto& graph : *context->graphs()) { graph_names_.emplace_back(graph->name()->str()); } diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.h b/backends/qualcomm/runtime/backends/QnnBackendCache.h index b9e00f0a662..e8ce9af88e7 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendCache.h +++ b/backends/qualcomm/runtime/backends/QnnBackendCache.h @@ -28,7 +28,9 @@ class QnnBackendCache { explicit QnnBackendCache( const QnnExecuTorchContextBinary& qnn_context_blob, const std::string& aot_graph_name) - : qnn_context_blob_(qnn_context_blob), aot_graph_name_(aot_graph_name) {} + : qnn_context_blob_(qnn_context_blob), aot_graph_name_(aot_graph_name) { + fb_opt_.max_size = FLATBUFFERS_MAX_64_BUFFER_SIZE; + } virtual ~QnnBackendCache(); QnnBackendCache(const QnnBackendCache&) = delete; QnnBackendCache(QnnBackendCache&&) = delete; @@ -82,6 +84,7 @@ class QnnBackendCache { input_tensor_structs_; std::unordered_map> output_tensor_structs_; + flatbuffers::Verifier::Options fb_opt_; }; } // namespace qnn } // namespace backends diff --git a/backends/qualcomm/runtime/backends/QnnContextCommon.cpp b/backends/qualcomm/runtime/backends/QnnContextCommon.cpp index 7db5164a1d5..4c3fe53eece 100644 --- a/backends/qualcomm/runtime/backends/QnnContextCommon.cpp +++ b/backends/qualcomm/runtime/backends/QnnContextCommon.cpp @@ -51,8 +51,8 @@ Error QnnContext::Configure() { backend_->GetHandle(), device_->GetHandle(), temp_context_config.empty() ? nullptr : temp_context_config.data(), - const_cast(binary_info->data()->data()), - binary_info->data()->size(), + const_cast(binary_info->context_data()->Data()), + binary_info->context_data()->size(), &handle_, /*profile=*/nullptr); if (error != QNN_SUCCESS) { @@ -93,10 +93,11 @@ Error QnnContext::GetContextBinary( Qnn_ContextBinarySize_t bytes_written = 0; Qnn_ErrorHandle_t error = qnn_interface.qnn_context_get_binary_size(handle_, &binary_size); + std::vector binary_buffer; if (error == QNN_SUCCESS) { - binary_buffer_.resize(binary_size); + binary_buffer.resize(binary_size); error = qnn_interface.qnn_context_get_binary( - handle_, binary_buffer_.data(), binary_size, &bytes_written); + handle_, binary_buffer.data(), binary_size, &bytes_written); if (error != QNN_SUCCESS) { QNN_EXECUTORCH_LOG_ERROR( "Can't get graph binary to be saved to " @@ -118,12 +119,12 @@ Error QnnContext::GetContextBinary( .time_since_epoch() .count()); }; - builder_.Reset(); + builder64_.Reset(); auto binary_info = qnn_delegate::CreateBinaryInfoDirect( - builder_, signature().c_str(), &binary_buffer_); - builder_.Finish(binary_info); - qnn_executorch_context_binary.buffer = builder_.GetBufferPointer(); - qnn_executorch_context_binary.nbytes = builder_.GetSize(); + builder64_, signature().c_str(), &binary_buffer); + builder64_.Finish(binary_info); + qnn_executorch_context_binary.buffer = builder64_.GetBufferPointer(); + qnn_executorch_context_binary.nbytes = builder64_.GetSize(); } } else { QNN_EXECUTORCH_LOG_ERROR( diff --git a/backends/qualcomm/runtime/backends/QnnContextCommon.h b/backends/qualcomm/runtime/backends/QnnContextCommon.h index d93390a5379..d6823a5d4a5 100644 --- a/backends/qualcomm/runtime/backends/QnnContextCommon.h +++ b/backends/qualcomm/runtime/backends/QnnContextCommon.h @@ -71,8 +71,7 @@ class QnnContext { QnnBackend* backend_; QnnDevice* device_; QnnBackendCache* cache_; - std::vector binary_buffer_; - flatbuffers::FlatBufferBuilder builder_; + flatbuffers::FlatBufferBuilder64 builder64_; }; } // namespace qnn } // namespace backends diff --git a/backends/qualcomm/runtime/backends/QnnLogger.cpp b/backends/qualcomm/runtime/backends/QnnLogger.cpp index 412b1a2db2c..5b86894d874 100644 --- a/backends/qualcomm/runtime/backends/QnnLogger.cpp +++ b/backends/qualcomm/runtime/backends/QnnLogger.cpp @@ -10,7 +10,6 @@ #include #include -#include #include #include "QnnLog.h" diff --git a/backends/qualcomm/serialization/qc_binary_info.fbs b/backends/qualcomm/serialization/qc_binary_info.fbs index 3f301055269..e924fa76871 100644 --- a/backends/qualcomm/serialization/qc_binary_info.fbs +++ b/backends/qualcomm/serialization/qc_binary_info.fbs @@ -14,7 +14,9 @@ table BinaryInfo { // Signature of binary signature: string; // Data of processed binary - data: [ubyte]; + context_data: [ubyte] (vector64); + // Data of tensor + tensor_data: [ubyte] (vector64); } root_type BinaryInfo; diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 10917cdd6bf..875d34760cc 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -1595,11 +1595,7 @@ def test_qnn_backend_multi_graphs(self): for i, edge_prog in enumerate(edge_progs) ] prog_mgr = generate_multi_graph_program( - compiler_specs=compiler_specs[0], - processed_bytes=[ - prog.graph_module.lowered_module_0.processed_bytes - for prog in exported_programs - ], + compiler_specs=compiler_specs[0], exported_programs=exported_programs ) for index, module in enumerate(modules): self.verify_output( @@ -1915,10 +1911,7 @@ def test_qnn_backend_multi_graphs(self): ] prog_mgr = generate_multi_graph_program( compiler_specs=compiler_specs[0], - processed_bytes=[ - prog.graph_module.lowered_module_0.processed_bytes - for prog in exported_programs - ], + exported_programs=exported_programs, ) for index, module in enumerate(modules): self.verify_output( diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py index 590ede74319..791ee802177 100644 --- a/backends/qualcomm/utils/utils.py +++ b/backends/qualcomm/utils/utils.py @@ -5,9 +5,10 @@ # LICENSE file in the root directory of this source tree. import operator +import re import warnings from collections import OrderedDict -from typing import Callable, Dict, FrozenSet, List, Tuple +from typing import Any, Callable, Dict, FrozenSet, List, Optional, Tuple import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManagerAdaptor @@ -648,7 +649,13 @@ def op_impl(inputs: List[torch.Tensor]): for v in outputs.values() ) - def build_graph(inputs, outputs): + def build_graph( + inputs, + outputs, + qnn_in_order: Optional[List[int]] = None, + executorch_in_order: Optional[List[int]] = None, + executorch_out_order: Optional[List[int]] = None, + ): # custom op declaration inputs_str = "Tensor[] inputs" func_proto = f"{op_name}({inputs_str}) -> Any" @@ -659,13 +666,39 @@ def build_graph(inputs, outputs): # model architecture mimicking context binary class Model(torch.nn.Module): - def forward(self, *inputs): - return getattr( + """ + The args of forward() can be thought of as what executorch is accepting as input. + The getattr inside the forward() can be thought of as qnn context binary. + When we first pass in the input, we need to use the executorch's(nn.module) input order. + After we get into forward(), we then need to convert input order to qnn's input order. + Same as return, when qnn returns the value, we need to reorder them back to executorh's output order. + """ + + def __init__(self, qnn_in_order, executorch_out_order): + super().__init__() + self.qnn_in_order = qnn_in_order + self.executorch_out_order = executorch_out_order + + def forward(self, *inputs): # executorch + if self.qnn_in_order: + inputs = tuple(inputs[i] for i in self.qnn_in_order) + ret = getattr( getattr(torch.ops, OpContextLoader.namespace), op_name ).default(inputs) + return ( + [ret[idx] for idx in self.executorch_out_order] + if self.executorch_out_order + else ret + ) + + inputs = ( + tuple(tuple(inputs.values())[i] for i in executorch_in_order) + if executorch_in_order + else tuple(inputs.values()) + ) - model = Model() - prog = torch.export.export(model, tuple(inputs.values())) + model = Model(qnn_in_order, executorch_out_order) + prog = torch.export.export(model, inputs) # bookkeeping for variables' life cycle return { "custom_op": custom_op, @@ -708,6 +741,7 @@ def preprocess_binary(ctx_bin, compiler_specs): for k, v in type_map.items(): dtype_map.setdefault(v, k) + qnn_in_order, executorch_in_order, executorch_out_order = [], [], [] if custom_info is not None: # since some context binaries might fail to open on host # if they are compiled with special flags: @@ -715,6 +749,9 @@ def preprocess_binary(ctx_bin, compiler_specs): # use custom information here instead inputs = build_tensor(custom_info["graph_inputs"], dtype_map) outputs = build_tensor(custom_info["graph_outputs"], dtype_map) + qnn_in_order = custom_info["qnn_in_order"] + executorch_in_order = custom_info["executorch_in_order"] + executorch_out_order = custom_info["executorch_out_order"] graph_name = custom_info["graph_name"] else: # get context-binary io tensor info through qnn manager @@ -729,15 +766,21 @@ def preprocess_binary(ctx_bin, compiler_specs): inputs = build_tensor(qnn_mgr.GetGraphInputs(graph_name), dtype_map) outputs = build_tensor(qnn_mgr.GetGraphOutputs(graph_name), dtype_map) qnn_mgr.Destroy() - # generate graph specific for loading context - bundle_prog = build_graph(inputs, outputs) + bundle_prog = build_graph( + inputs, outputs, qnn_in_order, executorch_in_order, executorch_out_order + ) bundle_prog.update({"inputs": inputs, "outputs": outputs}) + + # TODO: to_edge() decorator alters the function call behavior, which + # requires "self" when calling. To work around this issue, + # temporarily remove the first parameter name. edge_prog_mgr = to_edge( - programs={graph_name: bundle_prog["exported_program"]}, + {graph_name: bundle_prog["exported_program"]}, # do not alter name for custom op compile_config=EdgeCompileConfig(_use_edge_ops=False), ) + # update meta with context binary for n in edge_prog_mgr._edge_programs[graph_name].graph.nodes: if n.op == "call_function" and OpContextLoader.namespace in str(n.target): @@ -758,11 +801,23 @@ def draw_graph(title, path, graph_module: torch.fx.GraphModule): def generate_multi_graph_program( compiler_specs: List[CompileSpec], - processed_bytes: List[bytes], + exported_programs: List[ExportedProgram] = None, backend_config: ExecutorchBackendConfig = None, + constant_methods: Optional[Dict[str, Any]] = None, ) -> ExecutorchProgramManager: + # compile multiple graphs in qcir into single context binary - graph_inputs, graph_outputs = {}, {} + ( + graph_inputs, + graph_outputs, + qnn_in_order, + executorch_in_order, + executorch_out_order, + ) = ({}, {}, {}, {}, {}) + + processed_bytes = [ + prog.graph_module.lowered_module_0.processed_bytes for prog in exported_programs + ] qnn_mgr = PyQnnManagerAdaptor.QnnManager( generate_qnn_executorch_option(compiler_specs), processed_bytes ) @@ -773,6 +828,41 @@ def generate_multi_graph_program( for graph_name in graph_names: graph_inputs[graph_name] = qnn_mgr.GetGraphInputs(graph_name) graph_outputs[graph_name] = qnn_mgr.GetGraphOutputs(graph_name) + + # We need to obtain the order of the IOs to correctly map QNN with nn.module + for i, graph_name in enumerate(graph_names): + # input + input_names = [ + node.name + for node in exported_programs[i].graph_module.graph.nodes + if node.op == "placeholder" + ] + qnn_input_names = [wrapper.GetName() for wrapper in graph_inputs[graph_name]] + input_order_list = [] + for input_name in input_names: + # e.g., input_0_tokens_0 + pattern = rf"^input_(\d+)_({input_name})_(\d+)$" + for j in range(len(qnn_input_names)): + if re.match(pattern, qnn_input_names[j]): + input_order_list.append(j) + break + assert ( + len(input_order_list) == len(input_names) == len(qnn_input_names) + ), "Order list length is different from names" + executorch_in_order[graph_name] = input_order_list + qnn_in_order[graph_name] = sorted( + range(len(input_order_list)), key=lambda k: input_order_list[k] + ) + + # output + get_item_list = [ + node + for node in exported_programs[i].graph_module.graph.nodes + if node.op == "output" + ][0].args[0] + output_order_list = [item.args[1] for item in get_item_list] + executorch_out_order[graph_name] = output_order_list + qnn_mgr.Destroy() # build custom ops with different graph signatures @@ -786,16 +876,20 @@ def generate_multi_graph_program( "graph_inputs": graph_inputs[graph_name], "graph_outputs": graph_outputs[graph_name], "graph_name": graph_name, + "qnn_in_order": qnn_in_order[graph_name], + "executorch_in_order": executorch_in_order[graph_name], + "executorch_out_order": executorch_out_order[graph_name], }, ) for graph_name in graph_names ] # leverage ExecutorchProgramManager for generating pte with multi-methods edge_prog_mgr = to_edge( - programs={ + { graph_name: bundle_prog["exported_program"] for graph_name, bundle_prog in zip(graph_names, bundle_progs) }, + constant_methods=constant_methods, # do not alter name for custom op compile_config=EdgeCompileConfig(_use_edge_ops=False), ) @@ -806,7 +900,8 @@ def generate_multi_graph_program( n.meta[OpContextLoader.meta_ctx_bin] = binary_info break - return edge_prog_mgr.to_backend(QnnPartitioner(compiler_specs)).to_executorch( + edge_prog_mgr = edge_prog_mgr.to_backend(QnnPartitioner(compiler_specs)) + return edge_prog_mgr.to_executorch( config=backend_config or ExecutorchBackendConfig() ) diff --git a/examples/qualcomm/oss_scripts/llama3_2/README.md b/examples/qualcomm/oss_scripts/llama3_2/README.md new file mode 100644 index 00000000000..51de982b1b1 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama3_2/README.md @@ -0,0 +1,39 @@ +# Summary + +## Overview +This file provides instructions to run LLAMA3.2 1B and 3B (WIP) with different parameters via the Qualcomm HTP backend. In LLAMA3.2, we offer the following modes to execute the model: + +Prefill Mode: This is also known as batch prefill mode, where the model takes in a list of tokens as input and generates the next token along with the key-value (KV) cache for all tokens. This mode is efficient for generating the initial sequence of tokens (usually the user's prompt). + +KV Cache Mode: In KV Cache mode, the model takes in a single previous token and generates the next predicted token along with its KV cache. It is efficient for generating subsequent tokens after the initial prompt. + +Hybrid Mode: Hybrid mode leverages the strengths of both batch prefill and KV cache modes to optimize token generation speed. Initially, it uses prefill mode to efficiently generate the prompt's key-value (KV) cache. Then, the mode switches to KV cache mode, which excels at generating subsequent tokens. + +## Instructions +### Note +1. For hybrid mode, the export time will be longer and can take up to 2-4 hours to complete. +2. When exporting a hybrid mode model, please ensure the device has at least 80 GB of memory and swap space. + +### Step 1: Setup +1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. +2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend. + +### Step 2: Prepare Model +1. Follow the [instructions](https://www.llama.com/) to download models. +At the end of this step, users should have the following files ready: consolidated.00.pth, params.json, and tokenizer.model. + +### Step3: Run default examples using hybrid mode. +Default example using hybrid mode. +```bash +python examples/qualcomm/oss_scripts/llama3_2/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --prompt "what is 1+1" --temperature 0 --model_size 1B --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 +``` + +If you would like to compile the model only, we have provided the flag `--compile_only`. +```bash +python examples/qualcomm/oss_scripts/llama3_2/llama.py -b build-android -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --prompt "what is 1+1" --temperature 0 --model_size 1B --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --compile_only +``` + +On the other hand, if you already have a pre-compiled .pte model, you can perform inference by providing the flag `--pre_gen_pte` and specifying the folder that contains the .pte model. +```bash +python examples/qualcomm/oss_scripts/llama3_2/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --prompt "what is 1+1" --temperature 0 --model_size 1B --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE} +``` \ No newline at end of file diff --git a/examples/qualcomm/oss_scripts/llama3_2/llama.py b/examples/qualcomm/oss_scripts/llama3_2/llama.py index 75c0bb0ff0f..77347cc3616 100755 --- a/examples/qualcomm/oss_scripts/llama3_2/llama.py +++ b/examples/qualcomm/oss_scripts/llama3_2/llama.py @@ -4,6 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import copy import getpass import json import logging @@ -20,7 +21,6 @@ from executorch.backends.qualcomm.quantizer.custom_annotation import ( annotate_matmul_16a8w, - custom_annotate_llama_last_conv_16a8w, ) from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype @@ -30,6 +30,7 @@ capture_program, convert_linear_to_conv2d, generate_htp_compiler_spec, + generate_multi_graph_program, generate_qnn_executorch_compiler_spec, get_soc_to_chipset_map, ) @@ -44,6 +45,7 @@ SimpleADB, ) from executorch.exir import EdgeCompileConfig, EdgeProgramManager +from executorch.exir.backend.backend_api import to_backend from executorch.exir.capture._config import ExecutorchBackendConfig from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass @@ -59,8 +61,6 @@ logging.basicConfig(level=logging.INFO, format=FORMAT) logging.getLogger().setLevel(logging.INFO) -pte_filename = "llama3_2_qnn" - def _kv_calibrate( example_inputs, @@ -103,7 +103,7 @@ def _kv_calibrate( print(f"calibration data:\n{sp_model.decode(token_list)}") -def _batch_prefill_calibrate( +def _prefill_calibrate( example_inputs, user_prompts, module: torch.fx.GraphModule, @@ -147,7 +147,7 @@ def calibrate( max_seq_len=512, ): if len(example_inputs) == 2: - _batch_prefill_calibrate( + _prefill_calibrate( example_inputs, user_prompts, module, @@ -167,12 +167,13 @@ def calibrate( class SingleLlama: - def __init__(self, llama_model) -> None: + def __init__(self, llama_model, pte_filename) -> None: super().__init__() self.llama_model = llama_model self.quant_dtype = None self.llama_meta = self.llama_model.get_metadata() self.has_quant_io = False + self.pte_filename = pte_filename if self.llama_meta["get_use_kv_cache"]: tokens, atten_mask, pos_ids, k_caches, v_caches = self.get_example_inputs( use_kv_cache=True @@ -206,7 +207,7 @@ def _tag_kv_ios(self, gm: torch.fx.GraphModule, kv_type, sharding_type): == self.llama_meta["get_head_dim"] ): a.meta[QCOM_QUANTIZED_IO] = kv_type - # single head, batch_prefill mode + # single head, prefill mode elif a.meta["val"].flatten().size()[0] == self.llama_meta[ "get_head_dim" ] * (self.llama_meta["get_max_seq_len"] - 1): @@ -237,13 +238,12 @@ def quantize(self, quant_dtype, custom_annotations=()): ).module() fx_graph_module = prepare_pt2e(fx_graph_module, quantizer) logging.info("Quantizing the model...") - calibrate( self.get_example_inputs(self.llama_meta["get_use_kv_cache"]), args.prompt, fx_graph_module, tokenizer_model_path=args.tokenizer_model, - max_seq_len=args.seq_len, + max_seq_len=self.llama_meta["get_max_seq_len"], ) self.llama_model = convert_pt2e(fx_graph_module) @@ -277,7 +277,7 @@ def lowering_modules( compiler_specs = generate_qnn_executorch_compiler_spec( soc_model=soc_model, backend_options=backend_options, - shared_buffer=True, + shared_buffer=False, ) skip_node_op_set = {"llama.fallback.default"} partitioner = QnnPartitioner( @@ -313,49 +313,61 @@ def get_example_inputs(self, use_kv_cache=True): return self.llama_model.get_example_inputs(use_kv_cache) -def compile(args): +def compile(args, pte_filename): os.makedirs(args.artifact, exist_ok=True) start_ts = time.time() - if args.model_mode == "kv": - use_kv_cache = output_new_cache_only = True - matmul_annotate_func = partial(annotate_matmul_16a8w, traverse_input1=True) - elif args.model_mode == "batch_prefill": - use_kv_cache = output_new_cache_only = False - matmul_annotate_func = partial(annotate_matmul_16a8w, traverse_input1=False) - elif args.model_mode == "hybrid": - raise NotImplementedError( - f"model_mode {args.model_mode} is not implemented yet." - ) - else: - raise RuntimeError(f"No such model_mode {args.model_mode}.") - with open(args.params) as f: - config = ModelArgs(**json.load(f)) + kv_config = ModelArgs(**json.load(f)) # TODO: support batch inputs if necessary - config.max_batch_size = 1 - config.max_seq_len = args.seq_len - config.use_kv_cache = use_kv_cache + kv_config.max_batch_size = 1 + kv_config.max_seq_len = args.kv_seq_len + kv_config.use_kv_cache = True + + prefill_config = copy.copy(kv_config) + prefill_config.max_seq_len = args.prefill_seq_len + prefill_config.use_kv_cache = False + state_dict = torch.load( args.checkpoint, weights_only=True, map_location="cpu", mmap=True ) - llama_instance = None + llama_instance_list = [] with torch.device("meta"): - llama_instance = LlamaModel(config, output_new_cache_only=output_new_cache_only) + if args.model_mode == "kv": + llama_instance_list.append( + LlamaModel(kv_config, output_new_cache_only=True) + ) + elif args.model_mode == "prefill": + llama_instance_list.append( + LlamaModel(prefill_config, output_new_cache_only=False) + ) + elif args.model_mode == "hybrid": + llama_instance_list.append( + LlamaModel(prefill_config, output_new_cache_only=False) + ) + llama_instance_list.append( + LlamaModel(kv_config, output_new_cache_only=True) + ) + else: + raise RuntimeError(f"No such model_mode {args.model_mode}.") + if "model" in state_dict: state_dict = state_dict["model"] - llama_instance.load_state_dict( - state_dict, - strict=False, - assign=True, - ) + + for llama_instance in llama_instance_list: + llama_instance.load_state_dict( + state_dict, + strict=False, + assign=True, + ) end_load_ts = time.time() logging.info(f"Time for loading checkpoint: {end_load_ts - start_ts}") - for layer in llama_instance.layers: - if getattr(layer.attention, "prepare_sha", None): - layer.attention.prepare_sha() + for llama_instance in llama_instance_list: + for layer in llama_instance.layers: + if getattr(layer.attention, "prepare_sha", None): + layer.attention.prepare_sha() use_fp16 = False if args.ptq != None: @@ -378,60 +390,136 @@ def compile(args): if args.dtype_override is not None: dtype_override = DType[args.dtype_override] - llama_instance = llama_instance.to(dtype_override.to_torch_dtype()) + for i in range(len(llama_instance_list)): + llama_instance_list[i] = llama_instance_list[i].to( + dtype_override.to_torch_dtype() + ) - llama_instance = convert_linear_to_conv2d(llama_instance) - single_llama = SingleLlama(llama_instance.eval()) + for i in range(len(llama_instance_list)): + llama_instance_list[i] = convert_linear_to_conv2d(llama_instance_list[i]) + llama_instance_list[i] = SingleLlama( + llama_instance_list[i].eval(), pte_filename + ) if args.ptq != None: start_quantize_ts = time.time() - single_llama.quantize( - quant_dtype, - custom_annotations=( - custom_annotate_llama_last_conv_16a8w, - matmul_annotate_func, - ), - ) + for llama_instance in llama_instance_list: + llama_instance.quantize( + quant_dtype, + custom_annotations=( + partial( + annotate_matmul_16a8w, + traverse_input1=llama_instance.llama_meta["get_use_kv_cache"], + ), + ), + ) end_quantize_ts = time.time() logging.info(f"Time for quantizing: {end_quantize_ts - start_quantize_ts}") start_lowering_ts = time.time() - single_llama.lowering_modules( - args.artifact, - kv_type=kv_type, - sharding_type=sharding_type, - use_fp16=use_fp16, - soc_model=get_soc_to_chipset_map()[args.model], - num_sharding=args.num_sharding, - ) + + if len(llama_instance_list) == 1: + llama_instance_list[0].lowering_modules( + args.artifact, + kv_type=kv_type, + sharding_type=sharding_type, + use_fp16=use_fp16, + soc_model=get_soc_to_chipset_map()[args.model], + num_sharding=args.num_sharding, + ) + else: + sample_inputs_list = [ + llama_instace.inputs for llama_instace in llama_instance_list + ] + edge_progs = [ + capture_program(llama_instance.llama_model, sample_input) + for llama_instance, sample_input in zip( + llama_instance_list, sample_inputs_list + ) + ] + + if args.num_sharding > 0: + for i in range(len(llama_instance_list)): + model_sharding.split_graph( + edge_progs[i].exported_program, + llama_instance_list[i].llama_meta["get_n_layers"], + shares=args.num_sharding, + ) + + for i in range(len(llama_instance_list)): + llama_instance_list[i]._tag_kv_ios( + edge_progs[i].exported_program.graph_module, + kv_type=kv_type, + sharding_type=sharding_type, + ) + backend_options = generate_htp_compiler_spec(use_fp16=use_fp16) + graph_names = ["prefill_forward", "kv_forward"] + compiler_specs = [ + generate_qnn_executorch_compiler_spec( + soc_model=get_soc_to_chipset_map()[args.model], + backend_options=backend_options, + shared_buffer=True, + multiple_graphs=True, + graph_name=graph_name, + ) + for graph_name in graph_names + ] + exported_programs = [ + to_backend(edge_prog.exported_program, QnnPartitioner(compiler_specs[i])) + for i, edge_prog in enumerate(edge_progs) + ] + + executorch_config = ExecutorchBackendConfig( + passes=[ + BuildQuantIo(), + ], + # For shared buffer, user must pass the memory address + # which is allocated by RPC memory to executor runner. + # Therefore, won't want to pre-allocate + # by memory manager in runtime. + memory_planning_pass=MemoryPlanningPass( + alloc_graph_input=False, + alloc_graph_output=False, + ), + extract_delegate_segments=True, + ) + + prog_mgr = generate_multi_graph_program( + compiler_specs=compiler_specs[0], + exported_programs=exported_programs, + backend_config=executorch_config, + constant_methods=llama_instance_list[1].llama_meta, # kv method meta + ) + with open(f"{args.artifact}/{pte_filename}.pte", "wb") as file: + prog_mgr.write_to_file(file) + end_lowering_ts = time.time() logging.info(f"Time for compiling: {end_lowering_ts - start_lowering_ts}") -def inference(args, pre_gen_pte=""): +def inference(args, pte_filename, pre_gen_pte=""): workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/single_llama" - if args.model_mode == "batch_prefill": + if args.model_mode == "prefill": eval_mode = 0 elif args.model_mode == "kv": eval_mode = 1 elif args.model_mode == "hybrid": eval_mode = 2 - raise NotImplementedError( - f"model_mode {args.model_mode} is not implemented yet." - ) else: raise RuntimeError(f"No such model_mode {args.model_mode}.") + seq_len = args.prefill_seq_len if args.model_mode == "prefill" else args.kv_seq_len runner_args = " ".join( [ f"--model_path {pte_filename}.pte", "--output_path outputs/outputs.txt", f"--tokenizer_path {os.path.basename(args.tokenizer_model)}", f'--prompt "{args.prompt}"', - f"--seq_len {args.seq_len}", + f"--seq_len {seq_len}", f"--eval_mode {eval_mode}", f"--temperature {args.temperature}", + f"--system_prompt '{args.system_prompt}'", ] ) runner_cmd = " ".join( @@ -541,10 +629,10 @@ def post_process(): ) parser.add_argument( - "--seq_len", - help="Ouput sequence length for llama.", - default=128, - type=int, + "--system_prompt", + help="Tells the model what kind of assistant it should be. For example, You are a helpful AI assistant for travel tips and recommendations. Default is None", + default="", + type=str, ) parser.add_argument( @@ -578,27 +666,53 @@ def post_process(): parser.add_argument( "--model_mode", - help="Export and inference batch_prefill mode, kv mode or hybrid(TBD) mode", + help="Export and inference prefill mode, kv mode or hybrid mode", default="kv", - choices=["batch_prefill", "kv", "hybrid"], + choices=["prefill", "kv", "hybrid"], type=str, ) + parser.add_argument( + "--prefill_seq_len", + help="Ouput sequence length for llama. Use this option for prefill or hybrid mode", + default=32, + type=int, + ) + + parser.add_argument( + "--kv_seq_len", + help="Ouput sequence length for llama. Use this option for kv or hybrid mode", + default=512, + type=int, + ) + args = parser.parse_args() if args.compile_only and args.pre_gen_pte: exit("Cannot set both compile_only and pre_gen_pte as true") + if args.model_mode == "kv": + pte_filename = "kv_llama3_2_qnn" + elif args.model_mode == "prefill": + pte_filename = "prefill_llama3_2_qnn" + elif args.model_mode == "hybrid": + assert ( + args.kv_seq_len >= args.prefill_seq_len + ), "Please ensure kv_seq_len is >= prefill_seq_len" + pte_filename = "hybrid_llama3_2_qnn" + else: + raise RuntimeError(f"No such model_mode {args.model_mode}.") + if args.pre_gen_pte: - inference(args, args.pre_gen_pte) + inference(args, pte_filename, args.pre_gen_pte) exit(f"Finish the running pre_gen_pte from {args.pre_gen_pte}") if args.compile_only: - compile(args) + compile(args, pte_filename) exit(f"Finish compile_only and save to {args.artifact}") try: - compile(args) - inference(args) + compile(args, pte_filename) + inference(args, pte_filename) except Exception as e: if args.ip and args.port != -1: with Client((args.ip, args.port)) as conn: diff --git a/examples/qualcomm/oss_scripts/llama3_2/qnn_llama3_2_runner.cpp b/examples/qualcomm/oss_scripts/llama3_2/qnn_llama3_2_runner.cpp index 554e3ba9329..8c7ac6dd363 100644 --- a/examples/qualcomm/oss_scripts/llama3_2/qnn_llama3_2_runner.cpp +++ b/examples/qualcomm/oss_scripts/llama3_2/qnn_llama3_2_runner.cpp @@ -18,6 +18,7 @@ #include #include #include +#include DEFINE_string( model_path, @@ -46,7 +47,7 @@ DEFINE_int32( DEFINE_int32( eval_mode, 0, - "0: PromptProcessor(batch_prefill) / 1: TokenGenerator(kv) / 2: HybridMode (TBD)"); + "0: PromptProcessor(prefill) / 1: TokenGenerator(kv) / 2: HybridMode (prefill+kv)"); int main(int argc, char** argv) { gflags::ParseCommandLineFlags(&argc, &argv, true); @@ -55,16 +56,21 @@ int main(int argc, char** argv) { example::Runner runner( {FLAGS_model_path}, FLAGS_tokenizer_path.c_str(), + FLAGS_prompt.c_str(), + FLAGS_system_prompt.c_str(), FLAGS_temperature, FLAGS_eval_mode); - - // generate tokens & store inference output + std::vector buf; + buf.reserve(5 * FLAGS_seq_len); // assume each token is around 5 char std::ofstream fout(FLAGS_output_path.c_str()); - runner.generate( - FLAGS_prompt, - FLAGS_system_prompt, - FLAGS_seq_len, - [&](const std::string& piece) { fout << piece; }); + auto callback = [&](const std::string& piece) { + for (const char c : piece) { + buf.push_back(c); + } + }; + // generate tokens & store inference output + runner.generate(FLAGS_seq_len, callback); + fout.write(buf.data(), buf.size()); fout.close(); return 0; } diff --git a/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.cpp b/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.cpp index 9b37d056cf5..aabad659f48 100644 --- a/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.cpp +++ b/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.cpp @@ -23,10 +23,7 @@ using executorch::runtime::TensorInfo; namespace example { Memory::Memory(std::vector>& modules) - : data_ptr_(nullptr, [](void*) {}), - input_tensors_(modules.size()), - output_tensors_(modules.size()), - modules_(modules) {} + : data_ptr_(nullptr, [](void*) {}), modules_(modules) {} Memory::~Memory() {} @@ -34,19 +31,23 @@ void* Memory::get_mutable_ptr() { return data_ptr_.get(); } -std::vector Memory::get_input_tensors(int shard_index) { +std::vector Memory::get_input_tensors( + int shard_index, + const std::string& method_name) { std::vector ret; ret.reserve(input_tensors_.size()); - for (TensorImpl* impl : input_tensors_[shard_index]) { + for (TensorImpl* impl : input_tensors_[method_name][shard_index]) { ret.emplace_back(Tensor(impl)); } return ret; } -std::vector Memory::get_output_tensors(int shard_index) { +std::vector Memory::get_output_tensors( + int shard_index, + const std::string& method_name) { std::vector ret; - ret.reserve(output_tensors_.size()); - for (TensorImpl* impl : output_tensors_[shard_index]) { + ret.reserve(output_tensors_[method_name][shard_index].size()); + for (TensorImpl* impl : output_tensors_[method_name][shard_index]) { ret.emplace_back(Tensor(impl)); } return ret; @@ -58,22 +59,110 @@ HybridMemory::HybridMemory( int32_t vocab_size, int32_t num_layers, int32_t head_dim, - int32_t num_heads) + int32_t num_heads, + EvalMode eval_mode, + const std::string& prefill_forward_name, + const std::string& kv_forward_name) : Memory(modules), shard_layers_({num_layers}), max_seq_len_(max_seq_len), vocab_size_(vocab_size), num_layers_(num_layers), head_dim_(head_dim), - num_heads_(num_heads) { + num_heads_(num_heads), + eval_mode_(eval_mode), + prefill_forward_name_(prefill_forward_name), + kv_forward_name_(kv_forward_name) { + if (!prefill_forward_name_.empty()) { + input_tensors_[prefill_forward_name_] = + std::vector>(modules.size()); + output_tensors_[prefill_forward_name_] = + std::vector>(modules.size()); + k_cache_in_[prefill_forward_name_] = + std::vector>(); + v_cache_in_[prefill_forward_name_] = + std::vector>(); + k_cache_out_[prefill_forward_name_] = + std::vector>(); + v_cache_out_[prefill_forward_name_] = + std::vector>(); + } + if (!kv_forward_name_.empty()) { + input_tensors_[kv_forward_name_] = + std::vector>(modules.size()); + output_tensors_[kv_forward_name_] = + std::vector>(modules.size()); + k_cache_in_[kv_forward_name_] = + std::vector>(); + v_cache_in_[kv_forward_name_] = + std::vector>(); + k_cache_out_[kv_forward_name_] = + std::vector>(); + v_cache_out_[kv_forward_name_] = + std::vector>(); + } + data_ptr_ = std::unique_ptr( new IO, [](void* ptr) { delete static_cast(ptr); }); } -void HybridMemory::prepare_kv_io( - const std::vector>& methods_meta) { +void HybridMemory::init_io( + const std::vector>& methods_meta, + EvalMode eval_mode) { IO* ptr = static_cast(data_ptr_.get()); std::memset(ptr, 0, sizeof(IO)); + + int32_t cache_len = max_seq_len_ - 1; + int32_t k_in_size = (head_dim_ + 1) * (max_seq_len_ - 1); + int32_t k_cache_out_size = num_heads_ * head_dim_ * cache_len; + int32_t v_cache_size = (num_heads_ + 1) * (max_seq_len_ - 1) * head_dim_; + + // Init kv vector shape, general enough to be shared across all 3 modes. + ptr->k_cache_out.reserve(num_layers_); + ptr->v_cache.reserve(num_layers_); + for (int layer = 0; layer < num_layers_; layer++) { + ptr->k_cache_out.emplace_back(std::vector(k_cache_out_size)); + ptr->v_cache.emplace_back(std::vector(v_cache_size)); + } + + auto init_prefill = [&]() { + ptr->prefill_input_toks.resize(cache_len); + ptr->prefill_atten_mask.resize(cache_len * cache_len); + ptr->prefill_logits.resize(cache_len * vocab_size_); + }; + + auto init_kv = [&]() { + ptr->kv_logits.resize(vocab_size_); + ptr->kv_attention_mask.resize(max_seq_len_, -255); + ptr->k_cache.reserve(num_layers_); + for (int layer = 0; layer < num_layers_; layer++) { + ptr->k_cache.emplace_back(); + ptr->k_cache[layer].reserve(num_heads_); + for (int head = 0; head < num_heads_; head++) { + ptr->k_cache[layer].emplace_back(std::vector(k_in_size)); + } + } + }; + + switch (eval_mode) { + case EvalMode::kPrefill: + init_prefill(); + break; + case EvalMode::kKVCached: + init_kv(); + break; + case EvalMode::kHybrid: + init_prefill(); + init_kv(); + break; + default: + break; + } +} + +void HybridMemory::prepare_kv_io( + const std::vector>& methods_meta) { for (int i = 0; i < modules_.size(); ++i) { ET_CHECK_MSG( methods_meta[i].ok(), @@ -81,23 +170,8 @@ void HybridMemory::prepare_kv_io( static_cast(methods_meta[i].error())); } - // Init IO vector shape - // atten_mask - ptr->logits.resize(vocab_size_); - ptr->attention_mask.resize( - max_seq_len_, -255); // attention mask shape should be [1, ctx_length] - // kv - int32_t k_in_size = (head_dim_ + 1) * (max_seq_len_ - 1); - int32_t k_out_size = num_heads_ * head_dim_; - int32_t v_cache_size = (num_heads_ + 1) * (max_seq_len_ - 1) * head_dim_; - for (int layer = 0; layer < num_layers_; layer++) { - ptr->k_cache.emplace_back(); - for (int head = 0; head < num_heads_; head++) { - ptr->k_cache[layer].emplace_back(std::vector(k_in_size)); - } - ptr->k_cache_out.emplace_back(std::vector(k_out_size)); - ptr->v_cache.emplace_back(std::vector(v_cache_size)); - } + ET_CHECK_MSG(!(kv_forward_name_.empty()), "kv forward name is empty"); + IO* ptr = static_cast(data_ptr_.get()); // [I]: input_tokens Result input_tok = methods_meta[0]->input_tensor_meta(0); @@ -107,7 +181,7 @@ void HybridMemory::prepare_kv_io( const_cast(input_tok->sizes().data()), &ptr->input_tok, const_cast(input_tok->dim_order().data())); - input_tensors_[0].push_back(input_tok_.get()); + input_tensors_[kv_forward_name_][0].push_back(input_tok_.get()); // [I]: atten_mask Result atten_mask = methods_meta[0]->input_tensor_meta(1); @@ -115,9 +189,9 @@ void HybridMemory::prepare_kv_io( atten_mask->scalar_type(), atten_mask->sizes().size(), const_cast(atten_mask->sizes().data()), - ptr->attention_mask.data(), + ptr->kv_attention_mask.data(), const_cast(atten_mask->dim_order().data())); - input_tensors_[0].push_back(attention_mask_.get()); + input_tensors_[kv_forward_name_][0].push_back(attention_mask_.get()); // [I]: input_pos Result input_pos = methods_meta[0]->input_tensor_meta(2); @@ -127,7 +201,7 @@ void HybridMemory::prepare_kv_io( const_cast(input_pos->sizes().data()), &ptr->input_pos, const_cast(input_pos->dim_order().data())); - input_tensors_[0].push_back(input_pos_.get()); + input_tensors_[kv_forward_name_][0].push_back(input_pos_.get()); // [I] kv_cache int index = 3; // bypass input_tokens, input_pos, atten_mask @@ -142,7 +216,8 @@ void HybridMemory::prepare_kv_io( Result kv_cache = methods_meta[shard_index]->input_tensor_meta(index); std::vector>& cache = - (cache_group == 0 ? k_cache_in_ : v_cache_in_); + (cache_group == 0 ? k_cache_in_[kv_forward_name_] + : v_cache_in_[kv_forward_name_]); void* cache_ptr = (cache_group == 0) ? static_cast(ptr->k_cache[layer + offset][head].data()) : static_cast( @@ -155,7 +230,8 @@ void HybridMemory::prepare_kv_io( cache_ptr, const_cast( kv_cache->dim_order().data()))); - input_tensors_[shard_index].push_back(cache.back().get()); + input_tensors_[kv_forward_name_][shard_index].push_back( + cache.back().get()); } } } @@ -165,13 +241,14 @@ void HybridMemory::prepare_kv_io( int logit_index = 0; Result logits = methods_meta[modules_.size() - 1]->output_tensor_meta(logit_index); - logits_ = std::make_unique( + kv_logits_ = std::make_unique( logits->scalar_type(), logits->sizes().size(), const_cast(logits->sizes().data()), - ptr->logits.data(), + ptr->kv_logits.data(), const_cast(logits->dim_order().data())); - output_tensors_[modules_.size() - 1].push_back(logits_.get()); + output_tensors_[kv_forward_name_][modules_.size() - 1].push_back( + kv_logits_.get()); // [O] kv_cache index = 1; @@ -190,7 +267,8 @@ void HybridMemory::prepare_kv_io( Result kv_cache = methods_meta[shard_index]->output_tensor_meta(index); std::vector>& cache = - (cache_group == 0 ? k_cache_out_ : v_cache_out_); + (cache_group == 0 ? k_cache_out_[kv_forward_name_] + : v_cache_out_[kv_forward_name_]); void* cache_ptr = (cache_group == 0) ? static_cast( ptr->k_cache_out[layer + offset].data() + @@ -205,7 +283,8 @@ void HybridMemory::prepare_kv_io( cache_ptr, const_cast( kv_cache->dim_order().data()))); - output_tensors_[shard_index].push_back(cache.back().get()); + output_tensors_[kv_forward_name_][shard_index].push_back( + cache.back().get()); } } } @@ -214,8 +293,6 @@ void HybridMemory::prepare_kv_io( void HybridMemory::prepare_prefill_io( const std::vector>& methods_meta) { - IO* ptr = static_cast(data_ptr_.get()); - std::memset(ptr, 0, sizeof(IO)); for (int i = 0; i < modules_.size(); ++i) { ET_CHECK_MSG( methods_meta[i].ok(), @@ -223,24 +300,13 @@ void HybridMemory::prepare_prefill_io( static_cast(methods_meta[i].error())); } - // Parse some IO info from method meta - // cache_len should be max_seq_len - 1 - int cache_len = methods_meta[0]->input_tensor_meta(0)->sizes()[1]; - - // TODO: Combine vector init with KV mode once Hybrid mode is enabled - // as it shares some common data structure. - // Init IO vector shape - ptr->prefill_input_toks.resize(cache_len); - ptr->prefill_atten_mask.resize(cache_len * cache_len); - ptr->prefill_logits.resize(cache_len * vocab_size_); - // Init kv vector shape - int32_t k_cache_out_size = num_heads_ * head_dim_ * cache_len; - int32_t v_cache_size = (num_heads_ + 1) * cache_len * head_dim_; - for (int layer = 0; layer < num_layers_; layer++) { - ptr->k_cache_out.emplace_back(std::vector(k_cache_out_size)); - ptr->v_cache.emplace_back(std::vector(v_cache_size)); - } + ET_CHECK_MSG( + !(prefill_forward_name_.empty()), "prefill forward name is empty"); + IO* ptr = static_cast(data_ptr_.get()); + + // cache_len should be max_seq_len - 1 + int32_t cache_len = methods_meta[0]->input_tensor_meta(0)->sizes()[1]; // [I]: pre_input_tokens Result prefill_input_toks = methods_meta[0]->input_tensor_meta(0); prefill_input_toks_ = std::make_unique( @@ -250,7 +316,7 @@ void HybridMemory::prepare_prefill_io( ptr->prefill_input_toks.data(), const_cast( prefill_input_toks->dim_order().data())); - input_tensors_[0].push_back(prefill_input_toks_.get()); + input_tensors_[prefill_forward_name_][0].push_back(prefill_input_toks_.get()); // [I]: prefill_attn_mask for (int i = 0; i < cache_len; ++i) { for (int j = 0; j < cache_len; ++j) { @@ -261,28 +327,26 @@ void HybridMemory::prepare_prefill_io( } } } - - Result prefill_attn_mask = methods_meta[0]->input_tensor_meta(1); + Result prefill_atten_mask = methods_meta[0]->input_tensor_meta(1); prefill_attn_mask_ = std::make_unique( - prefill_attn_mask->scalar_type(), - prefill_attn_mask->sizes().size(), - const_cast(prefill_attn_mask->sizes().data()), + prefill_atten_mask->scalar_type(), + prefill_atten_mask->sizes().size(), + const_cast(prefill_atten_mask->sizes().data()), ptr->prefill_atten_mask.data(), const_cast( - prefill_attn_mask->dim_order().data())); - input_tensors_[0].push_back(prefill_attn_mask_.get()); - + prefill_atten_mask->dim_order().data())); + input_tensors_[prefill_forward_name_][0].push_back(prefill_attn_mask_.get()); // [O]: logits int logit_index = 0; - Result logits = - methods_meta[modules_.size() - 1]->output_tensor_meta(logit_index); - logits_ = std::make_unique( + Result logits = methods_meta[0]->output_tensor_meta(0); + prefill_logits_ = std::make_unique( logits->scalar_type(), logits->sizes().size(), const_cast(logits->sizes().data()), ptr->prefill_logits.data(), const_cast(logits->dim_order().data())); - output_tensors_[modules_.size() - 1].push_back(logits_.get()); + output_tensors_[prefill_forward_name_][modules_.size() - 1].push_back( + prefill_logits_.get()); // [O] kv_cache int index = 1; for (int offset = 0, shard_index = 0, cache_stride = cache_len * head_dim_; @@ -294,13 +358,15 @@ void HybridMemory::prepare_prefill_io( Result kv_cache = methods_meta[shard_index]->output_tensor_meta(index); std::vector>& cache = - (cache_group == 0 ? k_cache_out_ : v_cache_out_); + (cache_group == 0 ? k_cache_out_[prefill_forward_name_] + : v_cache_out_[prefill_forward_name_]); void* cache_ptr = (cache_group == 0) ? static_cast( ptr->k_cache_out[layer + offset].data() + head * cache_stride) : static_cast( - ptr->v_cache[layer + offset].data() + head * cache_stride); + ptr->v_cache[layer + offset].data() + + (head + 1) * cache_stride); cache.emplace_back(std::make_unique( kv_cache->scalar_type(), kv_cache->sizes().size(), @@ -308,14 +374,72 @@ void HybridMemory::prepare_prefill_io( cache_ptr, const_cast( kv_cache->dim_order().data()))); - output_tensors_[shard_index].push_back(cache.back().get()); + output_tensors_[prefill_forward_name_][shard_index].push_back( + cache.back().get()); } } } } } -void HybridMemory::update_io( +void HybridMemory::update_prefill_to_kv_io( + int64_t cur_token, + int64_t pos, + std::vector>& output_tensors) { + int cache_len = (max_seq_len_ - 1); + IO* ptr = static_cast(data_ptr_.get()); + + ptr->input_tok = static_cast(cur_token); + ptr->input_pos = static_cast(pos); + // If prompt len is 30, prefill will handle to pos = 30. + // At this point, pos should be 31. + for (int i = 0; i < pos + 1; i++) { + ptr->kv_attention_mask[cache_len - i] = 0; + } + + // update v_cache + std::vector>& v_cache_in = + v_cache_in_[kv_forward_name_]; + std::vector>& v_cache_out = + v_cache_out_[kv_forward_name_]; + for (int i = 0, v_cache_stride = head_dim_ * pos; i < v_cache_in.size(); + i++) { + v_cache_in[i]->set_data( + v_cache_in[i]->mutable_data() + v_cache_stride); + v_cache_out[i]->set_data( + v_cache_out[i]->mutable_data() + v_cache_stride); + } + for (int shard = 0; shard < output_tensors.size(); shard++) { + for (int index = 0; index < output_tensors[shard].size(); index++) { + ET_CHECK_MSG( + modules_[shard]->set_output( + kv_forward_name_, output_tensors[shard][index], index) == + Error::Ok, + "Failed to set output tensor for module %d's %d'th output " + "while updating kv_cache output tensors", + shard, + index); + } + } + + std::vector>& k_cache_in = + k_cache_in_[kv_forward_name_]; + std::vector>& k_cache_out = + k_cache_out_[prefill_forward_name_]; + for (int i = 0; i < k_cache_in.size(); ++i) { + uint8_t* ptr_in = k_cache_in[i]->mutable_data(); + const uint8_t* ptr_out = k_cache_out[i]->data(); + for (size_t j = 0, offset = cache_len; j < head_dim_; + ++j, offset += cache_len) { + for (int k = 0, k_stride = j * cache_len; k < pos; k++) { + ptr_in[offset + k] = ptr_out[k_stride + k]; + } + } + k_cache_in[i]->set_data(ptr_in + pos); + } +} + +void HybridMemory::update_kv_io( int64_t cur_token, int64_t pos, std::vector>& output_tensors) { @@ -326,19 +450,22 @@ void HybridMemory::update_io( // update position_ids ptr->input_pos = static_cast(pos); // update causal mask for next token - ptr->attention_mask[seq_len - pos] = 0; + ptr->kv_attention_mask[seq_len - pos] = 0; // update v_cache - for (int i = 0; i < v_cache_in_.size(); i++) { - v_cache_in_[i]->set_data( - v_cache_in_[i]->mutable_data() + head_dim_); - v_cache_out_[i]->set_data( - v_cache_out_[i]->mutable_data() + head_dim_); + auto& v_cache_in = v_cache_in_[kv_forward_name_]; + auto& v_cache_out = v_cache_out_[kv_forward_name_]; + for (int i = 0; i < v_cache_in.size(); i++) { + v_cache_in[i]->set_data(v_cache_in[i]->mutable_data() + head_dim_); + v_cache_out[i]->set_data( + v_cache_out[i]->mutable_data() + head_dim_); } + for (int shard = 0; shard < output_tensors.size(); shard++) { for (int index = 0; index < output_tensors[shard].size(); index++) { ET_CHECK_MSG( - modules_[shard]->set_output(output_tensors[shard][index], index) == + modules_[shard]->set_output( + kv_forward_name_, output_tensors[shard][index], index) == Error::Ok, "failed to set output tensor for module %d's %d'th output " "while updating kv_cache output tensors", @@ -347,15 +474,17 @@ void HybridMemory::update_io( } } + auto& k_cache_in = k_cache_in_[kv_forward_name_]; + auto& k_cache_out = k_cache_out_[kv_forward_name_]; // update k_cache by single thread, this part is cpu cache sensitive - for (int i = 0; i < k_cache_in_.size(); ++i) { - uint8_t* ptr_in = k_cache_in_[i]->mutable_data(); - const uint8_t* ptr_out = k_cache_out_[i]->data(); + for (int i = 0; i < k_cache_in.size(); ++i) { + uint8_t* ptr_in = k_cache_in[i]->mutable_data(); + const uint8_t* ptr_out = k_cache_out[i]->data(); for (size_t j = 0, offset = seq_len; j < head_dim_; ++j, offset += seq_len) { ptr_in[offset] = ptr_out[j]; } - k_cache_in_[i]->set_data(ptr_in + 1); + k_cache_in[i]->set_data(ptr_in + 1); } } diff --git a/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.h b/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.h index 31ed351ef4b..956d58caf23 100644 --- a/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.h +++ b/examples/qualcomm/oss_scripts/llama3_2/runner/io_memory.h @@ -21,10 +21,20 @@ namespace example { +enum EvalMode { + kPrefill = 0, + kKVCached, + kHybrid, + kUnsupported, +}; class Memory { public: Memory(std::vector>& modules); virtual ~Memory(); + virtual void init_io( + const std::vector>& methods_meta, + EvalMode eval_mode) = 0; virtual void prepare_prefill_io( const std::vector< executorch::runtime::Result>& @@ -33,18 +43,32 @@ class Memory { const std::vector< executorch::runtime::Result>& methods_meta) = 0; - virtual void update_io( + virtual void update_prefill_to_kv_io( + int64_t cur_token, + int64_t pos, + std::vector>& output_tensors) = 0; + virtual void update_kv_io( int64_t cur_token, int64_t pos, std::vector>& output_tensors) = 0; void* get_mutable_ptr(); - std::vector get_input_tensors(int shard_index); - std::vector get_output_tensors(int shard_index); + std::vector get_input_tensors( + int shard_index, + const std::string& method_name); + std::vector get_output_tensors( + int shard_index, + const std::string& method_name); protected: std::unique_ptr data_ptr_; - std::vector> input_tensors_; - std::vector> output_tensors_; + std::unordered_map< + std::string, + std::vector>> + input_tensors_; + std::unordered_map< + std::string, + std::vector>> + output_tensors_; std::vector> modules_; }; @@ -56,7 +80,15 @@ class HybridMemory : public Memory { int32_t vocab_size, int32_t num_layers, int32_t head_dim, - int32_t num_heads); + int32_t num_heads, + EvalMode eval_mode, + const std::string& prefill_forward_name, + const std::string& kv_forward_name); + + void init_io( + const std::vector>& methods_meta, + EvalMode eval_mode) override; void prepare_prefill_io( const std::vector< executorch::runtime::Result>& @@ -65,7 +97,12 @@ class HybridMemory : public Memory { const std::vector< executorch::runtime::Result>& methods_meta) override; - void update_io( + void update_prefill_to_kv_io( + int64_t cur_token, + int64_t pos, + std::vector>& output_tensors) + override; + void update_kv_io( int64_t cur_token, int64_t pos, std::vector>& output_tensors) @@ -73,11 +110,11 @@ class HybridMemory : public Memory { struct IO { int32_t input_tok; int32_t input_pos; - std::vector attention_mask; std::vector>> k_cache; std::vector> v_cache; std::vector> k_cache_out; - std::vector logits; + std::vector kv_attention_mask; + std::vector kv_logits; std::vector prefill_input_toks; std::vector prefill_atten_mask; std::vector prefill_logits; @@ -90,17 +127,33 @@ class HybridMemory : public Memory { std::unique_ptr attention_mask_; std::unique_ptr prefill_input_toks_; std::unique_ptr prefill_attn_mask_; - std::vector> k_cache_in_; - std::vector> v_cache_in_; - std::vector> k_cache_out_; - std::vector> v_cache_out_; - std::unique_ptr logits_; + std::unique_ptr prefill_logits_; + std::unordered_map< + std::string, + std::vector>> + k_cache_in_; + std::unordered_map< + std::string, + std::vector>> + v_cache_in_; + std::unordered_map< + std::string, + std::vector>> + k_cache_out_; + std::unordered_map< + std::string, + std::vector>> + v_cache_out_; + std::unique_ptr kv_logits_; std::vector shard_layers_; int32_t max_seq_len_; int32_t vocab_size_; int32_t num_layers_; int32_t head_dim_; int32_t num_heads_; + EvalMode eval_mode_; + std::string prefill_forward_name_; + std::string kv_forward_name_; }; } // namespace example diff --git a/examples/qualcomm/oss_scripts/llama3_2/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama3_2/runner/runner.cpp index 80da5b98873..ce784fed500 100644 --- a/examples/qualcomm/oss_scripts/llama3_2/runner/runner.cpp +++ b/examples/qualcomm/oss_scripts/llama3_2/runner/runner.cpp @@ -41,19 +41,22 @@ std::string statsToJsonString(const Runner::Stats& stats); Runner::Runner( const std::vector& models_path, const std::string& tokenizer_path, + const std::string& prompt, + const std::string& system_prompt, const float temperature, const int eval_mode) : n_bos_(1), n_eos_(1), tokenizer_path_(tokenizer_path), temperature_(temperature), - eval_mode_(eval_mode) { + eval_mode_(static_cast(eval_mode)) { for (size_t i = 0; i < models_path.size(); ++i) { modules_.push_back(std::make_shared( models_path[i], Module::LoadMode::MmapUseMlockIgnoreErrors)); ET_LOG(Info, "creating module: model_path=%s", models_path[i].c_str()); } ET_LOG(Info, "creating runner: tokenizer_path=%s", tokenizer_path_.c_str()); + ET_LOG(Info, "eval mode=%d", eval_mode); int64_t max_seq_len = getMetadataHelper("get_max_seq_len", -1); int64_t vocab_size = getMetadataHelper("get_vocab_size", -1); @@ -76,8 +79,47 @@ Runner::Runner( bos_id_ = tokenizer_->bos_tok(); eos_id_.insert(tokenizer_->eos_tok()); + ET_CHECK_MSG(!prompt.empty(), "prompt cannot be null"); + + if (!system_prompt.empty()) { + prompt_.append("<|start_header_id|>system<|end_header_id|>\n\n"); + prompt_.append(system_prompt); + prompt_.append("<|eot_id|>\n"); + } + prompt_.append("<|start_header_id|>user<|end_header_id|>\n\n"); + prompt_.append(prompt); + prompt_.append("<|eot_id|><|start_header_id|>assistant<|end_header_id|>"); + + switch (eval_mode_) { + case EvalMode::kPrefill: + prefill_forward_name_ = "forward"; + method_names_.emplace_back(prefill_forward_name_); + break; + case EvalMode::kKVCached: + kv_forward_name_ = "forward"; + method_names_.emplace_back(kv_forward_name_); + break; + case EvalMode::kHybrid: + prefill_forward_name_ = "prefill_forward"; + kv_forward_name_ = "kv_forward"; + method_names_.emplace_back(prefill_forward_name_); + method_names_.emplace_back(kv_forward_name_); + break; + case EvalMode::kUnsupported: + ET_CHECK_MSG(false, "Unsupported llama version"); + break; + } + io_mem_ = std::make_unique( - modules_, max_seq_len_, vocab_size_, num_layers, head_dim, num_heads); + modules_, + max_seq_len_, + vocab_size_, + num_layers, + head_dim, + num_heads, + eval_mode_, + prefill_forward_name_, + kv_forward_name_); ET_LOG(Info, "creating io_memory"); } @@ -94,7 +136,12 @@ Error Runner::load() { return Error::Ok; } for (std::shared_ptr& module : modules_) { - ET_CHECK_OK_OR_RETURN_ERROR(module->load_method("forward")); + if (!prefill_forward_name_.empty()) { + ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(prefill_forward_name_)); + } + if (!kv_forward_name_.empty()) { + ET_CHECK_OK_OR_RETURN_ERROR(module->load_method(kv_forward_name_)); + } } // create sampler @@ -105,12 +152,25 @@ Error Runner::load() { static_cast(std::time(nullptr))); // prepare io - auto methods_meta = get_methods_meta(); - if (eval_mode_ == EvalMode::kBatchPrefill) { - io_mem_->prepare_prefill_io(methods_meta); - } else { - io_mem_->prepare_kv_io(methods_meta); + switch (eval_mode_) { + case EvalMode::kPrefill: + io_mem_->init_io(get_methods_meta(prefill_forward_name_), eval_mode_); + io_mem_->prepare_prefill_io(get_methods_meta(prefill_forward_name_)); + break; + case EvalMode::kKVCached: + io_mem_->init_io(get_methods_meta(kv_forward_name_), eval_mode_); + io_mem_->prepare_kv_io(get_methods_meta(kv_forward_name_)); + break; + case EvalMode::kHybrid: + io_mem_->init_io(get_methods_meta(kv_forward_name_), eval_mode_); + io_mem_->prepare_prefill_io(get_methods_meta(prefill_forward_name_)); + io_mem_->prepare_kv_io(get_methods_meta(kv_forward_name_)); + break; + case EvalMode::kUnsupported: + ET_CHECK_MSG(false, "unsupported mode"); + break; } + return Error::Ok; } @@ -145,65 +205,59 @@ int32_t Runner::logitsToToken(const Tensor& logits_tensor) { return sampler_->sample(logits_last); } -void Runner::run_model_step(std::vector>& inputs) { +void Runner::run_model_step( + const std::string& method_name, + std::vector>& inputs) { for (size_t i = 0, num_modules = modules_.size(); i < num_modules; ++i) { - Result> outputs_res = modules_[i]->forward(inputs[i]); + Result> outputs_res = + modules_[i]->execute(method_name, inputs[i]); ET_CHECK_MSG( outputs_res.error() == Error::Ok, "shard %zu inference failed", i); } } Error Runner::generate( - const std::string& prompt, - const std::string& system_prompt, int32_t seq_len, std::function token_callback, std::function stats_callback) { - ET_CHECK_MSG(!prompt.empty(), "prompt cannot be null"); - - std::vector> input_tensors, output_tensors; - std::vector> inputs; + std::unordered_map>> + input_tensors, output_tensors; + std::unordered_map>> inputs; if (!is_loaded()) { stats_.model_load_start_ms = time_in_ms(); ET_CHECK_OK_OR_RETURN_ERROR(load()); - for (int i = 0; i < modules_.size(); ++i) { - input_tensors.emplace_back(io_mem_->get_input_tensors(i)); - output_tensors.emplace_back(io_mem_->get_output_tensors(i)); - for (size_t j = 0; j < output_tensors[i].size(); ++j) { - ET_CHECK_MSG( - modules_[i]->set_output(output_tensors[i][j], j) == Error::Ok, - "failed to set output tensor for module %d's %zu'th output", - i, - j); + for (auto method_name : method_names_) { + for (int i = 0; i < modules_.size(); ++i) { + input_tensors[method_name].emplace_back( + io_mem_->get_input_tensors(i, method_name)); + output_tensors[method_name].emplace_back( + io_mem_->get_output_tensors(i, method_name)); + for (size_t j = 0; j < output_tensors[method_name][i].size(); ++j) { + ET_CHECK_MSG( + modules_[i]->set_output( + method_name, output_tensors[method_name][i][j], j) == + Error::Ok, + "failed to set output tensor for module %d's %zu'th output", + i, + j); + } + inputs[method_name].emplace_back(std::vector( + begin(input_tensors[method_name][i]), + end(input_tensors[method_name][i]))); } - inputs.emplace_back( - std::vector(begin(input_tensors[i]), end(input_tensors[i]))); } - stats_.model_load_end_ms = time_in_ms(); } - std::string post_process_prompt; - - if (!system_prompt.empty()) { - post_process_prompt.append( - "<|start_header_id|>system<|end_header_id|>\n\n"); - post_process_prompt.append(system_prompt); - post_process_prompt.append("<|eot_id|>\n"); - } - post_process_prompt.append("<|start_header_id|>user<|end_header_id|>\n\n"); - post_process_prompt.append(prompt); - post_process_prompt.append( - "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"); - token_callback("<|begin_of_text|>"); - + stats_.model_load_end_ms = time_in_ms(); stats_.inference_start_ms = time_in_ms(); + if (token_callback) { + token_callback("<|begin_of_text|>"); + } seq_len = (seq_len > 0 && seq_len <= max_seq_len_) ? seq_len : max_seq_len_; Result> encode_res = - tokenizer_->encode(post_process_prompt, n_bos_, 0); + tokenizer_->encode(prompt_, n_bos_, 0); ET_CHECK_OK_OR_RETURN_ERROR( - encode_res.error(), - "failed to encode prompt %s", - post_process_prompt.c_str()); + encode_res.error(), "failed to encode prompt %s", prompt_.c_str()); std::vector prompt_tokens = encode_res.get(); int num_prompt_tokens = prompt_tokens.size(); @@ -211,58 +265,74 @@ Error Runner::generate( ET_CHECK_MSG( num_prompt_tokens < seq_len, "sequence length exceeded - please increase the seq_len value"); + if (eval_mode_ == EvalMode::kHybrid) { + int prefill_seq_len = get_methods_meta(prefill_forward_name_)[0] + ->input_tensor_meta(0) + ->sizes()[1] + + 1; + ET_CHECK_MSG( + num_prompt_tokens < prefill_seq_len, + "For hybrid mode, please ensure prompt length(%d) is less than prefill's seq_len(%d)", + num_prompt_tokens, + prefill_seq_len); + } int64_t pos = 0, prev_token, cur_token = prompt_tokens[0]; HybridMemory::IO* ptr = static_cast(io_mem_->get_mutable_ptr()); - if (eval_mode_ == EvalMode::kBatchPrefill) { + auto prefill_execute = [&](const std::string& method_name) { for (int i = 0; i < num_prompt_tokens; i++) { ptr->prefill_input_toks[i] = static_cast(prompt_tokens[i]); auto piece_res = tokenizer_->decode(prompt_tokens[i], prompt_tokens[i]); token_callback(piece_res.get()); } // inference - run_model_step(inputs); - Tensor& logits_tensor = output_tensors.back()[0]; + run_model_step(method_name, inputs[method_name]); + Tensor& logits_tensor = output_tensors[method_name].back()[0]; // offset to the meaningful logit we want. float* logits = logits_tensor.mutable_data_ptr() + (num_prompt_tokens - 1) * vocab_size_; prev_token = prompt_tokens[num_prompt_tokens - 1]; + long sample_start_time_ms = time_in_ms(); cur_token = sampler_->sample(logits); + stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms; stats_.first_token_ms = time_in_ms(); stats_.prompt_eval_end_ms = time_in_ms(); - long sample_start_time_ms = time_in_ms(); - stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms; auto piece_res = tokenizer_->decode(prev_token, cur_token); ET_CHECK(piece_res.ok()); if (token_callback) { token_callback(piece_res.get().c_str()); } pos += num_prompt_tokens; - } else { + }; + + auto kv_execute = [&](const std::string& method_name) { ptr->input_tok = static_cast(cur_token); - ptr->attention_mask[max_seq_len_ - 1] = 0; + ptr->kv_attention_mask[max_seq_len_ - 1] = 0; while (pos < seq_len - 1) { // inference - run_model_step(inputs); - Tensor& logits_tensor = output_tensors.back()[0]; - - if (pos == num_prompt_tokens) { - stats_.first_token_ms = time_in_ms(); - } else if (pos == num_prompt_tokens - 1) { - stats_.prompt_eval_end_ms = time_in_ms(); + run_model_step(method_name, inputs[method_name]); + Tensor& logits_tensor = output_tensors[method_name].back()[0]; + + // hybrid mode will check these stats_ at prefill(prefill) + if (eval_mode_ == EvalMode::kKVCached) { + if (pos == num_prompt_tokens) { + stats_.first_token_ms = time_in_ms(); + } else if (pos == num_prompt_tokens - 1) { + stats_.prompt_eval_end_ms = time_in_ms(); + } } - long sample_start_time_ms = time_in_ms(); prev_token = cur_token; + long sample_start_time_ms = time_in_ms(); cur_token = logitsToToken(logits_tensor); stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms; if (pos < num_prompt_tokens - 1) { cur_token = prompt_tokens[pos + 1]; } - io_mem_->update_io(cur_token, ++pos, output_tensors); + io_mem_->update_kv_io(cur_token, ++pos, output_tensors[method_name]); auto piece_res = tokenizer_->decode(prev_token, cur_token); ET_CHECK(piece_res.ok()); @@ -275,8 +345,25 @@ Error Runner::generate( break; } } + }; + + switch (eval_mode_) { + case EvalMode::kPrefill: + prefill_execute(prefill_forward_name_); + break; + case EvalMode::kKVCached: + kv_execute(kv_forward_name_); + break; + case EvalMode::kHybrid: + prefill_execute(prefill_forward_name_); + io_mem_->update_prefill_to_kv_io( + cur_token, pos, output_tensors[kv_forward_name_]); + kv_execute(kv_forward_name_); + break; + default: + ET_CHECK_MSG(false, "Unsupported eval mode"); + break; } - stats_.inference_end_ms = time_in_ms(); if (pos == seq_len) { ET_LOG(Info, "\nSequence length (%i tokens) reached!", seq_len); @@ -348,7 +435,7 @@ void printReport(const Runner::Stats& stats) { ET_LOG( Info, "\tSampling time over %" PRIu64 " tokens:\t%f (seconds)", - stats.num_prompt_tokens + stats.num_generated_tokens, + stats.num_generated_tokens, (double)stats.aggregate_sampling_time_ms / stats.SCALING_FACTOR_UNITS_PER_SECOND); } @@ -370,11 +457,12 @@ std::string statsToJsonString(const Runner::Stats& stats) { } } // namespace -std::vector> Runner::get_methods_meta() { +std::vector> Runner::get_methods_meta( + std::string& method_name) { std::vector> methods_meta; methods_meta.reserve(modules_.size()); for (std::shared_ptr& module : modules_) { - methods_meta.emplace_back(module->method_meta("forward")); + methods_meta.emplace_back(module->method_meta(method_name)); } return methods_meta; } diff --git a/examples/qualcomm/oss_scripts/llama3_2/runner/runner.h b/examples/qualcomm/oss_scripts/llama3_2/runner/runner.h index b720697be5f..3f0248872d5 100644 --- a/examples/qualcomm/oss_scripts/llama3_2/runner/runner.h +++ b/examples/qualcomm/oss_scripts/llama3_2/runner/runner.h @@ -29,6 +29,8 @@ class Runner { explicit Runner( const std::vector& models_path, const std::string& tokenizer_path, + const std::string& prompt, + const std::string& system_prompt, const float temperature, const int eval_mode); @@ -61,27 +63,23 @@ class Runner { bool is_loaded() const; executorch::runtime::Error load(); executorch::runtime::Error generate( - const std::string& prompt, - const std::string& system_prompt, int32_t seq_len, std::function token_callback = {}, std::function stats_callback = {}); void stop(); std::vector> - get_methods_meta(); + get_methods_meta(std::string& method_name); private: - enum EvalMode { - kBatchPrefill = 0, - kKVCached, - kUnsupported, - }; template T getMetadataHelper(std::string method_name, T default_val); template int32_t logitsToToken(const executorch::aten::Tensor& logits_tensor); void run_model_step( + const std::string& method_name, std::vector>& inputs); + std::string prompt_; + // metadata int32_t max_seq_len_; int32_t vocab_size_; @@ -96,7 +94,10 @@ class Runner { std::unique_ptr sampler_; Stats stats_; std::unique_ptr io_mem_; - int32_t eval_mode_; + EvalMode eval_mode_; + std::string prefill_forward_name_; + std::string kv_forward_name_; + std::vector method_names_; }; } // namespace example