diff --git a/ggml/src/ggml-qnn/npu/CMakeLists.txt b/ggml/src/ggml-qnn/npu/CMakeLists.txt index 5f1009bb9bea6..5e1281c3d5cf4 100644 --- a/ggml/src/ggml-qnn/npu/CMakeLists.txt +++ b/ggml/src/ggml-qnn/npu/CMakeLists.txt @@ -231,6 +231,11 @@ else() build_idl(idl/hexagon_npu.idl hexagon_npu_skel_OBJS) + add_subdirectory(${HEXAGON_SDK_ROOT}/libs/qprintf qprintf_dir) + target_include_directories(hexagon_npu_skel_OBJS PUBLIC + ${HEXAGON_SDK_ROOT}/libs/qprintf/inc/ + ) + # disable warnings for the skel set_source_files_properties( ${skel_srcs} @@ -239,12 +244,12 @@ else() ) add_library(hexagon_npu_skel SHARED $) - target_link_libraries(hexagon_npu_skel ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++abi.a ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++.a ) set_target_properties(hexagon_npu_skel PROPERTIES OUTPUT_NAME "hexagon_npu_skel_${HEXAGON_ARCH}") + target_link_libraries(hexagon_npu_skel qprintf_static) copy_binaries(hexagon_npu_skel) endif() diff --git a/ggml/src/ggml-qnn/npu/device/device.cpp b/ggml/src/ggml-qnn/npu/device/device.cpp index fbed4b0a28fa6..8a10e9e7525b1 100644 --- a/ggml/src/ggml-qnn/npu/device/device.cpp +++ b/ggml/src/ggml-qnn/npu/device/device.cpp @@ -106,6 +106,7 @@ int npu_device_open(const char * uri, remote_handle64 * h) { } *h = reinterpret_cast(context); + DEVICE_LOG_INFO("NPU device context created: %p", (void *) *h); return AEE_SUCCESS; } @@ -117,6 +118,7 @@ int npu_device_close(remote_handle64 h) { } delete context; + DEVICE_LOG_INFO("NPU device context destroyed: %p", (void *) h); return AEE_SUCCESS; } @@ -130,6 +132,12 @@ AEEResult npu_device_device_support_op(remote_handle64 _h, const npu_device_tens const npu_device_tensor_spec * src1, const npu_device_tensor_spec * dst, npu_device_tensor_op op, boolean * is_supported) { NPU_UNUSED(_h); + + if (!src0 || !src1 || !dst || !is_supported) { + DEVICE_LOG_ERROR("npu_device_device_support_op: Invalid arguments"); + return AEE_EINVARGS; + } + *is_supported = hexagon::support_op(*src0, *src1, *dst, op); return AEE_SUCCESS; } @@ -147,28 +155,15 @@ AEEResult npu_device_tensor_init(remote_handle64 _h, const npu_device_tensor_con return AEE_SUCCESS; } -AEEResult npu_device_tensor_set_src(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle, uint64_t index, - npu_device_tensor_handle_t src) { +AEEResult npu_device_tensor_update_params(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle, + const npu_device_tensor_update_config * config) { NPU_UNUSED(_h); auto * tensor = tensor_from_handle(tensor_handle); - if (!tensor) { - return AEE_EINVHANDLE; - } - - auto * src_tensor = tensor_from_handle(src); - tensor->set_src(index, src_tensor); - return AEE_SUCCESS; -} - -AEEResult npu_device_tensor_set_op(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle, - npu_device_tensor_op op) { - NPU_UNUSED(_h); - auto * tensor = tensor_from_handle(tensor_handle); - if (!tensor) { + if (!tensor || !config) { return AEE_EINVHANDLE; } - tensor->set_op(op); + tensor->update_config(*config); return AEE_SUCCESS; } @@ -206,6 +201,29 @@ AEEResult npu_device_graph_set_tensor(remote_handle64 _h, npu_device_graph_handl return AEE_SUCCESS; } +AEEResult npu_device_graph_set_tensor_with_param(remote_handle64 _h, npu_device_graph_handle_t graph_handle, + const npu_device_tensor_handle_t * tensor_handles, + int tensor_handlesLen, + const npu_device_tensor_update_config * tensor_params, + int tensor_paramsLen) { + NPU_UNUSED(_h); + auto * graph = graph_from_handle(graph_handle); + if (!graph || !tensor_handles || tensor_handlesLen <= 0 || !tensor_params || + tensor_handlesLen != tensor_paramsLen) { + return AEE_EINVHANDLE; + } + + graph->set_tensor(tensor_handles, tensor_handlesLen); + for (int i = 0; i < tensor_handlesLen; ++i) { + auto * tensor = tensor_from_handle(tensor_handles[i]); + if (tensor) { + tensor->update_config(tensor_params[i]); + } + } + + return AEE_SUCCESS; +} + AEEResult npu_device_graph_compute(remote_handle64 _h, npu_device_graph_handle_t graph_handle) { auto dev_ctx = device_context_from_handle(_h); if (!dev_ctx) { diff --git a/ggml/src/ggml-qnn/npu/device/graph.cpp b/ggml/src/ggml-qnn/npu/device/graph.cpp index 5201edefea924..c9cad772320f1 100644 --- a/ggml/src/ggml-qnn/npu/device/graph.cpp +++ b/ggml/src/ggml-qnn/npu/device/graph.cpp @@ -10,7 +10,8 @@ namespace hexagon { graph::graph() noexcept { - DEVICE_LOG_DEBUG("graph(%p) created\n", (void *) this); + _vtcm_quota_size = hexagon::vtcm_mem::get_avail_block_size(); // TODO: move to device init? + DEVICE_LOG_DEBUG("graph(%p) created: vtcm quota size: %zu\n", (void *) this, _vtcm_quota_size); } graph::~graph() noexcept { @@ -45,6 +46,8 @@ bool graph::compute(default_thread_pool * thread_pool, const float * f16_to_f32_ } DEVICE_LOG_DEBUG("graph(%p) compute\n", (void *) this); + + DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]compute", (void *) this); _f16_to_f32_table = f16_to_f32_table; if (thread_pool) { thread_pool->sync_execute(reinterpret_cast(&graph::thread_pool_task), this); @@ -61,6 +64,8 @@ void graph::thread_pool_task(default_thread_pool * pool, size_t thread_idx, size } void graph::compute_impl(default_thread_pool * pool, size_t thread_idx, size_t thread_count) { + hexagon::compute_params params = { thread_idx, thread_count, _vtcm_quota_size / thread_count, _f16_to_f32_table }; + for (size_t i = 0; i < _tensor_count; ++i) { auto * dst = _tensors[i]; auto op = dst->get_op(); @@ -69,14 +74,14 @@ void graph::compute_impl(default_thread_pool * pool, size_t thread_idx, size_t t DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d not supported\n", (void *) this, i, op); return; } - - hexagon::compute_params params = { thread_idx, thread_count, _f16_to_f32_table }; if (!func(dst, ¶ms)) { DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d compute failed\n", (void *) this, i, op); } - // TODO: figure out which ops need to sync - if (pool) { + DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]sync_thread, tidx: %zu", (void *) this, thread_idx); + + const bool should_sync = requires_thread_barrier(op); + if (pool && should_sync && i < _tensor_count - 1) { pool->sync_thread(); } dst->invalidate(); diff --git a/ggml/src/ggml-qnn/npu/device/graph.hpp b/ggml/src/ggml-qnn/npu/device/graph.hpp index 126d2541786a0..c6b68c4eeadd9 100644 --- a/ggml/src/ggml-qnn/npu/device/graph.hpp +++ b/ggml/src/ggml-qnn/npu/device/graph.hpp @@ -25,6 +25,7 @@ class graph { std::unique_ptr _tensors; size_t _tensor_count = 0; + size_t _vtcm_quota_size = 0; const float * _f16_to_f32_table = nullptr; DISABLE_COPY_AND_MOVE(graph); diff --git a/ggml/src/ggml-qnn/npu/device/op_impl.cpp b/ggml/src/ggml-qnn/npu/device/op_impl.cpp index d68fd9a53b4d4..777072024a450 100644 --- a/ggml/src/ggml-qnn/npu/device/op_impl.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_impl.cpp @@ -5,6 +5,8 @@ #include #include +#include + #include "op_mul_mat.hpp" #include "quants.hpp" @@ -17,7 +19,7 @@ inline void vec_op_impl(const _TyData * src0, const _TyData * src1, size_t count HVX_Vector * iptr0 = ((HVX_Vector *) src0); HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector); HVX_Vector * iptr1 = ((HVX_Vector *) src1); - HVX_Vector * optr = ((HVX_Vector *) dst); + HVX_Vector * optr = ((HVX_Vector *) dst); // framework will ensure the dst is aligned HVX_Vector prev0 = *iptr0++; HVX_Vector prev1 = *iptr1++; @@ -108,6 +110,12 @@ template struct get_data_type +struct get_data_type { + using type = _TyData; + using param_type = typename std::remove_cv::type>::type; +}; + template bool element_wise_op(hexagon::tensor * out, hexagon::compute_params * params) { using data_type = typename get_data_type::type; @@ -166,6 +174,16 @@ template bool element_wise_op(hexagon::tensor * out, hexagon::co return true; } +bool is_same_shape(const npu_device_tensor_spec & src, const npu_device_tensor_spec & dst) { + for (size_t i = 0; i < DEVICE_TENSOR_MAX_DIMS; ++i) { + if (src.ne[i] != dst.ne[i]) { + return false; + } + } + + return true; +} + bool is_element_wise_op_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, const npu_device_tensor_spec & dst, npu_device_tensor_op op) { if (op != NPU_OP_ADD && op != NPU_OP_SUB && op != NPU_OP_MUL) { @@ -196,12 +214,149 @@ bool is_element_wise_op_supported(const npu_device_tensor_spec & src0, const npu return false; } - for (size_t i = 0; i < DEVICE_TENSOR_MAX_DIMS; ++i) { - if (src0.ne[i] != dst.ne[i]) { - DEVICE_LOG_DEBUG("[%s]src0.ne[%zu] and dst.ne[%zu] not match: %lld vs %lld\n", hexagon::op_get_name(op), i, - i, (long long) src0.ne[i], (long long) dst.ne[i]); - return false; + if (!is_same_shape(src0, dst)) { + DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape\n", hexagon::op_get_name(op)); + return false; + } + + return true; +} + +void rms_norm_vec_f32(const float * src, size_t count, float eps, float * dst) { + constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(float); + + HVX_Vector * src_vec_ptr = ((HVX_Vector *) src); + HVX_Vector * src_vec_end = ((HVX_Vector *) src) + (count / kElementsPerVector); + HVX_Vector prev = *src_vec_ptr++; + HVX_Vector sum = Q6_V_vzero(); + while (src_vec_ptr < src_vec_end) { + HVX_Vector curr = *src_vec_ptr++; + HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src); + sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(s0, s0)); + prev = curr; + } + + if ((src_vec_end - ((HVX_Vector *) src)) > 0) { + // handle the last vector + bool src_ptr_aligned = hexagon::is_addr_aligned(src_vec_ptr); + HVX_Vector curr = src_ptr_aligned ? prev : *src_vec_ptr; + src_vec_ptr = src_ptr_aligned ? src_vec_ptr : src_vec_ptr + 1; + HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src); + sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(s0, s0)); + prev = curr; + } + + const size_t leftover = count % kElementsPerVector; + const size_t leftover_bytes = leftover * sizeof(float); + if (leftover > 0) { + // handle the leftover elements + HVX_Vector curr = + (leftover_bytes + hexagon::unaligned_bytes(src_vec_ptr) > hexagon::kBytesPerVector) ? *src_vec_ptr : prev; + curr = Q6_V_valign_VVR(curr, prev, (size_t) src); + sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, + Q6_V_valign_VVR(Q6_Vqf32_vmpy_VsfVsf(curr, curr), Q6_V_vzero(), leftover_bytes)); + } + + const float mean = hexagon::vec_reduction_f32(sum) / count; // TODO: figure out how to do division in vector + const float scale = 1.0f / sqrtf(mean + eps); // TODO: use buildin blas sqrtf? + + HVX_Vector scale_vec = Q6_V_vsplat_R(reinterpret_cast(scale)); + src_vec_ptr = ((HVX_Vector *) src); + prev = *src_vec_ptr++; + HVX_Vector * dst_vec_ptr = ((HVX_Vector *) dst); // framework will ensure the dst is aligned + while (src_vec_ptr < src_vec_end) { + HVX_Vector curr = *src_vec_ptr++; + HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src); + *dst_vec_ptr++ = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, scale_vec)); + prev = curr; + } + + if ((src_vec_end - ((HVX_Vector *) src)) > 0) { + // handle the last vector + bool src_ptr_aligned = hexagon::is_addr_aligned(src_vec_ptr); + HVX_Vector curr = src_ptr_aligned ? prev : *src_vec_ptr; + src_vec_ptr = src_ptr_aligned ? src_vec_ptr : src_vec_ptr + 1; + HVX_Vector s0 = Q6_V_valign_VVR(curr, prev, (size_t) src); + *dst_vec_ptr++ = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, scale_vec)); + prev = curr; + } + + if (leftover > 0) { + // handle the leftover elements + HVX_Vector curr = + (leftover_bytes + hexagon::unaligned_bytes(src_vec_ptr) > hexagon::kBytesPerVector) ? *src_vec_ptr : prev; + curr = Q6_V_valign_VVR(curr, prev, (size_t) src); + q6op_vstu_variable_ARV(dst_vec_ptr, leftover_bytes, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(curr, scale_vec))); + } +} + +// TODO: merge with element_wise_op? +template bool unary_op(hexagon::tensor * out, hexagon::compute_params * params) { + using data_type = typename get_data_type::type; + using param_type = typename get_data_type::param_type; + + if (!out) { + return false; + } + + static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4"); + auto * src0 = out->get_src(0); + if (!src0) { + return true; // skip if no src + } + + const auto * src0_ptr = reinterpret_cast(src0->get_read_buffer()); + auto * dst_ptr = reinterpret_cast(out->get_write_buffer()); + auto total_rows = out->get_ne(3) * out->get_ne(2) * out->get_ne(1); + const auto rows_per_cube = out->get_ne(2) * out->get_ne(1); + const auto start_end = hexagon::get_thread_work_slice(total_rows, params->tidx, params->tcnt); + if (start_end.first >= start_end.second) { + return true; + } + + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(out, params->tidx); + + const auto param = out->get_op_param(0); + const size_t valid_row_bytes = src0->get_ne(0) * sizeof(data_type); + for (int64_t ir = start_end.first; ir < start_end.second; ++ir) { + const auto i03 = ir / rows_per_cube; + const auto i02 = ir / out->get_ne(1) - i03 * out->get_ne(2); + const auto i01 = ir % out->get_ne(1); // TODO: should we use divide instead of mod? + + auto * src0_row = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1); + auto * dst_row = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1); + if (ir + 1 < start_end.second) { + hexagon::l2fetch_row(src0_row + src0->get_nb(1), valid_row_bytes); } + + _RowFunc(reinterpret_cast(src0_row), static_cast(out->get_ne(0)), param, + reinterpret_cast(dst_row)); + } + + return true; +} + +bool is_unary_op_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, + const npu_device_tensor_spec & dst, npu_device_tensor_op op) { + if (op != NPU_OP_RMS_NORM) { + DEVICE_LOG_DEBUG("[%s]unsupported\n", hexagon::op_get_name(op)); + return false; + } + + if (dst.type != src0.type) { + DEVICE_LOG_DEBUG("[%s]src0.type and dst.type mismatch: %s vs %s\n", hexagon::op_get_name(op), + hexagon::get_type_name(src0.type), hexagon::get_type_name(dst.type)); + return false; + } + + if (dst.type != NPU_DATA_TYPE_F32) { + DEVICE_LOG_DEBUG("[%s]unsupported data type: %s\n", hexagon::op_get_name(op), hexagon::get_type_name(dst.type)); + return false; + } + + if (!is_same_shape(src0, dst)) { + DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape\n", hexagon::op_get_name(op)); + return false; } return true; @@ -211,6 +366,7 @@ struct op_capabilities { npu_device_tensor_op op; hexagon::op_is_supported_func_type is_supported; hexagon::compute_func_type compute_funcs[NPU_DATA_TYPE_COUNT]; + bool requires_thread_barrier = false; }; constexpr const op_capabilities kOpCapabilities[] = { @@ -219,22 +375,36 @@ constexpr const op_capabilities kOpCapabilities[] = { { hexagon::mul_mat_f32, // NPU_DATA_TYPE_F32 nullptr, // NPU_DATA_TYPE_F16 - }, }, - { NPU_OP_ADD, - is_element_wise_op_supported, { - element_wise_op>, // NPU_DATA_TYPE_F32 - element_wise_op>, // NPU_DATA_TYPE_F16 - } }, - { NPU_OP_SUB, - is_element_wise_op_supported, { - element_wise_op>, // NPU_DATA_TYPE_F32 - element_wise_op>, // NPU_DATA_TYPE_F16 - } }, - { NPU_OP_MUL, - is_element_wise_op_supported, { - element_wise_op>, // NPU_DATA_TYPE_F32 - element_wise_op>, // NPU_DATA_TYPE_F16 - } }, + }, true, + }, + { + NPU_OP_ADD, is_element_wise_op_supported, + { + element_wise_op>, // NPU_DATA_TYPE_F32 + element_wise_op>, // NPU_DATA_TYPE_F16 + }, false, + }, + { + NPU_OP_SUB, is_element_wise_op_supported, + { + element_wise_op>, // NPU_DATA_TYPE_F32 + element_wise_op>, // NPU_DATA_TYPE_F16 + }, false, + }, + { + NPU_OP_MUL, is_element_wise_op_supported, + { + element_wise_op>, // NPU_DATA_TYPE_F32 + element_wise_op>, // NPU_DATA_TYPE_F16 + }, false, + }, + { + NPU_OP_RMS_NORM, is_unary_op_supported, + { + unary_op, // NPU_DATA_TYPE_F32 + nullptr, // NPU_DATA_TYPE_F16 + }, false, + }, }; static_assert(kOpCapabilities[NPU_OP_MUL_MAT].compute_funcs[NPU_DATA_TYPE_F32] == hexagon::mul_mat_f32, @@ -243,6 +413,8 @@ static_assert(kOpCapabilities[NPU_OP_MUL_MAT].compute_funcs[NPU_DATA_TYPE_F32] = static_assert(std::size(kOpCapabilities) == NPU_OP_COUNT); static_assert(kOpCapabilities[NPU_OP_MUL_MAT].op == NPU_OP_MUL_MAT, "kOpArray[NPU_OP_MUL_MAT].op != NPU_OP_MUL_MAT"); static_assert(kOpCapabilities[NPU_OP_MUL].op == NPU_OP_MUL, "kOpArray[NPU_OP_MUL].op != NPU_OP_MUL"); +static_assert(kOpCapabilities[NPU_OP_RMS_NORM].op == NPU_OP_RMS_NORM, + "kOpArray[NPU_OP_RMS_NORM].op != NPU_OP_RMS_NORM"); hexagon::compute_func_type get_compute_func_impl(npu_device_tensor_op op, npu_device_tensor_data_type type) { if (op >= NPU_OP_COUNT) { @@ -260,6 +432,14 @@ compute_func_type get_compute_func(tensor * dst) { return get_compute_func_impl(dst->get_op(), dst->get_type()); } +bool requires_thread_barrier(npu_device_tensor_op op) { + if (op >= NPU_OP_COUNT) { + return false; + } + + return kOpCapabilities[op].requires_thread_barrier; +} + bool support_op(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, const npu_device_tensor_spec & dst, npu_device_tensor_op op) { if (get_compute_func_impl(op, dst.type) == nullptr) { diff --git a/ggml/src/ggml-qnn/npu/device/op_impl.hpp b/ggml/src/ggml-qnn/npu/device/op_impl.hpp index f9a3d01187793..9b75ec6d47967 100644 --- a/ggml/src/ggml-qnn/npu/device/op_impl.hpp +++ b/ggml/src/ggml-qnn/npu/device/op_impl.hpp @@ -6,6 +6,8 @@ namespace hexagon { compute_func_type get_compute_func(tensor * dst); +bool requires_thread_barrier(npu_device_tensor_op op); + bool support_op(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, const npu_device_tensor_spec & dst, npu_device_tensor_op op); diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp index 647a5ff925737..6087673ac65af 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp @@ -3,44 +3,43 @@ #include #include "quants.hpp" +#include "thread_pool.hpp" // TODO: remove this dependency #include "vtcm_mem.hpp" namespace { -inline float vec_reduction_f32(HVX_Vector sums) { - constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float); - static_assert(kFloatsPerVector == 32 || kFloatsPerVector == 16, "kFloatsPerVector should be 16 or 32"); - - // TODO: do we have a better way to do the reduction? - switch (kFloatsPerVector) { - default: - case 32: - sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 16 * sizeof(float))); - // fallthrough - case 16: - sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 8 * sizeof(float))); - sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 4 * sizeof(float))); - sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 2 * sizeof(float))); - sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, sizeof(float))); - break; - } - - return hexagon::get_flt0_from_fltv(Q6_Vsf_equals_Vqf32(sums)); -} - inline float vec_dot_product_f32_f32(const float * src0, const float * src1, size_t count) { constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(float); - HVX_Vector * iptr0 = ((HVX_Vector *) src0); - HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector); - HVX_Vector * iptr1 = ((HVX_Vector *) src1); - HVX_Vector prev0 = *iptr0++; - HVX_Vector prev1 = *iptr1++; - HVX_Vector sum = Q6_V_vzero(); + HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0); + HVX_Vector * src0_vec_ptr_end = ((HVX_Vector *) src0) + count / kElementsPerVector; + HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1); + HVX_Vector prev0 = *src0_vec_ptr++; + HVX_Vector prev1 = *src1_vec_ptr++; + HVX_Vector sum = Q6_V_vzero(); + + while (src0_vec_ptr_end - src0_vec_ptr > 1) { + HVX_Vector curr0_lo = src0_vec_ptr[0]; + HVX_Vector curr0_hi = src0_vec_ptr[1]; + HVX_Vector curr1_lo = src1_vec_ptr[0]; + HVX_Vector curr1_hi = src1_vec_ptr[1]; + + HVX_Vector l0 = Q6_V_valign_VVR(curr0_lo, prev0, (size_t) src0); + HVX_Vector l1 = Q6_V_valign_VVR(curr1_lo, prev1, (size_t) src1); + HVX_Vector h0 = Q6_V_valign_VVR(curr0_hi, curr0_lo, (size_t) src0); + HVX_Vector h1 = Q6_V_valign_VVR(curr1_hi, curr1_lo, (size_t) src1); + sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(l0, l1), sum); + sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(h0, h1), sum); + + prev0 = curr0_hi; + prev1 = curr1_hi; + src0_vec_ptr += 2; + src1_vec_ptr += 2; + } - while (iptr0 < iptr0_end) { - HVX_Vector curr0 = *iptr0++; - HVX_Vector curr1 = *iptr1++; + if (src0_vec_ptr_end - src0_vec_ptr > 0) { + HVX_Vector curr0 = *src0_vec_ptr++; + HVX_Vector curr1 = *src1_vec_ptr++; HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum); @@ -48,17 +47,17 @@ inline float vec_dot_product_f32_f32(const float * src0, const float * src1, siz prev1 = curr1; } - if ((iptr0_end - ((HVX_Vector *) src0)) > 0) { + if ((src0_vec_ptr_end - ((HVX_Vector *) src0)) > 0) { // handle the last vector // see also: // https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147 // or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c - bool iptr0_aligned = hexagon::is_addr_aligned(iptr0); - HVX_Vector curr0 = iptr0_aligned ? prev0 : *iptr0; - iptr0 = iptr0_aligned ? iptr0 : iptr0 + 1; - bool iptr1_aligned = hexagon::is_addr_aligned(iptr1); - HVX_Vector curr1 = iptr1_aligned ? prev1 : *iptr1; - iptr1 = iptr1_aligned ? iptr1 : iptr1 + 1; + bool iptr0_aligned = hexagon::is_addr_aligned(src0_vec_ptr); + HVX_Vector curr0 = iptr0_aligned ? prev0 : *src0_vec_ptr; + src0_vec_ptr = iptr0_aligned ? src0_vec_ptr : src0_vec_ptr + 1; + bool iptr1_aligned = hexagon::is_addr_aligned(src1_vec_ptr); + HVX_Vector curr1 = iptr1_aligned ? prev1 : *src1_vec_ptr; + src1_vec_ptr = iptr1_aligned ? src1_vec_ptr : src1_vec_ptr + 1; HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); sum = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum); @@ -70,19 +69,21 @@ inline float vec_dot_product_f32_f32(const float * src0, const float * src1, siz const size_t leftover_bytes = leftover * sizeof(float); if (leftover > 0) { // handle the leftover elements - HVX_Vector curr0 = - (leftover_bytes + hexagon::unaligned_bytes(iptr0) > hexagon::kBytesPerVector) ? *iptr0 : prev0; - curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector curr0 = (leftover_bytes + hexagon::unaligned_bytes(src0_vec_ptr) > hexagon::kBytesPerVector) ? + *src0_vec_ptr : + prev0; + curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - HVX_Vector curr1 = - (leftover_bytes + hexagon::unaligned_bytes(iptr1) > hexagon::kBytesPerVector) ? *iptr1 : prev1; - curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + HVX_Vector curr1 = (leftover_bytes + hexagon::unaligned_bytes(src1_vec_ptr) > hexagon::kBytesPerVector) ? + *src1_vec_ptr : + prev1; + curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); sum = Q6_Vqf32_vadd_Vqf32Vqf32( Q6_V_valign_VVR(Q6_Vqf32_vmpy_VsfVsf(curr0, curr1), Q6_V_vzero(), leftover_bytes), sum); } - return vec_reduction_f32(sum); + return hexagon::vec_reduction_f32(sum); } // TODO: merge with vec_dot_product_f32_f32? @@ -90,17 +91,17 @@ inline float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_d constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(npu_device_fp16_t); constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float); - HVX_Vector * iptr0 = ((HVX_Vector *) src0); - HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector); - HVX_Vector * iptr1 = ((HVX_Vector *) src1); - HVX_Vector prev0 = *iptr0++; - HVX_Vector prev1 = *iptr1++; - HVX_Vector sum_hi = Q6_V_vzero(); - HVX_Vector sum_lo = Q6_V_vzero(); - - while (iptr0 < iptr0_end) { - HVX_Vector curr0 = *iptr0++; - HVX_Vector curr1 = *iptr1++; + HVX_Vector * src0_vec_ptr = ((HVX_Vector *) src0); + HVX_Vector * src0_vec_ptr_end = ((HVX_Vector *) src0) + (count / kElementsPerVector); + HVX_Vector * src1_vec_ptr = ((HVX_Vector *) src1); + HVX_Vector prev0 = *src0_vec_ptr++; + HVX_Vector prev1 = *src1_vec_ptr++; + HVX_Vector sum_hi = Q6_V_vzero(); + HVX_Vector sum_lo = Q6_V_vzero(); + + while (src0_vec_ptr < src0_vec_ptr_end) { + HVX_Vector curr0 = *src0_vec_ptr++; + HVX_Vector curr1 = *src1_vec_ptr++; HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); HVX_VectorPair result = Q6_Wqf32_vmpy_VhfVhf(s0, s1); @@ -110,17 +111,17 @@ inline float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_d prev1 = curr1; } - if ((iptr0_end - ((HVX_Vector *) src0)) > 0) { + if ((src0_vec_ptr_end - ((HVX_Vector *) src0)) > 0) { // handle the last vector // see also: // https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147 // or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c - bool iptr0_aligned = hexagon::is_addr_aligned(iptr0); - HVX_Vector curr0 = iptr0_aligned ? prev0 : *iptr0; - iptr0 = iptr0_aligned ? iptr0 : iptr0 + 1; - bool iptr1_aligned = hexagon::is_addr_aligned(iptr1); - HVX_Vector curr1 = iptr1_aligned ? prev1 : *iptr1; - iptr1 = iptr1_aligned ? iptr1 : iptr1 + 1; + bool iptr0_aligned = hexagon::is_addr_aligned(src0_vec_ptr); + HVX_Vector curr0 = iptr0_aligned ? prev0 : *src0_vec_ptr; + src0_vec_ptr = iptr0_aligned ? src0_vec_ptr : src0_vec_ptr + 1; + bool iptr1_aligned = hexagon::is_addr_aligned(src1_vec_ptr); + HVX_Vector curr1 = iptr1_aligned ? prev1 : *src1_vec_ptr; + src1_vec_ptr = iptr1_aligned ? src1_vec_ptr : src1_vec_ptr + 1; HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); HVX_VectorPair result = Q6_Wqf32_vmpy_VhfVhf(s0, s1); @@ -134,13 +135,15 @@ inline float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_d const size_t leftover_bytes = leftover * sizeof(npu_device_fp16_t); if (leftover > 0) { // handle the leftover elements - HVX_Vector curr0 = - (leftover_bytes + hexagon::unaligned_bytes(iptr0) > hexagon::kBytesPerVector) ? *iptr0 : prev0; - curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); + HVX_Vector curr0 = (leftover_bytes + hexagon::unaligned_bytes(src0_vec_ptr) > hexagon::kBytesPerVector) ? + *src0_vec_ptr : + prev0; + curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0); - HVX_Vector curr1 = - (leftover_bytes + hexagon::unaligned_bytes(iptr1) > hexagon::kBytesPerVector) ? *iptr1 : prev1; - curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); + HVX_Vector curr1 = (leftover_bytes + hexagon::unaligned_bytes(src1_vec_ptr) > hexagon::kBytesPerVector) ? + *src1_vec_ptr : + prev1; + curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1); HVX_VectorPair result = Q6_Wqf32_vmpy_VhfVhf(curr0, curr1); @@ -156,7 +159,7 @@ inline float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_d } } - return vec_reduction_f32(Q6_Vqf32_vadd_Vqf32Vqf32(sum_hi, sum_lo)); + return hexagon::vec_reduction_f32(Q6_Vqf32_vadd_Vqf32Vqf32(sum_hi, sum_lo)); } template struct get_data_type {}; @@ -208,70 +211,118 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso } // cache the src0 plane in VTCM - const size_t src0_plane_row_count = start_end_element.second - start_end_element.first; - size_t src0_plane_cache_size = 0; - uint8_t * src0_plane_cache_ptr = nullptr; - const uint8_t * last_cached_plane_ptr = nullptr; + size_t src0_plane_slice_row_count = start_end_element.second - start_end_element.first; + size_t src0_plane_cache_size = 0; + uint8_t * src0_plane_cache_ptr = nullptr; + const uint8_t * last_cached_plane_ptr = nullptr; + bool is_mem_cache = false; if (is_quantized) { - src0_plane_cache_size = src0_actual_row_size * src0_plane_row_count; - src0_plane_cache_ptr = params->get_cache(src0_plane_cache_size, is_quantized); + src0_plane_slice_row_count = + std::min(params->vtcm_quota_size / src0_actual_row_size, src0_plane_slice_row_count); + src0_plane_cache_size = src0_actual_row_size * src0_plane_slice_row_count; + src0_plane_cache_ptr = params->get_vtcm_cache(src0_plane_cache_size); + if (src0_plane_cache_ptr == nullptr) { + DEVICE_LOG_DEBUG( + "mul_mat_impl: failed to get VTCM cache for src0, size: %zu, src0_plane_slice_row_count: %zu, " + "src0_actual_row_size: %zu, will fallback to mem cache\n", + src0_plane_cache_size, src0_plane_slice_row_count, src0_actual_row_size); + src0_plane_cache_ptr = params->get_mem_cache(src0_plane_cache_size); + is_mem_cache = true; + } } - DEVICE_LOG_DEBUG("mul_mat_impl src0_actual_row_size: %zu, is_quantized: %d, vtcm_mem: %p(%zu)\n", - src0_actual_row_size, is_quantized, (void *) src0_plane_cache_ptr, src0_plane_cache_size); + DEVICE_LOG_DEBUG( + "mul_mat_impl src0_actual_row_size: %zu, src0_plane_slice_row_count: %zu, is_quantized: %d, vtcm_mem: " + "%p(%zu)\n", + src0_actual_row_size, src0_plane_slice_row_count, is_quantized, (void *) src0_plane_cache_ptr, + src0_plane_cache_size); const size_t valid_row_bytes = src1->get_ne(0) * sizeof(data_type); DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_SUB_PROC(dst, params->tidx, dequant); for (int64_t ip = start_end_plane.first; ip < start_end_plane.second; ip++) { const auto i3 = ip / dst->get_ne(2); const auto i2 = ip - i3 * dst->get_ne(2); - const auto * src0_plane = src0_ptr + i3 / r03 * src0->get_nb(3) + i2 / r02 * src0->get_nb(2) + - start_end_element.first * src0->get_nb(1); const auto * src1_plane = src1_ptr + i3 * src1->get_nb(3) + i2 * src1->get_nb(2); auto * dst_plane = dst_ptr + i3 * dst->get_nb(3) + i2 * dst->get_nb(2); - - if (src0_plane_cache_ptr) { - if (last_cached_plane_ptr != src0_plane) { - DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(dequant); - - for (int64_t ir = 0; ir < (int64_t) src0_plane_row_count; ir++) { - auto * src0_row = src0_plane + ir * src0->get_nb(1); - if (ir + 1 < src0_plane_row_count) { - hexagon::l2fetch_row(src0_row + src0->get_nb(1), src0->get_nb(1)); + for (int64_t col_idx = start_end_element.first; col_idx < start_end_element.second; + col_idx += src0_plane_slice_row_count) { + const auto * src0_plane = + src0_ptr + i3 / r03 * src0->get_nb(3) + i2 / r02 * src0->get_nb(2) + col_idx * src0->get_nb(1); + if (src0_plane_cache_ptr) { + if (last_cached_plane_ptr != src0_plane) { + DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(dequant); + + for (int64_t ir = 0; ir < (int64_t) src0_plane_slice_row_count; ir++) { + auto * src0_row = src0_plane + ir * src0->get_nb(1); + if (ir + 1 < src0_plane_slice_row_count) { + hexagon::l2fetch_row(src0_row + src0->get_nb(1), src0->get_nb(1)); + } + + auto * dst_row = reinterpret_cast(src0_plane_cache_ptr + ir * src0_actual_row_size); + dequantize_row_func(src0_row, reinterpret_cast(dst_row), src0->get_ne(0), + params->f16_to_f32_table); } - auto * dst_row = reinterpret_cast(src0_plane_cache_ptr + ir * src0_actual_row_size); - dequantize_row_func(src0_row, reinterpret_cast(dst_row), src0->get_ne(0), - params->f16_to_f32_table); + last_cached_plane_ptr = src0_plane; } - last_cached_plane_ptr = src0_plane; + src0_plane = src0_plane_cache_ptr; } - src0_plane = src0_plane_cache_ptr; - } - - for (int64_t i1 = start_end_row.first; i1 < start_end_row.second; i1++) { - auto * src1_row = src1_plane + i1 * src1->get_nb(1); - auto * dst_row = reinterpret_cast(dst_plane + i1 * dst->get_nb(1)) + start_end_element.first; - for (int64_t i0 = 0; i0 < (int64_t) src0_plane_row_count; i0++) { - auto * src0_row = src0_plane + i0 * src0_actual_row_size; - if (i0 + 1 < src0_plane_row_count) { - if (!src0_plane_cache_ptr) { - hexagon::l2fetch_row(src0_row + src0_actual_row_size, valid_row_bytes); + for (int64_t i1 = start_end_row.first; i1 < start_end_row.second; i1++) { + auto * src1_row = src1_plane + i1 * src1->get_nb(1); + auto * dst_row = reinterpret_cast(dst_plane + i1 * dst->get_nb(1)) + col_idx; + for (int64_t i0 = 0; i0 < (int64_t) src0_plane_slice_row_count; i0++) { + auto * src0_row = src0_plane + i0 * src0_actual_row_size; + if (i0 + 1 < src0_plane_slice_row_count) { + if (!src0_plane_cache_ptr || is_mem_cache) { + hexagon::l2fetch_row(src0_row + src0_actual_row_size, valid_row_bytes); + } + } else if (ip + 1 < start_end_plane.second) { + hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row_bytes); } - } else if (ip + 1 < start_end_plane.second) { - hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row_bytes); - } - // TODO: figure dst how to handle a entire row - dst_row[i0] = _DotFunc(reinterpret_cast(src0_row), - reinterpret_cast(src1_row), (size_t) src0->get_ne(0)); + // TODO: figure dst how to handle a entire row + dst_row[i0] = _DotFunc(reinterpret_cast(src0_row), + reinterpret_cast(src1_row), (size_t) src0->get_ne(0)); + } } } } } +bool is_quantized_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1) { + if (src1.type != NPU_DATA_TYPE_F32) { + DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) and src1.type(%s) mismatch and src1 is not F32\n", + hexagon::get_type_name(src0.type), hexagon::get_type_name(src1.type)); + return false; + } + + const auto type_traits = hexagon::get_type_traits(src0.type); + if (!type_traits.is_quantized || type_traits.dequantize_row == nullptr) { + DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) and src1.type(%s) mismatch and src0 is not quantized\n", + hexagon::get_type_name(src0.type), hexagon::get_type_name(src1.type)); + return false; + } + + if (src0.ne[0] % type_traits.blck_size) { + DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) ne[0] is not aligned: %ld\n", hexagon::get_type_name(src0.type), + (long) src0.ne[0]); + return false; + } + + const auto vtcm_thread_quota_size = hexagon::vtcm_mem::get_total_size() / hexagon::kMaxThreadCount; + if (src0.ne[0] * sizeof(hexagon::dequantized_element_type) > vtcm_thread_quota_size) { + DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) ne[0] is too large: %ld, vtcm_thread_quota_size: %zu\n", + hexagon::get_type_name(src0.type), (long) src0.ne[0], vtcm_thread_quota_size); + return false; + } + + DEVICE_LOG_DEBUG("[MUL_MAT]supported quantized src0.type(%s) and src1.type(%s)\n", + hexagon::get_type_name(src0.type), hexagon::get_type_name(src1.type)); + return true; +} + } // namespace namespace hexagon { @@ -319,27 +370,9 @@ bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_ if (src0.type != src1.type) { #ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS - if (src1.type != NPU_DATA_TYPE_F32) { - DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch and src1 is not F32\n", op_get_name(op), - get_type_name(src0.type), get_type_name(src1.type)); - return false; - } - - const auto type_traits = get_type_traits(src0.type); - if (!type_traits.is_quantized || type_traits.dequantize_row == nullptr) { - DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch and src0 is not quantized\n", - op_get_name(op), get_type_name(src0.type), get_type_name(src1.type)); + if (!is_quantized_mul_mat_supported(src0, src1)) { return false; } - - if (src0.ne[0] % type_traits.blck_size) { - DEVICE_LOG_DEBUG("[%s]src0.type(%s) ne[0] is not aligned: %ld\n", op_get_name(op), get_type_name(src0.type), - (long) src0.ne[0]); - return false; - } - - DEVICE_LOG_DEBUG("[%s]supported quantized src0.type(%s) and src1.type(%s)\n", op_get_name(op), - get_type_name(src0.type), get_type_name(src1.type)); #else DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch and quantized tensors are not supported\n", op_get_name(op), get_type_name(src0.type), get_type_name(src1.type)); diff --git a/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp b/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp index 3a97858606cd4..8cf41e0a99d86 100644 --- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp +++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp @@ -7,11 +7,6 @@ namespace hexagon { -constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73 -constexpr const size_t kAlignMask = kBytesPerVector - 1; -constexpr const size_t kL2CacheSize = 8 * 1024; // // 8KB L2 cache -constexpr const size_t kL2FetchAheadVectors = kL2CacheSize / kBytesPerVector; - inline size_t unaligned_bytes(const void * addr) { return ((size_t) addr) & kAlignMask; } @@ -43,6 +38,31 @@ inline float get_flt0_from_fltv(HVX_Vector vect) { return cvt.f; } +inline HVX_Vector vec_reduction_qf32(HVX_Vector sums) { + constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float); + static_assert(kFloatsPerVector == 32 || kFloatsPerVector == 16, "kFloatsPerVector should be 16 or 32"); + + // TODO: do we have a better way to do the reduction? + switch (kFloatsPerVector) { + default: + case 32: + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 16 * sizeof(float))); + // fallthrough + case 16: + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 8 * sizeof(float))); + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 4 * sizeof(float))); + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 2 * sizeof(float))); + sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, sizeof(float))); + break; + } + + return sums; +} + +inline float vec_reduction_f32(HVX_Vector sums) { + return hexagon::get_flt0_from_fltv(Q6_Vsf_equals_Vqf32(vec_reduction_qf32(sums))); +} + bool mul_mat_f32(tensor * out, compute_params * params); bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1, const npu_device_tensor_spec & dst, npu_device_tensor_op op); diff --git a/ggml/src/ggml-qnn/npu/device/op_types.hpp b/ggml/src/ggml-qnn/npu/device/op_types.hpp index 8bf10637db51c..153bbab058b89 100644 --- a/ggml/src/ggml-qnn/npu/device/op_types.hpp +++ b/ggml/src/ggml-qnn/npu/device/op_types.hpp @@ -1,5 +1,7 @@ #pragma once +#include + #include #include #include @@ -15,26 +17,25 @@ namespace hexagon { struct compute_params { const size_t tidx; const size_t tcnt; + const size_t vtcm_quota_size; const float * f16_to_f32_table; std::unique_ptr vtcm_cache; std::unique_ptr mem_cache; size_t mem_cache_size = 0; - uint8_t * get_cache(size_t size, bool fallback_to_mem) { + uint8_t * get_vtcm_cache(size_t size) { if (!vtcm_cache || vtcm_cache->get_size() < size) { vtcm_cache = std::make_unique(size, false); } - if (vtcm_cache->is_valid()) { - return vtcm_cache->get_mem(); - } - - if (!fallback_to_mem) { - DEVICE_LOG_DEBUG("vtcm_mem not valid, return nullptr\n"); + if (!vtcm_cache->is_valid()) { return nullptr; } - DEVICE_LOG_DEBUG("vtcm_mem not valid, allocate from mem_cache\n"); + return vtcm_cache->get_mem(); + } + + uint8_t * get_mem_cache(size_t size) { if (!mem_cache || mem_cache_size < size) { mem_cache = std::make_unique(size + 256); mem_cache_size = mem_cache ? size : 0; @@ -49,10 +50,31 @@ typedef bool (*op_is_supported_func_type)(const npu_device_tensor_spec & src0, c const npu_device_tensor_spec & dst, npu_device_tensor_op op); inline constexpr std::pair get_thread_work_slice(int64_t total, size_t tidx, size_t tcnt) { - const auto elements_per_thread = (total + tcnt - 1) / tcnt; - const auto start = tidx * elements_per_thread; - const auto end = std::min(start + elements_per_thread, total); - return { start, end }; + if (total <= 0 || tidx >= tcnt) { + return { 0, 0 }; // No work for this thread + } + + const auto elements_per_thread = total / tcnt; + const auto remainder = total % tcnt; + + int64_t start = 0; + int64_t end = 0; + if (tidx < remainder) { + // First 'remainder' threads get one extra item + start = tidx * (elements_per_thread + 1); + end = start + elements_per_thread + 1; + } else { + // Remaining threads get the base number of elements + start = remainder * (elements_per_thread + 1) + (tidx - remainder) * elements_per_thread; + end = start + elements_per_thread; + } + + return { start, std::min(end, total) }; } +constexpr const size_t kBytesPerVector = sizeof(HVX_Vector); // 128 for v73 +constexpr const size_t kAlignMask = kBytesPerVector - 1; +constexpr const size_t kL2CacheSize = 8 * 1024; // // 8KB L2 cache +constexpr const size_t kL2FetchAheadVectors = kL2CacheSize / kBytesPerVector; + } // namespace hexagon diff --git a/ggml/src/ggml-qnn/npu/device/quants.cpp b/ggml/src/ggml-qnn/npu/device/quants.cpp index d873691b58e15..67e77c2fc2a2a 100644 --- a/ggml/src/ggml-qnn/npu/device/quants.cpp +++ b/ggml/src/ggml-qnn/npu/device/quants.cpp @@ -4,6 +4,8 @@ #include +#include "op_types.hpp" // TODO: remove this include + static_assert(sizeof(npu_device_block_q4_K) == 2 * sizeof(npu_device_fp16_t) + QUANT_K_SCALE_SIZE + QUANT_K_BLOCK_SIZE / 2, "wrong q4_K block size/padding"); @@ -16,14 +18,34 @@ static_assert(sizeof(npu_device_block_q8_0) == sizeof(npu_device_fp16_t) + QUANT namespace { +inline HVX_Vector vmemu(const void * unaligned_ptr) { + HVX_Vector ret = *reinterpret_cast(unaligned_ptr); + return ret; +} + inline float to_float(const npu_device_fp16_t src) { - union { - __fp16 f16; - npu_device_fp16_t u16; - } f16; + return reinterpret_cast(src); +} + +template inline HVX_Vector load_block_generic(const _TBlock & src) { + uint8_t buffer[hexagon::kBytesPerVector]; + + static_assert(sizeof(buffer) == sizeof(HVX_Vector), "wrong cvt size/padding"); + static_assert(sizeof(buffer) >= sizeof(src.qs), "wrong q4_0 block size/padding"); - f16.u16 = src; - return f16.f16; + memcpy(&buffer[0], src.qs, sizeof(src.qs)); + return *reinterpret_cast(buffer); +} + +template inline HVX_Vector load_dual_block_generic(const _TBlock & src1, const _TBlock & src2) { + uint8_t buffer[hexagon::kBytesPerVector]; + + static_assert(sizeof(buffer) == sizeof(HVX_Vector), "wrong cvt size/padding"); + static_assert(sizeof(buffer) >= sizeof(src1.qs) * 2, "wrong q4_0 block size/padding"); + + memcpy(&buffer[0], src1.qs, sizeof(src1.qs)); + memcpy(&buffer[sizeof(src1.qs)], src2.qs, sizeof(src2.qs)); + return *reinterpret_cast(buffer); } inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) { @@ -37,38 +59,78 @@ inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) } void dequantize_row_q8_0(const void * src, float * dst, size_t count, const float * f16_to_f32_table) { - constexpr const int qk = QUANT_BLOCK_SIZE; - const int nb = count / qk; - const auto * src_ptr = reinterpret_cast(src); + constexpr const int qk = QUANT_BLOCK_SIZE; + static_assert(QUANT_BLOCK_SIZE == hexagon::kBytesPerVector / sizeof(float)); - // TODO: use intrinsics - for (int i = 0; i < nb; i++) { - const float d = f16_to_f32_table[src_ptr[i].d]; + const int nb = count / qk; + const auto * src_ptr = reinterpret_cast(src); + HVX_UVector * out = ((HVX_UVector *) dst); // TODO: opt for aligned access - for (int j = 0; j < qk; ++j) { - dst[i * qk + j] = src_ptr[i].qs[j] * d; - } + for (int i = 0; i < nb; i++) { + const auto & src = src_ptr[i]; + HVX_Vector d = Q6_Vh_vsplat_R(src.d); + + HVX_Vector q_lo = load_block_generic(src); + HVX_VectorPair q = Q6_Wh_vunpack_Vb(q_lo); + q = Q6_Wh_vunpack_Vb(Q6_V_lo_W(q)); + q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(q)); + q = Q6_Wqf32_vmpy_VhfVhf(q_lo, d); + out[i] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(q)); } } void dequantize_row_q4_0(const void * src, float * dst, size_t count, const float * f16_to_f32_table) { constexpr const int qk = QUANT_BLOCK_SIZE; static_assert(qk % 2 == 0, "qk must be even"); + static_assert(QUANT_BLOCK_SIZE == hexagon::kBytesPerVector / sizeof(float)); + constexpr const uint32_t kSizeOfQs = sizeof(npu_device_block_q4_0::qs); + + const int nb = count / qk; + const auto * src_ptr = reinterpret_cast(src); + HVX_Vector mask = Q6_Vb_vsplat_R(0x0F); + HVX_Vector minus = Q6_Vb_vsplat_R(8); + HVX_UVector * out = ((HVX_UVector *) dst); // TODO: opt for aligned access + + const int loop_count = nb - (nb % 2); + for (int i = 0; i < loop_count; i += 2) { + const auto & src1 = src_ptr[i]; + const auto & src2 = src_ptr[i + 1]; + + HVX_Vector d1 = Q6_Vh_vsplat_R(src1.d); + HVX_Vector d2 = Q6_Vh_vsplat_R(src2.d); + d1 = Q6_V_valign_VVR(d1, Q6_V_vzero(), hexagon::kBytesPerVector / 2); + d1 = Q6_V_valign_VVR(d2, d1, hexagon::kBytesPerVector / 2); + HVX_Vector d = Q6_Vh_vshuff_Vh(d1); + + HVX_Vector q_lo = load_dual_block_generic(src1, src2); + HVX_Vector q_hi = Q6_Vub_vlsr_VubR(q_lo, 4); + HVX_VectorPair q = Q6_W_vshuff_VVR(q_hi, Q6_V_vand_VV(q_lo, mask), kSizeOfQs); + q_lo = Q6_V_valign_VVR(Q6_V_lo_W(q), Q6_V_vzero(), hexagon::kBytesPerVector / 2); + q_lo = Q6_V_valign_VVR(Q6_V_hi_W(q), q_lo, hexagon::kBytesPerVector / 2); + q_lo = Q6_Vb_vshuff_Vb(q_lo); + q_lo = Q6_Vb_vsub_VbVb(q_lo, minus); + q = Q6_Wh_vunpack_Vb(q_lo); + q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(q)); + q = Q6_Wqf32_vmpy_VhfVhf(q_lo, d); + out[i] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(q)); + out[i + 1] = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(q)); + } - const int nb = count / qk; - const auto * src_ptr = reinterpret_cast(src); - - // TODO: use intrinsics - for (int i = 0; i < nb; i++) { - const float d = f16_to_f32_table[src_ptr[i].d]; - - for (int j = 0; j < qk / 2; ++j) { - const int x0 = (src_ptr[i].qs[j] & 0x0F) - 8; - const int x1 = ((src_ptr[i].qs[j] >> 4) & 0xF) - 8; - - dst[i * qk + j + 0] = x0 * d; - dst[i * qk + j + qk / 2] = x1 * d; - } + if (loop_count < nb) { + const auto & curr_blk = src_ptr[nb - 1]; + HVX_Vector d = Q6_Vh_vsplat_R(curr_blk.d); + + HVX_Vector q_lo = load_block_generic(curr_blk); + HVX_Vector q_hi = Q6_Vub_vlsr_VubR(q_lo, 4); + q_lo = Q6_V_valign_VVR(Q6_V_vand_VV(q_lo, mask), Q6_V_vzero(), sizeof(curr_blk.qs)); + q_lo = Q6_V_valign_VVR(q_hi, q_lo, hexagon::kBytesPerVector - sizeof(curr_blk.qs)); + q_lo = Q6_Vb_vsub_VbVb(q_lo, minus); + + HVX_VectorPair q = Q6_Wh_vunpack_Vb(q_lo); + q = Q6_Wh_vunpack_Vb(Q6_V_lo_W(q)); + q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(q)); + q = Q6_Wqf32_vmpy_VhfVhf(q_lo, d); + out[nb - 1] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(q)); } } diff --git a/ggml/src/ggml-qnn/npu/device/quants.hpp b/ggml/src/ggml-qnn/npu/device/quants.hpp index 6ffbeb0031635..6006cd22e93a4 100644 --- a/ggml/src/ggml-qnn/npu/device/quants.hpp +++ b/ggml/src/ggml-qnn/npu/device/quants.hpp @@ -23,13 +23,15 @@ inline bool is_quantized_type(npu_device_tensor_data_type type) { return get_type_traits(type).is_quantized; } -inline size_t get_dequantized_row_size(tensor * tensor) { +using dequantized_element_type = float; + +inline size_t get_dequantized_row_size(const tensor * tensor) { if (!is_quantized_type(tensor->get_type())) { return tensor->get_nb(1); // for f32 and f16 } auto row_elems_count = tensor->get_ne(0); - return row_elems_count * sizeof(float); // currently only f32 is supported + return row_elems_count * sizeof(dequantized_element_type); // currently only f32 is supported } inline const char * get_type_name(npu_device_tensor_data_type type) { diff --git a/ggml/src/ggml-qnn/npu/device/tensor.hpp b/ggml/src/ggml-qnn/npu/device/tensor.hpp index 9c7f6bffefff6..7e980d8402fb2 100644 --- a/ggml/src/ggml-qnn/npu/device/tensor.hpp +++ b/ggml/src/ggml-qnn/npu/device/tensor.hpp @@ -8,7 +8,8 @@ namespace hexagon { -constexpr const size_t kMaxTensorSrc = DEVICE_TENSOR_MAX_SRC; +constexpr const size_t kMaxTensorSrc = DEVICE_TENSOR_MAX_SRC; +constexpr const size_t kMaxParamsCount = DEVICE_TENSOR_MAX_OP_PARAMS; class tensor { public: @@ -50,17 +51,17 @@ class tensor { } } - bool set_src(size_t index, tensor * src) { - if (index >= kMaxTensorSrc) { - return false; - } + void update_config(const npu_device_tensor_update_config & config) { + static_assert(sizeof(_op_params) == sizeof(config.params), "op params size mismatch"); - _src[index] = src; - return true; + _info.op = config.op; + memcpy(_op_params, config.params, sizeof(_op_params)); + for (size_t i = 0; i < DEVICE_TENSOR_MAX_SRC; ++i) { + auto src_handle = config.src_handles[i]; + _src[i] = (src_handle ? reinterpret_cast(src_handle) : nullptr); + } } - void set_op(npu_device_tensor_op op) { _info.op = op; } - tensor * get_src(size_t index) const { if (index >= kMaxTensorSrc) { return nullptr; @@ -77,6 +78,20 @@ class tensor { npu_device_tensor_op get_op() const { return _info.op; } + template const _TyParam get_op_param(size_t index) const { + static_assert(sizeof(_TyParam) <= sizeof(_op_params), "_op_param type size exceeds op params size"); + + if (sizeof(_TyParam) * (index + 1) >= sizeof(_op_params)) { + return 0; + } + + return reinterpret_cast(_op_params)[index]; + } + + const int32_t * get_op_params() const { return _op_params; } + + const size_t get_op_param_count() const { return kMaxParamsCount; } + npu_device_tensor_data_type get_type() const { return _info.type; } const uint8_t * get_read_buffer() const { @@ -89,9 +104,10 @@ class tensor { bool is_valid() const { return _data != nullptr; } private: - npu_device_tensor_config _info; - tensor * _src[kMaxTensorSrc] = {}; - uint8_t * _data = nullptr; + npu_device_tensor_config _info = {}; + int32_t _op_params[kMaxParamsCount] = {}; + tensor * _src[kMaxTensorSrc] = {}; + uint8_t * _data = nullptr; DISABLE_COPY_AND_MOVE(tensor); }; diff --git a/ggml/src/ggml-qnn/npu/device/thread_pool.hpp b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp index bd7e83dd8a485..9a525213c9fad 100644 --- a/ggml/src/ggml-qnn/npu/device/thread_pool.hpp +++ b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp @@ -12,7 +12,7 @@ namespace hexagon { constexpr const size_t kMaxThreadCount = 4; -constexpr const size_t kDefaultStackSize = 1024 * 16; // 16KB +constexpr const size_t kDefaultStackSize = 1024 * 32; // 32KB constexpr const unsigned long long kThreadTaskPendingBit = 1; template class qurt_thread { @@ -80,7 +80,7 @@ using qurt_thread_ptr = std::unique_ptr>; template class thread_pool { static_assert(_thread_count > 1, "Thread count must be greater than 1"); - constexpr const static size_t kMaxThreadCount = _thread_count - 1; + constexpr const static size_t kMaxSubThreadCount = _thread_count - 1; public: typedef qurt_thread thread_type; @@ -88,9 +88,10 @@ template class thread_pool { thread_pool() { std::string thread_name_base = "thread_pool_"; - qurt_barrier_init(&_pending, kMaxThreadCount + 1); - qurt_barrier_init(&_completed, kMaxThreadCount + 1); - for (size_t i = 0; i < kMaxThreadCount; ++i) { + qurt_barrier_init(&_pending, kMaxSubThreadCount + 1); + qurt_barrier_init(&_completed, kMaxSubThreadCount + 1); + const auto priority = qurt_thread_get_priority(qurt_thread_get_id()); + for (size_t i = 0; i < kMaxSubThreadCount; ++i) { auto & thread_arg = _thread_args[i]; thread_arg.pool = this; thread_arg.thread_idx = i + 1; @@ -98,7 +99,7 @@ template class thread_pool { auto thread = std::make_unique( thread_name_base + std::to_string(i), reinterpret_cast(&thread_pool::thread_func_impl), &thread_arg, - QURT_THREAD_ATTR_PRIORITY_DEFAULT); + priority); if (!thread->is_valid()) { DEVICE_LOG_ERROR("Failed to create thread: %zu", i); // destroy all barriers and threads at destructor @@ -107,7 +108,7 @@ template class thread_pool { _threads[i] = std::move(thread); } - DEVICE_LOG_DEBUG("thread_pool.created: %zu", kMaxThreadCount); + DEVICE_LOG_DEBUG("thread_pool.created: %zu", kMaxSubThreadCount); } ~thread_pool() { @@ -133,7 +134,7 @@ template class thread_pool { _arg = arg; qurt_barrier_wait(&_pending); - task(this, 0, kMaxThreadCount + 1, arg); + task(this, 0, kMaxSubThreadCount + 1, arg); DEVICE_LOG_DEBUG("main_thread.task_completed: 0"); qurt_barrier_wait(&_completed); @@ -166,7 +167,7 @@ template class thread_pool { auto task = pool._task; if (task) { - task(arg->pool, arg->thread_idx, kMaxThreadCount + 1, pool._arg); + task(arg->pool, arg->thread_idx, kMaxSubThreadCount + 1, pool._arg); } DEVICE_LOG_DEBUG("thread_func_impl.task_completed: %zu", arg->thread_idx); @@ -176,13 +177,13 @@ template class thread_pool { DEVICE_LOG_DEBUG("thread_func_impl.end: %zu", arg->thread_idx); } - std::atomic_bool _thread_exit = false; - std::array _threads; - thread_pool_arg _thread_args[kMaxThreadCount] = {}; - qurt_barrier_t _pending = {}; - qurt_barrier_t _completed = {}; - task_type _task = nullptr; - void * _arg = nullptr; + std::atomic_bool _thread_exit = false; + std::array _threads; + thread_pool_arg _thread_args[kMaxSubThreadCount] = {}; + qurt_barrier_t _pending = {}; + qurt_barrier_t _completed = {}; + task_type _task = nullptr; + void * _arg = nullptr; DISABLE_COPY_AND_MOVE(thread_pool); }; diff --git a/ggml/src/ggml-qnn/npu/device/util.hpp b/ggml/src/ggml-qnn/npu/device/util.hpp index a5e1ae5201c3b..3ae7f100de507 100644 --- a/ggml/src/ggml-qnn/npu/device/util.hpp +++ b/ggml/src/ggml-qnn/npu/device/util.hpp @@ -1,7 +1,9 @@ #pragma once +#include #include #include +#include #include #include @@ -48,11 +50,114 @@ inline constexpr const char * op_get_name(npu_device_tensor_op op) { return "SUB"; case NPU_OP_MUL: return "MUL"; + case NPU_OP_RMS_NORM: + return "RMS_NORM"; default: return "UNKNOWN"; } } +class power_utils { + public: + power_utils() { + _context_ptr = HAP_utils_create_context(); + if (_context_ptr == nullptr) { + DEVICE_LOG_ERROR("Failed to create power context\n"); + } + } + + ~power_utils() { + if (_context_ptr != nullptr) { + HAP_utils_destroy_context(_context_ptr); + } + } + + unsigned int get_clock_speed_hz() const { + if (!is_valid()) { + DEVICE_LOG_ERROR("Power context is not initialized\n"); + return 0; + } + + HAP_power_response_t response = {}; + response.type = HAP_power_get_clk_Freq; + auto ret = HAP_power_get(_context_ptr, &response); + if (ret != AEE_SUCCESS) { + DEVICE_LOG_ERROR("Failed to get clock speed: %d\n", ret); + return 0; + } + + return response.clkFreqHz; + } + + bool get_dvcs_enabled() const { + if (!is_valid()) { + DEVICE_LOG_ERROR("Power context is not initialized\n"); + return false; + } + + HAP_power_response_t response = {}; + response.type = HAP_power_get_dcvsEnabled; + auto ret = HAP_power_get(_context_ptr, &response); + if (ret != AEE_SUCCESS) { + DEVICE_LOG_ERROR("Failed to get DVCS enabled: %d\n", ret); + return false; + } + + return response.dcvsEnabled; + } + + void set_dvcs_performance_mode(bool enable) { + if (!is_valid()) { + DEVICE_LOG_ERROR("Power context is not initialized\n"); + return; + } + + HAP_power_request_t request = {}; + request.type = HAP_power_set_DCVS_v3; + request.dcvs_v3.dcvs_enable = enable ? TRUE : FALSE; + if (enable) { + request.dcvs_v3.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE; + /* + * sleep_latency : To request for sleep latency in micro-seconds. + * Sleep latency is the minimum time before which the DSP sleeps + * Set latency to 65535 to reset it to the default value + */ + request.dcvs_v3.set_latency = TRUE; + request.dcvs_v3.latency = 1000; + + request.dcvs_v3.set_bus_params = TRUE; + request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_SVS; + request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_TURBO; + request.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_NOM; + } + + auto ret = HAP_power_set(_context_ptr, &request); + if (ret != AEE_SUCCESS) { + DEVICE_LOG_ERROR("Failed to set DVCS performance mode: %d\n", ret); + } + } + + void set_sleep_mode(bool enable) { + if (!is_valid()) { + DEVICE_LOG_ERROR("Power context is not initialized\n"); + return; + } + + boolean sleep_disable = enable ? FALSE : TRUE; + auto ret = HAP_power_set_sleep_mode(_context_ptr, sleep_disable); + if (ret != AEE_SUCCESS) { + DEVICE_LOG_ERROR("Failed to set sleep mode: %d\n", ret); + } + } + + bool is_valid() const { return _context_ptr != nullptr; } + + private: + void * _context_ptr = nullptr; + + DISABLE_COPY_AND_MOVE(power_utils); +}; + #ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING template class npu_scoped_timer { diff --git a/ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp b/ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp index 4c2922ca87f15..ab1041f626205 100644 --- a/ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp +++ b/ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp @@ -47,7 +47,7 @@ class vtcm_mem { DEVICE_LOG_DEBUG("VTCM released: %zu bytes at %p\n", _vtcm_size, _vtcm_mem); } - bool is_valid() const { return _vtcm_mem != nullptr; } + bool is_valid() const { return _vtcm_mem != nullptr && _vtcm_size != 0; } uint8_t * get_mem() const { return reinterpret_cast(_vtcm_mem); } diff --git a/ggml/src/ggml-qnn/npu/host/buffer.cpp b/ggml/src/ggml-qnn/npu/host/buffer.cpp index ace3dbee8eeec..7d3c1fbd9f7ac 100644 --- a/ggml/src/ggml-qnn/npu/host/buffer.cpp +++ b/ggml/src/ggml-qnn/npu/host/buffer.cpp @@ -177,7 +177,7 @@ std::shared_ptr host_buffer::init_tensor(ggml_tensor * tensor, remo auto ret = _allocator->fastrpc_mmap((int) _domain_id, _buffer_fd, _data, 0, _size, FASTRPC_MAP_FD); if (ret != AEE_SUCCESS) { - LOG_ERROR("failed to mmap rpc memory, fd: %d, ret: %d\n", _buffer_fd, ret); + LOG_ERROR("failed to mmap rpc memory, fd: %d, size: %zu, ret: %d\n", _buffer_fd, _size, ret); return std::shared_ptr(); } diff --git a/ggml/src/ggml-qnn/npu/host/graph.cpp b/ggml/src/ggml-qnn/npu/host/graph.cpp index 72ef5cc7868eb..d891280e5694c 100644 --- a/ggml/src/ggml-qnn/npu/host/graph.cpp +++ b/ggml/src/ggml-qnn/npu/host/graph.cpp @@ -1,5 +1,6 @@ #include "graph.hpp" +#include "profiler.hpp" #include "tensor.hpp" namespace hexagon { @@ -28,8 +29,12 @@ bool host_graph::update(ggml_cgraph * cgraph) { return false; } + SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]update, handle(%p)", (void *) this, (void *) _graph_handle); + _tensor_handles.clear(); + _tensor_update_configs.clear(); _tensor_handles.reserve(cgraph->n_nodes); + _tensor_update_configs.reserve(cgraph->n_nodes); for (int i = 0; i < cgraph->n_nodes; ++i) { auto * node = cgraph->nodes[i]; if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || @@ -40,28 +45,38 @@ bool host_graph::update(ggml_cgraph * cgraph) { continue; } + // TODO: move to tensor? auto * tensor_obj = host_tensor::from_ggml_tensor(node); if (!tensor_obj) { LOG_DEBUG("Unable to get host tensor from ggml tensor: %p\n", (void *) node); continue; } - tensor_obj->set_op(node->op); _tensor_handles.push_back(tensor_obj->get_device_tensor_handle()); - LOG_DEBUG("node[%d]%s(%s), addr: %p, type: %s, tensor_handle: %p\n", i, ggml_get_name(node), ggml_op_desc(node), - (void *) node, ggml_type_name(node->type), (void *) tensor_obj->get_device_tensor_handle()); - for (size_t j = 0; j < GGML_MAX_SRC && node->src[j]; ++j) { - auto * src = host_tensor::from_ggml_tensor(node->src[j]); - tensor_obj->set_src(j, src); - } + _tensor_update_configs.push_back(tensor_obj->update_hosts_params_only(node)); + LOG_DEBUG("[%p]node[%d]%s(%s), addr: %p, type: %s, tensor_handle: %p\n", (void *) this, i, ggml_get_name(node), + ggml_op_desc(node), (void *) node, ggml_type_name(node->type), + (void *) tensor_obj->get_device_tensor_handle()); } - LOG_DEBUG("host_graph::update, host_graph(%p), handle(%p), ggml_cgraph(%p), tensor count(%zu)\n", (void *) this, - (void *) _graph_handle, (void *) cgraph, _tensor_handles.size()); - if (!_tensor_handles.empty()) { - npu_device_graph_set_tensor(_device_handle, _graph_handle, _tensor_handles.data(), - (int) _tensor_handles.size()); + GGML_ASSERT(_tensor_handles.size() == _tensor_update_configs.size()); + + constexpr const npu_device_tensor_handle_t kEmptyTensorHandle = 0; + constexpr const npu_device_tensor_update_config kEmptyUpdateConfig = {}; + + auto ret = npu_device_graph_set_tensor_with_param( + _device_handle, _graph_handle, _tensor_handles.size() ? _tensor_handles.data() : &kEmptyTensorHandle, + (int) _tensor_handles.size(), + _tensor_update_configs.size() ? _tensor_update_configs.data() : &kEmptyUpdateConfig, + (int) _tensor_update_configs.size()); + + if (ret != AEE_SUCCESS) { + LOG_ERROR("Failed to set tensors in host_graph: 0x%x\n", (int) ret); + return false; } + + LOG_DEBUG("[%p]host_graph::update, handle(%p), ggml_cgraph(%p), tensor count(%zu)\n", (void *) this, + (void *) _graph_handle, (void *) cgraph, _tensor_handles.size()); return true; } @@ -71,6 +86,7 @@ bool host_graph::compute() { return false; } + SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]compute, handle(%p)", (void *) this, (void *) _graph_handle); auto status = npu_device_graph_compute(_device_handle, _graph_handle); if (status != AEE_SUCCESS) { LOG_ERROR("Failed to compute host_graph: 0x%x\n", (int) status); diff --git a/ggml/src/ggml-qnn/npu/host/graph.hpp b/ggml/src/ggml-qnn/npu/host/graph.hpp index 20c917e1203ca..b871c125563f2 100644 --- a/ggml/src/ggml-qnn/npu/host/graph.hpp +++ b/ggml/src/ggml-qnn/npu/host/graph.hpp @@ -21,9 +21,10 @@ class host_graph { bool compute(); private: - remote_handle64 _device_handle = 0; - npu_device_graph_handle_t _graph_handle = 0; - std::vector _tensor_handles; + remote_handle64 _device_handle = 0; + npu_device_graph_handle_t _graph_handle = 0; + std::vector _tensor_handles; + std::vector _tensor_update_configs; DISABLE_COPY(host_graph); DISABLE_MOVE(host_graph); diff --git a/ggml/src/ggml-qnn/npu/host/host_device.cpp b/ggml/src/ggml-qnn/npu/host/host_device.cpp index fb1ad4dfd677b..443abe5c9e6fe 100644 --- a/ggml/src/ggml-qnn/npu/host/host_device.cpp +++ b/ggml/src/ggml-qnn/npu/host/host_device.cpp @@ -151,7 +151,7 @@ bool npu_device::supports_op_impl(const ggml_tensor * op) { auto * src0 = op->src[0]; if (!src0) { - LOG_DEBUG("[%s]Unsupported inplace op: %s\n", get_name(), ggml_op_name(op->op)); + LOG_DEBUG("[%s]Unsupported inplace op: %s\n", get_name(), ggml_op_desc(op)); return false; } @@ -168,7 +168,7 @@ bool npu_device::supports_op_impl(const ggml_tensor * op) { auto npu_op = op_to_npu_op(op->op); if (npu_op == NPU_OP_COUNT) { - LOG_DEBUG("[%s]Unsupported op: %s\n", get_name(), ggml_op_name(op->op)); + LOG_DEBUG("[%s]Unsupported op: %s\n", get_name(), ggml_op_desc(op)); return false; } @@ -179,7 +179,7 @@ bool npu_device::supports_op_impl(const ggml_tensor * op) { constexpr const auto get_spec = [](const ggml_tensor * tensor) -> npu_device_tensor_spec { if (!tensor) { - return npu_device_tensor_spec{}; + return npu_device_tensor_spec{ {}, NPU_DATA_TYPE_COUNT }; } static_assert(DEVICE_TENSOR_MAX_DIMS == GGML_MAX_DIMS, "tensor dimensions mismatch"); diff --git a/ggml/src/ggml-qnn/npu/host/tensor.hpp b/ggml/src/ggml-qnn/npu/host/tensor.hpp index c5d2decbc5682..71205b39fb7a8 100644 --- a/ggml/src/ggml-qnn/npu/host/tensor.hpp +++ b/ggml/src/ggml-qnn/npu/host/tensor.hpp @@ -1,5 +1,7 @@ #pragma once +#include + #include "common.hpp" #include "ggml-impl.h" #include "hexagon_npu.h" @@ -19,11 +21,15 @@ class host_tensor { explicit host_tensor(ggml_tensor * tensor, int buffer_fd, uint64_t offset, remote_handle64 device_handle) : _device_handle(device_handle) { + + // TODO: figure out why the npu_device_tensor_config can't be larger than 100 bytes + static_assert(sizeof(npu_device_tensor_config) < 100, "npu_device_tensor_config size too large"); + _info.buffer_fd = buffer_fd; _info.offset = offset; _info.type = type_to_npu_type(tensor->type); - _info.op = op_to_npu_op(tensor->op); _info.size = ggml_nbytes(tensor); + // _info.op will be updated in update_params() static_assert(DEVICE_TENSOR_MAX_DIMS == GGML_MAX_DIMS, "tensor dimensions mismatch"); static_assert(sizeof(_info.ne) == sizeof(tensor->ne), "tensor ne size mismatch"); @@ -56,28 +62,96 @@ class host_tensor { npu_device_tensor_handle_t get_device_tensor_handle() const { return _device_tensor_handle; } - void set_src(size_t index, host_tensor * src) { - if (index >= DEVICE_TENSOR_MAX_SRC) { - LOG_ERROR("host_tensor(%p) set_src[%zu] out of range\n", (void *) this, index); + void update_params(ggml_tensor * ggml_tensor) { + static_assert(sizeof(_info_update.params) <= sizeof(_ggml_tensor->op_params), + "device tensor params size mismatch"); + static_assert(DEVICE_TENSOR_MAX_SRC <= GGML_MAX_SRC, "device tensor src size mismatch"); + + GGML_ASSERT(ggml_tensor == _ggml_tensor); + if (!_ggml_tensor) { + LOG_DEBUG("host_tensor(%p) _ggml_tensor is null\n", (void *) this); return; } - LOG_DEBUG("host_tensor(%p) set_src[%zu]: %p\n", (void *) this, index, (void *) src); - npu_device_tensor_set_src(_device_handle, _device_tensor_handle, index, src->get_device_tensor_handle()); + auto new_op = op_to_npu_op(_ggml_tensor->op); + bool params_changed = new_op != _info_update.op; + if (params_changed) { + LOG_DEBUG("host_tensor(%p) op changed: %s -> %s\n", (void *) this, get_npu_op_desc(_info.op), + get_npu_op_desc(new_op)); + } + + _info.op = new_op; + _info_update.op = new_op; + + if (memcmp(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params)) != 0) { + params_changed = true; + memcpy(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params)); + LOG_DEBUG("host_tensor(%p) op_params changed: [%x, %x, %x, %x]\n", (void *) this, + (int) _info_update.params[0], (int) _info_update.params[1], (int) _info_update.params[2], + (int) _info_update.params[3]); + } + + npu_device_tensor_handle_t src_tensor_handles[DEVICE_TENSOR_MAX_SRC] = {}; + for (size_t j = 0; j < DEVICE_TENSOR_MAX_SRC && _ggml_tensor->src[j]; ++j) { + auto * src = host_tensor::from_ggml_tensor(_ggml_tensor->src[j]); + src_tensor_handles[j] = src->get_device_tensor_handle(); + LOG_DEBUG("host_tensor(%p) set_src[%zu]: %p\n", (void *) this, j, (void *) src); + } + + static_assert(std::is_same::value, + "src tensor handles type mismatch"); + + if (memcmp(_info_update.src_handles, src_tensor_handles, sizeof(_info_update.src_handles)) != 0) { + params_changed = true; + memcpy(_info_update.src_handles, src_tensor_handles, sizeof(_info_update.src_handles)); + LOG_DEBUG("host_tensor(%p) src changed, handles: [%p, %p]\n", (void *) this, + (void *) _info_update.src_handles[0], (void *) _info_update.src_handles[1]); + } + + if (params_changed) { + npu_device_tensor_update_params(_device_handle, _device_tensor_handle, &_info_update); + LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n", (void *) this, + ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1], + (int) _info_update.params[2], (int) _info_update.params[3]); + } else { + LOG_DEBUG("host_tensor(%p) update_params, no changes, op: %s, params: [%x, %x, %x, %x]\n", (void *) this, + ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1], + (int) _info_update.params[2], (int) _info_update.params[3]); + } } - void set_op(ggml_op op) { - _info.op = op_to_npu_op(op); - npu_device_tensor_set_op(_device_handle, _device_tensor_handle, _info.op); + const npu_device_tensor_update_config & update_hosts_params_only(ggml_tensor * ggml_tensor) { + static_assert(sizeof(_info_update.params) <= sizeof(ggml_tensor->op_params), + "device tensor params size mismatch"); + static_assert(DEVICE_TENSOR_MAX_SRC <= GGML_MAX_SRC, "device tensor src size mismatch"); + + GGML_ASSERT(ggml_tensor == _ggml_tensor); + + auto new_op = op_to_npu_op(_ggml_tensor->op); + _info.op = new_op; + _info_update.op = new_op; + memcpy(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params)); + + for (size_t j = 0; j < DEVICE_TENSOR_MAX_SRC && _ggml_tensor->src[j]; ++j) { + auto * src = host_tensor::from_ggml_tensor(_ggml_tensor->src[j]); + _info_update.src_handles[j] = src->get_device_tensor_handle(); + LOG_DEBUG("host_tensor(%p) set_src[%zu]: %p\n", (void *) this, j, (void *) src); + } + + LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n", (void *) this, + ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1], + (int) _info_update.params[2], (int) _info_update.params[3]); + return _info_update; } bool is_valid() const { return _device_tensor_handle != 0; } private: - remote_handle64 _device_handle = 0; - npu_device_tensor_handle_t _device_tensor_handle = 0; - npu_device_tensor_config _info = {}; - ggml_tensor * _ggml_tensor = nullptr; + remote_handle64 _device_handle = 0; + npu_device_tensor_handle_t _device_tensor_handle = 0; + npu_device_tensor_config _info = {}; + npu_device_tensor_update_config _info_update = {}; + ggml_tensor * _ggml_tensor = nullptr; DISABLE_COPY(host_tensor); DISABLE_MOVE(host_tensor); diff --git a/ggml/src/ggml-qnn/npu/host/util.cpp b/ggml/src/ggml-qnn/npu/host/util.cpp index 9ce9841004235..b62370d1ad845 100644 --- a/ggml/src/ggml-qnn/npu/host/util.cpp +++ b/ggml/src/ggml-qnn/npu/host/util.cpp @@ -25,11 +25,30 @@ enum npu_device_tensor_op op_to_npu_op(ggml_op op) { return NPU_OP_SUB; case GGML_OP_MUL: return NPU_OP_MUL; + case GGML_OP_RMS_NORM: + return NPU_OP_RMS_NORM; default: return NPU_OP_COUNT; } } +const char * get_npu_op_desc(enum npu_device_tensor_op op) { + switch (op) { + case NPU_OP_MUL_MAT: + return ggml_op_name(GGML_OP_MUL_MAT); + case NPU_OP_ADD: + return ggml_op_name(GGML_OP_ADD); + case NPU_OP_SUB: + return ggml_op_name(GGML_OP_SUB); + case NPU_OP_MUL: + return ggml_op_name(GGML_OP_MUL); + case NPU_OP_RMS_NORM: + return ggml_op_name(GGML_OP_RMS_NORM); + default: + return "UNKNOWN"; + } +} + enum npu_device_tensor_data_type type_to_npu_type(ggml_type type) { switch (type) { case GGML_TYPE_F32: diff --git a/ggml/src/ggml-qnn/npu/host/util.hpp b/ggml/src/ggml-qnn/npu/host/util.hpp index 469e5066602ed..f8ec5c3b9f537 100644 --- a/ggml/src/ggml-qnn/npu/host/util.hpp +++ b/ggml/src/ggml-qnn/npu/host/util.hpp @@ -5,6 +5,7 @@ namespace hexagon { enum npu_device_tensor_op op_to_npu_op(ggml_op op); +const char * get_npu_op_desc(enum npu_device_tensor_op op); enum npu_device_tensor_data_type type_to_npu_type(ggml_type type); // TODO: merge with qcom_htp_arch diff --git a/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl b/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl index df3cdf4957295..ed20c125b379c 100644 --- a/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl +++ b/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl @@ -4,6 +4,7 @@ const uint32_t DEVICE_TENSOR_MAX_DIMS = 4; const uint32_t DEVICE_TENSOR_MAX_SRC = 2; +const uint32_t DEVICE_TENSOR_MAX_OP_PARAMS = 4; const uint32_t QUANT_BLOCK_SIZE = 32; const uint32_t QUANT_K_BLOCK_SIZE = 256; const uint32_t QUANT_K_SCALE_SIZE = 12; @@ -38,6 +39,7 @@ interface npu_device : remote_handle64{ NPU_OP_ADD, NPU_OP_SUB, NPU_OP_MUL, + NPU_OP_RMS_NORM, NPU_OP_COUNT }; @@ -55,6 +57,12 @@ interface npu_device : remote_handle64{ tensor_data_type type; }; + struct tensor_update_config { + tensor_op op; + int32_t params[DEVICE_TENSOR_MAX_OP_PARAMS]; + tensor_handle_t src_handles[DEVICE_TENSOR_MAX_SRC]; + }; + struct tensor_config { ne_type ne; uint64_t nb[DEVICE_TENSOR_MAX_DIMS]; @@ -82,15 +90,9 @@ interface npu_device : remote_handle64{ rout tensor_handle_t tensor_handle ); - AEEResult tensor_set_src( + AEEResult tensor_update_params( in tensor_handle_t tensor_handle, - in uint64_t index, - in tensor_handle_t src - ); - - AEEResult tensor_set_op( - in tensor_handle_t tensor_handle, - in tensor_op op + in tensor_update_config config ); AEEResult tensor_free( @@ -106,6 +108,12 @@ interface npu_device : remote_handle64{ in sequence tensor_handles ); + AEEResult graph_set_tensor_with_param( + in graph_handle_t graph_handle, + in sequence tensor_handles, + in sequence tensor_params + ); + AEEResult graph_compute( in graph_handle_t graph_handle ); diff --git a/ggml/src/ggml-qnn/qnn/profiler.cpp b/ggml/src/ggml-qnn/qnn/event_tracer.cpp similarity index 99% rename from ggml/src/ggml-qnn/qnn/profiler.cpp rename to ggml/src/ggml-qnn/qnn/event_tracer.cpp index 5625c3acf7ebb..41bf0ab88eb30 100644 --- a/ggml/src/ggml-qnn/qnn/profiler.cpp +++ b/ggml/src/ggml-qnn/qnn/event_tracer.cpp @@ -1,5 +1,5 @@ -#include "profiler.hpp" +#include "event_tracer.hpp" #include #include diff --git a/ggml/src/ggml-qnn/qnn/event_tracer.hpp b/ggml/src/ggml-qnn/qnn/event_tracer.hpp new file mode 100644 index 0000000000000..3db137ebe167c --- /dev/null +++ b/ggml/src/ggml-qnn/qnn/event_tracer.hpp @@ -0,0 +1,45 @@ +#pragma once + +#include + +#include +#include +#include +#include + +#include "logger.hpp" +#include "profiler.hpp" +#include "qnn-types.hpp" + +namespace qnn { + +// forward declaration of qnn_interface +class qnn_interface; + +class qnn_event_tracer { + public: + // ref: + // https://github.com/pytorch/executorch/blob/ae3d558d5e6aa04fc52a3065399fe6a773702f52/backends/qualcomm/serialization/qc_schema.py#L53 + // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html#supported-snapdragon-devices + enum sdk_profile_level { PROFILE_OFF = 0, PROFILE_BASIC, PROFILE_DETAIL, PROFILE_OP_TRACE }; + + explicit qnn_event_tracer(const std::string & prefix, std::shared_ptr interface, + Qnn_BackendHandle_t backend_handle, sdk_profile_level level); + ~qnn_event_tracer(); + + Qnn_ProfileHandle_t get_handle() const { return _handle; } + + void print_profile_events(); + + private: + std::shared_ptr _interface; + Qnn_ProfileHandle_t _handle = nullptr; + std::string _prefix; + + DISABLE_COPY(qnn_event_tracer); + DISABLE_MOVE(qnn_event_tracer); +}; + +using qnn_event_tracer_ptr = std::shared_ptr; + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn/graph.cpp b/ggml/src/ggml-qnn/qnn/graph.cpp index 3094b5c3bee67..b4dcc7797dfd9 100644 --- a/ggml/src/ggml-qnn/qnn/graph.cpp +++ b/ggml/src/ggml-qnn/qnn/graph.cpp @@ -4,10 +4,10 @@ #include #include +#include "event_tracer.hpp" #include "ggml-impl.h" #include "logger.hpp" #include "op-config.hpp" -#include "profiler.hpp" #include "tensor.hpp" #ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING @@ -411,8 +411,8 @@ bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph * cgraph) { GGML_TYPE_COUNT > GGML_TYPE_Q8_0 && GGML_TYPE_Q8_0 > GGML_TYPE_F16 && GGML_TYPE_F16 > GGML_TYPE_F32, "GGML_TYPE enum order is not correct"); - QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]build_graph_from_ggml_graph", get_backend_name(_device), - _graph_name.c_str()); + SCOPED_PERFORMANCE_TRACKER("[%s][%s]build_graph_from_ggml_graph", get_backend_name(_device), + _graph_name.c_str()); auto override_data_type = get_override_data_type(inputs, outputs); if (override_data_type != GGML_TYPE_COUNT) { @@ -466,8 +466,7 @@ bool qnn_graph::execute(const ggml_cgraph * cgraph, std::shared_ptrqnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(), @@ -529,7 +528,7 @@ bool qnn_graph::execute(const ggml_cgraph * cgraph, std::shared_ptr #include "convert.hpp" +#include "event_tracer.hpp" #include "ggml-qnn.h" #include "op-config.hpp" -#include "profiler.hpp" #include "qnn-lib.hpp" namespace qnn { diff --git a/ggml/src/ggml-qnn/qnn/profiler.hpp b/ggml/src/ggml-qnn/qnn/profiler.hpp deleted file mode 100644 index 0d4f839fda270..0000000000000 --- a/ggml/src/ggml-qnn/qnn/profiler.hpp +++ /dev/null @@ -1,100 +0,0 @@ -#pragma once - -#include - -#include -#include -#include -#include - -#include "logger.hpp" -#include "qnn-types.hpp" - -namespace qnn { - -#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING - -class qnn_scoped_timer { - public: - qnn_scoped_timer(const std::string & log_prefix) : _log_prefix(std::move(log_prefix)) { - _begin_us = ggml_time_us(); - } - - qnn_scoped_timer(qnn_scoped_timer && other) { - _begin_us = other._begin_us; - _log_prefix = std::move(other._log_prefix); - } - - ~qnn_scoped_timer() { print(); } - - void operator=(qnn_scoped_timer && other) { - _begin_us = other._begin_us; - _log_prefix = std::move(other._log_prefix); - } - - void print() const { - auto duration = (ggml_time_us() - _begin_us) / 1000.0; - QNN_LOG_INFO("[profiler]%s, duration: %.4f ms\n", _log_prefix.c_str(), duration); - } - - - private: - int64_t _begin_us = 0LL; - std::string _log_prefix; - - qnn_scoped_timer(const qnn_scoped_timer &) = delete; - void operator=(const qnn_scoped_timer &) = delete; -}; - -inline qnn_scoped_timer make_scope_perf_timer(const char * format, ...) { - va_list args; - va_start(args, format); - char buffer[4096]; - vsnprintf(buffer, sizeof(buffer), format, args); - va_end(args); - return qnn_scoped_timer(buffer); -} - -#else - -inline void make_scope_perf_timer(const char *, ...) {} - -#endif - -// forward declaration of qnn_interface -class qnn_interface; - -class qnn_event_tracer { - public: - // ref: - // https://github.com/pytorch/executorch/blob/ae3d558d5e6aa04fc52a3065399fe6a773702f52/backends/qualcomm/serialization/qc_schema.py#L53 - // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html#supported-snapdragon-devices - enum sdk_profile_level { PROFILE_OFF = 0, PROFILE_BASIC, PROFILE_DETAIL, PROFILE_OP_TRACE }; - - explicit qnn_event_tracer(const std::string & prefix, std::shared_ptr interface, - Qnn_BackendHandle_t backend_handle, sdk_profile_level level); - ~qnn_event_tracer(); - - Qnn_ProfileHandle_t get_handle() const { return _handle; } - - void print_profile_events(); - - private: - std::shared_ptr _interface; - Qnn_ProfileHandle_t _handle = nullptr; - std::string _prefix; - - DISABLE_COPY(qnn_event_tracer); - DISABLE_MOVE(qnn_event_tracer); -}; - -using qnn_event_tracer_ptr = std::shared_ptr; - -} // namespace qnn - -#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING -# define QNN_SCOPED_PERFORMANCE_TRACKER(fmt, ...) \ - auto __qnn_timer_##__LINE__ = qnn::make_scope_perf_timer(fmt, __VA_ARGS__) -#else -# define QNN_SCOPED_PERFORMANCE_TRACKER(fmt, ...) ((void) 0) -#endif diff --git a/ggml/src/ggml-qnn/qnn/qnn-lib.cpp b/ggml/src/ggml-qnn/qnn/qnn-lib.cpp index 474bf53434628..e32bab5f9247d 100644 --- a/ggml/src/ggml-qnn/qnn/qnn-lib.cpp +++ b/ggml/src/ggml-qnn/qnn/qnn-lib.cpp @@ -34,21 +34,36 @@ constexpr const qnn::device_caps kDeviceCaps[] = { { // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul kQnnCpuLibName, GGML_BACKEND_DEVICE_TYPE_ACCEL, (1L << GGML_TYPE_I8) | (1L << GGML_TYPE_F32), - 0xFFFFFE, // all quantized types can be offload to CPU, at current implementation, those types will be dequantized into float32 on cpu - 0, // 0 for no limitation +#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS + // all quantized types can be offload to CPU, at current implementation, those types will be dequantized into float32 on cpu + 0xFFFFFE, +#else + 0, +#endif + + 0, // 0 for no limitation }, { // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul kQnnGpuLibName, GGML_BACKEND_DEVICE_TYPE_GPU, (1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16), +#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS // all quantized types can be offload to GPU, at current implementation, those types will be dequantized into float32 on cpu - 0xFFFFFE, (128256L * 4096 * + 0xFFFFFE, +#else + 0, +#endif + (128256L * 4096 * sizeof(float)), // tested on 8 gen 2, failed to allocate tensor with size 128256x4096 and float32 }, { // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul kQnnNpuLibName, GGML_BACKEND_DEVICE_TYPE_ACCEL, +#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS (1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16) | (1L << GGML_TYPE_I16), (1L << GGML_TYPE_Q2_K) | (1L << GGML_TYPE_Q3_K) | (1L << GGML_TYPE_Q4_K) | (1L << GGML_TYPE_Q8_K), +#else + 0, +#endif (8192L * 2048 + 8192 * 512 + 2048 * 512) * sizeof(float), // TODO: should have a better way to get this value }, }; diff --git a/ggml/src/ggml-qnn/shared/common.hpp b/ggml/src/ggml-qnn/shared/common.hpp index 4feb3365ce102..b5e1e5213e8c3 100644 --- a/ggml/src/ggml-qnn/shared/common.hpp +++ b/ggml/src/ggml-qnn/shared/common.hpp @@ -45,6 +45,10 @@ size_t get_system_free_memory_in_bytes(); class_name(class_name &&) = delete; \ void operator=(class_name &&) = delete +#define DISABLE_COPY_AND_MOVE(class_name) \ + DISABLE_COPY(class_name); \ + DISABLE_MOVE(class_name) + #define LOG_ERROR(...) (GGML_LOG_ERROR(__VA_ARGS__)) #define LOG_WARN(...) (GGML_LOG_WARN(__VA_ARGS__)) #define LOG_INFO(...) (GGML_LOG_INFO(__VA_ARGS__)) diff --git a/ggml/src/ggml-qnn/shared/profiler.hpp b/ggml/src/ggml-qnn/shared/profiler.hpp new file mode 100644 index 0000000000000..7180dc02957bc --- /dev/null +++ b/ggml/src/ggml-qnn/shared/profiler.hpp @@ -0,0 +1,61 @@ +#pragma once + +#include +#include +#include + +#include "common.hpp" +#include "ggml-impl.h" + +namespace profiler { + +#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING + +class scoped_timer { + public: + scoped_timer(const std::string & log_prefix) : _log_prefix(std::move(log_prefix)) { _begin_us = ggml_time_us(); } + + scoped_timer(scoped_timer && other) { + _begin_us = other._begin_us; + _log_prefix = std::move(other._log_prefix); + } + + ~scoped_timer() { print(); } + + void operator=(scoped_timer && other) { + _begin_us = other._begin_us; + _log_prefix = std::move(other._log_prefix); + } + + void print() const { + auto duration = ggml_time_us() - _begin_us; + GGML_LOG_INFO("[profiler]%s, dur: %lld us\n", _log_prefix.c_str(), (long long) duration); + } + + + private: + int64_t _begin_us = 0LL; + std::string _log_prefix; + + DISABLE_COPY(scoped_timer); +}; + +inline scoped_timer make_scope_perf_timer(const char * format, ...) { + va_list args; + va_start(args, format); + char buffer[4096]; + vsnprintf(buffer, sizeof(buffer), format, args); + va_end(args); + return scoped_timer(buffer); +} + +#endif + +} // namespace profiler + +#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING +# define SCOPED_PERFORMANCE_TRACKER(fmt, ...) \ + auto __scoped_timer_##__LINE__ = profiler::make_scope_perf_timer(fmt, __VA_ARGS__) +#else +# define SCOPED_PERFORMANCE_TRACKER(fmt, ...) ((void) 0) +#endif diff --git a/ggml/src/ggml-qnn/shared/rpc-mem.hpp b/ggml/src/ggml-qnn/shared/rpc-mem.hpp index ba8449192b5dd..9552ca9555380 100644 --- a/ggml/src/ggml-qnn/shared/rpc-mem.hpp +++ b/ggml/src/ggml-qnn/shared/rpc-mem.hpp @@ -64,8 +64,10 @@ class rpc_mem { void * buf = nullptr; if (_rpc_interface->is_alloc2_available()) { + LOG_DEBUG("rpcmem_alloc2 available, using it\n"); buf = _rpc_interface->rpcmem_alloc2(heapid, flags, size); } else { + LOG_DEBUG("rpcmem_alloc2 not available, using rpcmem_alloc\n"); buf = _rpc_interface->rpcmem_alloc(heapid, flags, size); }