diff --git a/CMakeLists.txt b/CMakeLists.txt index de51c0a17b2f6..c5903c112b944 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,16 @@ set(CMAKE_WARN_UNUSED_CLI YES) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +if(CMAKE_SYSTEM_NAME STREQUAL "Android") + set(TARGET_SNAPDRAGON8GEN3 ON) + if(TARGET_SNAPDRAGON8GEN3) + #works fine on Snapdragon 8Gen3 with 1.5x(45+ tokens/second)-3x(70+ tokens/second) performance gain through the default ggml backend + add_definitions(-march=armv8.7-a) + add_definitions(-mcpu=cortex-x1) + add_definitions(-mtune=cortex-x1) + endif() +endif() + if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") @@ -119,6 +129,7 @@ llama_option_depr(WARNING LLAMA_RPC GGML_RPC) llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL) llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16) llama_option_depr(WARNING LLAMA_CANN GGML_CANN) +llama_option_depr(WARNING LLAMA_HEXAGON GGML_HEXAGON) if (NOT MSVC) if (LLAMA_SANITIZE_THREAD) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index d33f843b417cf..b5c328911083e 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -204,6 +204,7 @@ option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON) set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING "gmml: OpenCL API version to target") +option(GGML_HEXAGON "ggml: use HEXAGON" OFF) # toolchain for vulkan-shaders-gen set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen") @@ -269,6 +270,7 @@ set(GGML_PUBLIC_HEADERS include/ggml-rpc.h include/ggml-sycl.h include/ggml-vulkan.h + include/ggml-hexagon.h include/gguf.h) set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") diff --git a/ggml/include/ggml-hexagon.h b/ggml/include/ggml-hexagon.h new file mode 100644 index 0000000000000..8e37f7da73adf --- /dev/null +++ b/ggml/include/ggml-hexagon.h @@ -0,0 +1,54 @@ + /* + * Copyright (c) 2023-2025 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define GGML_HEXAGON_MAX_DEVICES 3 +#define GGML_HEXAGON_BACKEND_NAME "hexagon" + +enum HEXAGONBackend { + HEXAGON_BACKEND_QNNCPU = 0, + HEXAGON_BACKEND_QNNGPU = 1, + HEXAGON_BACKEND_QNNNPU = 2, + HEXAGON_BACKEND_CDSP = 2, + HEXAGON_BACKEND_GGML = 3, //"fake" QNN backend for compare performance between HEXAGON backend and ggml backend +}; + +GGML_BACKEND_API ggml_backend_t ggml_backend_hexagon_init(size_t dev_num, const char * qnn_lib_path); + +GGML_BACKEND_API bool ggml_backend_is_hexagon(ggml_backend_t backend); + +GGML_BACKEND_API int ggml_backend_hexagon_get_device_count(void); + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_hexagon_reg(void); + +const char * ggml_backend_hexagon_get_devname(size_t dev_num); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index f00700da71fcd..d3be21ae4b55f 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -310,6 +310,7 @@ ggml_add_backend(RPC) ggml_add_backend(SYCL) ggml_add_backend(Vulkan) ggml_add_backend(OpenCL) +ggml_add_backend(HEXAGON) foreach (target ggml-base ggml) target_include_directories(${target} PUBLIC $ $) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 405d8e31514b5..e2e334c1de002 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -65,6 +65,10 @@ #include "ggml-kompute.h" #endif +#ifdef GGML_USE_HEXAGON +#include "ggml-hexagon.h" +#endif + // disable C++17 deprecation warning for std::codecvt_utf8 #if defined(__clang__) # pragma clang diagnostic push @@ -187,6 +191,9 @@ struct ggml_backend_registry { #ifdef GGML_USE_KOMPUTE register_backend(ggml_backend_kompute_reg()); #endif +#ifdef GGML_USE_HEXAGON + register_backend(ggml_backend_hexagon_reg()); +#endif #ifdef GGML_USE_CPU register_backend(ggml_backend_cpu_reg()); #endif @@ -577,6 +584,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) { ggml_backend_load_best("vulkan", silent, dir_path); ggml_backend_load_best("opencl", silent, dir_path); ggml_backend_load_best("musa", silent, dir_path); + ggml_backend_load_best("hexagon", silent, dir_path); ggml_backend_load_best("cpu", silent, dir_path); // check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend const char * backend_path = std::getenv("GGML_BACKEND_PATH"); diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt new file mode 100644 index 0000000000000..80186509f76b4 --- /dev/null +++ b/ggml/src/ggml-hexagon/CMakeLists.txt @@ -0,0 +1,115 @@ +project(ggml-hexagon) +message(STATUS "Using HEXAGON backend") +message("CMAKE_SYSTEM_NAME : ${CMAKE_SYSTEM_NAME}") + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +if(NOT DEFINED QNN_SDK_PATH) + message(FATAL_ERROR "QNN_SDK_PATH not defined") +endif() + +if(NOT DEFINED HEXAGON_SDK_PATH) + message(FATAL_ERROR "HEXAGON_SDK_PATH not defined") +endif() + +message("QNN_SDK_PATH : ${QNN_SDK_PATH}") +message("HEXAGON_SDK_PATH: ${HEXAGON_SDK_PATH}") +message("HTP_ARCH_VERSION: ${HTP_ARCH_VERSION}") + +if (CMAKE_BUILD_TYPE STREQUAL "Debug") + set(DEBUG_FLAG "-Wall") + message("Debug mode:${DEBUG_FLAG}") +else() + set(DEBUG_FLAG "-DNDEBUG -Wall") + message("Release mode:${DEBUG_FLAG}") +endif() + + +#v68 --- Snapdragon 888 +#v69 --- Snapdragon 8 Gen1 +#v73 --- Snapdragon 8 Gen2 +#v75 --- Snapdragon 8 Gen3 +#v79 --- Snapdragon 8 Elite(aka Gen4) +if(NOT DEFINED HTP_ARCH_VERSION) + message(FATAL_ERROR "HTP_ARCH_VERSION not defined, valid htp arch: v68,v69,v73,v75,v79") +endif() + +#check whether user's specified htp arch is valid +set(CHECK_HTP_ARCH "WRONG") +foreach (feat v68 v69 v73 v75 v79) + if (${feat} STREQUAL ${HTP_ARCH_VERSION}) + set(CHECK_HTP_ARCH "GOOD") + endif() +endforeach() +if (${CHECK_HTP_ARCH} STREQUAL "WRONG") + message(FATAL_ERROR "ggml-hexagon backend only support htp arch v68,v69,v73,v75,v79") +endif() + +#cross compiling for hexagon kernels on cDSP side +set(HEXAGON_CC "${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang") +set(HEXAGON_CXX "${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang") +set(HEXAGON_TARGET libggmlop_skel${HTP_ARCH_VERSION}.so) +set(HEXAGON_KERNELS_PATH "${CMAKE_CURRENT_LIST_DIR}/kernels") +set(HEXAGON_COMPUTE "compute${HTP_ARCH_VERSION}") + +if(CMAKE_SYSTEM_NAME STREQUAL "Android") + find_library(LOG_LIB log) + + add_library(cdsprpc + SHARED + IMPORTED) + set_target_properties(cdsprpc + PROPERTIES + IMPORTED_LOCATION + ${HEXAGON_SDK_PATH}/ipc/fastrpc/remote/ship/android_aarch64/libcdsprpc.so) + + set(QNN_LINK_LIBRARIES ${LOG_LIB} cdsprpc) + set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend") + + include_directories(${HEXAGON_SDK_PATH}/incs) + include_directories(${HEXAGON_SDK_PATH}/incs/stddef) + include_directories(${HEXAGON_SDK_PATH}/ipc/fastrpc/incs) + include_directories(${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc) + include_directories(${HEXAGON_SDK_PATH}/ipc/fastrpc/remote/ship/android_Debug_aarch64) + include_directories(${HEXAGON_SDK_PATH}/utils/examples) + include_directories(${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/android_aarch64) + include_directories(${HEXAGON_SDK_PATH}/libs/atomic/inc) + include_directories(${HEXAGON_SDK_PATH}/libs/atomic/android_Debug_aarch64/ship) + include_directories(${CMAKE_SOURCE_DIR}/ggml/src/ggml-hexagon/) + include_directories(${CMAKE_SOURCE_DIR}/ggml/src/ggml-hexagon/kernels/) +elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows") + set(QNN_DEFAULT_LIB_SEARCH_PATH "C:\\" CACHE STRING "customized library search path for QNN backend") +else() + message(FATAL_ERROR "ggml-hexagon now only available on Android and Windows(Windows on ARM)") +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_HEXAGON ${DEBUG_FLAG}") +set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") + +file(GLOB HEXAGON_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/kernels/ggmlop_ap_skel.c") +ggml_add_backend_library(ggml-hexagon ${HEXAGON_SOURCES}) + +target_include_directories(ggml-hexagon PRIVATE ${QNN_SDK_PATH}/include/QNN ${HEXAGON_SDK_PATH} ${CMAKE_CURRENT_LIST_DIR}) +target_link_libraries(ggml-hexagon PRIVATE ${QNN_LINK_LIBRARIES}) + +string(REGEX REPLACE "/$" "" QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}") +target_compile_definitions(ggml-hexagon PRIVATE QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}/") + +function(ggml_hexagon_build_kernel KNAME) + message(STATUS "ggml_hexagon: build hexagon-kernel ${KNAME}") + + add_custom_command( + TARGET ${PROJECT_NAME} + POST_BUILD + COMMAND echo "current working path:`pwd`\n" + COMMAND ${HEXAGON_CC} -o ${HEXAGON_KERNELS_PATH}/ggml-dsp.o -c ${HEXAGON_KERNELS_PATH}/ggml-dsp.c -m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fdata-sections -fpic ${DEBUG_FLAG} -D__V_DYNAMIC__ -mhvx -mhvx-length=128B -fno-finite-math-only -I${HEXAGON_SDK_PATH}/incs -I${HEXAGON_SDK_PATH}/libs/qprintf/inc -I${HEXAGON_SDK_PATH}/incs/stddef -I${HEXAGON_SDK_PATH}/ipc/fastrpc/incs -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc -I${HEXAGON_SDK_PATH}/utils/examples -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/inc -I${HEXAGON_SDK_PATH}/libs/atomic/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc -I${HEXAGON_SDK_PATH}/rtos/qurt/${HEXAGON_COMPUTE}/include/posix -I${HEXAGON_SDK_PATH}/rtos/qurt/${HEXAGON_COMPUTE}/include/qurt/ + COMMAND ${HEXAGON_CC} -o ${HEXAGON_KERNELS_PATH}/ggmlop_cdsp_skel.o -c ${HEXAGON_KERNELS_PATH}/ggmlop_cdsp_skel.c -m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fdata-sections -fpic -D__V_DYNAMIC__ -mhvx -mhvx-length=128B -fno-finite-math-only -I${HEXAGON_SDK_PATH}/incs -I${HEXAGON_SDK_PATH}/libs/qprintf/inc -I${HEXAGON_SDK_PATH}/incs/stddef -I${HEXAGON_SDK_PATH}/ipc/fastrpc/incs -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc -I${HEXAGON_SDK_PATH}/utils/examples -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/inc -I${HEXAGON_SDK_PATH}/libs/atomic/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc + COMMAND ${HEXAGON_CC} -m${HTP_ARCH_VERSION} -Wl,--defsym=ISDB_TRUSTED_FLAG=2 -Wl,--defsym=ISDB_SECURE_FLAG=2 -Wl,--no-threads -fpic -shared -Wl,-Bsymbolic -Wl,--wrap=malloc -Wl,--wrap=calloc -Wl,--wrap=free -Wl,--wrap=realloc -Wl,--wrap=memalign -lc -Wl,-soname=${HEXAGON_TARGET} -o ../../../bin/${HEXAGON_TARGET} -Wl,--start-group ${HEXAGON_KERNELS_PATH}/ggmlop_cdsp_skel.o ${HEXAGON_KERNELS_PATH}/ggml-dsp.o -Wl,--end-group + COMMAND ls -l ../../../bin/${HEXAGON_TARGET} + COMMAND /bin/cp -fv ../../../bin/${HEXAGON_TARGET} ../../../bin/libggmlop_skel.so + COMMENT "build hexagon-kernel" + ) +endfunction() + +ggml_hexagon_build_kernel("cdsp") diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp new file mode 100644 index 0000000000000..7ddda008da192 --- /dev/null +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -0,0 +1,6522 @@ +/* + * Copyright (c) 2023-2025 The ggml authors + * + * Qualcomm QNN SDK and reference tech guides could be found at: + * https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk + * Qualcomm Hexagon SDK and reference tech guides could be found at: + * https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools + * + * this single-source-file or self-contained implementation of ggml-hexagon backend has 8 sections: + * section-1 forward/prototype declaration, global vars, macros, data structures + * section-2 internal troubleshooting function/class + * section-3 helper function for WoA(Windows on ARM) + * section-4 general helper function + * section-5 QNN helper function/class + * section-6 implementation of hwaccel approach through QNN: offload ggmlop to QNN + * section-7 cDSP helper function + * section-8 implementation of ggml-hexagon backend according to specification in ggml backend subsystem + * + * currently provide following ggml op' implementation through QNN: + * - GGML_OP_ADD/GGML_OP_SUB/GGML_OP_MUL/GGML_OP_DIV/GGML_OP_LOG/GGML_OP_SQRT: + * this is a simple hwaccel skeleton, can expand other ggml ops according to expertise + * - GGML_OP_MUL_MAT: + * this is a complicated hwaccel skeleton, can expand other ggml ops accordingly + * + * currently provide following ggml op' implementation through cDSP in hexagon-kernels: + * - GGML_OP_ADD & GGML_OP_MUL_MAT: + * this is a hwaccel skeleton, can expand other ggml ops accordingly + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__ANDROID__) || defined(__linux__) +#include +#include +#include +#include +#include +#include +#include +#endif + +#if !defined(__ANDROID__) && !defined(__linux__) +#include +#include +#include +#endif + +#if defined(__ANDROID__) +#include "android/log.h" + +#include "rpcmem.h" +#include "remote.h" +#include "os_defines.h" +#include "domain.h" +#include "AEEStdErr.h" +#include "HAP_power.h" +#include "HAP_farf.h" +#endif + +#include "QnnTypes.h" +#include "QnnCommon.h" +#include "QnnContext.h" +#include "QnnBackend.h" +#include "QnnGraph.h" +#include "QnnProperty.h" +#include "QnnTensor.h" +#include "QnnInterface.h" +#include "Saver/QnnSaver.h" +#include "System/QnnSystemInterface.h" +#include "HTP/QnnHtpDevice.h" +#include "HTP/QnnHtpGraph.h" + +#include "ggml-hexagon.h" +#include "ggml-impl.h" +#include "ggml-backend-impl.h" + +#include "kernels/ggmlop_ap_skel.h" + +// ================================================================================================= +// section-1: forward/prototype declaration, global vars, macros, data structures +// ================================================================================================= +class qnn_instance; +class hexagon_profiler; +struct ggml_backend_hexagon_context; + +#ifdef NDEBUG +#define GGMLHEXAGON_DEBUG 0 +#else +#define GGMLHEXAGON_DEBUG 1 +#endif + +#define GGMLHEXAGON_LOGBUF_LEN 4096 +#define GGMLHEXAGON_TMPBUF_LEN 256 + +#define GGMLHEXAGON_LOG_ERROR(...) ggmlhexagon_log_internal(GGML_LOG_LEVEL_ERROR, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define GGMLHEXAGON_LOG_WARN(...) ggmlhexagon_log_internal(GGML_LOG_LEVEL_WARN , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define GGMLHEXAGON_LOG_INFO(...) ggmlhexagon_log_internal(GGML_LOG_LEVEL_INFO , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define GGMLHEXAGON_LOG_VERBOSE(...) ggmlhexagon_log_internal(GGML_LOG_LEVEL_CONT , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#if GGMLHEXAGON_DEBUG +#define GGMLHEXAGON_LOG_DEBUG(...) ggmlhexagon_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#else +#define GGMLHEXAGON_LOG_DEBUG(...) +#endif + +#define QNN_VER_PTR(x) (&((x).v1)) +#define RPCMEM_DEFAULT_FLAGS 1 +#define RPCMEM_HEAP_ID_SYSTEM 25 +#define SIZE_IN_MB (1 << 20) +#define STATUS_CONTEXT 0x12345678 + +#if !defined (_WINDOWS) +#pragma weak remote_system_request +#endif + +#define CHECK_QNN_API(error, result) \ + do { \ + error = (result); \ + if (QNN_SUCCESS != error) { \ + if (error == QNN_COMMON_ERROR_NOT_SUPPORTED) { \ + GGMLHEXAGON_LOG_WARN("WARNING: QNN feature/API not supported\n"); \ + } else { \ + GGMLHEXAGON_LOG_INFO("QNN API error = %d(%s)\n", error, ggmlqnn_get_qnnerror_string(error)); \ + } \ + } \ + } while (0) + +#define GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst) \ + do { \ + if (g_hexagon_appcfg.hwaccel_approach != HWACCEL_CDSP) { \ + if (!ggmlqnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ + return; \ + } \ + } \ + } while (0) \ + +// ================================================================================================= +// section-1: data type, data structure, global vars +// ================================================================================================= +using pfn_rpc_mem_init = void (*)(void); +using pfn_rpc_mem_deinit = void (*)(void); +using pfn_rpc_mem_alloc = void *(*)(int, uint32_t, int); +using pfn_rpc_mem_free = void (*)(void *); +using pfn_rpc_mem_to_fd = int (*)(void *); +using _pfn_QnnSaver_initialize = decltype(QnnSaver_initialize); +using _pfn_QnnInterface_getProviders = decltype(QnnInterface_getProviders); +using _pfn_QnnSystemInterface_getProviders = decltype(QnnSystemInterface_getProviders); + +//QNN resource management for the general approach through QNN +using qnn_tensors_t = std::vector< Qnn_Tensor_t >; +using qnn_ptensors_t = std::vector< Qnn_Tensor_t *>; +using qnn_singlenode_res_t = std::tuple; + +typedef void (* ggmlqnn_op_func_t)(ggml_backend_hexagon_context * ctx, ggml_tensor * op); +typedef int (* notify_callback_fn)(void * context, int domain, int session, remote_rpc_status_flags_t status); +typedef int (* ggmlhexagon_op_func_t)(remote_handle64 handle, const dsptensor * src0, const dsptensor * src1, dsptensor * dst); + +enum qnn_index_type { + QNN_TENSOR_INDEX = 0, + QNN_OPCFG_INDEX = 1, +}; + +enum qnn_profile_level { + PROFILE_OFF = 0, + PROFILE_BASIC = 1, + PROFILE_DETAIL = 2, +}; + +//0: general approach through QNN:offload ggmlop to QNN +//1: special approach through QNN-SINGLEGRAPH:mapping entire ggml cgraph to a single QNN graph +//2: general approach through Hexagon cDSP:offload ggmlop to Hexagon cDSP directly +enum hwaccel_approach_type { + HWACCEL_QNN = 0, + HWACCEL_QNN_SINGLEGRAPH = 1, + HWACCEL_CDSP = 2, +}; + +enum hexagon_dsp_type { + HEXAGON_ADSP = 0, + HEXAGON_MDSP = 1, + HEXAGON_SDSP = 2, + HEXAGON_CDSP = 3, + HEXAGON_CDSP1 = 4, +}; + +enum qcom_htp_arch { + NONE = 0, + V68 = 68, + V69 = 69, + V73 = 73, + V75 = 75, + V79 = 79, +}; + +enum qcom_chipset_soc_model { + UNKNOWN_SM = 0, + SM7450 = 41, // v69, 7 Gen1 + SM8350 = 30, // v68, 888 + SM8450 = 36, // v69, SD 8 Gen 1 + SM8475 = 42, // v69, SD 8+ Gen 1 + SM8550 = 43, // v73, SD 8 Gen 2 + SM8650 = 57, // v75, SD 8 Gen 3 + SM8750 = 69, // v79, SD 8 Elite(aka 8 Gen 4) +#if !defined(__ANDROID__) && !defined(__linux__) + SC7280X = 44, + SC8280X = 37, + SC8380XP = 60, +#endif +}; + +struct qcom_socinfo { + uint32_t soc_model; + size_t htp_arch; + size_t vtcm_size_in_mb; + char soc_desc[GGML_MAX_NAME]; +}; + +struct ggml_backend_hexagon_context { + int device; + char name[GGML_MAX_NAME]; + char desc[GGML_MAX_NAME]; + char lib[GGML_MAX_NAME]; + qnn_instance * instance; + struct ggml_backend * backend; + QNN_INTERFACE_VER_TYPE raw_interface; + QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; + struct qcom_socinfo socinfo; + + //QNN resource management for the general approach through QNN + std::map qnn_singlenode_graph_map; + + //quantize data -> fp32 + std::unique_ptr work_data; + std::vector> tasks; + size_t work_size; + size_t desired_size; + int n_threads; + + //Hexagon resource management for the general approach through Hexagaon cDSP + size_t rpc_mempool_capacity; + size_t rpc_mempool_len; + size_t rpc_mempool_usage; + void * rpc_mempool; + int rpc_mempool_handle; + remote_handle64 ggmlop_handle; + int domain_id; +}; + +struct qnn_op_caps { + bool supported; + ggml_op op; + const size_t input_param_count; + const char * qnn_op_name; +}; + +struct hexagon_op_caps { + bool supported; + ggml_op op; + const size_t input_param_count; + const char * hexagon_op_name; + ggmlhexagon_op_func_t dsp_op_func; +}; + +struct hexagon_appcfg_t { + int print_qnn_internal_log; // enable/disable QNN's internal log + int enable_perf; // enable/disable perf of a specified ggml op + int enable_profiler; // enable/disable profiler feature to visualization comparison between HWACCEL_CDSP and HWACCEL_QNN + int print_tensors_info; // enable/disable print tensors info in op function + int dump_op_info; // enable/disable dump op info in handle_op + int enable_q_mulmat; // enable/disable offload quantized mulmat + int precision_mode; // 0: default 1:fp16 + int hvx_threads; + int vtcm_size_in_mb; + int enable_dlbc; + int hwaccel_approach; // 0: HWACCEL_QNN 1: HWACCEL_QNN_SINGLEGRAPH 2: HWACCEL_CDSP + int hexagon_backend; // 0: HEXAGON_BACKEND_QNNCPU 1: HEXAGON_BACKEND_QNNGPU 2: HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP + int enable_rpc_ion_mempool; // enable/disable rpc ion memory pool + int enable_all_q_mulmat; // enable/disable offload all quantized type mulmat to cDSP + int profiler_duration; // threshold of duration in profiler, per seconds + int profiler_counts; // threshold of counts in profiler + int thread_counts; // thread_counts on cDSP side + const char * cfgfilename; + const char * runtime_libpath; + char ggml_hexagon_version[GGMLHEXAGON_TMPBUF_LEN]; + char ggml_dsp_version[GGMLHEXAGON_TMPBUF_LEN]; +}; + +static struct hexagon_appcfg_t g_hexagon_appcfg = { + .print_qnn_internal_log = 0, + .enable_perf = 1, + .enable_profiler = 0, + .print_tensors_info = 0, + .dump_op_info = 0, + .enable_q_mulmat = 0, + .precision_mode = 0, + .hvx_threads = 4, + .vtcm_size_in_mb = 8, + .enable_dlbc = 1, + .hwaccel_approach = HWACCEL_CDSP, + .hexagon_backend = HEXAGON_BACKEND_CDSP, + .enable_rpc_ion_mempool = 0, + .enable_all_q_mulmat = 0, + .profiler_duration = 5, + .profiler_counts = 100, + .thread_counts = 4, + .cfgfilename = "ggml-hexagon.cfg", +#if defined(__ANDROID__) +//Android command line program + .runtime_libpath = "/data/local/tmp/", +#elif defined(__linux__) + .qnn_runtimelib_path = "/tmp/", +#elif defined(_WIN32) + .qnn_runtimelib_path = "C:\\", +#endif + .ggml_hexagon_version = {"1.04"}, + .ggml_dsp_version = {"0.61"}, +}; + +//file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/overview.html#tbl-supported-snapdragon-devices +static struct qcom_socinfo g_qnn_soc_info_table[] = { + /* Qualcomm SnapDragon 7 Gen 1 */ + { + .soc_model = SM7450, + .htp_arch = V69, + .vtcm_size_in_mb = 8, + .soc_desc = "Qualcomm SnapDragon 7 Gen 1"}, + + /* Qualcomm SnapDragon 888 */ + { + .soc_model = SM8350, + .htp_arch = V68, + .vtcm_size_in_mb = 8, + .soc_desc = "Qualcomm SnapDragon 888 "}, + + /* Qualcomm SnapDragon 8 Gen 1 */ + { + .soc_model = SM8450, + .htp_arch = V69, + .vtcm_size_in_mb = 8, + .soc_desc = "Qualcomm SnapDragon 8 Gen 1"}, + + /* Qualcomm SnapDragon 8 Gen 1+ */ + { + .soc_model = SM8475, + .htp_arch = V69, + .vtcm_size_in_mb = 8, + .soc_desc = "Qualcomm SnapDragon 8 Gen 1+"}, + + /* Qualcomm SnapDragon 8 Gen 2 */ + { + .soc_model = SM8550, + .htp_arch = V73, + .vtcm_size_in_mb = 8, + .soc_desc = "Qualcomm SnapDragon 8 Gen 2"}, + + /* Qualcomm SnapDragon 8 Gen 3 */ + { + .soc_model = SM8650, + .htp_arch = V75, + .vtcm_size_in_mb = 8, + .soc_desc = "Qualcomm SnapDragon 8 Gen 3 "}, + + /* Qualcomm SnapDragon 8 Gen 4 */ + { + .soc_model = SM8750, + .htp_arch = V79, + .vtcm_size_in_mb = 8, + .soc_desc = "Qualcomm SnapDragon 8 Elite(aka 8 Gen 4)"}, + +#if !defined(__ANDROID__) && !defined(__linux__) + /* Qualcomm SnapDragon 7c Gen 2 */ + { + .soc_model = SC7280X, + .htp_arch = V68, + .vtcm_size_in_mb = 8, + .soc_desc = "Qualcomm SnapDragon 7c Gen 2"}, + + /* Qualcomm SnapDragon 8cx Gen 3 */ + { + .soc_model = SC8280X, + .htp_arch = V68, + .vtcm_size_in_mb = 8, + .soc_desc = "Qualcomm SnapDragon 8cx Gen 3"}, + + /* Qualcomm SnapDragon 8cx Gen 4 */ + { + .soc_model = SC8380XP, + .htp_arch = V73, + .vtcm_size_in_mb = 8, + .soc_desc = "Qualcomm SnapDragon 8cx Gen 4"}, +#endif + +}; + +// file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/quantization.html +// CPU - Choose a non-quantized model.Quantized models are currently incompatible with the CPU backend +// GPU - Choose a non-quantized model.Quantized models are currently incompatible with the GPU backend +// HTP - Choose a quantized model. Quantized models are required when running on the HTP backend +// DSP - Choose a quantized model. Quantized models are required when running on the DSP backend +// HTA - Choose a quantized model. Quantized models are required when running on the HTA backend +static struct ggml_backend_hexagon_context g_hexagon_mgr[GGML_HEXAGON_MAX_DEVICES] = { + { .device = 0, + .name = "qnn-cpu", + .desc = "Qualcomm Kryo CPU", +#if !defined(__ANDROID__) && !defined(__linux__) + .lib = "QnnCpu.dll", +#else + .lib = "libQnnCpu.so", +#endif + .instance = nullptr, + .backend = nullptr, + .raw_interface = {}, + .raw_system_interface = {}, + .socinfo = {}}, + + { .device = 1, + .name = "qnn-gpu", + .desc = "Qualcomm Adreno GPU", +#if !defined(__ANDROID__) && !defined(__linux__) + .lib = "QnnGpu.dll", +#else + .lib = "libQnnGpu.so", +#endif + .instance = nullptr, + .backend = nullptr, + .raw_interface = {}, + .raw_system_interface = {}, + .socinfo = {}}, + + { .device = 2, + .name = "qnn-npu", + .desc = "Qualcomm NPU(Hexagon Tensor Processor)", +#if !defined(__ANDROID__) && !defined(__linux__) + .lib = "QnnHtp.dll", +#else + .lib = "libQnnHtp.so", +#endif + .instance = nullptr, + .backend = nullptr, + .raw_interface = {}, + .raw_system_interface = {}, + .socinfo = {}}, +}; + +static domain hexagon_supported_domains[] = { + {ADSP_DOMAIN_ID, ADSP_DOMAIN}, + {MDSP_DOMAIN_ID, MDSP_DOMAIN}, + {SDSP_DOMAIN_ID, SDSP_DOMAIN}, + {CDSP_DOMAIN_ID, CDSP_DOMAIN}, + {CDSP1_DOMAIN_ID, CDSP1_DOMAIN} +}; + +//supported ggml op by HWACCEL_QNN +static constexpr const qnn_op_caps ggmlqnn_k_op_caps[] = { + {true, GGML_OP_NONE, 0, nullptr}, + {false, GGML_OP_DUP, 0, nullptr}, + {true, GGML_OP_ADD, 2, QNN_OP_ELEMENT_WISE_ADD}, + {false, GGML_OP_ADD1, 0, nullptr}, + {false, GGML_OP_ACC, 0, nullptr}, + {true, GGML_OP_SUB, 2, QNN_OP_ELEMENT_WISE_SUBTRACT}, + {true, GGML_OP_MUL, 2, QNN_OP_ELEMENT_WISE_MULTIPLY}, + {true, GGML_OP_DIV, 2, QNN_OP_ELEMENT_WISE_DIVIDE}, + {false, GGML_OP_SQR, 0, nullptr}, + {true, GGML_OP_SQRT, 1, QNN_OP_ELEMENT_WISE_SQUARE_ROOT}, + {true, GGML_OP_LOG, 1, QNN_OP_ELEMENT_WISE_LOG}, + {false, GGML_OP_SIN, 0, nullptr}, + {false, GGML_OP_COS, 0, nullptr}, + {false, GGML_OP_SUM, 0, nullptr}, + {false, GGML_OP_SUM_ROWS, 0, nullptr}, + {false, GGML_OP_MEAN, 0, nullptr}, + {false, GGML_OP_ARGMAX, 0, nullptr}, + {false, GGML_OP_COUNT_EQUAL, 0, nullptr}, + {false, GGML_OP_REPEAT, 0, nullptr}, + {false, GGML_OP_REPEAT_BACK, 0, nullptr}, + {false, GGML_OP_CONCAT, 0, nullptr}, + {false, GGML_OP_SILU_BACK, 0, nullptr}, + {false, GGML_OP_NORM, 0, nullptr}, + {false, GGML_OP_RMS_NORM, 0, nullptr}, + {false, GGML_OP_RMS_NORM_BACK, 0, nullptr}, + {false, GGML_OP_GROUP_NORM, 0, nullptr}, + {false, GGML_OP_L2_NORM, 0, nullptr}, + {true, GGML_OP_MUL_MAT, 2, QNN_OP_MAT_MUL}, + {false, GGML_OP_MUL_MAT_ID, 0, nullptr}, + {false, GGML_OP_OUT_PROD, 0, nullptr}, + {false, GGML_OP_SCALE, 0, nullptr}, + {false, GGML_OP_SET, 0, nullptr}, + {false, GGML_OP_CPY, 0, nullptr}, + {false, GGML_OP_CONT, 0, nullptr}, + {false, GGML_OP_RESHAPE, 0, nullptr}, + {false, GGML_OP_VIEW, 0, nullptr}, + {false, GGML_OP_PERMUTE, 0, nullptr}, + {false, GGML_OP_TRANSPOSE, 0, nullptr}, + {false, GGML_OP_GET_ROWS, 0, nullptr}, + {false, GGML_OP_GET_ROWS_BACK, 0, nullptr}, + {false, GGML_OP_DIAG, 0, nullptr}, + {false, GGML_OP_DIAG_MASK_INF, 0, nullptr}, + {false, GGML_OP_DIAG_MASK_ZERO, 0, nullptr}, + {false, GGML_OP_SOFT_MAX, 0, nullptr}, + {false, GGML_OP_SOFT_MAX_BACK, 0, nullptr}, + {false, GGML_OP_ROPE, 0, nullptr}, + {false, GGML_OP_ROPE_BACK, 0, nullptr}, + {false, GGML_OP_CLAMP, 0, nullptr}, + {false, GGML_OP_CONV_TRANSPOSE_1D, 0, nullptr}, + {false, GGML_OP_IM2COL, 0, nullptr}, + {false, GGML_OP_IM2COL_BACK, 0, nullptr}, + {false, GGML_OP_CONV_TRANSPOSE_2D, 0, nullptr}, + {false, GGML_OP_POOL_1D, 0, nullptr}, + {false, GGML_OP_POOL_2D, 0, nullptr}, + {false, GGML_OP_POOL_2D_BACK, 0, nullptr}, + {false, GGML_OP_UPSCALE, 0, nullptr}, + {false, GGML_OP_PAD, 0, nullptr}, + {false, GGML_OP_PAD_REFLECT_1D, 0, nullptr}, + {false, GGML_OP_ARANGE, 0, nullptr}, + {false, GGML_OP_TIMESTEP_EMBEDDING, 0, nullptr}, + {false, GGML_OP_ARGSORT, 0, nullptr}, + {false, GGML_OP_LEAKY_RELU, 0, nullptr}, + {false, GGML_OP_FLASH_ATTN_EXT, 0, nullptr}, + {false, GGML_OP_FLASH_ATTN_BACK, 0, nullptr}, + {false, GGML_OP_SSM_CONV, 0, nullptr}, + {false, GGML_OP_SSM_SCAN, 0, nullptr}, + {false, GGML_OP_WIN_PART, 0, nullptr}, + {false, GGML_OP_WIN_UNPART, 0, nullptr}, + {false, GGML_OP_GET_REL_POS, 0, nullptr}, + {false, GGML_OP_ADD_REL_POS, 0, nullptr}, + {false, GGML_OP_RWKV_WKV6, 0, nullptr}, + {false, GGML_OP_GATED_LINEAR_ATTN, 0, nullptr}, + {false, GGML_OP_RWKV_WKV7, 0, nullptr}, + {false, GGML_OP_UNARY, 0, nullptr}, + {false, GGML_OP_MAP_CUSTOM1, 0, nullptr}, + {false, GGML_OP_MAP_CUSTOM2, 0, nullptr}, + {false, GGML_OP_MAP_CUSTOM3, 0, nullptr}, + {false, GGML_OP_CUSTOM, 0, nullptr}, + {false, GGML_OP_CROSS_ENTROPY_LOSS, 0, nullptr}, + {false, GGML_OP_CROSS_ENTROPY_LOSS_BACK, 0, nullptr}, + {false, GGML_OP_OPT_STEP_ADAMW, 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_ABS), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_SGN), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_NEG), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_STEP), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_TANH), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_ELU), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_RELU), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_SIGMOID), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_GELU), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_GELU_QUICK), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_SILU), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_HARDSWISH), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_HARDSIGMOID), 0, nullptr}, + {false, static_cast(GGML_UNARY_OP_EXP), 0, nullptr} +}; + +static_assert(ggmlqnn_k_op_caps[GGML_OP_NONE].supported, "GGML_OP_NONE is not true"); +static_assert(ggmlqnn_k_op_caps[GGML_OP_ADD].supported, "GGML_OP_ADD is not true"); +static_assert(ggmlqnn_k_op_caps[GGML_OP_MUL].supported, "GGML_OP_MUL is not true"); +static_assert(ggmlqnn_k_op_caps[GGML_OP_MUL_MAT].supported, "GGML_OP_MUL_MAT is not true"); +static_assert(std::size(ggmlqnn_k_op_caps) == (static_cast(GGML_OP_COUNT) + static_cast(GGML_UNARY_OP_COUNT)), + "pls check ggmlqnn_k_op_caps and ensure is corresponding to latest ggml.h"); + +//supported ggml op by HWACCEL_CDSP +static constexpr const hexagon_op_caps ggmlhexagon_k_op_caps[] = { + {true, GGML_OP_NONE, 0, nullptr, nullptr}, + {false, GGML_OP_DUP, 0, nullptr, nullptr}, + {true, GGML_OP_ADD, 2, "ggmlop_dsp_add", ggmlop_dsp_add}, + {false, GGML_OP_ADD1, 0, nullptr, nullptr}, + {false, GGML_OP_ACC, 0, nullptr, nullptr}, + {false, GGML_OP_SUB, 2, nullptr, nullptr}, + {false, GGML_OP_MUL, 2, nullptr, nullptr}, + {false, GGML_OP_DIV, 2, nullptr, nullptr}, + {false, GGML_OP_SQR, 0, nullptr, nullptr}, + {false, GGML_OP_SQRT, 0, nullptr, nullptr}, + {false, GGML_OP_LOG, 0, nullptr, nullptr}, + {false, GGML_OP_SIN, 0, nullptr, nullptr}, + {false, GGML_OP_COS, 0, nullptr, nullptr}, + {false, GGML_OP_SUM, 0, nullptr, nullptr}, + {false, GGML_OP_SUM_ROWS, 0, nullptr, nullptr}, + {false, GGML_OP_MEAN, 0, nullptr, nullptr}, + {false, GGML_OP_ARGMAX, 0, nullptr, nullptr}, + {false, GGML_OP_COUNT_EQUAL, 0, nullptr, nullptr}, + {false, GGML_OP_REPEAT, 0, nullptr, nullptr}, + {false, GGML_OP_REPEAT_BACK, 0, nullptr, nullptr}, + {false, GGML_OP_CONCAT, 0, nullptr, nullptr}, + {false, GGML_OP_SILU_BACK, 0, nullptr, nullptr}, + {false, GGML_OP_NORM, 0, nullptr, nullptr}, + {true, GGML_OP_RMS_NORM, 1, "ggmlop_dsp_rmsnorm", ggmlop_dsp_rmsnorm}, + {false, GGML_OP_RMS_NORM_BACK, 0, nullptr, nullptr}, + {false, GGML_OP_GROUP_NORM, 0, nullptr, nullptr}, + {false, GGML_OP_L2_NORM, 0, nullptr, nullptr}, + {true, GGML_OP_MUL_MAT, 2, "ggmlop_dsp_mulmat", ggmlop_dsp_mulmat}, + {false, GGML_OP_MUL_MAT_ID, 0, nullptr, nullptr}, + {false, GGML_OP_OUT_PROD, 0, nullptr, nullptr}, + {false, GGML_OP_SCALE, 0, nullptr, nullptr}, + {false, GGML_OP_SET, 0, nullptr, nullptr}, + {false, GGML_OP_CPY, 0, nullptr, nullptr}, + {false, GGML_OP_CONT, 0, nullptr, nullptr}, + {false, GGML_OP_RESHAPE, 0, nullptr, nullptr}, + {false, GGML_OP_VIEW, 0, nullptr, nullptr}, + {false, GGML_OP_PERMUTE, 0, nullptr, nullptr}, + {false, GGML_OP_TRANSPOSE, 0, nullptr, nullptr}, + {false, GGML_OP_GET_ROWS, 0, nullptr, nullptr}, + {false, GGML_OP_GET_ROWS_BACK, 0, nullptr, nullptr}, + {false, GGML_OP_DIAG, 0, nullptr, nullptr}, + {false, GGML_OP_DIAG_MASK_INF, 0, nullptr, nullptr}, + {false, GGML_OP_DIAG_MASK_ZERO, 0, nullptr, nullptr}, + {true, GGML_OP_SOFT_MAX, 1, "ggmlop_dsp_softmax", ggmlop_dsp_softmax}, + {false, GGML_OP_SOFT_MAX_BACK, 0, nullptr, nullptr}, + {false, GGML_OP_ROPE, 0, nullptr, nullptr}, + {false, GGML_OP_ROPE_BACK, 0, nullptr, nullptr}, + {false, GGML_OP_CLAMP, 0, nullptr, nullptr}, + {false, GGML_OP_CONV_TRANSPOSE_1D, 0, nullptr, nullptr}, + {false, GGML_OP_IM2COL, 0, nullptr, nullptr}, + {false, GGML_OP_IM2COL_BACK, 0, nullptr, nullptr}, + {false, GGML_OP_CONV_TRANSPOSE_2D, 0, nullptr, nullptr}, + {false, GGML_OP_POOL_1D, 0, nullptr, nullptr}, + {true, GGML_OP_POOL_2D, 1, "ggmlop_dsp_pool2d", ggmlop_dsp_pool2d}, + {false, GGML_OP_POOL_2D_BACK, 0, nullptr, nullptr}, + {false, GGML_OP_UPSCALE, 0, nullptr, nullptr}, + {false, GGML_OP_PAD, 0, nullptr, nullptr}, + {false, GGML_OP_PAD_REFLECT_1D, 0, nullptr, nullptr}, + {false, GGML_OP_ARANGE, 0, nullptr, nullptr}, + {false, GGML_OP_TIMESTEP_EMBEDDING, 0, nullptr, nullptr}, + {false, GGML_OP_ARGSORT, 0, nullptr, nullptr}, + {false, GGML_OP_LEAKY_RELU, 0, nullptr, nullptr}, + {false, GGML_OP_FLASH_ATTN_EXT, 0, nullptr, nullptr}, + {false, GGML_OP_FLASH_ATTN_BACK, 0, nullptr, nullptr}, + {false, GGML_OP_SSM_CONV, 0, nullptr, nullptr}, + {false, GGML_OP_SSM_SCAN, 0, nullptr, nullptr}, + {false, GGML_OP_WIN_PART, 0, nullptr, nullptr}, + {false, GGML_OP_WIN_UNPART, 0, nullptr, nullptr}, + {false, GGML_OP_GET_REL_POS, 0, nullptr, nullptr}, + {false, GGML_OP_ADD_REL_POS, 0, nullptr, nullptr}, + {false, GGML_OP_RWKV_WKV6, 0, nullptr, nullptr}, + {false, GGML_OP_GATED_LINEAR_ATTN, 0, nullptr, nullptr}, + {false, GGML_OP_RWKV_WKV7, 0, nullptr, nullptr}, + {false, GGML_OP_UNARY, 0, nullptr, nullptr}, + {false, GGML_OP_MAP_CUSTOM1, 0, nullptr, nullptr}, + {false, GGML_OP_MAP_CUSTOM2, 0, nullptr, nullptr}, + {false, GGML_OP_MAP_CUSTOM3, 0, nullptr, nullptr}, + {false, GGML_OP_CUSTOM, 0, nullptr, nullptr}, + {false, GGML_OP_CROSS_ENTROPY_LOSS, 0, nullptr, nullptr}, + {false, GGML_OP_CROSS_ENTROPY_LOSS_BACK, 0, nullptr, nullptr}, + {false, GGML_OP_OPT_STEP_ADAMW, 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_ABS), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_SGN), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_NEG), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_STEP), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_TANH), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_ELU), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_RELU), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_SIGMOID), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_GELU), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_GELU_QUICK), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_SILU), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_HARDSWISH), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_HARDSIGMOID), 0, nullptr, nullptr}, + {false, static_cast(GGML_UNARY_OP_EXP), 0, nullptr, nullptr} +}; + +static_assert(ggmlhexagon_k_op_caps[GGML_OP_NONE].supported, "GGML_OP_NONE is not true"); +static_assert(ggmlhexagon_k_op_caps[GGML_OP_ADD].supported, "GGML_OP_ADD is not true"); +static_assert(ggmlhexagon_k_op_caps[GGML_OP_MUL_MAT].supported, "GGML_OP_MUL_MAT is not true"); +static_assert(ggmlhexagon_k_op_caps[GGML_OP_SOFT_MAX].supported, "GGML_OP_SOFT_MAX is not true"); +static_assert(std::size(ggmlhexagon_k_op_caps) == (static_cast(GGML_OP_COUNT) + static_cast(GGML_UNARY_OP_COUNT)), + "pls check ggmlhexagon_k_op_caps and ensure is corresponding to latest ggml.h"); + +static int32_t g_qnntensor_idx = 0; //ensure every QNN tensor name is unique +static int32_t g_qnnopcfg_idx = 0; //ensure every QNN opconfig name is unique + +// ================================================================================================= +// section-2: ggml-hexagon internal troubleshooting and profiler function/class +// ================================================================================================= +static const char * ggmlhexagon_get_hwaccel_approach_name(int hwaccle_approach) { + switch (hwaccle_approach) { + case HWACCEL_QNN: + return "HWACCEL_QNN"; + case HWACCEL_QNN_SINGLEGRAPH: + return "HWACCEL_QNN_SINGLEGRAPH"; + case HWACCEL_CDSP: + return "HWACCEL_CDSP"; + default: + return "unknown hwaccel approach"; + } +} + +static void ggmlhexagon_get_timestring(char * p_currenttime) { +#if defined(__ANDROID__) || defined(__linux__) + time_t n_seconds = 0; + struct tm now_time; + + if (nullptr == p_currenttime) + return; + + time(&n_seconds); + localtime_r(&n_seconds, &now_time); + snprintf(p_currenttime, GGMLHEXAGON_TMPBUF_LEN, "%04d-%02d-%02d,%02d:%02d:%02d", + now_time.tm_year + 1900, now_time.tm_mon + 1, now_time.tm_mday, + now_time.tm_hour, now_time.tm_min, now_time.tm_sec); +#else + //TODO: WoA +#endif +} + +static void ggmlhexagon_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) { + static std::mutex ggmlhexagon_log_internal_mutex; + static char s_ggmlhexagon_log_internal_buf[GGMLHEXAGON_LOGBUF_LEN]; + + GGML_UNUSED(file); +#if !(defined __ANDROID__) || !(defined ANDROID) + GGML_UNUSED(level); +#endif + { + std::lock_guard lock(ggmlhexagon_log_internal_mutex); + va_list args; + va_start(args, format); + int len_prefix = snprintf(s_ggmlhexagon_log_internal_buf, GGMLHEXAGON_LOGBUF_LEN, "[%s, %d]: ", func, line); + int len = vsnprintf(s_ggmlhexagon_log_internal_buf + len_prefix, GGMLHEXAGON_LOGBUF_LEN - len_prefix, format, args); + if (len < (GGMLHEXAGON_LOGBUF_LEN - len_prefix)) { +#if (defined __ANDROID__) || (defined ANDROID) + __android_log_print(ANDROID_LOG_INFO, "ggml-hexagon", "%s\n", s_ggmlhexagon_log_internal_buf); + if (GGML_LOG_LEVEL_INFO == level) { + printf("%s\n", s_ggmlhexagon_log_internal_buf); + } +#else + //for Snapdragon based WoA(Windows on ARM) device or Linux + printf("%s\n", s_ggmlhexagon_log_internal_buf); +#endif + } + va_end(args); + } +} + +static void ggmlhexagon_print_tensors_info(const char * func_name, const ggml_backend_hexagon_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * dst) { + //skip sanity check of params because of performance concern + if (0 == g_hexagon_appcfg.dump_op_info) { + if (0 == g_hexagon_appcfg.print_tensors_info) + return; + } + + if (nullptr != func_name && nullptr != ctx) { + GGMLHEXAGON_LOG_DEBUG("call %s in dev %s\n", func_name, ctx->name); + } + if (nullptr != src0) { + GGMLHEXAGON_LOG_DEBUG( + "%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)", + src0->name, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->ne[3], + src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]); + } + if (nullptr != src1) { + GGMLHEXAGON_LOG_DEBUG( + "%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)", + src1->name, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->ne[3], + src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]); + } + GGMLHEXAGON_LOG_DEBUG("%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)", + dst->name, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3]); + GGMLHEXAGON_LOG_DEBUG("\n"); +} + +static void ggmlhexagon_dump_op_info(const struct ggml_tensor * tensor) { + //skip sanity check of params because of performance concern + if (0 == g_hexagon_appcfg.dump_op_info) + return; + + const struct ggml_tensor * src0 = tensor->src[0]; + struct ggml_tensor * src1 = tensor->src[1]; + struct ggml_tensor * dst = const_cast(tensor); + GGMLHEXAGON_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op), ggml_type_name(tensor->type)); + ggmlhexagon_print_tensors_info(nullptr, nullptr, src0, src1, dst); +} + +static void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor) { + float value = 0; + std::ostringstream tmposs; + if (tensor->type == GGML_TYPE_F32) { + for (int h = 0; h < tensor->ne[3]; h++) { + for (int i = 0; i < tensor->ne[2]; i++) { + for (int j = 0; j < tensor->ne[1]; j++) { + for (int k = 0; k < tensor->ne[0]; k++) { + value = ((float *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + + j * tensor->ne[0] + k]; + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value + << " "; + } + if (strlen(tmposs.str().c_str()) <= (GGMLHEXAGON_LOGBUF_LEN - 96)) { + GGMLHEXAGON_LOG_DEBUG("%s\n", tmposs.str().c_str()); + } + tmposs.clear(); + tmposs.str(""); + } + } + } + } + + GGMLHEXAGON_LOG_DEBUG("\n"); +} + +static void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, const char * name) { + GGMLHEXAGON_LOG_DEBUG("dump ggml tensor %s(%s)\n", name, tensor->name); + GGMLHEXAGON_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64", nb = (%5zi, %5zi, %5zi, %5zi)\n", + name, + tensor->type, ggml_type_name(tensor->type), + tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], + tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[2]); + ggmlhexagon_dump_tensor_elements(tensor); + + GGMLHEXAGON_LOG_DEBUG("\n"); +} + +//a simple high-cohesion and low-coupling class to collect necessary profiler data and visualize NPU performance accordingly +class hexagon_profiler { +public: + static hexagon_profiler & get_instance() { + //make thread-safety without using complex dynamic resource management + static hexagon_profiler instance; + return instance; + } + +public: + void profiler_init(int profiler_threshold_duration, int profiler_threshold_counts) { + reset(); + //here is not accurate profiler start time because inference wasn't launched at the moment + _profiler_starttime = ggml_time_us(); + + _profiler_threshold_duration = profiler_threshold_duration; + _profiler_threshold_counts = profiler_threshold_counts; + + //FIXME:hardcode filename of profiler data + std::string filename = std::string(g_hexagon_appcfg.runtime_libpath) + "/"; + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + if (g_hexagon_appcfg.thread_counts > 1) { + //multi-threading feature enabled on cDSP side + if (0 == g_hexagon_appcfg.enable_rpc_ion_mempool) { + filename = filename + "hexagon_perf_cdsp_mt.dat"; + } else { + filename = filename + "hexagon_perf_cdsp_ion_mt.dat"; + } + } else { + if (0 == g_hexagon_appcfg.enable_rpc_ion_mempool) { + filename = filename + "hexagon_perf_cdsp.dat"; + } else { + filename = filename + "hexagon_perf_cdsp_ion.dat"; + } + } + } else { + filename = filename + "hexagon_perf_qnn.dat"; + } + GGMLHEXAGON_LOG_DEBUG("profiler name:%s", filename.c_str()); + const char * profiler_filename = filename.c_str(); + _fp_profile_file = fopen(profiler_filename, "w"); + if (nullptr == _fp_profile_file) { + GGMLHEXAGON_LOG_WARN("can't open profiler file %s, reason:%s", profiler_filename, strerror(errno)); + reset(); + return; + } else { + size_t written_size = 0; + char profiler_info[GGMLHEXAGON_TMPBUF_LEN]; + const char * prefix = "### starting hexagon profiler at "; + + written_size = fwrite(prefix, 1, strlen(prefix), _fp_profile_file); + if (written_size != strlen(prefix)) { + GGMLHEXAGON_LOG_WARN("write data to file %s failed, reason: %s", profiler_filename, strerror(errno)); + profiler_deinit(); + return; + } + + memset(profiler_info, 0, GGMLHEXAGON_TMPBUF_LEN); + ggmlhexagon_get_timestring(profiler_info); + written_size = fwrite(profiler_info, 1, strlen(profiler_info), _fp_profile_file); + if (written_size != strlen(profiler_info)) { + GGMLHEXAGON_LOG_WARN("write data to file %s failed, reason: %s", profiler_filename, strerror(errno)); + profiler_deinit(); + return; + } + fprintf(_fp_profile_file, "\n\n"); + fprintf(_fp_profile_file, + "#frame input max total avg elapse frame max total avg\n"); + fprintf(_fp_profile_file, + "# inference inference inference inference\n"); + fprintf(_fp_profile_file, + "#index len i-len i-len i-speed time time time time time\n"); + fprintf(_fp_profile_file, "\n\n"); + } + _enable_profiler = true; + } + + void profiler_deinit() { + if (nullptr != _fp_profile_file) { + fclose(_fp_profile_file); + _fp_profile_file = nullptr; + } + reset(); + } + +/** + * \param inference_time microseconds, inference time for a single GGML op + * \param inference_input_size bytes, total input data size for a single GGML op + * \param inference_output_size bytes, total output data size for a single GGML op + */ + void profiler_update_profilerdata(const char * ggml_opname, int inference_time, int inference_input_size, int inference_output_size) { + if (!_enable_profiler) + return; + + //1.get the accurate profiler starting time in this function when frame index is 0 + //2.update frame index in this function accordingly + profiler_update_frameindex(); + + int64_t elapse_time = ggml_time_us() - profiler_get_starttime(); + profiler_update_elapsetime(elapse_time); + if (elapse_time > (_profiler_threshold_duration * SIZE_IN_MB)) { + //do nothing when elapsed profiler time > profiler_duration in ggml-hexagon.cfg + return; + } + if (profiler_get_frame_index() >= _profiler_threshold_counts) { + //do nothing when frame_index >= profiler_counts in ggml-hexagon.cfg + return; + } + + if (inference_input_size > profiler_get_max_inputsize()) { + profiler_set_max_inputsize(inference_input_size); + } + + if (inference_output_size > profiler_get_max_outputsize()) { + profiler_set_max_outputsize(inference_output_size); + } + + if (inference_time > profiler_get_max_inferencetime()) { + profiler_set_max_inferencetime(inference_time); + } + + profiler_update_total_inputsize(inference_input_size); + profiler_update_total_outputsize(inference_output_size); + profiler_update_total_inferencetime(inference_time); + profiler_update_elapsetime(elapse_time); + + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + if (10 > _frame_index) { + //FIXME:why some initial profiler data in llama-cli looks unusual + //return; + } + } + + if (0 == elapse_time) { + //filter invalid profiler data + return; + } + + if (NULL != _fp_profile_file) { + fprintf(_fp_profile_file, "%-8d %-6d %-6d %-10ld %-11ld %-10ld %-12d %-9d %-11ld %-3ld\n", + profiler_get_frame_index(), + inference_input_size, + profiler_get_max_inputsize(), + profiler_get_total_inputputsize(), + profiler_get_total_inputputsize() / profiler_get_frame_index(), + + elapse_time, + inference_time, + profiler_get_max_inferencetime(), + profiler_get_total_inferencetime(), + profiler_get_total_inferencetime() / profiler_get_frame_index() + ); + } + + //print/compare NPU's I/O performance between 8Gen3 and 8Elite(aka 8Gen4) , removed in the future + char bps_string[GGMLHEXAGON_TMPBUF_LEN]; + memset(bps_string, 0, GGMLHEXAGON_TMPBUF_LEN); + profiler_get_bpsstring(_total_inputsize + _total_outputsize, elapse_time, bps_string); + GGMLHEXAGON_LOG_VERBOSE("I/O performance:%s", bps_string); + } + + int profiler_get_frame_index() { + return _frame_index; + } + + int profiler_get_threshold_count() { + return _profiler_threshold_counts; + } + +private: + void profiler_set_max_inputsize(int input_size) { + _max_inputsize = input_size; + } + + void profiler_set_max_outputsize(int output_size) { + _max_outputsize = output_size; + } + + void profiler_set_max_inferencetime(int inference_time) { + _max_inferencetime = inference_time; + } + + void profiler_update_frameindex() { + if (0 == _frame_index) { + _profiler_starttime = ggml_time_us(); + } + _frame_index += 1; + } + + void profiler_update_elapsetime(int64_t elapse_time_microseconds) { + _profiler_elapsetime = elapse_time_microseconds; + } + + void profiler_update_total_inferencetime(int inference_time) { + _total_inferencetime += inference_time; + } + + void profiler_update_total_inputsize(int input_size) { + _total_inputsize += input_size; + } + + void profiler_update_total_outputsize(int output_size) { + _total_outputsize += output_size; + } + + int profiler_get_max_inputsize() { + return _max_inputsize; + } + + int profiler_get_max_outputsize() { + return _max_outputsize; + } + + int profiler_get_max_inferencetime() { + return _max_inferencetime; + } + + int64_t profiler_get_total_inferencetime() { + return _total_inferencetime; + } + + int64_t profiler_get_total_inputputsize() { + return _total_inputsize; + } + + //might-be used to calculate total I/O performance in the future + int64_t profiler_get_total_outputsize() { + return _total_outputsize; + } + + int64_t profiler_get_starttime() { + return _profiler_starttime; + } + + int64_t profiler_get_elapsedtime() { + return _profiler_elapsetime; + } + + void profiler_get_bpsstring(int64_t data_size, int64_t elapse_time_microseconds, char * bps_string) { + if (nullptr == bps_string) { + return; + } + + float bps = 0.0f; + bps = (data_size * SIZE_IN_MB * 1.0f) / (elapse_time_microseconds * 1.0f); + if (bps >= SIZE_IN_MB) { + snprintf(bps_string, GGMLHEXAGON_TMPBUF_LEN, "%.2f MiB/s", ((float) bps) / SIZE_IN_MB); + } else if (bps >= 1000) { + snprintf(bps_string, GGMLHEXAGON_TMPBUF_LEN, "%.1f KiB/s", ((float) bps) / 1000); + } else { + snprintf(bps_string, GGMLHEXAGON_TMPBUF_LEN, "%.2f B/s", bps); + } + } + + void reset() { + _frame_index = 0; + + _max_inputsize = 0; + _max_outputsize = 0; + _max_inferencetime = 0; + + _total_inputsize = 0; + _total_outputsize = 0; + _total_inferencetime = 0; + + _profiler_starttime = 0; + _profiler_elapsetime = 0; + _fp_profile_file = nullptr; + _enable_profiler = false; + _profiler_threshold_duration = 100; + _profiler_threshold_duration = 5; + } + +private: + hexagon_profiler() { + reset(); + } + + hexagon_profiler(const hexagon_profiler &) = delete; + + hexagon_profiler(const hexagon_profiler &&) = delete; + + hexagon_profiler & operator= (const hexagon_profiler &) = delete; + +private: + int _frame_index; + + int _max_inputsize; //bytes + int _max_outputsize; //bytes + int _max_inferencetime; //bytes + + int64_t _total_inputsize; //bytes + int64_t _total_outputsize; //bytes + int64_t _total_inferencetime; //microsecond + + int64_t _profiler_starttime; //microsecond + int64_t _profiler_elapsetime; //microsecond + FILE * _fp_profile_file; + + bool _enable_profiler; + int _profiler_threshold_duration; //seconds + int _profiler_threshold_counts; +}; +static hexagon_profiler & g_hexagon_profiler = hexagon_profiler::get_instance(); + +//a simple perf class to probe NPU performance +class hexagon_perf { +public: + hexagon_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {} + hexagon_perf(const std::string & perf_name, const char * op_name, int input_size, int output_size) + : _perf_name(std::move(perf_name)), _op_name(op_name), + _input_size(input_size), + _output_size(output_size) { + + } + + void start() { + if (0 == g_hexagon_appcfg.enable_perf) + return; + _begin_time = ggml_time_us(); + } + + void info() { + if (0 == g_hexagon_appcfg.enable_perf) { + return; + } + + _end_time = ggml_time_us(); + _duration = (_end_time - _begin_time); + //add following judgement will useful for other developers and AI experts although: + // it breaks the original logic + // it's not mandatory + // had to expose two public function in hexagon_profiler class + if (g_hexagon_profiler.profiler_get_frame_index() <= g_hexagon_profiler.profiler_get_threshold_count()) { + GGMLHEXAGON_LOG_VERBOSE("inference duration of %s through %s: %lld microseconds", + _perf_name.c_str(), ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach), _duration); + } + + //update profiler data + g_hexagon_profiler.profiler_update_profilerdata(_op_name, _duration, _input_size, _output_size); + } + +private: + hexagon_perf() = delete; + hexagon_perf(const hexagon_perf & ) = delete; + hexagon_perf(const hexagon_perf && ) = delete; + hexagon_perf & operator= (const hexagon_perf & ) = delete; + +private: + int64_t _begin_time = 0LL; + int64_t _end_time = 0LL; + int64_t _duration = 0LL; + std::string _perf_name; + const char * _op_name; + int _input_size = 0; + int _output_size = 0; +}; + +//a simple class to load configurations from ggml-hexagon.cfg +class hexagon_appcfg { +public: + hexagon_appcfg() {} + + void dump(std::function worker) { + if (!_load_success) { + GGMLHEXAGON_LOG_INFO("qnn cfg file %s not loaded", _cfg_filename.c_str()); + return; + } + auto iter = _hexagon_appcfg.begin(); + while (iter != _hexagon_appcfg.end()) { + auto kv_iter = iter->second.begin(); + while (kv_iter != iter->second.end()) { + worker(iter->first, kv_iter->first, kv_iter->second); + ++kv_iter; + } + ++iter; + } + } + + bool load(const std::string & file_name) { + if (file_name == "") { + return false; + } + _cfg_filename = file_name; + std::ifstream in; + std::string line; + in.open(file_name.c_str()); + if (not in.is_open()) { + GGMLHEXAGON_LOG_WARN("can't open file %s", file_name.c_str()); + return false; + } + while (getline(in, line)) { + std::string section, key, value; + if (not parse_line(line, section, key, value)) { + continue; + } + set_section_keyvalue(section, key, value); + } + _load_success = true; + return true; + } + + void get_stringvalue(const std::string & section, const std::string & key, std::string & value, std::string default_value) { + value = default_value; + if (_hexagon_appcfg.find(section) == _hexagon_appcfg.end()) { + return; + } + if (_hexagon_appcfg[section].find(key) == _hexagon_appcfg[section].end()) { + return; + } + value = _hexagon_appcfg[section][key]; + } + + void get_intvalue(const std::string & section, const std::string & key, int & value, int default_value) { + value = default_value; + if (_hexagon_appcfg.find(section) == _hexagon_appcfg.end()) { + return; + } + if (_hexagon_appcfg[section].find(key) == _hexagon_appcfg[section].end()) { + return; + } + value = atol(_hexagon_appcfg[section][key].c_str()); + } + +private: + void ltrim(std::string & str) { + if (str.empty()) return; + size_t len = 0; + const char * temp = str.c_str(); + while (*temp && isblank(*temp)) { + ++len; + ++temp; + } + if (len > 0) str.erase(0, len); + } + + void rtrim(std::string & str) { + if (str.empty()) return; + size_t len = str.length(); + size_t pos = len; + while (pos > 0) { + if (not isblank(str[pos - 1])) { + break; + } + --pos; + } + if (pos != len) str.erase(pos); + } + + void trim(std::string & str) { + ltrim(str); + rtrim(str); + } + + void set_section_keyvalue(std::string & section, std::string & key, std::string & value) { + if (_hexagon_appcfg.find(section) == _hexagon_appcfg.end()) { + std::unordered_map kv_map; + _hexagon_appcfg[section] = kv_map; + } + if (key != "" && value != "") _hexagon_appcfg[section][key] = value; + } + + bool parse_line(std::string & line, std::string & section, std::string & key, std::string & value) { + static std::string cur_section = ""; + std::string nodes[2] = {"#", ";"}; + for (int i = 0; i < 2; ++i) { + std::string::size_type pos = line.find(nodes[i]); + if (pos != std::string::npos) line.erase(pos); + } + trim(line); + if (line == "") return false; + if (line[0] == '[' && line[line.size() - 1] == ']') { + section = line.substr(1, line.size() - 2); + trim(section); + cur_section = section; + return false; + } + if (cur_section == "") return false; + bool is_key = true; + for (size_t i = 0; i < line.size(); ++i) { + if (line[i] == '=') { + is_key = false; + continue; + } + if (is_key) { + key += line[i]; + } else { + value += line[i]; + } + } + section = cur_section; + trim(key); + trim(value); + return true; + } + +private: + hexagon_appcfg(const hexagon_appcfg & ) = delete; + hexagon_appcfg(const hexagon_appcfg && ) = delete; + hexagon_appcfg & operator= (const hexagon_appcfg & ) = delete; + +private: + std::unordered_map> _hexagon_appcfg; + bool _load_success = false; + std::string _cfg_filename; +}; + +// ================================================================================================= +// section-3: helper function for WoA(Window on ARM) +// ================================================================================================= +#if !defined(__ANDROID__) && !defined(__linux__) +#define RTLD_GLOBAL 0x100 +#define RTLD_LOCAL 0x000 +#define RTLD_LAZY 0x000 +#define RTLD_NOW 0x001 +static void * dlopen(const char * filename, int flag); +static int dlclose(void * handle); +static void * dlsym(void* handle, const char* name); +static const char * dlerror(void); + +static const char * last_func = nullptr; +static long last_err; +static void * dlopen(const char * dll, int flags) { + HINSTANCE h = LoadLibraryA(dll); + GGML_UNUSED(flags); + if (h == NULL) { + last_err = GetLastError(); + last_func = "dlopen"; + } + return h; +} + +static int dlclose(void * h) { + if (!FreeLibrary((HINSTANCE)h)) { + last_err = GetLastError(); + last_func = "dlclose"; + return -1; + } + return 0; +} + +static void * dlsym(void * h, const char * name) { + FARPROC p = GetProcAddress((HINSTANCE)h, name); + if (!p) { + last_err = GetLastError(); + last_func = "dlsym"; + } + return (void*)(intptr_t)p; +} + +static const char * dlerror(void) { + static char str[512]; + if (!last_err) return nullptr; + + snprintf(str, 512, "%s error #%ld", last_func, last_err); + last_err = 0; + last_func = NULL; + + return str; +} +#endif + +// ================================================================================================= +// section-4: general helper function +// ================================================================================================= +static const char * ggmlhexagon_get_socmodel_desc(uint32_t soc_model) { + switch (soc_model) { + case SM7450: + return "SM7450"; + case SM8350: + return "SM8350"; + case SM8450: + return "SM8450"; + case SM8475: + return "SM8475"; + case SM8550: + return "SM8550"; + case SM8650: + return "SM8650"; + case SM8750: + return "SM8750"; + default: + return "unknown"; + } +} + +//0x68 -> 68, 0x69 -> 69, 0x73 -> 73, 0x75 -> 75, 0x79 -> 79 +static size_t ggmlhexagon_htparch_hex_to_decimal(size_t htp_arch) { + //naive algorithm + int a = htp_arch / 16; + int b = htp_arch % 16; + return a * 10 + b; +} + +static const char * ggmlhexagon_get_htparch_desc(size_t htp_arch) { + switch (htp_arch) { + case V68: + return "QCOM_HTP_V68"; + case V69: + return "QCOM_HTP_V69"; + case V73: + return "QCOM_HTP_V73"; + case V75: + return "QCOM_HTP_V75"; + case V79: + return "QCOM_HTP_V79"; + default: + return "unknown"; + } +} + +static struct qcom_socinfo * ggmlhexagon_get_socinfo_from_socmodel(uint32_t soc_model) { + size_t items = sizeof(g_qnn_soc_info_table) / sizeof(g_qnn_soc_info_table[0]); + for (size_t idx = 0; idx < items; idx++) { + if (soc_model == g_qnn_soc_info_table[idx].soc_model) { + return &g_qnn_soc_info_table[idx]; + } + } + return nullptr; +} + +static struct qcom_socinfo * ggmlhexagon_get_socinfo_from_socmodel(size_t htp_arch) { + size_t items = sizeof(g_qnn_soc_info_table) / sizeof(g_qnn_soc_info_table[0]); + for (size_t idx = 0; idx < items; idx++) { + if (htp_arch == g_qnn_soc_info_table[idx].htp_arch) { + return &g_qnn_soc_info_table[idx]; + } + } + return nullptr; +} + +static inline uint32_t ggmlqnn_get_tensor_data_size(const ggml_tensor * tensor) { + /* + size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); + size_t n_dims = ggml_get_tensor_rank(tensor); + for (int i = 1; i < n_dims; i++) { + data_size *= tensor->ne[i]; + } + + return data_size; + */ + return ggml_nbytes(tensor); +} + +static inline bool ggmlqnn_is_valid_params(ggml_backend_hexagon_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { + if ((nullptr == ctx) || (nullptr == src0) || (nullptr == dst)) { + GGMLHEXAGON_LOG_WARN("invalid params\n"); + return false; + } + + qnn_instance * instance = ctx->instance; + if (nullptr == instance) { + GGMLHEXAGON_LOG_WARN("invalid params\n"); + return false; + } + + return true; +} + +static size_t ggmlhexagon_get_system_total_memory_in_bytes() { +#if defined(__ANDROID__) || defined(__linux__) + struct sysinfo info = {}; + if (0 == sysinfo(&info)) { + return (info.totalram + info.totalswap) * info.mem_unit; + } + size_t pages = (size_t)sysconf(_SC_PHYS_PAGES); + size_t page_size = (size_t)sysconf(_SC_PAGE_SIZE); + + return pages * page_size; +#else + //TODO: Snapdragon based WoA(Windows on ARM) + MEMORYSTATUSEX statex; + statex.dwLength = sizeof(statex); + if (GlobalMemoryStatusEx(&statex)) { + GGMLHEXAGON_LOG_INFO("total physical mem:%llu Mb", statex.ullTotalPhys >> 20); + GGMLHEXAGON_LOG_INFO("avail physical mem:%llu Mb", statex.ullAvailPhys >> 20); + return statex.ullTotalPhys; + } + return 0; +#endif +} + +static size_t ggmlhexagon_get_system_free_memory_in_bytes() { +#if defined(__ANDROID__) || defined(__linux__) + struct sysinfo info = {}; + if (0 == sysinfo(&info)) { + return (info.freeram + info.freeswap) * info.mem_unit; + } + size_t avail_pages = (size_t)sysconf(_SC_AVPHYS_PAGES); + size_t page_size = (size_t)sysconf(_SC_PAGE_SIZE); + + return avail_pages * page_size; +#else + //TODO: Snapdragon based WoA(Windows on ARM) + MEMORYSTATUSEX statex; + statex.dwLength = sizeof(statex); + if (GlobalMemoryStatusEx(&statex)) { + GGMLHEXAGON_LOG_INFO("total physical mem:%llu Mb", statex.ullTotalPhys >> 20); + GGMLHEXAGON_LOG_INFO("avail physical mem:%llu Mb", statex.ullAvailPhys >> 20); + return statex.ullAvailPhys; + } + return 0; +#endif +} + +static bool ggmlhexagon_same_types(const ggml_backend_hexagon_context * ctx, const ggml_tensor * op_tensor) { + GGML_UNUSED(ctx); + ggml_tensor * src0 = op_tensor->src[0]; + ggml_tensor * src1 = op_tensor->src[1]; + if (nullptr != src1) { + if (src0->type != op_tensor->type || src1->type != op_tensor->type) { + return false; + } + } else { + if (src0->type != op_tensor->type) { + return false; + } + } + + if (src0->type != GGML_TYPE_F32) + return false; + + return true; +} + +static const char * ggmlhexagon_get_ggml_type_name(ggml_type type) { + const auto * traits = ggml_get_type_traits(type); + return traits->type_name; +} + +static void ggmlhexagon_append_tensor_dimensions(const ggml_tensor * tensor, std::string & output) { + char buffer[GGMLHEXAGON_TMPBUF_LEN] = {}; + const char * type_name = ggmlhexagon_get_ggml_type_name(tensor->type); + int len = 0; + switch (ggml_n_dims(tensor)) { + case 1: + len = snprintf(buffer, sizeof(buffer), "%ldx1%s", (long)tensor->ne[0], type_name); + break; + case 2: + len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], type_name); + break; + case 3: + len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], + (long)tensor->ne[2], type_name); + break; + case 4: + default: + len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], + (long)tensor->ne[2], (long)tensor->ne[3], type_name); + break; + } + GGML_ASSERT(len > 0 && len < (int)sizeof(buffer)); + output.append(buffer, len); +} + +static size_t ggmlhexagon_get_op_index(const ggml_tensor * tensor) { + if (tensor->op == GGML_OP_UNARY) { + return static_cast(GGML_OP_COUNT) + static_cast(ggml_get_unary_op(tensor)); + } + + return tensor->op; +} + +static size_t ggmlhexagon_get_op_input_param_count(const ggml_tensor * op) { + auto op_index = ggmlhexagon_get_op_index(op); + GGML_ASSERT(op_index < std::size(ggmlqnn_k_op_caps)); + return ggmlhexagon_k_op_caps[op_index].input_param_count; +} + +static void ggmlhexagon_get_opkey_from_op(const ggml_tensor * op, std::string & output) { + GGML_ASSERT(op->op != GGML_OP_NONE); + output += ggml_op_desc(op); + output += ggmlhexagon_get_ggml_type_name(op->type); + size_t param_count = ggmlhexagon_get_op_input_param_count(op); + for (size_t i = 0; i < param_count; ++i) { + auto * input = op->src[i]; + if (!input) { + break; + } + output += '_'; + ggmlhexagon_append_tensor_dimensions(input, output); + } +} + +static void * ggmlhexagon_type_trait(ggml_backend_hexagon_context * ctx, ggml_tensor * op) { + const ggml_tensor * src0 = op->src[0]; + const ggml_tensor * src1 = op->src[1]; + ggml_tensor * dst = op; + const enum ggml_type src0_type = src0->type; + + GGML_TENSOR_BINARY_OP_LOCALS + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + GGML_ASSERT(nb00 == ggml_type_size(src0_type)); + GGML_ASSERT(nb10 == ggml_type_size(src1->type)); + + const int64_t ne_plane = ne01 * ne00; + const size_t desired_size = ((GGML_TYPE_F32 == src0_type) ? 0 : ne03 * ne02 * ne_plane * sizeof(float)); + ctx->desired_size = desired_size; + if (ctx->work_size < desired_size) { + ctx->work_data.reset(new char[desired_size]); + ctx->work_size = desired_size; + } + ctx->n_threads = std::thread::hardware_concurrency(); + void * wdata = ctx->work_data.get(); + // convert src0 to float + if (src0_type != GGML_TYPE_F32) { + const auto * type_traits = ggml_get_type_traits(src0_type); + ggml_to_float_t const to_float = type_traits->to_float; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const void * x = (char *)src0->data + i02 * nb02 + i03 * nb03; + float * const wplane = (float *)wdata + i02 * ne_plane + i03 * ne02 * ne_plane; + + const int min_cols_per_thread = 4096; + const int min_rows_per_thread = std::max((int)(min_cols_per_thread / ne00), 1); + const int n_threads = std::max( + std::min(ctx->n_threads, (int)(ne01 / min_rows_per_thread)), 1); + for (int i = 1; i < n_threads; i++) { + const int64_t start = i * ne01 / n_threads; + const int64_t end = (i + 1) * ne01 / n_threads; + if (start < end) { + ctx->tasks.push_back(std::async(std::launch::async, [=]() { + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *)x + i01 * nb01, wplane + i01 * ne00, ne00); + } + })); + } + } + { + // reuse the current thread for the first task + const int64_t start = 0; + const int64_t end = ne01 / n_threads; + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00); + } + } + } + } + + // wait for all tasks to finish + for (auto &task: ctx->tasks) { + task.get(); + } + ctx->tasks.clear(); + } + return wdata; +} + +static void ggmlhexagon_set_runtime_path(size_t device, const std::string & path) { +#if defined(__ANDROID__) + if ((HEXAGON_BACKEND_QNNNPU == device) || (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach)) { + std::string lib_runtime_path = path + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images"; + if (0 == setenv("LD_LIBRARY_PATH", lib_runtime_path.c_str(), 1)) { + GGMLHEXAGON_LOG_DEBUG("setenv LD_LIBRARY_PATH %s successfully", lib_runtime_path.c_str()); + } else { + GGMLHEXAGON_LOG_ERROR("setenv LD_LIBRARY_PATH %s failure", lib_runtime_path.c_str()); + } + + std::string adsp_runtime_path = path + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp"; + if (0 == setenv("ADSP_LIBRARY_PATH", adsp_runtime_path.c_str(), 1)) { + GGMLHEXAGON_LOG_DEBUG("setenv ADSP_LIBRARY_PATH %s successfully", adsp_runtime_path.c_str()); + } else { + GGMLHEXAGON_LOG_ERROR("setenv ADSP_LIBRARY_PATH %s failure", adsp_runtime_path.c_str()); + } + } else { + if (0 == setenv("LD_LIBRARY_PATH", + (path + + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(), + 1)) { + GGMLHEXAGON_LOG_DEBUG("%s backend setenv successfully\n", + ggml_backend_hexagon_get_devname(device)); + } else { + GGMLHEXAGON_LOG_ERROR("%s backend setenv failure\n", + ggml_backend_hexagon_get_devname(device)); + } + } +#endif +} + +static void ggmlhexagon_load_cfg() { + //this function can be called in various scenarios + static bool initialized = false; + if (initialized) { + GGMLHEXAGON_LOG_DEBUG("hexagon appcfg file already loaded\n"); + return; + } + char time_string[GGMLHEXAGON_TMPBUF_LEN]; + memset(time_string, 0, GGMLHEXAGON_TMPBUF_LEN); + ggmlhexagon_get_timestring(time_string); + GGMLHEXAGON_LOG_DEBUG("program running start time:%s", time_string); + std::string cfg_filename = std::string(g_hexagon_appcfg.runtime_libpath) + std::string(g_hexagon_appcfg.cfgfilename); + GGMLHEXAGON_LOG_INFO("load hexagon appcfg from %s", cfg_filename.c_str()); + hexagon_appcfg hexagoncfg_instance; + hexagoncfg_instance.load(cfg_filename); + hexagoncfg_instance.dump([](const std::string & section, const std::string & key, const std::string value) { + std::ostringstream tmposs; + tmposs << "section[" << std::setw(10) << std::left << section << "],[" << std::setw(25) << std::left << key << "] = [" << value << "]"; + GGMLHEXAGON_LOG_INFO("%s", tmposs.str().c_str()); + }); + std::string precision_mode; + std::string ggml_hexagon_version; + hexagoncfg_instance.get_stringvalue("general", "ggml_hexagon_version", ggml_hexagon_version, "1.03"); + hexagoncfg_instance.get_intvalue("general", "enable_perf", g_hexagon_appcfg.enable_perf, 1); + hexagoncfg_instance.get_intvalue("general", "print_tensors_info", g_hexagon_appcfg.print_tensors_info, 0); + hexagoncfg_instance.get_intvalue("general", "dump_op_info", g_hexagon_appcfg.dump_op_info, 0); + hexagoncfg_instance.get_intvalue("general", "hwaccel_approach", g_hexagon_appcfg.hwaccel_approach, HWACCEL_CDSP); + hexagoncfg_instance.get_intvalue("general", "hexagon_backend", g_hexagon_appcfg.hexagon_backend, HEXAGON_BACKEND_CDSP); + hexagoncfg_instance.get_intvalue("general", "enable_q_mulmat", g_hexagon_appcfg.enable_q_mulmat, 0); + hexagoncfg_instance.get_intvalue("general", "enable_profiler", g_hexagon_appcfg.enable_profiler, 0); + hexagoncfg_instance.get_intvalue("general", "profiler_duration", g_hexagon_appcfg.profiler_duration, 5); + hexagoncfg_instance.get_intvalue("general", "profiler_counts", g_hexagon_appcfg.profiler_counts, 100); + + hexagoncfg_instance.get_intvalue("qnn", "hvx_threads", g_hexagon_appcfg.hvx_threads, 4); + hexagoncfg_instance.get_intvalue("qnn", "vtcm_size_in_mb", g_hexagon_appcfg.vtcm_size_in_mb, 8); + hexagoncfg_instance.get_intvalue("qnn", "enable_dlbc", g_hexagon_appcfg.enable_dlbc, 1); + hexagoncfg_instance.get_stringvalue("qnn", "precision_mode", precision_mode, "fp32"); + hexagoncfg_instance.get_intvalue("qnn", "print_qnn_internal_log", g_hexagon_appcfg.print_qnn_internal_log, 0); + + hexagoncfg_instance.get_intvalue("cdsp", "enable_rpc_ion_mempool", g_hexagon_appcfg.enable_rpc_ion_mempool, 0); + hexagoncfg_instance.get_intvalue("cdsp", "enable_all_q_mulmat", g_hexagon_appcfg.enable_all_q_mulmat, 0); + hexagoncfg_instance.get_intvalue("cdsp", "thread_counts", g_hexagon_appcfg.thread_counts, 4); + + GGMLHEXAGON_LOG_INFO("internal ggml_hexagon_version=%s", g_hexagon_appcfg.ggml_hexagon_version); + GGMLHEXAGON_LOG_INFO("internal ggml_dsp_version=%s", g_hexagon_appcfg.ggml_dsp_version); + GGMLHEXAGON_LOG_INFO("external ggml_hexagon_version=%s", ggml_hexagon_version.c_str()); + GGMLHEXAGON_LOG_INFO("hwaccel_approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach, + ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach)); + GGMLHEXAGON_LOG_INFO("hexagon_backend=%d(%s)", g_hexagon_appcfg.hexagon_backend, + ggml_backend_hexagon_get_devname(g_hexagon_appcfg.hexagon_backend)); + GGMLHEXAGON_LOG_INFO("runtime libpath=%s", g_hexagon_appcfg.runtime_libpath); + GGMLHEXAGON_LOG_INFO("enable_perf=%d", g_hexagon_appcfg.enable_perf); + GGMLHEXAGON_LOG_INFO("enable_profiler=%d", g_hexagon_appcfg.enable_profiler); + + if (precision_mode.find("fp16") != std::string::npos) { + g_hexagon_appcfg.precision_mode = 1; + } else { + g_hexagon_appcfg.precision_mode = 0; + } + + ggmlhexagon_set_runtime_path(HEXAGON_BACKEND_CDSP, g_hexagon_appcfg.runtime_libpath); + + if (1 == g_hexagon_appcfg.enable_profiler) { + //make sure this function is called only once + g_hexagon_profiler.profiler_init(g_hexagon_appcfg.profiler_duration, g_hexagon_appcfg.profiler_counts); + } + + initialized = true; +} + +static bool ggmlhexagon_check_valid_appcfg() { + bool is_valid_appcfg = true; + + GGMLHEXAGON_LOG_DEBUG("user's specified hwaccel approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach, + ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach)); + GGMLHEXAGON_LOG_DEBUG("user's specified hexagon_backend=%d", g_hexagon_appcfg.hexagon_backend); + if (g_hexagon_appcfg.hexagon_backend >= GGML_HEXAGON_MAX_DEVICES) { + GGMLHEXAGON_LOG_INFO("using default ggml backend"); + is_valid_appcfg = false; + } + + if (HWACCEL_QNN_SINGLEGRAPH == g_hexagon_appcfg.hwaccel_approach) { + GGMLHEXAGON_LOG_INFO("HWACCEL_QNN_SINGLEGRAPH not supported"); + is_valid_appcfg = false; + } + + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + if (HEXAGON_BACKEND_CDSP != g_hexagon_appcfg.hexagon_backend) { + GGMLHEXAGON_LOG_INFO("hwaccel_approach HWACCEL_CDSP must match with hexagon_backend HEXAGON_BACKEND_CDSP"); + is_valid_appcfg = false; + } + + if (1 == g_hexagon_appcfg.enable_all_q_mulmat) { + if (0 == g_hexagon_appcfg.enable_q_mulmat) { + GGMLHEXAGON_LOG_INFO("ensure set enable_q_mulmat to 1 firstly when set enable_all_q_mulmat to 1"); + is_valid_appcfg = false; + } + } + } + + if (!is_valid_appcfg) { + GGMLHEXAGON_LOG_INFO("it seems there is wrong configuration in ggml-hexagon.cfg, will using the default ggml backend accordingly"); + } + return is_valid_appcfg; +} + +static void ggmlhexagon_probe_dspinfo(ggml_backend_hexagon_context * ctx); +static void ggmlhexagon_print_running_timestamp(ggml_backend_hexagon_context * ctx) { + char timestamp[GGMLHEXAGON_TMPBUF_LEN]; + memset(timestamp, 0, GGMLHEXAGON_TMPBUF_LEN); + + GGMLHEXAGON_LOG_INFO("ggml_hexagon_version: %s", g_hexagon_appcfg.ggml_hexagon_version); + GGMLHEXAGON_LOG_INFO("ggml_dsp_version: %s", g_hexagon_appcfg.ggml_dsp_version); + GGMLHEXAGON_LOG_INFO("hwaccel approach: %d(%s)", g_hexagon_appcfg.hwaccel_approach, + ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach)); + GGMLHEXAGON_LOG_INFO("hexagon_backend: %d(%s)", g_hexagon_appcfg.hexagon_backend, + ggml_backend_hexagon_get_devname(g_hexagon_appcfg.hexagon_backend)); + ggmlhexagon_get_timestring(timestamp); + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + GGMLHEXAGON_LOG_INFO("offload quantize GGML_OP_MUL_MAT: %s", g_hexagon_appcfg.enable_q_mulmat ? "YES" : "NO"); + GGMLHEXAGON_LOG_INFO("using rpc ion memory pool: %s", g_hexagon_appcfg.enable_rpc_ion_mempool ? "YES" : "NO"); + GGMLHEXAGON_LOG_INFO("thread_counts with HWACCEL_CDSP: %d", g_hexagon_appcfg.thread_counts); + ggmlhexagon_probe_dspinfo(ctx); + } else { + GGMLHEXAGON_LOG_INFO("thread_counts with HWACCEL_QNN: %d", g_hexagon_appcfg.hvx_threads); + GGMLHEXAGON_LOG_INFO("offload quantize GGML_OP_MUL_MAT: %s", g_hexagon_appcfg.enable_q_mulmat ? "YES" : "NO"); + } + GGMLHEXAGON_LOG_INFO("running timestamp:%s", timestamp); + + if (1 == g_hexagon_appcfg.enable_profiler) { + //make sure this function is called only once + g_hexagon_profiler.profiler_deinit(); + } +} + +// ================================================================================================= +// section-5: QNN helper function/class +// ================================================================================================= +//make sure every QNN tensor/opcfg name is unique, threadsafe is not required at the moment +static void ggmlqnn_reset_idx() { + g_qnntensor_idx = 0; + g_qnnopcfg_idx = 0; +} + +static void ggmlqnn_inc_idx(int idx_type) { + switch (idx_type) { + case QNN_TENSOR_INDEX: + g_qnntensor_idx++; + break; + case QNN_OPCFG_INDEX: + g_qnnopcfg_idx++; + break; + default: + break; + } +} + +static int32_t ggmlqnn_get_idx(int idx_type) { + switch (idx_type) { + case QNN_TENSOR_INDEX: + return g_qnntensor_idx; + case QNN_OPCFG_INDEX: + return g_qnnopcfg_idx; + default: + break; + } + + //it's not make sense, just for fix compiler warning + return g_qnntensor_idx; +} + +static intptr_t ggmlqnn_align_to(size_t alignment, intptr_t offset) { + return offset % alignment == 0 ? offset + : offset + + (static_cast(alignment) - + offset % static_cast(alignment)); +} + +static size_t ggmlqnn_memscpy(void * dst, size_t dst_size, const void * src, size_t copy_size) { + if (!dst || !src || !dst_size || !copy_size) + return 0; + + size_t min_size = dst_size < copy_size ? dst_size : copy_size; + + memcpy(dst, src, min_size); + + return min_size; +} + +static char * ggmlqnn_strndup(const char * source, size_t maxlen) { +#if defined(__ANDROID__) || defined(__linux__) + return strndup(source, maxlen); +#else + //TODO:behaviour is not exactly same to Android&Linux + GGML_UNUSED(maxlen); + return strdup(source); +#endif +} + +static inline uint32_t ggmlqnn_get_tensorid(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.id; + } + return 0u; +} + +static inline const char * ggmlqnn_get_tensorname(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.name; + } + return nullptr; +} + +static inline Qnn_TensorType_t ggmlqnn_get_tensortype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.type; + } + return QNN_TENSOR_TYPE_UNDEFINED; +} + +static inline Qnn_TensorDataFormat_t ggmlqnn_get_tensor_dataformat(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataFormat; + } + return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; +} + +static inline Qnn_DataType_t ggmlqnn_get_tensor_datatype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataType; + } + return QNN_DATATYPE_UNDEFINED; +} + +static inline Qnn_QuantizeParams_t ggmlqnn_get_tensor_quantparams(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.quantizeParams; + } + return QNN_QUANTIZE_PARAMS_INIT; +} + +static inline uint32_t ggmlqnn_get_tensor_rank(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.rank; + } + return 0u; +} + +static inline uint32_t * ggmlqnn_get_tensor_dimensions(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dimensions; + } + return nullptr; +} + +static inline Qnn_TensorMemType_t ggmlqnn_get_tensor_memtype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.memType; + } + return QNN_TENSORMEMTYPE_UNDEFINED; +} + +static inline void ggmlqnn_set_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.id = id; + } +} + +static inline void ggmlqnn_set_tensor_name(Qnn_Tensor_t & tensor, const char * name) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.name = name; + } +} + +static inline void ggmlqnn_set_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.type = type; + } +} + +static inline void ggmlqnn_set_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dataFormat = format; + } +} + +static inline void ggmlqnn_set_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dataType = dataType; + } +} + +static inline void ggmlqnn_set_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.quantizeParams = params; + } +} + +static inline void ggmlqnn_set_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.rank = rank; + } +} + +static inline void ggmlqnn_set_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dimensions = dims; + } +} + +static inline void ggmlqnn_set_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t memType) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.memType = memType; + } +} + +static inline void ggmlqnn_set_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t clientBuf) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.clientBuf = clientBuf; + } +} + +static inline void ggmlqnn_set_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.memHandle = handle; + } +} + +static int ggmlqnn_deep_copy_qnntensor(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { + int err = 0; + + dst.version = src.version; + ggmlqnn_set_tensor_name(dst, ggmlqnn_strndup(ggmlqnn_get_tensorname(src), std::string(ggmlqnn_get_tensorname(src)).size())); + if (nullptr == ggmlqnn_get_tensorname(dst)) { + return 1; + } + ggmlqnn_set_tensor_id(dst, ggmlqnn_get_tensorid(src)); + ggmlqnn_set_tensor_type(dst, ggmlqnn_get_tensortype(src)); + ggmlqnn_set_tensor_dataformat(dst, ggmlqnn_get_tensor_dataformat(src)); + ggmlqnn_set_tensor_datatype(dst, ggmlqnn_get_tensor_datatype(src)); + ggmlqnn_set_tensor_memtype(dst, ggmlqnn_get_tensor_memtype(src)); + + if (ggmlqnn_get_tensor_memtype(src) == QNN_TENSORMEMTYPE_RAW) { + Qnn_ClientBuffer_t client_buf = {nullptr, 0}; + ggmlqnn_set_tensor_clientbuf(dst, client_buf); + } else if (ggmlqnn_get_tensor_memtype(src) == QNN_TENSORMEMTYPE_MEMHANDLE) { + ggmlqnn_set_tensor_memhandle(dst, nullptr); + } else { + return 1; + } + + Qnn_QuantizeParams_t src_qparam = ggmlqnn_get_tensor_quantparams(src); + Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding; + if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { + Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; + Qnn_AxisScaleOffset_t & axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding; + Qnn_ScaleOffset_t ** scale_offset = &axis_scale_offset.scaleOffset; + size_t scale_offset_size = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); + *scale_offset = (Qnn_ScaleOffset_t *)malloc(scale_offset_size); + ggmlqnn_memscpy(*scale_offset, + scale_offset_size, + src_qparam.axisScaleOffsetEncoding.scaleOffset, + scale_offset_size); + ggmlqnn_set_tensor_quantparams(dst, src_qparam_cpy); + } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) { + Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; + Qnn_BwAxisScaleOffset_t & bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding; + size_t scale_size = bwaxis_scale_offset.numElements * sizeof(float); + float ** scales = &bwaxis_scale_offset.scales; + int32_t ** offsets = &bwaxis_scale_offset.offsets; + *scales = (float *)malloc(scale_size); + ggmlqnn_memscpy(*scales, scale_size, src_qparam.bwAxisScaleOffsetEncoding.scales, scale_size); + + if (bwaxis_scale_offset.offsets != nullptr) { + size_t offset_size = bwaxis_scale_offset.numElements * sizeof(int32_t); + *offsets = (int32_t *)malloc(offset_size); + ggmlqnn_memscpy(*offsets, offset_size, src_qparam.bwAxisScaleOffsetEncoding.offsets, offset_size); + } + ggmlqnn_set_tensor_quantparams(dst, src_qparam_cpy); + } else { + ggmlqnn_set_tensor_quantparams(dst, src_qparam); + } + + uint32_t rank = ggmlqnn_get_tensor_rank(src); + ggmlqnn_set_tensor_rank(dst, rank); + size_t dim_size = GGML_MAX_DIMS * sizeof(uint32_t); + uint32_t * dimensions = (uint32_t *)malloc(dim_size); + if (nullptr == dimensions) { + GGMLHEXAGON_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying tensor %s\n", ggmlqnn_get_tensorname(src)); + return 1; + } + ggmlqnn_memscpy(dimensions, dim_size, ggmlqnn_get_tensor_dimensions(src), dim_size); + ggmlqnn_set_tensor_dimensions(dst, dimensions); + + return err; +} + +static int ggmlqnn_free_qnntensor(Qnn_Tensor_t * tensor) { + int err = 0; + free((void *) ggmlqnn_get_tensorname(*tensor)); + Qnn_QuantizeParams_t src_qparam = ggmlqnn_get_tensor_quantparams(*tensor); + Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding; + if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { + free(src_qparam.axisScaleOffsetEncoding.scaleOffset); + } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) { + free(src_qparam.bwAxisScaleOffsetEncoding.scales); + if (src_qparam.bwAxisScaleOffsetEncoding.offsets != nullptr) { + free(src_qparam.bwAxisScaleOffsetEncoding.offsets); + } + } + free(ggmlqnn_get_tensor_dimensions(*tensor)); + free(tensor); + + return err; +} + +static const char * ggmlqnn_get_qnnerror_string(Qnn_ErrorHandle_t qnn_error_code) { + // file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/api_error_codes.html + switch (qnn_error_code) { + case QNN_SUCCESS: + return "QNN_SUCCESS"; + case QNN_COMMON_ERROR_GENERAL: + return "QNN_COMMON_ERROR_GENERAL"; + + // QnnGraph_Error_t + case QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: + return "QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE"; + case QNN_GRAPH_ERROR_MEM_ALLOC: + return "QNN_GRAPH_ERROR_MEM_ALLOC"; + case QNN_GRAPH_ERROR_INVALID_ARGUMENT: + return "QNN_GRAPH_ERROR_INVALID_ARGUMENT"; + case QNN_GRAPH_ERROR_INVALID_HANDLE: + return "QNN_GRAPH_ERROR_INVALID_HANDLE"; + case QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST: + return "QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST"; + case QNN_GRAPH_ERROR_INVALID_NAME: + return "QNN_GRAPH_ERROR_INVALID_NAME"; + case QNN_GRAPH_ERROR_INVALID_TENSOR: + return "QNN_GRAPH_ERROR_INVALID_TENSOR"; + case QNN_GRAPH_ERROR_INVALID_OP_CONFIG: + return "QNN_GRAPH_ERROR_INVALID_OP_CONFIG"; + case QNN_GRAPH_ERROR_SET_PROFILE: + return "QNN_GRAPH_ERROR_SET_PROFILE"; + case QNN_GRAPH_ERROR_UNCONNECTED_NODE: + return "QNN_GRAPH_ERROR_UNCONNECTED_NODE"; + case QNN_GRAPH_ERROR_CREATE_FAILED: + return "QNN_GRAPH_ERROR_CREATE_FAILED"; + case QNN_GRAPH_ERROR_OPTIMIZATION_FAILED: + return "QNN_GRAPH_ERROR_OPTIMIZATION_FAILED"; + case QNN_GRAPH_ERROR_FINALIZE_FAILED: + return "QNN_GRAPH_ERROR_FINALIZE_FAILED"; + case QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED: + return "QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED"; + case QNN_GRAPH_ERROR_GRAPH_FINALIZED: + return "QNN_GRAPH_ERROR_GRAPH_FINALIZED"; + case QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL: + return "QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL"; + case QNN_GRAPH_ERROR_SIGNAL_IN_USE: + return "QNN_GRAPH_ERROR_SIGNAL_IN_USE"; + case QNN_GRAPH_ERROR_ABORTED: + return "QNN_GRAPH_ERROR_ABORTED"; + case QNN_GRAPH_ERROR_PROFILE_IN_USE: + return "QNN_GRAPH_ERROR_PROFILE_IN_USE"; + case QNN_GRAPH_ERROR_TIMED_OUT: + return "QNN_GRAPH_ERROR_TIMED_OUT"; + case QNN_GRAPH_ERROR_SUBGRAPH: + return "QNN_GRAPH_ERROR_SUBGRAPH"; + case QNN_GRAPH_ERROR_DISABLED: + return "QNN_GRAPH_ERROR_DISABLED"; + case QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE: + return "QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE"; + case QNN_GRAPH_ERROR_TENSOR_SPARSITY: + return "QNN_GRAPH_ERROR_TENSOR_SPARSITY"; + case QNN_GRAPH_ERROR_EARLY_TERMINATION: + return "QNN_GRAPH_ERROR_EARLY_TERMINATION"; + case QNN_GRAPH_ERROR_INVALID_CONTEXT: + return "QNN_GRAPH_ERROR_INVALID_CONTEXT"; + + //QQnnTensor_Error_t + //Invalid context/graph handle in creating tensor + case QNN_TENSOR_ERROR_INVALID_HANDLE: + return "QNN_TENSOR_ERROR_INVALID_HANDLE"; + //Tensor with specified credentials not registered with a context/graph + case QNN_TENSOR_ERROR_DOES_NOT_EXIST: + return "QNN_TENSOR_ERROR_DOES_NOT_EXIST"; + // (deprecated) Tensor has already been registered with backend + case QNN_TENSOR_ERROR_ALREADY_EXISTS: + return "QNN_TENSOR_ERROR_ALREADY_EXISTS"; + // Invalid tensor param. + case QNN_TENSOR_ERROR_INVALID_TENSOR_PARAM: + return "QNN_TENSOR_ERROR_INVALID_TENSOR_PARAM"; + // This tensor param is currently unsupported + case QNN_TENSOR_ERROR_UNSUPPORTED_TENSOR_PARAM: + return "QNN_TENSOR_ERROR_UNSUPPORTED_TENSOR_PARAM"; + // Tensor provided for update is invalid + case QNN_TENSOR_ERROR_INCOMPATIBLE_TENSOR_UPDATE: + return "QNN_TENSOR_ERROR_INCOMPATIBLE_TENSOR_UPDATE"; + + // QnnOpPackage_Error_t + case QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED: + return "QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED"; + case QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED: + return "QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED"; + case QNN_OP_PACKAGE_ERROR_INVALID_HANDLE: + return "QNN_OP_PACKAGE_ERROR_INVALID_HANDLE"; + case QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE: + return "QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE"; + case QNN_OP_PACKAGE_ERROR_INVALID_INFO: + return "QNN_OP_PACKAGE_ERROR_INVALID_INFO"; + case QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE: + return "QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE"; + case QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT: + return "QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT"; + + default: + return "unknown QNN error"; + } +} + +// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 +static Qnn_DataType_t ggmlqnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { + switch (ggmltype) { + case GGML_TYPE_F16: + return QNN_DATATYPE_FLOAT_16; + case GGML_TYPE_F32: + return QNN_DATATYPE_FLOAT_32; + case GGML_TYPE_I8: + return QNN_DATATYPE_INT_8; + case GGML_TYPE_Q8_0: + return QNN_DATATYPE_SFIXED_POINT_8; + case GGML_TYPE_Q4_0: + return QNN_DATATYPE_SFIXED_POINT_4; + default: + break; + } + return QNN_DATATYPE_UNDEFINED; +} + +static void ggmlqnn_get_qnn_dimensions_from_ggml_dimensions(uint32_t * qnn_dimensions, const uint32_t * ggml_dimensions, uint32_t rank) { + if (rank > GGML_MAX_DIMS) { + GGMLHEXAGON_LOG_WARN("invalid params"); + return; + } + if (nullptr == qnn_dimensions || nullptr == ggml_dimensions) { + GGMLHEXAGON_LOG_WARN("invalid params"); + return; + } + for (size_t idx = 0; idx < GGML_MAX_DIMS; idx++) + qnn_dimensions[idx] = ggml_dimensions[idx]; + + if (rank >= 2) { + qnn_dimensions[rank - 1] = ggml_dimensions[rank - 2]; + qnn_dimensions[rank - 2] = ggml_dimensions[rank - 1]; + } +} + +template +Fn ggmlqnn_load_qnn_functionpointers(void * handle, const char * function_name) { + return reinterpret_cast(dlsym(handle, function_name)); +} + +class qnn_interface { +#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ + template \ + inline auto qnn_##F(Args... args) const { \ + return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \ + std::forward(args)...); \ + } + + +#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ + template \ + inline auto qnn_##F(Args... args) const { \ + return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \ + std::forward(args)...); \ + } + + friend class qnn_instance; + +public: + qnn_interface() = default; + + // QnnBackend + DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate) + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree) + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage) + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig) + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion) + + // QnnDevice + DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate) + + DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree) + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure) + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo) + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo) + + // QnnContext + DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate) + + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize) + + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary) + + DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary) + + DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree) + + // QnnGraph + DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate) + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode) + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize) + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute) + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve) + + // QnnLog + DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate) + + DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree) + + DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel) + + // QnnProfile + DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate) + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents) + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents) + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData) + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree) + + // QnnMem + DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister) + + DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister) + + // QnnProperty + DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability) + + // QnnTensor + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor) + + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor) + + // QnnSystem + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate) + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo) + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree) + + void set_qnn_interface(const QnnInterface_t * qnn_interface) { + _qnn_interface = qnn_interface; + } + + void set_qnn_system_interface(const QnnSystemInterface_t * qnn_sys_interface) { + _qnn_sys_interface = qnn_sys_interface; + } + + uint32_t get_backend_id() const { + return _qnn_interface->backendId; + } + + bool is_loaded() const { + return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr)); + } + +private: + const QnnInterface_t * _qnn_interface = nullptr; + + const QnnSystemInterface_t * _qnn_sys_interface = nullptr; +}; + +class qnn_instance { +public: + using BackendIdType = decltype(QnnInterface_t{}.backendId); + + explicit qnn_instance(const std::string & lib_path, const std::string & backend_name, + const std::string & model_name) : + _lib_path(std::move(lib_path)), + _backend_name(std::move(backend_name)), + _model_name(std::move(model_name)) {} + + ~qnn_instance() { + } + + int qnn_init(const QnnSaver_Config_t ** saver_config); + + int qnn_finalize(); + + const qnn_interface & get_qnn_interface() { + if (!_qnn_interface.is_loaded()) { + GGMLHEXAGON_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_interface; + } + + const QNN_INTERFACE_VER_TYPE & get_qnn_raw_interface() { + if (!_qnn_interface.is_loaded()) { + GGMLHEXAGON_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_interface; + } + + const QNN_SYSTEM_INTERFACE_VER_TYPE & get_qnn_raw_system_interface() { + if (!_qnn_interface.is_loaded()) { + GGMLHEXAGON_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_system_interface; + } + + Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } + + Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; } + + Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; } + + Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; } + + Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; } + + QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; } + + Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } + + int init_qnn_graph(const char * graph_name, + bool debug, + uint8_t do_node_validation = 1, + const QnnGraph_Config_t ** graph_configs = nullptr + ); + int init_qnn_graph(const std::string & graph_name, HEXAGONBackend device, size_t vtcm_size_in_mb = 8, size_t hvx_threads = 8); + + int finalize_qnn_graph(); + + bool is_valid_graph() const { return _qnn_graph_handle != nullptr; } + + int htp_init_perfinfra(); + + int htp_set_rpc_polling(); + + int htp_set_high_performance_mode(); + + std::string & get_qnn_graph_name() { return _graph_name; } + + bool is_rpcmem_initialized() { + return _rpcmem_initialized; + } + + void set_rpcmem_initialized(bool initialized) { + _rpcmem_initialized = initialized; + } + + size_t get_rpcmem_capacity() { return _rpcmem_capacity; } + size_t get_rpcmem_usage() { return _rpcmem_usage; } + + int32_t rpcmem_to_fd(void * buf); + + int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor); + Qnn_MemHandle_t register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions, Qnn_DataType_t data_type); + + void unregister_rpcmem(); + void unregister_rpcmem(Qnn_MemHandle_t mem_handle); + + void * alloc_rpcmem(size_t bytes, size_t alignment); + void * get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle); + + void free_rpcmem(void * buf); + void free_rpcmem(); + + bool is_rpcmem_allocated(void * buf); + + bool is_rpcmem_registered(Qnn_MemHandle_t handle) { + return _qnn_mem_set.count(handle) != 0U; + } + + bool enable_qnn_rpc() { + return _enable_qnn_rpc; + } + + HEXAGONBackend get_device_id() { + return _device_id; + } + +private: + int load_system(); + + int unload_system(); + + int load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config); + + int unload_backend(); + + void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) { + _qnn_raw_interface = raw_interface; + } + + void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) { + _qnn_raw_system_interface = raw_interface; + } + + void * alloc_rpcmem_internal(size_t bytes, size_t alignment); + + void htp_probe_rpc_meminfo(); + + void htp_print_info(); + + void print_backend_info(); + + void htp_set_memory_grow_size(size_t size = 1ul * 1024 * 1024); + + void htp_enter_performance_mode(); + + void htp_set_n_hvx_threads(size_t n_threads); + +private: + static constexpr const int _required_num_providers = 1; + +private: + std::string _lib_path; + std::string _backend_name; + std::string _model_name; // name of prebuilt QNN model, might be used in the future + BackendIdType _backend_id; + + bool _debug_tensor = false; // flag to indicate if requested graph is to be run in debug mode + bool _do_node_validations = true; // flag to indicate whether all add_node calls need to be validated + QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; + + qnn_profile_level _profile_level = PROFILE_OFF; + + void * _system_lib_handle = nullptr; + void * _loaded_lib_handle = nullptr; + const QnnInterface_t * _loaded_backend = nullptr; + + Qnn_GraphHandle_t _qnn_graph_handle = nullptr; + + Qnn_LogHandle_t _qnn_log_handle = nullptr; + + Qnn_ProfileHandle_t _qnn_profile_handle = nullptr; + + Qnn_DeviceHandle_t _qnn_device_handle = nullptr; + + Qnn_BackendHandle_t _qnn_backend_handle = nullptr; + + Qnn_ContextHandle_t _qnn_context_handle = nullptr; + + QnnSystemContext_Handle_t _qnn_system_handle = nullptr; + + QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; + uint32_t _qnn_htp_powerconfig_id = 1; + uint32_t _qnn_htp_device_id = 0; + uint32_t _qnn_htp_core_id = 0; + + uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing + + qnn_interface _qnn_interface; + QNN_INTERFACE_VER_TYPE _qnn_raw_interface; + QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; + + std::unordered_map _qnn_mem_set; + std::unordered_map _qnn_rpc_buffer_to_handles; + + std::atomic_bool _rpcmem_initialized{false}; + pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; + pfn_rpc_mem_free _pfn_rpc_mem_free; + pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; + pfn_rpc_mem_init _pfn_rpc_mem_init; + pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; + std::unordered_map _rpcmem_store_map; + std::unordered_map _rpcmem_usage_map; + size_t _rpcmem_usage = 0; // mempool usage in bytes + size_t _rpcmem_capacity = 0; // mempool size in bytes + + std::string _graph_name; + HEXAGONBackend _device_id; + void * _rpc_lib_handle = nullptr; + bool _enable_qnn_rpc = false; //TODO:unknown issue with QNN RPC feature + + qnn_instance(const qnn_instance &) = delete; + void operator=(const qnn_instance &) = delete; + + qnn_instance(qnn_instance &&) = delete; + void operator=(qnn_instance &&) = delete; +}; + +void * qnn_instance::alloc_rpcmem_internal(size_t bytes, size_t alignment) { + if (!_rpcmem_initialized) { + GGMLHEXAGON_LOG_WARN("rpc memory not initialized\n"); + return nullptr; + } + + auto allocate_bytes = static_cast(bytes + alignment); + void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); + if (nullptr == buf) { + GGMLHEXAGON_LOG_WARN("failed to allocate rpc memory\n"); + return nullptr; + } + + auto aligned_buf = reinterpret_cast(ggmlqnn_align_to(alignment, + reinterpret_cast(buf))); + bool status = _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; + if (!status) { + GGMLHEXAGON_LOG_WARN("failed to allocate rpc memory\n"); + _pfn_rpc_mem_free(buf); + } + return aligned_buf; +} + +void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) { + if (_rpcmem_usage > (_rpcmem_capacity - (8 * SIZE_IN_MB))) { // reserve 8Mbytes in rpc mempool + GGMLHEXAGON_LOG_WARN("rpc mempool capacity: %d MiB, usage: %d MiB", _rpcmem_capacity / SIZE_IN_MB, _rpcmem_usage / SIZE_IN_MB); + return nullptr; + } + + auto aligned_buf = alloc_rpcmem_internal(bytes, alignment); + if (nullptr == aligned_buf) + return nullptr; + _rpcmem_usage_map.insert(std::pair(aligned_buf, bytes)); + + _rpcmem_usage += bytes; + return aligned_buf; +} + +void qnn_instance::free_rpcmem(void * buf) { + size_t rpcbuffer_size = 0; + if (!_rpcmem_initialized) { + GGMLHEXAGON_LOG_WARN("rpc memory not initialized\n"); + } else if (0 == _rpcmem_store_map.count(buf)) { + GGMLHEXAGON_LOG_WARN("no allocated tensor\n"); + } else { + GGMLHEXAGON_LOG_DEBUG("free rpc mem %p", _rpcmem_store_map[buf]); + for (std::unordered_map::iterator it = _rpcmem_usage_map.begin(); + it != _rpcmem_usage_map.end(); + it++) { + void * rpcbuffer = it->first; + if (buf == rpcbuffer) { + rpcbuffer_size = it->second; + _rpcmem_usage -= rpcbuffer_size; + } + } + if (rpcbuffer_size != 0) { + _rpcmem_usage_map.erase(buf); + } + _pfn_rpc_mem_free(_rpcmem_store_map[buf]); + _rpcmem_store_map.erase(buf); + } +} + +void qnn_instance::free_rpcmem() { + if (_rpcmem_store_map.empty()) { + GGMLHEXAGON_LOG_WARN("no rpcmem allocated\n"); + return; + } + + for (std::unordered_map::iterator it = _rpcmem_store_map.begin(); + it != _qnn_mem_set.end(); + it++) { + void * rpcbuffer = it->second; + GGMLHEXAGON_LOG_DEBUG("free rpc buffer %p", rpcbuffer); + _pfn_rpc_mem_free(rpcbuffer); + } + _rpcmem_store_map.clear(); + _rpcmem_usage_map.clear(); + _rpcmem_usage = 0; +} + +int32_t qnn_instance::rpcmem_to_fd(void * buf) { + int32_t mem_fd = -1; + if (!is_rpcmem_initialized()) { + GGMLHEXAGON_LOG_WARN("rpc memory not initialized\n"); + } else { + mem_fd = _pfn_rpc_mem_to_fd(buf); + } + + return mem_fd; +} + +int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { + if (nullptr == p_data || (nullptr == p_tensor)) { + GGMLHEXAGON_LOG_WARN("invalid param\n"); + return 1; + } + + if (!is_rpcmem_initialized()) { + GGMLHEXAGON_LOG_WARN("rpc memory not initialized\n"); + return 2; + } + + if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { + GGMLHEXAGON_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + return 3; + } + + int32_t mem_fd = rpcmem_to_fd(p_data); + if (-1 == mem_fd) { + GGMLHEXAGON_LOG_WARN("failed to get file descriptor\n"); + return 4; + } + GGMLHEXAGON_LOG_DEBUG("mem_fd %d\n", mem_fd); + Qnn_MemDescriptor_t descriptor = { + {QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions, nullptr}, + QNN_VER_PTR(*p_tensor)->dataType, + QNN_MEM_TYPE_ION, + {{mem_fd}}}; + Qnn_MemHandle_t handle = nullptr; + int error = QNN_SUCCESS; + error = _qnn_interface.qnn_mem_register( + _qnn_context_handle, + &descriptor, + /*numDescriptors=*/1, + &handle); + if (error != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), strerror(error)); + return 5; + } else { + GGMLHEXAGON_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + } + QNN_VER_PTR(*p_tensor)->memHandle = handle; + _qnn_mem_set.insert((std::pair(p_data, handle))); + + return 0; +} + +Qnn_MemHandle_t qnn_instance::register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions, Qnn_DataType_t data_type) { + if (!p_data) { + GGMLHEXAGON_LOG_WARN("invalid param"); + return nullptr; + } + + if (!is_rpcmem_initialized()) { + GGMLHEXAGON_LOG_WARN("rpc memory not initialized"); + return nullptr; + } + + if (is_rpcmem_registered(p_data)) { + GGMLHEXAGON_LOG_WARN("rpc memory already registered"); + return _qnn_rpc_buffer_to_handles[p_data]; + } + + int32_t mem_fd = rpcmem_to_fd(p_data); + if (mem_fd == -1) { + GGMLHEXAGON_LOG_WARN("failed to get file descriptor"); + return nullptr; + } + + GGMLHEXAGON_LOG_DEBUG("mem_fd %d", mem_fd); + Qnn_MemDescriptor_t descriptor = { + {rank, dimensions, nullptr}, + data_type, QNN_MEM_TYPE_ION, + {{mem_fd}} + }; + Qnn_MemHandle_t handle = nullptr; + Qnn_ErrorHandle_t error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, /*numDescriptors=*/1, &handle); + if (error != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to register shared memory, error %d, %s", QNN_GET_ERROR_CODE(error), strerror(error)); + return nullptr; + } + + _qnn_rpc_buffer_to_handles.insert({p_data, handle}); + GGMLHEXAGON_LOG_DEBUG("successfully register shared memory handler: %p", handle); + return handle; +} + +void * qnn_instance::get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) { + for (std::unordered_map::iterator it = _qnn_mem_set.begin(); + it != _qnn_mem_set.end(); + it++) { + Qnn_MemHandle_t mem_handle = it->second; + if (it->second == mem_handle) { + return it->first; + } + } + GGMLHEXAGON_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle); + return nullptr; +} + +void qnn_instance::unregister_rpcmem() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + if (_qnn_mem_set.empty()) { + GGMLHEXAGON_LOG_WARN("no rpcmem registered\n"); + } + + for (std::unordered_map::iterator it = _qnn_mem_set.begin(); + it != _qnn_mem_set.end(); + it++) { + Qnn_MemHandle_t mem_handle = it->second; + error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); + if (error != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error)); + } else { + GGMLHEXAGON_LOG_DEBUG("unregister shared memory ok"); + } + } + _qnn_mem_set.clear(); +} + +void qnn_instance::unregister_rpcmem(Qnn_MemHandle_t mem_handle) { + Qnn_ErrorHandle_t error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); + if (error != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to unregister shared memory, error %d", QNN_GET_ERROR_CODE(error)); + } + + auto it = std::find_if(_qnn_mem_set.begin(), _qnn_mem_set.end(), + [mem_handle](const auto &kv) { return kv.second == mem_handle; }); + if (it == _qnn_mem_set.end()) { + GGMLHEXAGON_LOG_WARN("failed to find shared memory handler: %p", mem_handle); + return; + } + + _qnn_mem_set.erase(it); +} + +bool qnn_instance::is_rpcmem_allocated(void * buf) { + return _rpcmem_store_map.count(buf) != 0U; +} + +int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + GGMLHEXAGON_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); + + void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); + if (nullptr == lib_handle) { + GGMLHEXAGON_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); + return 1; + } + + auto get_providers = ggmlqnn_load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>( + lib_handle, + "QnnInterface_getProviders"); + if (nullptr == get_providers) { + GGMLHEXAGON_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); + return 2; + } + + std::uint32_t num_providers = 0; + const QnnInterface_t ** provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); + return 3; + } + GGMLHEXAGON_LOG_DEBUG("num_providers=%d\n", num_providers); + if (num_providers != _required_num_providers) { + GGMLHEXAGON_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); + return 4; + } + + if (nullptr == provider_list) { + GGMLHEXAGON_LOG_WARN("failed to get qnn interface providers\n"); + return 5; + } + bool found_valid_interface = false; + QNN_INTERFACE_VER_TYPE qnn_interface; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major && + QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) { + found_valid_interface = true; + qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; + break; + } + } + + if (!found_valid_interface) { + GGMLHEXAGON_LOG_WARN("unable to find a valid qnn interface\n"); + return 6; + } else { + GGMLHEXAGON_LOG_INFO("find a valid qnn interface\n"); + } + set_qnn_raw_interface(qnn_interface); + + BackendIdType backend_id = provider_list[0]->backendId; + _loaded_backend = provider_list[0]; + _loaded_lib_handle = lib_handle; + _backend_id = backend_id; + + auto saver_initialize = + ggmlqnn_load_qnn_functionpointers<_pfn_QnnSaver_initialize *>(_loaded_lib_handle, "QnnSaver_initialize"); + if (nullptr != saver_initialize) { + error = saver_initialize(saver_config); + if (error != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to saver_initialize,error %d", QNN_GET_ERROR_CODE(error)); + return 7; + } + } else { + GGMLHEXAGON_LOG_WARN("saver_initialize is null\n"); + } + + return 0; +} + +int qnn_instance::unload_backend() { + int dlclose_error = 0; + dlclose_error = dlclose(_loaded_lib_handle); + if (dlclose_error != 0) { + GGMLHEXAGON_LOG_WARN("failed to close QNN backend %d, error %s\n", _backend_id, dlerror()); + } + + return 0; +} + +int qnn_instance::load_system() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + +#if !defined(__ANDROID__) && !defined(__linux__) + std::string system_lib_path = _lib_path + "QnnSystem.dll"; +#else + std::string system_lib_path = _lib_path + "libQnnSystem.so"; +#endif + GGMLHEXAGON_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); + + _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); + if (nullptr == _system_lib_handle) { + GGMLHEXAGON_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror()); + //re-try with default path of QNN binary runtime lib + _lib_path = std::string(g_hexagon_appcfg.runtime_libpath); +#if !defined(__ANDROID__) && !defined(__linux__) + system_lib_path = _lib_path + "QnnSystem.dll"; +#else + system_lib_path = _lib_path + "libQnnSystem.so"; +#endif + _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); + if (nullptr == _system_lib_handle) { + GGMLHEXAGON_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror()); + return 1; + } + } + + auto * get_providers = reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>(dlsym( + _system_lib_handle, "QnnSystemInterface_getProviders")); + if (nullptr == get_providers) { + GGMLHEXAGON_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dlerror()); + return 2; + } + + uint32_t num_providers = 0; + const QnnSystemInterface_t ** provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to get providers, error %d\n", QNN_GET_ERROR_CODE(error)); + return 3; + } + + if (num_providers != _required_num_providers) { + GGMLHEXAGON_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers); + return 4; + } + + if (nullptr == provider_list) { + GGMLHEXAGON_LOG_WARN("can not get providers\n"); + return 5; + } + + QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; + bool found_valid_system_interface = false; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_SYSTEM_API_VERSION_MAJOR == + provider_list[idx]->systemApiVersion.major && + QNN_SYSTEM_API_VERSION_MINOR <= + provider_list[idx]->systemApiVersion.minor) { + found_valid_system_interface = true; + qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; + break; + } + } + if (!found_valid_system_interface) { + GGMLHEXAGON_LOG_WARN("unable to find a valid qnn system interface\n"); + return 6; + } else { + GGMLHEXAGON_LOG_INFO("find a valid qnn system interface\n"); + } + set_qnn_raw_system_interface(qnn_system_interface); + + _qnn_interface.set_qnn_system_interface(provider_list[0]); + + _qnn_interface.qnn_system_context_create(&_qnn_system_handle); + if (nullptr == _qnn_system_handle) { + GGMLHEXAGON_LOG_WARN("can not create QNN system contenxt\n"); + } else { + GGMLHEXAGON_LOG_INFO("initialize qnn system successfully\n"); + } + + return 0; +} + +int qnn_instance::unload_system() { + int result = 0; + + if (nullptr == _system_lib_handle) { + GGMLHEXAGON_LOG_DEBUG("system lib handle is null\n"); + return 1; + } + + if (nullptr != _qnn_system_handle) { + result = _qnn_interface.qnn_system_context_free(_qnn_system_handle); + if (result != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to free QNN system context\n"); + } + _qnn_system_handle = nullptr; + } + + int dlclose_error = dlclose(_system_lib_handle); + if (dlclose_error != 0) { + GGMLHEXAGON_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror()); + return 2; + } + + _system_lib_handle = nullptr; + + return result; +} + +static void ggmlqnn_sdk_logcallback(const char * fmt, + QnnLog_Level_t level, + uint64_t timestamp, + va_list argp) { + + if (0 == g_hexagon_appcfg.print_qnn_internal_log) + return; + + static std::mutex log_mutex; + static unsigned char s_ggmlqnn_sdk_logbuf[GGMLHEXAGON_LOGBUF_LEN]; + + const char * log_level_desc = ""; + switch (level) { + case QNN_LOG_LEVEL_ERROR: + log_level_desc = " ERROR "; + break; + case QNN_LOG_LEVEL_WARN: + log_level_desc = "WARNING"; + break; + case QNN_LOG_LEVEL_INFO: + log_level_desc = " INFO "; + break; + case QNN_LOG_LEVEL_DEBUG: + log_level_desc = " DEBUG "; + break; + case QNN_LOG_LEVEL_VERBOSE: + log_level_desc = "VERBOSE"; + break; + case QNN_LOG_LEVEL_MAX: + log_level_desc = "UNKNOWN"; + break; + } + + double ms = (double) timestamp / 1000000.0; + { + std::lock_guard lock(log_mutex); + memset(s_ggmlqnn_sdk_logbuf, 0, GGMLHEXAGON_LOGBUF_LEN); + vsnprintf(reinterpret_cast(s_ggmlqnn_sdk_logbuf), GGMLHEXAGON_LOGBUF_LEN, fmt, argp); + GGMLHEXAGON_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggmlqnn_sdk_logbuf); + } +#if !GGMLHEXAGON_DEBUG + GGML_UNUSED(log_level_desc); + GGML_UNUSED(ms); +#endif +} + +int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { + GGMLHEXAGON_LOG_DEBUG("enter qni_init\n"); + + _device_id = HEXAGON_BACKEND_GGML; + if (_backend_name.find("QnnCpu") != std::string::npos) { + _device_id = HEXAGON_BACKEND_QNNCPU; + } + if (_backend_name.find("QnnGpu") != std::string::npos) { + _device_id = HEXAGON_BACKEND_QNNGPU; + } + if (_backend_name.find("QnnHtp") != std::string::npos) { + _device_id = HEXAGON_BACKEND_QNNNPU; + } + if (HEXAGON_BACKEND_GGML == _device_id) { + GGMLHEXAGON_LOG_INFO("user specified qnn backend is ggml, skip QNN initialize"); + return 0; + } + + if (0 != load_system()) { + GGMLHEXAGON_LOG_WARN("can not load QNN system lib, pls check why?\n"); + return 1; + } else { + GGMLHEXAGON_LOG_DEBUG("load QNN system lib successfully\n"); + } + + std::string backend_lib_path = _lib_path + _backend_name; + + int is_load_ok = load_backend(backend_lib_path, saver_config); + if (0 != is_load_ok) { + GGMLHEXAGON_LOG_WARN("failed to load QNN backend\n"); + return 2; + } + + _qnn_interface.set_qnn_interface(_loaded_backend); +#if 1 + _qnn_interface.qnn_log_create(ggmlqnn_sdk_logcallback, _qnn_log_level, &_qnn_log_handle); +#else + _qnn_raw_interface.logCreate(ggmlqnn_sdk_logcallback, _qnn_log_level, &_qnn_log_handle); +#endif + if (nullptr == _qnn_log_handle) { + GGMLHEXAGON_LOG_WARN("why failed to initialize qnn log\n"); //NPU backend not work on Qualcomm SoC based low-end phone + return 3; + } else { + GGMLHEXAGON_LOG_DEBUG("initialize qnn log successfully\n"); + } + + std::vector temp_backend_config; + _qnn_interface.qnn_backend_create(_qnn_log_handle, + temp_backend_config.empty() ? nullptr : temp_backend_config.data(), + &_qnn_backend_handle); + if (nullptr == _qnn_backend_handle) { + GGMLHEXAGON_LOG_WARN("why failed to initialize qnn backend\n"); + return 4; + } else { + GGMLHEXAGON_LOG_DEBUG("initialize qnn backend successfully\n"); + } + + if (nullptr != _qnn_raw_interface.propertyHasCapability) { + auto qnnstatus = _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); + if (QNN_PROPERTY_NOT_SUPPORTED == qnnstatus) { + GGMLHEXAGON_LOG_WARN("device property is not supported\n"); + } + if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnnstatus) { + GGMLHEXAGON_LOG_WARN("device property is not known to backend\n"); + } + } + + Qnn_ErrorHandle_t qnnstatus = QNN_SUCCESS; + if (_device_id == HEXAGON_BACKEND_QNNNPU) { + const QnnDevice_PlatformInfo_t * p_info = nullptr; + qcom_socinfo soc_info = {}; + qnnstatus = _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); + if (QNN_SUCCESS == qnnstatus) { + GGMLHEXAGON_LOG_INFO("device counts %d\n", p_info->v1.numHwDevices); + QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; + QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = {}; + for (uint32_t i = 0; i < p_info->v1.numHwDevices; i++) { + GGMLHEXAGON_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d\n", (int) infos[i].v1.deviceId, + (int) infos[i].v1.deviceType, (int) infos[i].v1.numCores); + QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; + chipinfo = devinfo->onChipDevice; + size_t htp_arch = (size_t) chipinfo.arch; + GGMLHEXAGON_LOG_INFO("htp_type:%d(%s)\n", devinfo->devType, + (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); + soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize, {} }; + } + _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); + } else { + GGMLHEXAGON_LOG_WARN("failed to get platform info, are we in emulator?\n"); + soc_info = { NONE, UNKNOWN_SM, 0, {} }; + } + + QnnHtpDevice_CustomConfig_t soc_customconfig; + soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; + soc_customconfig.socModel = soc_info.soc_model; + QnnDevice_Config_t soc_devconfig; + soc_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + soc_devconfig.customConfig = &soc_customconfig; + + /* + QnnHtpDevice_CustomConfig_t arch_customconfig; + arch_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH; + arch_customconfig.arch.arch = (QnnHtpDevice_Arch_t)soc_info.htp_arch; + arch_customconfig.arch.deviceId = 0; + QnnDevice_Config_t arch_devconfig; + arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + arch_devconfig.customConfig = &arch_customconfig; + */ + const QnnDevice_Config_t * p_deviceconfig[] = { &soc_devconfig, nullptr }; + qnnstatus = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); + } else { + qnnstatus = _qnn_interface.qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle); + } + if (QNN_SUCCESS != qnnstatus && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnnstatus) { + GGMLHEXAGON_LOG_WARN("failed to create QNN device\n"); + } else { + GGMLHEXAGON_LOG_INFO("create device successfully\n"); + } + + if (PROFILE_OFF != _profile_level) { + GGMLHEXAGON_LOG_INFO("profiling turned on; level = %d", _profile_level); + if (PROFILE_BASIC == _profile_level) { + GGMLHEXAGON_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( + _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) { + GGMLHEXAGON_LOG_WARN("unable to create profile handle in the backend\n"); + return 5; + } else { + GGMLHEXAGON_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } else if (PROFILE_DETAIL == _profile_level) { + GGMLHEXAGON_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( + _qnn_backend_handle, QNN_PROFILE_LEVEL_DETAILED, &_qnn_profile_handle)) { + GGMLHEXAGON_LOG_WARN("unable to create profile handle in the backend\n"); + return 6; + } else { + GGMLHEXAGON_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } + } + +#if defined(__ANDROID__) || defined(__linux__) + std::filesystem::path full_path(std::string(g_hexagon_appcfg.runtime_libpath) + "libcdsprpc.so"); + full_path /= std::filesystem::path("libcdsprpc.so").filename(); + _rpc_lib_handle = dlopen(full_path.string().c_str(), RTLD_NOW | RTLD_LOCAL); + if (nullptr == _rpc_lib_handle) { + GGMLHEXAGON_LOG_WARN("failed to load %s\n", full_path.c_str()); + _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); + } +#else + _rpc_lib_handle = dlopen("libcdsprpc.dll", RTLD_NOW | RTLD_LOCAL); +#endif + if (nullptr == _rpc_lib_handle) { + GGMLHEXAGON_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); + return 7; + } else { + GGMLHEXAGON_LOG_DEBUG("load rpcmem lib successfully\n"); + set_rpcmem_initialized(true); + } + _pfn_rpc_mem_init = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_init")); + _pfn_rpc_mem_deinit = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_deinit")); + _pfn_rpc_mem_alloc = reinterpret_cast(dlsym(_rpc_lib_handle,"rpcmem_alloc")); + _pfn_rpc_mem_free = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_free")); + _pfn_rpc_mem_to_fd = reinterpret_cast(dlsym(_rpc_lib_handle,"rpcmem_to_fd")); + if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || nullptr == _pfn_rpc_mem_to_fd) { + GGMLHEXAGON_LOG_WARN("unable to access symbols in QNN RPC lib, dlerror(): %s", dlerror()); + dlclose(_rpc_lib_handle); + return 8; + } + + if (nullptr != _pfn_rpc_mem_init) // make Qualcomm's SoC based low-end phone happy + _pfn_rpc_mem_init(); + + std::vector temp_context_config; + _qnn_interface.qnn_context_create(_qnn_backend_handle, _qnn_device_handle, + temp_context_config.empty() ? nullptr : temp_context_config.data(), + &_qnn_context_handle); + if (nullptr == _qnn_context_handle) { + GGMLHEXAGON_LOG_WARN("why failed to initialize qnn context, error:%s\n", strerror(errno)); + return 9; + } else { + GGMLHEXAGON_LOG_DEBUG("initialize qnn context successfully\n"); + } + + if (_backend_name.find("Htp") != std::string::npos) { + htp_print_info(); + htp_probe_rpc_meminfo(); + + if (0 != htp_init_perfinfra()) { + GGMLHEXAGON_LOG_WARN("initialize HTP performance failure"); + } + + htp_enter_performance_mode(); + htp_set_memory_grow_size(); + + if (enable_qnn_rpc()) { + GGMLHEXAGON_LOG_INFO("NPU RPC feature enabled with QNN-NPU backend"); + } else { + GGMLHEXAGON_LOG_INFO("NPU RPC feature disabled with QNN-NPU backend"); + } + } + + print_backend_info(); + + GGMLHEXAGON_LOG_DEBUG("leave qni_init\n"); + + return 0; +} + +int qnn_instance::qnn_finalize() { + int ret_status = 0; + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + GGMLHEXAGON_LOG_INFO("enter %s\n", __func__); + ggmlqnn_reset_idx(); + + free_rpcmem(); + unregister_rpcmem(); + + if (nullptr != _pfn_rpc_mem_deinit) + _pfn_rpc_mem_deinit(); + + if (0 != dlclose(_rpc_lib_handle)) { + GGMLHEXAGON_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); + } else { + GGMLHEXAGON_LOG_DEBUG("succeed to close rpcmem lib\n"); + } + + if (nullptr != _qnn_context_handle) { + error = _qnn_interface.qnn_context_free(_qnn_context_handle, _qnn_profile_handle); + if (error != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + + } + _qnn_context_handle = nullptr; + } + + if (nullptr != _qnn_profile_handle) { + error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); + if (error != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + + } + _qnn_profile_handle = nullptr; + } + + if (nullptr != _qnn_device_handle) { + error = _qnn_interface.qnn_device_free(_qnn_device_handle); + if (error != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + + } + _qnn_device_handle = nullptr; + } + + if (nullptr != _qnn_backend_handle) { + error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); + if (error != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + } + _qnn_backend_handle = nullptr; + + } + + if (nullptr != _qnn_log_handle) { + error = _qnn_interface.qnn_log_free(_qnn_log_handle); + if (error != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + } + _qnn_log_handle = nullptr; + } + + unload_backend(); + unload_system(); + + GGMLHEXAGON_LOG_INFO("leave %s\n", __func__); + return ret_status; +} + +int qnn_instance::init_qnn_graph(const std::string & graph_name, HEXAGONBackend device, size_t vtcm_size_in_mb, size_t hvx_threads) { + _graph_name = graph_name; + _device_id = device; + + //GGMLHEXAGON_LOG_DEBUG("[%s][%s]created", ggml_backend_hexagon_get_devname(device), graph_name.c_str()); + + Qnn_ErrorHandle_t error = QNN_SUCCESS; + if (HEXAGON_BACKEND_QNNNPU == device) { + QnnHtpGraph_CustomConfig_t hvx_config; + hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + hvx_config.numHvxThreads = hvx_threads; + QnnGraph_Config_t graph_hvx_config; + graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_hvx_config.customConfig = &hvx_config; + + QnnHtpGraph_CustomConfig_t dlbc_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT; + dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; + if (0 == g_hexagon_appcfg.enable_dlbc) + dlbc_config.optimizationOption.floatValue = 0.0; // set to 0.0 to turn off DLBC + else + dlbc_config.optimizationOption.floatValue = 1.0; // set to 1.0 to turn on DLBC + QnnGraph_Config_t graph_dlbc_config; + graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_dlbc_config.customConfig = &dlbc_config; + + QnnHtpGraph_CustomConfig_t opt_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT; + opt_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + opt_config.optimizationOption.floatValue = 1; // 1 / 3 + QnnGraph_Config_t graph_opt_config; + graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config.customConfig = &opt_config; + + QnnHtpGraph_CustomConfig_t vtcm_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = vtcm_size_in_mb; + QnnGraph_Config_t graph_vtcm_config; + graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_vtcm_config.customConfig = &vtcm_config; + + std::vector graph_configs; + graph_configs.push_back(&graph_hvx_config); + graph_configs.push_back(&graph_dlbc_config); + graph_configs.push_back(&graph_vtcm_config); + graph_configs.push_back(&graph_opt_config); + if (1 == g_hexagon_appcfg.precision_mode) { + QnnHtpGraph_CustomConfig_t fp16_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT; + fp16_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION; + fp16_config.precision = QNN_PRECISION_FLOAT16; + QnnGraph_Config_t graph_fp16_config; + graph_fp16_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_fp16_config.customConfig = &fp16_config; + graph_configs.push_back(&graph_fp16_config); + } + graph_configs.push_back(nullptr); + error = _qnn_interface.qnn_graph_create(_qnn_context_handle, graph_name.c_str(), graph_configs.data(), &_qnn_graph_handle); + //GGMLHEXAGON_LOG_DEBUG("[%s][%s]created graph %p", ggml_backend_hexagon_get_devname(device), graph_name.c_str(), _qnn_graph_handle); + } else { + error = _qnn_interface.qnn_graph_create(_qnn_context_handle, graph_name.c_str(), nullptr, &_qnn_graph_handle); + } + if (QNN_SUCCESS != error) { + GGMLHEXAGON_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s", + ggml_backend_hexagon_get_devname(device), graph_name.c_str(), + ggmlqnn_get_qnnerror_string(error)); + return error; + } + + GGMLHEXAGON_LOG_DEBUG("[%s]create graph %s succeed", ggml_backend_hexagon_get_devname(device), graph_name.c_str()); + if (HEXAGON_BACKEND_QNNNPU == device) { + htp_set_n_hvx_threads(hvx_threads); + } + return QNN_SUCCESS; +} + +int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation, + const QnnGraph_Config_t ** graph_configs) { + Qnn_ErrorHandle_t result = 0; + + if (nullptr == graph_name) { + GGMLHEXAGON_LOG_WARN("graph name is null\n"); + return 1; + } + + if (!_graph_name.empty()) { + GGMLHEXAGON_LOG_WARN("qnn model for graph %s already initialized\n", graph_name); + return 2; + } + + if (!do_node_validation) { + GGMLHEXAGON_LOG_WARN("node validation disabled, backend will not perform op validation prior to adding node\n"); + } + + _graph_name = graph_name; + _debug_tensor = debug; + _do_node_validations = do_node_validation; + + result = _qnn_raw_interface.graphCreate(_qnn_context_handle, + graph_name, + graph_configs, + &_qnn_graph_handle); + if (QNN_GRAPH_NO_ERROR != result || nullptr == _qnn_graph_handle) { + GGMLHEXAGON_LOG_WARN("failed to create graph in qnn context\n"); + return 3; + } else { + GGMLHEXAGON_LOG_DEBUG("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle); + } + + return 0; +} + +int qnn_instance::finalize_qnn_graph() { + if (nullptr != _qnn_graph_handle) { + if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle, + _qnn_profile_handle, nullptr) + != QNN_GRAPH_NO_ERROR) { + GGMLHEXAGON_LOG_WARN("finalizing graph failure\n"); + return 1; + } + } else { + GGMLHEXAGON_LOG_DEBUG("qnn graph handle is null\n"); + } + + return 0; +} + +int qnn_instance::htp_init_perfinfra() { + QnnDevice_Infrastructure_t device_infra = nullptr; + Qnn_ErrorHandle_t error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra); + if (QNN_SUCCESS != error) { + GGMLHEXAGON_LOG_WARN("failed to get qnn device infra\n"); + return 1; + } + + QnnHtpDevice_Infrastructure_t * htp_infra = static_cast(device_infra); + QnnHtpDevice_PerfInfrastructure_t * htp_perfinfra = &htp_infra->perfInfra; + uint32_t power_configid = 1; + uint32_t device_id = 0; + uint32_t core_id = 0; + htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); + _qnn_htp_perfinfra = htp_perfinfra; + _qnn_htp_powerconfig_id = power_configid; + //TODO:hardcode to 0 and 0 although it's correct + _qnn_htp_device_id = device_id; + _qnn_htp_core_id = core_id; + + return 0; +} + +void qnn_instance::htp_probe_rpc_meminfo() { + size_t candidate_size = 0; + uint8_t * rpc_buffer = nullptr; + size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; + size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); + for (size_t idx = 0; idx < probe_counts; idx++) { + rpc_buffer = static_cast(alloc_rpcmem_internal(probe_slots[idx] * SIZE_IN_MB, 4)); + if (nullptr == rpc_buffer) { + GGMLHEXAGON_LOG_DEBUG("alloc rpcmem %d (MiB) failure during probe rpc memory info, reason: %s\n", probe_slots[idx], strerror(errno)); + break; + } else { + candidate_size = probe_slots[idx]; + free_rpcmem(rpc_buffer); + rpc_buffer = nullptr; + } + } + if (candidate_size > _rpcmem_capacity) + _rpcmem_capacity = candidate_size * SIZE_IN_MB; + + free_rpcmem(); + _rpcmem_usage = 0; + GGMLHEXAGON_LOG_INFO("capacity of rpc ion memory %d MiB\n", _rpcmem_capacity / SIZE_IN_MB); +} + +void qnn_instance::htp_print_info() { + const QnnDevice_PlatformInfo_t * p_info = nullptr; + _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); + GGMLHEXAGON_LOG_DEBUG("HTP device counts %d", p_info->v1.numHwDevices); + QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; + for (size_t i = 0; i < p_info->v1.numHwDevices; i++) { + GGMLHEXAGON_LOG_DEBUG("HTP deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, + infos[i].v1.deviceType, infos[i].v1.numCores); + QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; + QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice; + QnnHtpDevice_Arch_t htp_arch = chipinfo.arch; + GGMLHEXAGON_LOG_DEBUG("HTP_TYPE:%d(%s)", devinfo->devType, + (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "QNN_HTP_DEVICE_TYPE_ON_CHIP" : "QNN_HTP_DEVICE_TYPE_UNKNOWN"); + GGMLHEXAGON_LOG_DEBUG("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MiB," \ + "dlbc_support:%d, signedpd_support:%d", \ + chipinfo.socModel, ggmlhexagon_get_socmodel_desc(chipinfo.socModel), \ + htp_arch, ggmlhexagon_get_htparch_desc(htp_arch), chipinfo.vtcmSize, \ + chipinfo.dlbcSupport, chipinfo.signedPdSupport); + struct qcom_socinfo * socinfo = ggmlhexagon_get_socinfo_from_socmodel(chipinfo.socModel); + g_hexagon_mgr[HEXAGON_BACKEND_QNNNPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize, {}}; + if (nullptr != socinfo) { + memcpy(g_hexagon_mgr[HEXAGON_BACKEND_QNNNPU].socinfo.soc_desc, socinfo->soc_desc, sizeof(socinfo->soc_desc)); + GGMLHEXAGON_LOG_DEBUG("soc info:%s", socinfo->soc_desc); + } else { + memcpy(g_hexagon_mgr[HEXAGON_BACKEND_QNNNPU].socinfo.soc_desc, "unknown", 7); + GGMLHEXAGON_LOG_DEBUG("soc info:unknown"); + } + } + _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); +} + +void qnn_instance::print_backend_info() { + auto print_property = [&](const char * name, QnnProperty_Key_t property) { + auto ret = _qnn_raw_interface.propertyHasCapability(property); + + const char * status = "Unknown"; + if (ret == QNN_PROPERTY_SUPPORTED) { + status = "Yes"; + } else if (ret == QNN_PROPERTY_NOT_SUPPORTED) { + status = "No"; + } + + GGMLHEXAGON_LOG_INFO("%s: %s", name, status); + }; + + GGMLHEXAGON_LOG_INFO("QNN backend properties:"); + print_property("Create context from binary list", QNN_PROPERTY_CONTEXT_SUPPORT_CREATE_FROM_BINARY_LIST_ASYNC); + print_property("Dynamic batch", QNN_PROPERTY_GRAPH_SUPPORT_BATCH_MULTIPLE); + print_property("Early termination", QNN_PROPERTY_GRAPH_SUPPORT_EARLY_TERMINATION); + print_property("Dynamic dimensions", QNN_PROPERTY_TENSOR_SUPPORT_DYNAMIC_DIMENSIONS); + print_property("Blockwise quantization", QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BLOCK); + print_property("Blockwise quantization with expansion", QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION); + print_property("Vector quantization", QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_VECTOR); + print_property("Tensor sparsity", QNN_PROPERTY_TENSOR_SUPPORT_SPARSITY); + print_property("Updateable application tensor", QNN_PROPERTY_TENSOR_SUPPORT_UPDATEABLE_APP_TENSORS); + print_property("Updateable native tensor", QNN_PROPERTY_TENSOR_SUPPORT_UPDATEABLE_NATIVE_TENSORS); + print_property("Updateable static tensor", QNN_PROPERTY_TENSOR_SUPPORT_UPDATEABLE_STATIC_TENSORS); + print_property("Qnn group device", QNN_PROPERTY_GROUP_DEVICE); +} + +void qnn_instance::htp_set_memory_grow_size(size_t size) { + QnnHtpPerfInfrastructure_MemoryConfig_t grow_size_config = { + .option = QNN_HTP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_GROW_SIZE, + .memGrowSizeConfig = (uint32_t)size, + }; + + const QnnHtpPerfInfrastructure_MemoryConfig_t *memory_config[] = { + &grow_size_config, + nullptr, + }; + Qnn_ErrorHandle_t result = _qnn_htp_perfinfra->setMemoryConfig(_qnn_htp_device_id, _qnn_htp_core_id, memory_config); + if (QNN_SUCCESS != result) { + GGMLHEXAGON_LOG_WARN("failed to set HTP memory config"); + } else { + GGMLHEXAGON_LOG_INFO("succeed to set HTP memory config"); + } +} + +void qnn_instance::htp_set_n_hvx_threads(size_t n_threads) { + QnnHtpGraph_CustomConfig_t htp_hvx_thread_config = { + .option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS, + .numHvxThreads = n_threads, + }; + + QnnGraph_Config_t hvx_thread_config = { + .option = QNN_GRAPH_CONFIG_OPTION_CUSTOM, + .customConfig = &htp_hvx_thread_config, + }; + + const QnnGraph_Config_t * graph_configs[] = {&hvx_thread_config, nullptr}; + Qnn_ErrorHandle_t result = _qnn_raw_interface.graphSetConfig(_qnn_graph_handle, graph_configs); + if (QNN_SUCCESS != result) { + GGMLHEXAGON_LOG_WARN("failed to set QNN graph config: set hvx threads %d", n_threads); + } else { + //GGMLHEXAGON_LOG_DEBUG("succeed to set QNN graph config: set hvx threads %d", n_threads); + } +} + +void qnn_instance::htp_enter_performance_mode() { + QnnHtpPerfInfrastructure_PowerConfig_t dcvs_v3_config = { + .option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3, + .dcvsV3Config = + { + .contextId = _qnn_htp_powerconfig_id, + + .setDcvsEnable = 1, + .dcvsEnable = 0, + + .powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE, + + .setSleepLatency = 1, + .sleepLatency = 40, + + .setSleepDisable = 1, + .sleepDisable = 1, + + .setBusParams = 1, + .busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER, + .busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER, + .busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER, + + .setCoreParams = 1, + .coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER, + .coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER, + .coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER, + }, + }; + + QnnHtpPerfInfrastructure_PowerConfig_t hmx_config = { + .option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_HMX_V2, + .hmxV2Config = + { + .hmxPickDefault = 0, + .hmxVoltageCornerMin = DCVS_EXP_VCORNER_MAX, + .hmxVoltageCornerTarget = DCVS_EXP_VCORNER_MAX, + .hmxVoltageCornerMax = DCVS_EXP_VCORNER_MAX, + .hmxPerfMode = QNN_HTP_PERF_INFRASTRUCTURE_CLK_PERF_HIGH, + }, + }; + + QnnHtpPerfInfrastructure_PowerConfig_t rpc_ctrl_config = { + .option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY, + .rpcControlLatencyConfig = 100, + }; + + QnnHtpPerfInfrastructure_PowerConfig_t rpc_poll_config = { + .option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME, + .rpcPollingTimeConfig = 9999, + }; + + const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = { + &dcvs_v3_config, + &hmx_config, + &rpc_ctrl_config, + &rpc_poll_config, + nullptr, + }; + Qnn_ErrorHandle_t ret = _qnn_htp_perfinfra->setPowerConfig(_qnn_htp_powerconfig_id, power_configs); + if (ret != QNN_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed to set HTP power config"); + } else { + GGMLHEXAGON_LOG_INFO("succeed to set HTP power config"); + } +} + +static uint8_t * ggmlqnn_create_rpc_buffer(qnn_instance * instance, const ggml_tensor * ggml_tensor, Qnn_Tensor_t * qnn_tensor, bool b_copydata) { + if (nullptr == instance || nullptr == ggml_tensor || nullptr == qnn_tensor) { + GGMLHEXAGON_LOG_WARN("invalid params\n"); + return nullptr; + } + + uint8_t * qnn_rpcbuffer = static_cast(instance->alloc_rpcmem(ggml_nbytes(ggml_tensor), 4)); + if (nullptr == qnn_rpcbuffer) { + GGMLHEXAGON_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + return nullptr; + } else { + GGMLHEXAGON_LOG_DEBUG("alloc rpcmem %p successfully\n", qnn_rpcbuffer); + } + if (b_copydata) + memcpy(qnn_rpcbuffer, ggml_tensor->data, ggml_nbytes(ggml_tensor)); + instance->register_rpcmem(qnn_rpcbuffer, qnn_tensor); + return qnn_rpcbuffer; +} + +static Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * package, const char * type, + Qnn_Param_t * params, uint32_t num_params, + Qnn_Tensor_t * inputs, uint32_t num_inputs, + Qnn_Tensor_t * outputs, uint32_t num_outputs) { + + char opcfg_name[GGML_MAX_NAME] = {}; + + //ensure the opcfg name is unique + if (nullptr == name) { + snprintf(opcfg_name, GGML_MAX_NAME, "opcfg_%-8d", ggmlqnn_get_idx(QNN_OPCFG_INDEX)); + } else { + snprintf(opcfg_name, GGML_MAX_NAME, "opcfg_%s_%-8d", name, ggmlqnn_get_idx(QNN_OPCFG_INDEX)); + } + //GGMLHEXAGON_LOG_DEBUG("create qnn opconfig %s", opcfg_name); + ggmlqnn_inc_idx(QNN_OPCFG_INDEX); + + Qnn_OpConfigV1_t v1 = {opcfg_name, package, type, + num_params, params, + num_inputs, inputs, + num_outputs, outputs + }; + Qnn_OpConfig_t opcfg = {QNN_OPCONFIG_VERSION_1, {v1}}; + + return opcfg; +} + +static Qnn_Tensor_t * ggmlqnn_create_general_tensor(qnn_instance * instance, Qnn_GraphHandle_t graph_handle, + const ggml_tensor * tensor, const char * name, + Qnn_TensorType_t qnn_tensor_type, + Qnn_DataType_t qnn_data_type, + uint32_t rank, uint32_t * dims, + void * data, uint32_t data_size, + bool b_transpose = false) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + char tensor_name[GGML_MAX_NAME] = {}; + + //ensure the tensor name is unique + if (nullptr == name) { + snprintf(tensor_name, GGML_MAX_NAME, "tensor_%-8d", ggmlqnn_get_idx(QNN_TENSOR_INDEX)); + } else { + snprintf(tensor_name, GGML_MAX_NAME, "tensor_%s%-8d", name, ggmlqnn_get_idx(QNN_TENSOR_INDEX)); + } + GGMLHEXAGON_LOG_DEBUG("init_tensor %s", tensor_name); + ggmlqnn_inc_idx(QNN_TENSOR_INDEX); + + uint32_t reverse_dims[GGML_MAX_DIMS] = {}; + uint32_t transpose_dims[GGML_MAX_DIMS] = {}; + uint32_t * tensor_dims = nullptr; + //case 1:use dims info from ggml tensor + if (nullptr != tensor) { + //there are different dimension order between ggml tensor and qnn tensor + for (size_t idx = 0; idx < rank; idx++) { + reverse_dims[idx] = (uint32_t)tensor->ne[rank - 1 - idx]; + } + tensor_dims = reverse_dims; + } + //case 2: use user's specified tensor_dims + if (nullptr != dims) { + tensor_dims = dims; + } + //case 3: transpose for dst tensor + if (b_transpose) { + GGML_ASSERT(tensor != nullptr); //ensure ggml_tensor is not nullptr for this special case + + ggmlqnn_get_qnn_dimensions_from_ggml_dimensions(transpose_dims, reverse_dims, ggml_n_dims(tensor)); + tensor_dims = transpose_dims; + } + + Qnn_Tensor_t qnn_tensor = { + .version = QNN_TENSOR_VERSION_1, + .v1 = { + .id = 0, + .name = tensor_name, + .type = qnn_tensor_type, + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = qnn_data_type, + .quantizeParams = {.encodingDefinition = QNN_DEFINITION_UNDEFINED, + .quantizationEncoding = QNN_QUANTIZATION_ENCODING_UNDEFINED}, + .rank = rank, + .dimensions = tensor_dims, + .memType = QNN_TENSORMEMTYPE_RAW, + .clientBuf = {.data = nullptr, .dataSize = 0} + } + }; + Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); + if (nullptr == p_qnn_tensor) { + GGMLHEXAGON_LOG_WARN("calloc failed"); + return nullptr; + } + error = ggmlqnn_deep_copy_qnntensor(qnn_tensor, *p_qnn_tensor); + if (error != QNN_SUCCESS) { + free(p_qnn_tensor); + GGMLHEXAGON_LOG_WARN("init tensor failed"); + return nullptr; + } + + bool enable_npu_rpc = (instance->enable_qnn_rpc() && instance->get_device_id() == HEXAGON_BACKEND_QNNNPU); + if (enable_npu_rpc) { + QNN_VER_PTR(*p_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*p_qnn_tensor)->clientBuf = {.data=nullptr, .dataSize=0}; + } else { + QNN_VER_PTR(*p_qnn_tensor)->clientBuf = {data, data_size}; + } + QNN_INTERFACE_VER_TYPE qnn_raw_interface = instance->get_qnn_raw_interface(); + CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_qnn_tensor)); + + return p_qnn_tensor; +} + +static Qnn_Tensor_t * ggmlqnn_create_compute_tensor(qnn_instance * instance, Qnn_GraphHandle_t graph_handle, + const ggml_tensor * tensor, Qnn_TensorType_t tensor_type) { + uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], + (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]}; + Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32; + Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + + if (0 == tensor->flags) { + qnn_tensor_type = tensor_type; + } else { + if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { + qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { + qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; + } + } + + qnn_data_type = ggmlqnn_datatype_from_ggml_datatype(tensor->type); + Qnn_Tensor_t * p_qnn_tensor = ggmlqnn_create_general_tensor(instance, graph_handle, tensor, nullptr, + qnn_tensor_type, qnn_data_type, + ggml_n_dims(tensor), dimensions, + nullptr, 0); + return p_qnn_tensor; +} + +// ================================================================================================= +// section-6: hwaccel approach through QNN: offload GGML op to QNN backend +// ================================================================================================= +/* + * provide a general skeleton to offload ggml op to QNN backend: perform element-wise + * operation on 1/2 input tensors and 1 output tensors +*/ +static void ggmlqnn_compute_elementwise(ggml_backend_hexagon_context * ctx, ggml_tensor * op) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + qnn_instance * instance = nullptr; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * p_tensor0 = nullptr; + Qnn_Tensor_t * p_tensor1 = nullptr; + Qnn_Tensor_t * p_tensor2 = nullptr; + const ggml_tensor * src0 = op->src[0]; + const ggml_tensor * src1 = op->src[1]; + ggml_tensor * dst = op; + + GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst); + instance = ctx->instance; + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + size_t qnn_op_index = ggmlhexagon_get_op_index(op); + const char * qnn_op_name = ggmlqnn_k_op_caps[qnn_op_index].qnn_op_name; + size_t input_param_count = ggmlqnn_k_op_caps[qnn_op_index].input_param_count; + const char * ggml_original_opname = ggml_op_name(op->op); + std::string ggml_op_name_string = std::string("ggml_") + ggml_original_opname; + const char * ggml_op_name = ggml_op_name_string.c_str(); + + std::string graph_name; + ggmlhexagon_get_opkey_from_op(op, graph_name); + + int input_size = ggml_nbytes(src0); + if (nullptr != src1) + input_size += ggml_nbytes(src1); + hexagon_perf op_perf(graph_name, ggml_original_opname, input_size, ggml_nbytes(dst)); + op_perf.start(); + + bool enable_npu_rpc = instance->enable_qnn_rpc() && ctx->device == HEXAGON_BACKEND_QNNNPU; + if (ctx->qnn_singlenode_graph_map.find(graph_name) != ctx->qnn_singlenode_graph_map.end()) { + //retrieve computational resource from cached QNN graph + qnn_singlenode_res_t & graph_item = ctx->qnn_singlenode_graph_map[graph_name]; + graph_handle = std::get<0>(graph_item); + qnn_ptensors_t & ptensors = std::get<1>(graph_item); + p_tensor0 = ptensors[0]; + if (2 == input_param_count) { + p_tensor1 = ptensors[1]; + p_tensor2 = ptensors[2]; + } else { + //now p_tensor1 is nullptr + p_tensor2 = ptensors[1]; + } + } else { + GGML_ASSERT(instance->get_device_id() == ctx->device); + GGMLHEXAGON_LOG_INFO("graph name %s", graph_name.c_str()); + //create QNN graph + error = instance->init_qnn_graph(graph_name, static_cast(ctx->device), + g_hexagon_appcfg.vtcm_size_in_mb, + g_hexagon_appcfg.hvx_threads); + if (QNN_SUCCESS != error) { + GGMLHEXAGON_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error); + return; + } + graph_handle = instance->get_qnn_graph_handle(); + + //GGMLHEXAGON_LOG_DEBUG("graph_handle %p", graph_handle); + //create computational tensor + p_tensor0 = ggmlqnn_create_compute_tensor(instance, graph_handle, src0, QNN_TENSOR_TYPE_APP_WRITE); + if (2 == input_param_count) { + p_tensor1 = ggmlqnn_create_compute_tensor(instance, graph_handle, src1, QNN_TENSOR_TYPE_APP_WRITE); + } + p_tensor2 = ggmlqnn_create_compute_tensor(instance, graph_handle, dst, QNN_TENSOR_TYPE_APP_READ); + + //compose QNN graph + qnn_tensors_t input_tensors; + input_tensors.reserve(input_param_count); + input_tensors.push_back(*p_tensor0); + if (2 == input_param_count) { + input_tensors.push_back(*p_tensor1); + } + Qnn_Tensor_t output_tensors[] = { + *p_tensor2 + }; + Qnn_OpConfig_t op_config = ggmlqnn_create_op_config(ggml_op_name, + QNN_OP_PACKAGE_NAME_QTI_AISW, + qnn_op_name, nullptr, 0, + input_tensors.data(), + input_param_count, output_tensors, + 1); + CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, op_config)); + //finalize QNN graph + CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr)); + + //cache QNN graph + qnn_ptensors_t qnn_elementwise_tensors; + qnn_elementwise_tensors.reserve(input_param_count + 1); + + qnn_elementwise_tensors.push_back(p_tensor0); + if (2 == input_param_count) { + qnn_elementwise_tensors.push_back(p_tensor1); + } + qnn_elementwise_tensors.push_back(p_tensor2); + auto graph_item = std::make_tuple(graph_handle, qnn_elementwise_tensors); + ctx->qnn_singlenode_graph_map[graph_name] = graph_item; + } + + if (enable_npu_rpc) { + uint8_t * qnn_buffer_0 = static_cast(instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*p_tensor0)->memHandle)); + GGMLHEXAGON_LOG_DEBUG("qnn_rpcbuffer_0 = %p\n", qnn_buffer_0); + if (nullptr != qnn_buffer_0) { + memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); + } + + if (2 == input_param_count) { + uint8_t * qnn_buffer_1 = static_cast(instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*p_tensor1)->memHandle)); + GGMLHEXAGON_LOG_DEBUG("qnn_rpcbuffer_1 = %p\n", qnn_buffer_1); + if (nullptr != qnn_buffer_1) { + memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); + } + } + } else { + QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)}; + if (2 == input_param_count) { + QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)}; + } + QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)}; + } + + qnn_tensors_t input_tensors; + input_tensors.reserve(input_param_count); + input_tensors.push_back(*p_tensor0); + if (2 == input_param_count) { + input_tensors.push_back(*p_tensor1); + } + Qnn_Tensor_t output_tensors[] = { + *p_tensor2 + }; + CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle, + input_tensors.data(), input_param_count, + output_tensors, 1, + nullptr, nullptr)); + if (enable_npu_rpc) { + uint8_t * qnn_buffer_2 = static_cast(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle)); + if (nullptr != qnn_buffer_2) { + memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); + } + } + + op_perf.info(); +} + +/* + * this function is AI-assisted code from Grok 3 for purpose of offload 4d matrix mulmat to QNN backend + * various UT has verified and succeed but failed in CT of test-backend-ops + * + * the logic of ggmlqnn_compute_mul_mat_4d is similar to ggmlqnn_compute_mul_mat but much more complicated + * than ggmlqnn_compute_mul_mat, so it's a standalone function. + * it will be combined with ggmlqnn_compute_mul_mat in the future + */ +static void ggmlqnn_compute_mul_mat_4d(ggml_backend_hexagon_context * ctx, ggml_tensor * op) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + qnn_instance * instance = ctx->instance; + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + + const ggml_tensor * src0 = op->src[0]; + const ggml_tensor * src1 = op->src[1]; + ggml_tensor * dst = op; + + GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst); + GGML_ASSERT(ggml_n_dims(src0) == 4 && ggml_n_dims(src1) == 4); + + hexagon_perf op_perf("ggmlqnn_compute_mul_mat_4d"); + op_perf.start(); + + std::string graph_name; + ggmlhexagon_get_opkey_from_op(op, graph_name); + GGMLHEXAGON_LOG_DEBUG("graph name %s\n", graph_name.c_str()); + + ggmlhexagon_print_tensors_info(__func__, ctx, src0, src1, dst); + + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * p_tensor0 = nullptr; + Qnn_Tensor_t * p_reshape0_out = nullptr; + Qnn_Tensor_t * p_tile0_out = nullptr; + Qnn_Tensor_t * p_tensor1 = nullptr; + Qnn_Tensor_t * p_permute1_out = nullptr; + Qnn_Tensor_t * p_reshape1_out = nullptr; + Qnn_Tensor_t * p_matmul_out = nullptr; + Qnn_Tensor_t * p_reshape2_out = nullptr; + + if (ctx->qnn_singlenode_graph_map.find(graph_name) != ctx->qnn_singlenode_graph_map.end()) { + qnn_singlenode_res_t & graph_item = ctx->qnn_singlenode_graph_map[graph_name]; + graph_handle = std::get<0>(graph_item); + qnn_ptensors_t & tensors = std::get<1>(graph_item); + p_tensor0 = tensors[0]; + p_reshape0_out = tensors[1]; + p_tile0_out = tensors[2]; + p_tensor1 = tensors[3]; + p_permute1_out = tensors[4]; + p_reshape1_out = tensors[5]; + p_matmul_out = tensors[6]; + p_reshape2_out = tensors[7]; + } else { + CHECK_QNN_API(error, qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), NULL, &graph_handle)); + + // Define dimensions + uint32_t K = src0->ne[0]; // Inner dimension + uint32_t M = src0->ne[1]; // Rows of src0 + uint32_t N = src1->ne[1]; // Columns of src1 + uint32_t B0 = src0->ne[2] * src0->ne[3]; // src0 batch + uint32_t B1 = src1->ne[2] * src1->ne[3]; // src1 batch (drives output) + + // Validate K only + GGML_ASSERT(src0->ne[0] == src1->ne[0]); // K must match + + // src0: [K, M, H0, B0] -> QNN: [B0, H0, M, K] + uint32_t src0_dims[] = {static_cast(src0->ne[3]), static_cast(src0->ne[2]), + static_cast(src0->ne[1]), static_cast(src0->ne[0]) + }; + p_tensor0 = ggmlqnn_create_general_tensor(instance, graph_handle, src0, "input0", + QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4, + src0_dims, nullptr, 0); + + // Reshape src0 to [B0, M, K] + uint32_t reshape0_out_dims[] = {B0, M, K}; + p_reshape0_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "reshape0_out", + QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3, + reshape0_out_dims, nullptr, 0); + + Qnn_Tensor_t reshape0_inputs[] = {*p_tensor0}; + Qnn_Tensor_t reshape0_outputs[] = {*p_reshape0_out}; + Qnn_OpConfig_t reshape0_op = ggmlqnn_create_op_config("reshape0", QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_RESHAPE, nullptr, 0, + reshape0_inputs, 1, reshape0_outputs, 1); + CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape0_op)); + + // Tile src0 to match B1: [B0, M, K] -> [B1, M, K] + uint32_t tile0_out_dims[] = {B1, M, K}; + p_tile0_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "tile0_out", + QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3, + tile0_out_dims, nullptr, 0); + + uint32_t tile_multiples[] = {B1 / B0, 1, 1}; + uint32_t tile_dims[] = {3}; + Qnn_Tensor_t * p_tile_multiples = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "tile_multiples", + QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1, + tile_dims, tile_multiples, sizeof(tile_multiples)); + + Qnn_Param_t tile_params[] = {{.paramType = QNN_PARAMTYPE_TENSOR, .name = "multiples", .tensorParam = *p_tile_multiples}}; + Qnn_Tensor_t tile0_inputs[] = {*p_reshape0_out}; + Qnn_Tensor_t tile0_outputs[] = {*p_tile0_out}; + Qnn_OpConfig_t tile0_op = ggmlqnn_create_op_config("tile0", QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_TILE, tile_params, 1, + tile0_inputs, 1, tile0_outputs, 1); + CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, tile0_op)); + + // src1: [N, K, H1, B1] -> QNN: [B1, H1, N, K] + uint32_t src1_dims[] = {static_cast(src1->ne[3]), static_cast(src1->ne[2]), + static_cast(src1->ne[1]), static_cast(src1->ne[0]) + }; + p_tensor1 = ggmlqnn_create_general_tensor(instance, graph_handle, src1, "input1", + QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4, + src1_dims, nullptr, 0); + + + // Permute src1 to [B1, H1, K, N] + uint32_t perm_data[] = {0, 1, 3, 2}; + uint32_t perm_dims[] = {4}; + Qnn_Tensor_t * p_perm = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "perm", + QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1, + perm_dims, perm_data, sizeof(perm_data)); + + uint32_t permute1_out_dims[] = {static_cast(src1->ne[3]), static_cast(src1->ne[2]), + static_cast(src1->ne[0]), static_cast(src1->ne[1]) + }; + p_permute1_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "permute1_out", + QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 4, + permute1_out_dims, nullptr, 0); + + Qnn_Param_t permute1_params[] = {{.paramType = QNN_PARAMTYPE_TENSOR, .name = "perm", .tensorParam = *p_perm}}; + Qnn_Tensor_t permute1_inputs[] = {*p_tensor1}; + Qnn_Tensor_t permute1_outputs[] = {*p_permute1_out}; + Qnn_OpConfig_t permute1_op = ggmlqnn_create_op_config("permute1", QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_TRANSPOSE, permute1_params, 1, + permute1_inputs, 1, permute1_outputs, 1); + CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, permute1_op)); + + // Reshape src1 to [B1, K, N] + uint32_t reshape1_out_dims[] = {B1, K, N}; + p_reshape1_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "reshape1_out", + QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3, + reshape1_out_dims, nullptr, 0); + + Qnn_Tensor_t reshape1_inputs[] = {*p_permute1_out}; + Qnn_Tensor_t reshape1_outputs[] = {*p_reshape1_out}; + Qnn_OpConfig_t reshape1_op = ggmlqnn_create_op_config("reshape1", QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_RESHAPE, nullptr, 0, + reshape1_inputs, 1, reshape1_outputs, 1); + CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape1_op)); + + // MatMul: [B1, M, K] x [B1, K, N] -> [B1, M, N] + uint32_t matmul_out_dims[] = {B1, M, N}; + p_matmul_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "matmul_out", + QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3, + matmul_out_dims, nullptr, 0); + + Qnn_Tensor_t matmul_inputs[] = {*p_tile0_out, *p_reshape1_out}; + Qnn_Tensor_t matmul_outputs[] = {*p_matmul_out}; + Qnn_OpConfig_t matmul_op = ggmlqnn_create_op_config("matmul", QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_MAT_MUL, nullptr, 0, + matmul_inputs, 2, matmul_outputs, 1); + CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, matmul_op)); + + // Output: [N, M, H1, B1] -> QNN: [B1, H1, M, N] + uint32_t reshape2_out_dims[] = {static_cast(dst->ne[3]), static_cast(dst->ne[2]), + static_cast(dst->ne[1]), static_cast(dst->ne[0]) + }; + p_reshape2_out = ggmlqnn_create_general_tensor(instance, graph_handle, dst, "output", + QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, 4, + reshape2_out_dims, nullptr, 0); + + Qnn_Tensor_t reshape2_inputs[] = {*p_matmul_out}; + Qnn_Tensor_t reshape2_outputs[] = {*p_reshape2_out}; + Qnn_OpConfig_t reshape2_op = ggmlqnn_create_op_config("reshape2", QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_RESHAPE, nullptr, 0, + reshape2_inputs, 1, reshape2_outputs, 1); + CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape2_op)); + + // Finalize + CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL)); + + // Cache + qnn_ptensors_t ggml_op_mulmat_tensors = {p_tensor0, p_reshape0_out, p_tile0_out, p_tensor1, + p_permute1_out, p_reshape1_out, p_matmul_out, p_reshape2_out + }; + ctx->qnn_singlenode_graph_map[graph_name] = std::make_tuple(graph_handle, ggml_op_mulmat_tensors); + } + + // Execute + QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, static_cast(ggml_nbytes(src0))}; + QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, static_cast(ggml_nbytes(src1))}; + QNN_VER_PTR(*p_reshape2_out)->clientBuf = {dst->data, static_cast(ggml_nbytes(dst))}; + + Qnn_Tensor_t input_tensors[] = {*p_tensor0, *p_tensor1}; + Qnn_Tensor_t output_tensors[] = {*p_reshape2_out}; + CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle, input_tensors, 2, output_tensors, 1, NULL, NULL)); + + op_perf.info(); +} + +/* + * @brief performs matrix multiplication with FP32 & quantized weights and floating-point inputs + * using the QNN backend. this function performs matrix multiplication of the input tensor + * `src1` and the weight tensor `src0`, handling transposing, and quantization as needed, + * and stores the result in the destination tensor `dst`. + * + there are two key-points in properly handling how to offload mulmat to the QNN + 1. transpose + a 3x2 f32 matrix which means 3 rows and 2 columns. in ggml, it could be created from: + struct ggml_tensor* matrix = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3); + which like this: + +---+---+ + | 0 | 1 | + +---+---+ + | 2 | 3 | + +---+---+ + | 4 | 5 | + +---+---+ + with + ne[0] = 2 + ne[1] = 3 + there are different dimension order between ggml tensor and qnn tensor + + 2. QNN's MatMul can only support input tensors with rank >= 2 + + in the all, there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose + operation when offloading mulmat to QNN backend. this implementation will handle transpose + in func ggmlqnn_compute_create_general_tensor() + + * @param ctx the context of backend + * @param op the destination tensor where the result of the matrix multiplication will be stored. + * + * @note the logic of ggmlqnn_compute_mul_mat is similar to ggmlqnn_compute_op_two_tensors but much more complicated + * than ggmlqnn_compute_op_two_tensors. so it's a standalone function. accordingly, this is another + * typical skeleton for offload other ggml ops to QNN backend. MUL_MAT take most of the compute + * time (about 95%).so to speed up llama inference, should focus on this func. there are three kinds + * of MUL_MAT to compute: + * mul_mat_f32: both src0 and src1 are F32, this will be naturally handled in QNN backend + * mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1 + * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, Q6_K...) + * and src1 is F32, src0 -> f32 in src0', then src0' * src1 +*/ +static void ggmlqnn_compute_mul_mat(ggml_backend_hexagon_context * ctx, ggml_tensor * op) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + qnn_instance * instance = nullptr; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * p_tensor0 = nullptr; + Qnn_Tensor_t * p_tensor1 = nullptr; + Qnn_Tensor_t * p_tensor2 = nullptr; + Qnn_Tensor_t * p_param_tensor = nullptr; + Qnn_Tensor_t * p_tensor2_transpose = nullptr; + const ggml_tensor * src0 = op->src[0]; + const ggml_tensor * src1 = op->src[1]; + ggml_tensor * dst = op; + + GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst); + instance = ctx->instance; + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + + const enum ggml_type src0_type = src0->type; + const uint32_t src0_rank = ggml_n_dims(src0); + const uint32_t src1_rank = ggml_n_dims(src1); + const char * ggml_original_opname = ggml_op_name(op->op); + ggmlhexagon_print_tensors_info(__func__, ctx, src0, src1, dst); + + std::string graph_name; + ggmlhexagon_get_opkey_from_op(op, graph_name); + + int input_size = ggml_nbytes(src0); + if (nullptr != src1) + input_size += ggml_nbytes(src1); + hexagon_perf op_perf(graph_name, ggml_original_opname, input_size, ggml_nbytes(dst)); + op_perf.start(); + + GGML_ASSERT(src0_rank == src1_rank); + GGML_ASSERT(src0_rank >= 2); //QNN SDK's limitation, make QNN SDK happy + if (4 == src0_rank) { + return ggmlqnn_compute_mul_mat_4d(ctx, op); + } + + void * wdata = ggmlhexagon_type_trait(ctx, op); + const size_t desired_size = ctx->desired_size; + + if (ctx->qnn_singlenode_graph_map.find(graph_name) != ctx->qnn_singlenode_graph_map.end()) { + //retrieve computational resource from cached QNN graph + qnn_singlenode_res_t & graph_item = ctx->qnn_singlenode_graph_map[graph_name]; + graph_handle = std::get<0>(graph_item); + qnn_ptensors_t &tensors = std::get<1>(graph_item); + p_tensor0 = tensors[0]; + p_tensor1 = tensors[1]; + p_tensor2 = tensors[2]; + p_param_tensor = tensors[3]; + p_tensor2_transpose = tensors[4]; + } else { + //create QNN graph + GGMLHEXAGON_LOG_INFO("graph name %s", graph_name.c_str()); + error = instance->init_qnn_graph(graph_name, static_cast(ctx->device), + g_hexagon_appcfg.vtcm_size_in_mb, + g_hexagon_appcfg.hvx_threads); + if (QNN_SUCCESS != error) { + GGMLHEXAGON_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d\n", + graph_name.c_str(), error); + return; + } + graph_handle = instance->get_qnn_graph_handle(); + + //create computational tensor + p_tensor0 = ggmlqnn_create_general_tensor(instance, graph_handle, src0, nullptr, + QNN_TENSOR_TYPE_APP_WRITE, + QNN_DATATYPE_FLOAT_32, src0_rank, + nullptr, nullptr, 0); + p_tensor1 = ggmlqnn_create_general_tensor(instance, graph_handle, src1, nullptr, + QNN_TENSOR_TYPE_APP_WRITE, + QNN_DATATYPE_FLOAT_32, src0_rank, + nullptr, nullptr, 0); + p_tensor2 = ggmlqnn_create_general_tensor(instance, graph_handle, dst, nullptr, + QNN_TENSOR_TYPE_APP_READ, + QNN_DATATYPE_FLOAT_32, src0_rank, + nullptr, nullptr, 0); + + //create param tensor for offload 2d/3d/4d matrix multiplication + const uint32_t param_tensor_data[GGML_MAX_DIMS][GGML_MAX_DIMS] = { + {0}, + {1, 0}, + {0, 2, 1}, + {0, 1, 3, 2}, + }; + uint32_t param_tensor_dims[1] = {src0_rank}; + p_param_tensor = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "param", + QNN_TENSOR_TYPE_STATIC, + QNN_DATATYPE_UINT_32, 1, + param_tensor_dims, + (void *) (param_tensor_data[src0_rank - 1]), + src0_rank * sizeof(uint32_t)); + + //create transpose tensor + p_tensor2_transpose = ggmlqnn_create_general_tensor(instance, graph_handle, dst, + "transpose", + QNN_TENSOR_TYPE_NATIVE, + QNN_DATATYPE_FLOAT_32, src0_rank, + nullptr, nullptr, 0, true); + + //compose QNN graph: add mulmat node + Qnn_Param_t out_0_params[] = { + {.paramType = QNN_PARAMTYPE_SCALAR, .name = QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, .scalarParam = { + .dataType = QNN_DATATYPE_BOOL_8, .bool8Value = 1}}}; + Qnn_Tensor_t out_0_inputs[] = {*p_tensor0, *p_tensor1}; + Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose}; + Qnn_OpConfig_t out_0 = ggmlqnn_create_op_config("mulmat_opconfig", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_MAT_MUL, out_0_params, 1, + out_0_inputs, 2, out_0_outputs, 1); + CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, out_0)); + + //compose QNN graph: add transpose node + Qnn_Param_t out_trans1_0_params[] = { + {.paramType = QNN_PARAMTYPE_TENSOR, .name = "perm", .tensorParam = *p_param_tensor}}; + Qnn_Tensor_t out_trans1_0_inputs[] = {*p_tensor2_transpose}; + Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2}; + Qnn_OpConfig_t out_trans1_0 = ggmlqnn_create_op_config("mulmat_transpose_opconfig", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_TRANSPOSE, + out_trans1_0_params, 1, + out_trans1_0_inputs, 1, + out_trans1_0_outputs, 1); + CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, out_trans1_0)); + + //finalize QNN graph + CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr)); + + //cache QNN graph + qnn_ptensors_t ggml_op_mulmat_tensors; + ggml_op_mulmat_tensors.reserve(5); + ggml_op_mulmat_tensors.push_back(p_tensor0); + ggml_op_mulmat_tensors.push_back(p_tensor1); + ggml_op_mulmat_tensors.push_back(p_tensor2); + ggml_op_mulmat_tensors.push_back(p_param_tensor); + ggml_op_mulmat_tensors.push_back(p_tensor2_transpose); + auto graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors); + ctx->qnn_singlenode_graph_map[graph_name] = graph_item; + } + + if (src0_type != GGML_TYPE_F32) { + QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast(desired_size)}; + } else { + QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)}; + } + QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)}; + QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *p_tensor0, + *p_tensor1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *p_tensor2 + }; + CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr)); + op_perf.info(); +} + +static void ggmlqnn_compute_repeat(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_div(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_leaky_relu(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_concat(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_arange(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_sqr(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_clamp(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_scale(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_argsort(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_norm(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_group_norm(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_acc(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_sum_rows(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_upsample_nearest2d(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_pad(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_pool2d(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_dup(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_rms_norm(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_diag_mask(ggml_backend_hexagon_context * ctx, ggml_tensor * dst, float value) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); + GGML_UNUSED(value); +} + +static void ggmlqnn_compute_im2col(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_timestep_embedding(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_cpy(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + ggmlqnn_compute_dup(ctx, dst); +} + +static void ggmlqnn_compute_softmax(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_get_rows(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +static void ggmlqnn_compute_rope(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(dst); +} + +// ================================================================================================= +// section-7: cDSP helper function +// ================================================================================================= +static const char * ggmlhexagon_get_dsp_name(int domain_id) { + switch (domain_id) { + case HEXAGON_ADSP: + return "Hexagon-aDSP"; + case HEXAGON_MDSP: + return "Hexagon-mDSP"; + case HEXAGON_SDSP: + return "Hexagon-sDSP"; + case HEXAGON_CDSP: + return "Hexagon-cDSP"; + case HEXAGON_CDSP1: + return "Hexagon-cDSP1"; + default: + return "Hexagon-unknown"; + } +} + +static int ggmlhexagon_pd_status_notifier_callback(void * context, int domain, int session, remote_rpc_status_flags_t status){ + int error = AEE_SUCCESS; + switch (status){ + case FASTRPC_USER_PD_UP: + GGMLHEXAGON_LOG_DEBUG("PD is up\n"); + break; + case FASTRPC_USER_PD_EXIT: + GGMLHEXAGON_LOG_DEBUG("PD closed\n"); + break; + case FASTRPC_USER_PD_FORCE_KILL: + GGMLHEXAGON_LOG_DEBUG("PD force kill\n"); + break; + case FASTRPC_USER_PD_EXCEPTION: + GGMLHEXAGON_LOG_DEBUG("PD exception\n"); + break; + case FASTRPC_DSP_SSR: + GGMLHEXAGON_LOG_DEBUG("DSP SSR\n"); + break; + default : + error = AEE_EBADITEM; + break; + } + return error; +} + +static domain * ggmlhexagon_get_domain(int domain_id) { + int size = sizeof(hexagon_supported_domains) / sizeof(domain); + + for (int i = 0; i < size; i++) { + if (hexagon_supported_domains[i].id == domain_id) + return &hexagon_supported_domains[i]; + } + + return nullptr; +} + +static bool ggmlhexagon_is_cdsp(int domain_id) { + return (domain_id == HEXAGON_CDSP) || (domain_id == HEXAGON_CDSP1); +} + +static bool ggmlhexagon_is_valid_domain_id(int domain_id, int compute_only) { + int size = sizeof(hexagon_supported_domains) / sizeof(domain); + + if (0 != compute_only) { + return ggmlhexagon_is_cdsp(domain_id); + } + + for (int i = 0; i < size; i++) { + if (hexagon_supported_domains[i].id == domain_id) + return true; + } + + return false; +} + +static int ggmlhexagon_get_domains_info(const char * domain_type, int * num_domains, fastrpc_domain ** domains_info) { + int hexagon_err = AEE_SUCCESS; + int ss_info = 0; + void * buffer = nullptr; + ss_info = strcmp(domain_type, "NSP")? HPASS: NSP; + system_req_payload req; + memset(&req, 0, sizeof(system_req_payload)); + req.id = FASTRPC_GET_DOMAINS; + req.sys.domains = nullptr; + fastrpc_domain * domain = nullptr; + + if (ss_info != 0) { + req.sys.flags = DOMAINS_LIST_FLAGS_SET_TYPE(req.sys.flags, ss_info); + } else { + req.sys.flags =0; + } + +#ifdef _WIN32 + hexagon_err = AEE_EUNSUPPORTED; + goto bail; +#endif + + hexagon_err = remote_system_request(&req); + if (hexagon_err != AEE_SUCCESS) { + GGMLHEXAGON_LOG_DEBUG("failure in remote_system_request call: %d", hexagon_err); + goto bail; + } + //allocate memory for domain-info array + req.sys.max_domains = req.sys.num_domains; + buffer = calloc(req.sys.num_domains, sizeof(fastrpc_domain)); + if (nullptr == buffer) { + hexagon_err = AEE_ENOMEMORY; + GGMLHEXAGON_LOG_DEBUG("unable to allocate memory for req.sys.domains"); + goto bail; + } + req.sys.domains = static_cast(buffer); + hexagon_err = remote_system_request(&req); + if (hexagon_err != AEE_SUCCESS) { + GGMLHEXAGON_LOG_DEBUG("failure in remote_system_request call: %d.\n", hexagon_err); + goto bail; + } + + for (int i = 0; i < req.sys.num_domains; i++) { + //verify that only requested type domains were returned + domain = &req.sys.domains[i]; + if (domain->type != ss_info) { + hexagon_err = -1; + GGMLHEXAGON_LOG_DEBUG("incorrect data received from remote_system_request.\n"); + goto bail; + } + } + *domains_info = req.sys.domains; + *num_domains = req.sys.num_domains; + +bail: + if (hexagon_err && !req.sys.domains) { + free(req.sys.domains); + } + return hexagon_err; +} + +static int ggmlhexagon_get_dsp_support(int * domain) { + int hexagon_error = AEE_SUCCESS; + *domain = HEXAGON_CDSP; + + if (remote_handle_control) { + struct remote_dsp_capability dsp_capability_domain = {HEXAGON_CDSP, DOMAIN_SUPPORT, 0}; + hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability)); + if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { + GGMLHEXAGON_LOG_DEBUG("FastRPC Capability API is not supported on this device"); + goto bail; + } + + if (0 == dsp_capability_domain.capability) { + dsp_capability_domain.domain = HEXAGON_ADSP; + dsp_capability_domain.attribute_ID = DOMAIN_SUPPORT; + dsp_capability_domain.capability = 0; + hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability)); + if(dsp_capability_domain.capability) { + *domain = HEXAGON_ADSP; + } + } + + if (hexagon_error != AEE_SUCCESS) { + GGMLHEXAGON_LOG_DEBUG("get_dsp_support failed with error 0x%x", hexagon_error); + goto bail; + } + } else { + hexagon_error = AEE_EUNSUPPORTEDAPI; + GGMLHEXAGON_LOG_DEBUG("remote_dsp_capability interface is not supported on this device"); + } + +bail: + return hexagon_error; +} + +static int ggmlhexagon_get_vtcm_info(int domain, uint32_t attr, uint32_t * capability) { + int hexagon_error = AEE_SUCCESS; + *capability = 0; + + if (attr == VTCM_PAGE || attr == VTCM_COUNT) { + } else { + hexagon_error = AEE_EBADPARM; + GGMLHEXAGON_LOG_DEBUG("unsupported attr, only VTCM_PAGE and VTCM_COUNT supported"); + goto bail; + } + + if (remote_handle_control) { + if (domain == HEXAGON_ADSP || domain == HEXAGON_CDSP) { + /* + * query the DSP for VTCM information + * since the ADSP does not have a dedicated VTCM, we expect the output to be 0 + */ + struct remote_dsp_capability dsp_capability_vtcm_dsp; + dsp_capability_vtcm_dsp.domain = (uint32_t)domain; + dsp_capability_vtcm_dsp.attribute_ID = attr; + dsp_capability_vtcm_dsp.capability = (uint32_t)0; + hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_vtcm_dsp, sizeof(struct remote_dsp_capability)); + if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { + GGMLHEXAGON_LOG_DEBUG("FastRPC Capability API is not supported on this device"); + GGMLHEXAGON_LOG_DEBUG("running the use case without checking the capability"); + hexagon_error = AEE_SUCCESS; + goto bail; + } else if (hexagon_error == AEE_SUCCESS) { + *capability = dsp_capability_vtcm_dsp.capability; + } else { + GGMLHEXAGON_LOG_DEBUG("get_vtcm_info failed with error 0x%x", hexagon_error); + goto bail; + } + } else { + hexagon_error = AEE_EUNSUPPORTED; + GGMLHEXAGON_LOG_DEBUG("unsupported domain %d", domain); + goto bail; + } + } else { + hexagon_error = AEE_EUNSUPPORTEDAPI; + GGMLHEXAGON_LOG_DEBUG("remote_dsp_capability interface is not supported on this device"); + } + +bail: + return hexagon_error; +} + +static bool ggmlhexagon_is_unsignedpd_supported(int domain_id) { + int hexagon_error = AEE_SUCCESS; + if (remote_handle_control) { + struct remote_dsp_capability dsp_capability_domain = {static_cast(domain_id), UNSIGNED_PD_SUPPORT, 0}; + hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability)); + if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { + GGMLHEXAGON_LOG_WARN("FastRPC Capability API is not supported on this device. Falling back to signed pd"); + return false; + } + + if (hexagon_error) { + GGMLHEXAGON_LOG_WARN("error 0x%x: FastRPC Capability API failed. falling back to signed pd", hexagon_error); + return false; + } + + if (dsp_capability_domain.capability == 1) { + return true; + } + } else { + hexagon_error = AEE_EUNSUPPORTEDAPI; + GGMLHEXAGON_LOG_WARN("remote_dsp_capability interface is not supported on this device.falling back to signed pd"); + return false; + } + + return false; +} + +static bool ggmlhexagon_get_unsignedpd_support(void) { + return ggmlhexagon_is_unsignedpd_supported(HEXAGON_CDSP); +} + +static bool ggmlhexagon_is_async_fastrpc_supported(int domain) { + int hexagon_error = AEE_SUCCESS; + if (remote_handle_control) { + if (domain == HEXAGON_CDSP) { + /* + * Query the DSP for ASYNC_FASTRPC_SUPPORT information + * Async fastrpc is supported only on CDSP + */ + struct remote_dsp_capability dsp_capability_async_support; + dsp_capability_async_support.domain = (uint32_t)domain; + dsp_capability_async_support.attribute_ID = ASYNC_FASTRPC_SUPPORT; + dsp_capability_async_support.capability = (uint32_t)0; + hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_async_support, sizeof(struct remote_dsp_capability)); + if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { + GGMLHEXAGON_LOG_WARN("FastRPC Capability API is not supported on this device"); + hexagon_error = AEE_SUCCESS; + goto bail; + } else if (dsp_capability_async_support.capability == 1) { + return true; + } + + if (hexagon_error != AEE_SUCCESS){ + GGMLHEXAGON_LOG_WARN("failed with error 0x%x", hexagon_error); + goto bail; + } + } else { + hexagon_error = AEE_EUNSUPPORTED; + GGMLHEXAGON_LOG_WARN("async FastRPC is not supported on domain %d", domain); + goto bail; + } + } else { + hexagon_error = AEE_EUNSUPPORTEDAPI; + GGMLHEXAGON_LOG_WARN("remote_dsp_capability interface is not supported on this device"); + } + +bail: + return false; +} + +static void ggmlhexagon_set_rpc_latency(remote_handle64 handle, int qos, int latency) { + int hexagon_error = AEE_SUCCESS; + + if (remote_handle_control) { + struct remote_rpc_control_latency data; +/* + qos | latency + ----------------------- + RPC_PM_QOS | 100 + RPC_POLL_QOS | 1000 +*/ + data.enable = qos; + data.latency = latency; + hexagon_error = remote_handle64_control(handle, DSPRPC_CONTROL_LATENCY, (void*)&data, sizeof(data)); + if (hexagon_error != AEE_SUCCESS) { + GGMLHEXAGON_LOG_WARN("failed with error 0x%x", hexagon_error); + goto bail; + } else { + GGMLHEXAGON_LOG_INFO("set rpc qos %d, latency %d\n", qos, latency); + } + } else { + hexagon_error = AEE_EUNSUPPORTEDAPI; + GGMLHEXAGON_LOG_WARN("remote_dsp_capability interface is not supported on this device"); + } + +bail: + return; +} + +static bool ggmlhexagon_is_status_notification_supported(int domain) { + int hexagon_error = AEE_SUCCESS; + + if (remote_handle_control) { + /* + * Query the DSP for STATUS_NOTIFICATION_SUPPORT information + * DSP User PD status notification Support + */ + struct remote_dsp_capability dsp_capability_status_notification_support; + dsp_capability_status_notification_support.domain = (uint32_t)domain; + dsp_capability_status_notification_support.attribute_ID = STATUS_NOTIFICATION_SUPPORT; + dsp_capability_status_notification_support.capability = (uint32_t)0; + hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_status_notification_support, sizeof(struct remote_dsp_capability)); + if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { + GGMLHEXAGON_LOG_WARN("FastRPC Capability API is not supported on this device"); + hexagon_error = AEE_SUCCESS; + goto bail; + } else if (1 == dsp_capability_status_notification_support.capability) { + return true; + } + + if (hexagon_error != AEE_SUCCESS){ + GGMLHEXAGON_LOG_WARN("failed with error 0x%x", hexagon_error); + goto bail; + } + } else { + hexagon_error = AEE_EUNSUPPORTEDAPI; + GGMLHEXAGON_LOG_WARN("remote_dsp_capability interface is not supported on this device"); + } + +bail: + return false; +} + +static int ggmlhexagon_get_hmx_support_info(int domain, uint32_t attr, uint32_t * capability) { + int hexagon_error = AEE_SUCCESS; + *capability = 0; + + if (attr != HMX_SUPPORT_SPATIAL && attr != HMX_SUPPORT_DEPTH) { + hexagon_error = AEE_EBADPARM; + GGMLHEXAGON_LOG_WARN("unsupported attr, only HMX_SUPPORT_SPATIAL and HMX_SUPPORT_DEPTH supported"); + goto bail; + } + + if (remote_handle_control) { + if (domain == HEXAGON_CDSP) { + /* + * Query the DSP for HMX SUPPORT information + * HMX is supported on CDSP only + */ + struct remote_dsp_capability dsp_capability_hmx_dsp; + dsp_capability_hmx_dsp.domain = (uint32_t)domain; + dsp_capability_hmx_dsp.attribute_ID = attr; + dsp_capability_hmx_dsp.capability = (uint32_t)0; + hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hmx_dsp, sizeof(struct remote_dsp_capability)); + if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { + GGMLHEXAGON_LOG_DEBUG("FastRPC Capability API is not supported on this device"); + hexagon_error = AEE_SUCCESS; + goto bail; + } + else if (hexagon_error == AEE_SUCCESS) { + *capability = dsp_capability_hmx_dsp.capability; + } else { + GGMLHEXAGON_LOG_DEBUG("get_hmx_support_info failed with Error 0x%x", hexagon_error); + goto bail; + } + } else { + hexagon_error = AEE_EUNSUPPORTED; + GGMLHEXAGON_LOG_DEBUG("HMX support is not there for domain %d", domain); + goto bail; + } + } else { + hexagon_error = AEE_EUNSUPPORTEDAPI; + GGMLHEXAGON_LOG_DEBUG("remote_dsp_capability interface is not supported on this device"); + } + +bail: + return hexagon_error; +} + +static int ggmlhexagon_get_hvx_arch_ver(int domain, uint32_t * capability) { + int hexagon_error = AEE_SUCCESS; + *capability = 0; + if(remote_handle_control) { + /* + * Query the Hexagon processor architecture version information + */ + struct remote_dsp_capability dsp_capability_arch_ver; + dsp_capability_arch_ver.domain = (uint32_t)domain; + dsp_capability_arch_ver.attribute_ID = ARCH_VER; + dsp_capability_arch_ver.capability = (uint32_t)0; + hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_arch_ver, sizeof(struct remote_dsp_capability)); + if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { + GGMLHEXAGON_LOG_DEBUG("FastRPC Capability API is not supported on this device"); + hexagon_error = AEE_SUCCESS; + goto bail; + } else if (hexagon_error == AEE_SUCCESS) { + *capability = dsp_capability_arch_ver.capability & 0xFF; + } else { + GGMLHEXAGON_LOG_DEBUG("get_hex_arch_ver failed with error 0x%x", hexagon_error); + goto bail; + } + } else { + hexagon_error = AEE_EUNSUPPORTEDAPI; + GGMLHEXAGON_LOG_DEBUG("remote_dsp_capability interface is not supported on this device"); + } + +bail: + return hexagon_error; +} + +static int ggmlhexagon_get_hvx_support_info(int domain, uint32_t attr, uint32_t * capability) +{ + int hexagon_error = AEE_SUCCESS; + *capability = 0; + if (attr == HVX_SUPPORT_64B) { + hexagon_error = AEE_EBADPARM; + GGMLHEXAGON_LOG_DEBUG("latest targets have 128 byte HVX register, use HVX_SUPPORT_128B instead of HVX_SUPPORT_64B"); + goto bail; + } + + if (attr != HVX_SUPPORT_128B) { + hexagon_error = AEE_EBADPARM; + GGMLHEXAGON_LOG_DEBUG("unsupported attr. only HVX_SUPPORT_128B supported"); + goto bail; + } + + if (remote_handle_control) { + if (domain == HEXAGON_CDSP) { + /* + * Query the DSP for HVX SUPPORT information + * HVX is supported on CDSP only + */ + struct remote_dsp_capability dsp_capability_hvx_dsp; + dsp_capability_hvx_dsp.domain = (uint32_t)domain; + dsp_capability_hvx_dsp.attribute_ID = attr; + dsp_capability_hvx_dsp.capability = (uint32_t)0; + hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hvx_dsp, sizeof(struct remote_dsp_capability)); + if ((hexagon_error & 0xFF)==(AEE_EUNSUPPORTEDAPI & 0xFF)) { + GGMLHEXAGON_LOG_DEBUG("FastRPC Capability API is not supported on this device"); + hexagon_error = AEE_SUCCESS; + goto bail; + } else if (hexagon_error == AEE_SUCCESS) { + *capability = dsp_capability_hvx_dsp.capability; + } else { + GGMLHEXAGON_LOG_DEBUG("failed with error 0x%x", hexagon_error); + goto bail; + } + } else { + hexagon_error = AEE_EUNSUPPORTED; + GGMLHEXAGON_LOG_DEBUG("HVX support is not available on domain %d", domain); + goto bail; + } + } else { + hexagon_error = AEE_EUNSUPPORTEDAPI; + GGMLHEXAGON_LOG_DEBUG("remote_dsp_capability interface is not supported on this device"); + } + +bail: + return hexagon_error; +} + +static int ggmlhexagon_request_status_notifications(int domain_id, void * context, notify_callback_fn call_back_fn) { + int hexagon_error = AEE_SUCCESS; + struct remote_rpc_notif_register notif; + bool status_notification_support; + + notif.context = context; + notif.domain = domain_id; + notif.notifier_fn = call_back_fn; + + status_notification_support = ggmlhexagon_is_status_notification_supported(domain_id); + if (status_notification_support) { + hexagon_error = remote_session_control(FASTRPC_REGISTER_STATUS_NOTIFICATIONS, (void*)¬if, sizeof(notif)); + if (hexagon_error != AEE_SUCCESS) { + GGMLHEXAGON_LOG_DEBUG("error 0x%x: remote_session_control failed to enable status notifications", hexagon_error); + } + } else { + hexagon_error = AEE_EUNSUPPORTEDAPI; + } + + return hexagon_error; +} + +static int ggmlhexagon_init_rpcmempool(ggml_backend_hexagon_context * ctx) { + size_t candidate_size = 0; + uint8_t * rpc_buffer = nullptr; + size_t probe_slots[] = {1024, 1536, 2000, 2048}; + size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); + + if (nullptr == ctx) + return 1; + + for (size_t idx = 0; idx < probe_counts; idx++) { + rpc_buffer = static_cast(rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (probe_slots[idx] * SIZE_IN_MB))); + if (nullptr == rpc_buffer) { + GGMLHEXAGON_LOG_DEBUG("alloc rpcmem %d (MiB) failure during probe rpc memory info, reason: %s\n", probe_slots[idx], strerror(errno)); + break; + } else { + candidate_size = probe_slots[idx]; + rpcmem_free(rpc_buffer); + rpc_buffer = nullptr; + } + } + ctx->rpc_mempool_capacity = candidate_size * SIZE_IN_MB; + GGMLHEXAGON_LOG_DEBUG("rpc memory capacity %ld(%d MiB) for device %d", + ctx->rpc_mempool_capacity, ctx->rpc_mempool_capacity / SIZE_IN_MB, ctx->device); + GGMLHEXAGON_LOG_INFO("capacity of rpc memory %d MiB", ctx->rpc_mempool_capacity / SIZE_IN_MB); + + if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) { + GGML_ASSERT(ctx->rpc_mempool_capacity > (8 * SIZE_IN_MB)); + ctx->rpc_mempool_len = ctx->rpc_mempool_capacity - (8 * SIZE_IN_MB); + + //FIXME: it seems there is unknown issue with 2+ GiB memory pool + ctx->rpc_mempool = rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_TRY_MAP_STATIC, ctx->rpc_mempool_len); + if (nullptr == ctx->rpc_mempool) { + GGMLHEXAGON_LOG_WARN("alloc rpc memorypool %d failed", ctx->rpc_mempool_len); + return 2; + } else { + GGMLHEXAGON_LOG_DEBUG("alloc rpc memorypool %p successfully %ld(%d MiB)", + ctx->rpc_mempool, ctx->rpc_mempool_len, + ctx->rpc_mempool_len / SIZE_IN_MB); + } + ctx->rpc_mempool_handle = rpcmem_to_fd(ctx->rpc_mempool); + GGMLHEXAGON_LOG_DEBUG("rpc mempool handle %d", ctx->rpc_mempool_handle); + remote_register_buf(ctx->rpc_mempool, ctx->rpc_mempool_len, ctx->rpc_mempool_handle); + } + + return 0; +} + +static void ggmlhexagon_deinit_rpcmempool(ggml_backend_hexagon_context * ctx) { + if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) { + if (ctx->rpc_mempool) { + //deregister rpc memory pool + remote_register_buf(ctx->rpc_mempool, ctx->rpc_mempool_len, -1); + GGMLHEXAGON_LOG_DEBUG("free rpc mempool %p", ctx->rpc_mempool); + rpcmem_free(ctx->rpc_mempool); + ctx->rpc_mempool = nullptr; + ctx->rpc_mempool_len = 0; + ctx->rpc_mempool_capacity = 0; + } + } +} + +static void ggmlhexagon_probe_dspinfo(ggml_backend_hexagon_context * ctx) { + uint32_t dsp_version = 0; + ggmlhexagon_get_hvx_arch_ver(ctx->domain_id, &dsp_version); + + if (dsp_version == 0x68 || dsp_version == 0x69 || dsp_version == 0x73 || dsp_version == 0x75 || dsp_version == 0x79) { + GGMLHEXAGON_LOG_INFO("dsp arch version 0x%x", dsp_version); + //0x68 -> 68, 0x69 -> 69, 0x73 -> 73, 0x75 -> 75, 0x79 -> 79 + size_t htp_arch = ggmlhexagon_htparch_hex_to_decimal(dsp_version); + GGMLHEXAGON_LOG_DEBUG("dsp arch version %d", htp_arch); + struct qcom_socinfo * socinfo = ggmlhexagon_get_socinfo_from_socmodel(htp_arch); + if (nullptr != socinfo) { + //got fully description of SoC when hwaccel approach is HWACCEL_CDSP + GGMLHEXAGON_LOG_INFO("device info: %s, %s", socinfo->soc_desc, ggmlhexagon_get_htparch_desc(htp_arch)); + } + } else { + GGMLHEXAGON_LOG_WARN("error: dsp arch version 0x%x is not supported", dsp_version); + } + + uint32_t vtcm_count = 0; + uint32_t vtcm_page = 0; + ggmlhexagon_get_vtcm_info(ctx->domain_id, VTCM_COUNT, &vtcm_count); + ggmlhexagon_get_vtcm_info(ctx->domain_id, VTCM_PAGE, &vtcm_page); + GGMLHEXAGON_LOG_INFO("vtcm_count %d", vtcm_count); + GGMLHEXAGON_LOG_INFO("vtcm_page %d", vtcm_page); + + uint32_t hmx_depth = 0; + uint32_t hmx_spatial = 0; + ggmlhexagon_get_hmx_support_info(ctx->domain_id, HMX_SUPPORT_DEPTH, &hmx_depth); + ggmlhexagon_get_hmx_support_info(ctx->domain_id, HMX_SUPPORT_SPATIAL, &hmx_spatial); + GGMLHEXAGON_LOG_INFO("hmx_depth %d", hmx_depth); + GGMLHEXAGON_LOG_INFO("hmx_spatial %d", hmx_spatial); + + uint32_t hvx_support_128b = 0; + ggmlhexagon_get_hvx_support_info(ctx->domain_id, HVX_SUPPORT_128B, &hvx_support_128b); + GGMLHEXAGON_LOG_INFO("hvx_support_128b %d", hvx_support_128b); + + GGMLHEXAGON_LOG_INFO("unsigned pd supported %d", ggmlhexagon_get_unsignedpd_support()); + GGMLHEXAGON_LOG_INFO("async fastrpc supported %d", ggmlhexagon_is_async_fastrpc_supported(ctx->domain_id)); +} + +static void ggmlhexagon_deinit_cdsp(ggml_backend_hexagon_context * ctx) { + int hexagon_error = AEE_SUCCESS; + GGMLHEXAGON_LOG_INFO("enter %s", __func__); + if (0 != ctx->ggmlop_handle) { + hexagon_error = ggmlop_dsp_close(ctx->ggmlop_handle); + if (AEE_SUCCESS != hexagon_error) { + GGMLHEXAGON_LOG_WARN("error 0x%x: failed to close ggmlop dsp handle", hexagon_error); + } else { + ctx->ggmlop_handle = 0; + } + } + + ggmlhexagon_deinit_rpcmempool(ctx); + + ctx->domain_id = -1; + GGMLHEXAGON_LOG_INFO("leave %s", __func__); +} + +static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) { + int hexagon_error = AEE_SUCCESS; + + int domain_id = HEXAGON_CDSP; + const char * domain_type = "NSP"; + + int unsignedpd_flag = 1; + bool is_unsignedpd_enabled = false; + int use_logical_id = 0; + int core_id = -1; + fastrpc_domain * domains_info = NULL; + int num_domains = -1; + + domain * my_domain = NULL; + char * uri = NULL; + + char * ggmlop_domain_uri = NULL; + int ggmlop_domain_uri_len = 0; + + if (nullptr == ctx) + return 1; + GGMLHEXAGON_LOG_INFO("init Hexagon cDSP with backend %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device)); + if (0 != ctx->ggmlop_handle) { + GGMLHEXAGON_LOG_DEBUG("already init Hexagon cDSP with backend %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device)); + return 0; + } + ctx->ggmlop_handle = 0; + + if (-1 == domain_id) { + if (nullptr != domain_type) { + if ((strcmp(domain_type, "NSP") != 0 && strcmp(domain_type, "HPASS") != 0)) { + GGMLHEXAGON_LOG_WARN("invalid domain_type %s. possible values are NSP or HPASS", domain_type); + goto bail; + } else { + hexagon_error = ggmlhexagon_get_domains_info(domain_type, &num_domains, &domains_info); + if (hexagon_error == AEE_EUNSUPPORTED) { + GGMLHEXAGON_LOG_DEBUG("API is not supported on this target so cannot get domains info from the device. falling back to legacy approach of using default domain id"); + hexagon_error = ggmlhexagon_get_dsp_support(&domain_id); + if (hexagon_error != AEE_SUCCESS) { + GGMLHEXAGON_LOG_DEBUG("error: 0x%x, defaulting to cDSP domain", hexagon_error); + } + } else if (hexagon_error != AEE_SUCCESS) { + GGMLHEXAGON_LOG_DEBUG("error in getting domains information"); + goto bail; + } else { + if (core_id != -1) { + if (core_id < 0 || core_id >= num_domains) { + GGMLHEXAGON_LOG_DEBUG("invalid core_id = %d for %s. core_id should be between 0 to %d", core_id, domain_type, num_domains - 1); + hexagon_error = AEE_EBADPARM; + goto bail; + } + } else { + core_id = 0; + } + use_logical_id = 1; + domain_id = domains_info[core_id].id; + } + } + } else { + GGMLHEXAGON_LOG_DEBUG("DSP domain is not provided, retrieving DSP information using Remote APIs"); + hexagon_error = ggmlhexagon_get_dsp_support(&domain_id); + if (hexagon_error != AEE_SUCCESS) { + GGMLHEXAGON_LOG_DEBUG("error: 0x%x, defaulting to cDSP domain", hexagon_error); + } + } + } + + if (0 == use_logical_id) { + if (!ggmlhexagon_is_valid_domain_id(domain_id, 0)) { + hexagon_error = AEE_EBADPARM; + GGMLHEXAGON_LOG_DEBUG("error 0x%x: invalid domain %d", hexagon_error, domain_id); + goto bail; + } + + my_domain = ggmlhexagon_get_domain(domain_id); + if (nullptr == my_domain) { + GGMLHEXAGON_LOG_DEBUG("unable to get domain struct %d", domain_id); + goto bail; + } + uri = my_domain->uri; + } + GGMLHEXAGON_LOG_DEBUG("temporary domain uri=%s\n", uri); + + if (1 == unsignedpd_flag) { + is_unsignedpd_enabled = ggmlhexagon_is_unsignedpd_supported(domain_id); + if (!is_unsignedpd_enabled) { + GGMLHEXAGON_LOG_DEBUG("overriding user request for unsigned PD, only signed offload is allowed on domain %d", domain_id); + unsignedpd_flag = 0; + } + } + + ctx->domain_id = domain_id; + GGMLHEXAGON_LOG_INFO("using Hexagon domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id)); + GGMLHEXAGON_LOG_INFO("unsignedpd_enabled %d", is_unsignedpd_enabled); + if (is_unsignedpd_enabled) { + if (remote_session_control) { + struct remote_rpc_control_unsigned_module data; + data.enable = 1; + data.domain = domain_id; + hexagon_error = remote_session_control(DSPRPC_CONTROL_UNSIGNED_MODULE, (void *)&data, sizeof(data)); + GGMLHEXAGON_LOG_DEBUG("remote_session_control returned %d for configuring unsigned PD success", hexagon_error); + if (AEE_SUCCESS != hexagon_error) { + GGMLHEXAGON_LOG_DEBUG("error 0x%x: remote_session_control failed", hexagon_error); + } + } else { + GGMLHEXAGON_LOG_DEBUG("unsigned PD not supported on this device"); + hexagon_error = AEE_EUNSUPPORTED; + GGMLHEXAGON_LOG_DEBUG("error 0x%x: remote_session_control interface is not supported on this device", hexagon_error); + } + } + + hexagon_error = ggmlhexagon_request_status_notifications(domain_id, (void *)STATUS_CONTEXT, ggmlhexagon_pd_status_notifier_callback); + if (AEE_SUCCESS != hexagon_error) { + if (AEE_EUNSUPPORTEDAPI != hexagon_error) { + GGMLHEXAGON_LOG_WARN("error 0x%x: hexagon_request_status_notifications failed", hexagon_error); + } + GGMLHEXAGON_LOG_WARN("error 0x%x: failed to compute on domain %d", hexagon_error, domain_id); + goto bail; + } + + ggmlop_domain_uri_len = strlen(ggmlop_URI) + MAX_DOMAIN_NAMELEN; + ggmlop_domain_uri = (char *)malloc(ggmlop_domain_uri_len); + snprintf(ggmlop_domain_uri, ggmlop_domain_uri_len, "%s%s", ggmlop_URI, uri); + GGMLHEXAGON_LOG_DEBUG("ggmlop domain uri:%s", ggmlop_domain_uri); + hexagon_error = ggmlop_dsp_open(ggmlop_domain_uri, &ctx->ggmlop_handle); + if (AEE_SUCCESS == hexagon_error) { + GGMLHEXAGON_LOG_INFO("succeed to open domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id)); + //FIXME: only support offload fp32 GGML_OP_MUL_MAT to cDSP + GGMLHEXAGON_LOG_INFO("only support offload fp32 GGML_OP_ADD and fp32 GGML_OP_MUL_MAT to cDSP currently"); + ggmlhexagon_probe_dspinfo(ctx); + //FIXME: re-use this function to pass thread_counts info to code on cDSP side before fully understand qidl mechanism + ggmlop_dsp_setclocks(ctx->ggmlop_handle, HAP_DCVS_VCORNER_TURBO_PLUS, 40, 1, g_hexagon_appcfg.thread_counts); + ggmlhexagon_set_rpc_latency(ctx->ggmlop_handle, RPC_POLL_QOS, 100); + int result = ggmlhexagon_init_rpcmempool(ctx); + if (0 != result) { + GGMLHEXAGON_LOG_INFO("failed to init rpc mempool"); + goto bail; + } + } else { + GGMLHEXAGON_LOG_INFO("error 0x%x: failed to open domain %d(%s)", hexagon_error, domain_id, + ggmlhexagon_get_dsp_name(domain_id)); + goto bail; + } + + //make sure test-backend-ops get the correct backend name when hwaccel approach is 2(HWACCEL_CDSP) + memcpy(g_hexagon_mgr[ctx->device].name, "Hexagon-cDSP", strlen("Hexagon-cDSP")); + + return 0; + +bail: + if (ggmlop_domain_uri) { + free(ggmlop_domain_uri); + } + + ggmlhexagon_deinit_cdsp(ctx); + + return -1; +} + +static void ggmlhexagon_compute(ggml_backend_hexagon_context * ctx, struct ggml_tensor * op) { + //skip sanity check because already checked in other place + struct dsptensor dsptensor_0; + struct dsptensor dsptensor_1; + struct dsptensor dsptensor_2; + std::string op_name; + const char * ggml_opname = ggml_op_name(op->op); + ggmlhexagon_get_opkey_from_op(op, op_name); + + int hexagon_error = AEE_SUCCESS; + ggmlhexagon_op_func_t op_func = nullptr; + size_t input_tensor_count = 2; + + ggml_tensor * src0 = op->src[0]; + ggml_tensor * src1 = op->src[1]; + ggml_tensor * dst = op; + + int input_size = ggml_nbytes(src0); + if (nullptr != src1) + input_size += ggml_nbytes(src1); + hexagon_perf op_perf(op_name, ggml_opname, input_size, ggml_nbytes(dst)); + op_perf.start(); + + input_tensor_count = ggmlhexagon_k_op_caps[ggmlhexagon_get_op_index(op)].input_param_count; + op_func = ggmlhexagon_k_op_caps[ggmlhexagon_get_op_index(op)].dsp_op_func; + if (nullptr == op_func) { + GGMLHEXAGON_LOG_DEBUG("op GGML_OP_%s and dsp func %s not supported on cCSP", ggml_op_name(op->op), ggmlhexagon_k_op_caps[ggmlhexagon_get_op_index(op)].hexagon_op_name); + return; + } + + //FIXME:try to fully understand the tech detail in qidl: + // qidl is a binary tool to generate some very complicated and hard-to customized bridge-layer codes + // between ARM-AP and cDSP. the mechanism in qidl/FastRPC is exactly similar to mechanism in TEE. + // try to find a better/efficient approach to exchange necessary data between ARM-AP side and cDSP side. + // manually modifying the important data structure ggml_tensor in ggml.h is not make-sense and not acceptable. + dsptensor_0.data = src0->data; + dsptensor_0.data_len = ggml_nbytes(src0); + dsptensor_0.type = src0->type; + + dsptensor_0.ne[0] = src0->ne[0]; + dsptensor_0.ne[1] = src0->ne[1]; + dsptensor_0.ne[2] = src0->ne[2]; + dsptensor_0.ne[3] = src0->ne[3]; + + dsptensor_0.nb[0] = src0->nb[0]; + dsptensor_0.nb[1] = src0->nb[1]; + dsptensor_0.nb[2] = src0->nb[2]; + dsptensor_0.nb[3] = src0->nb[3]; + + if (2 == input_tensor_count) { + GGML_ASSERT(nullptr != src1); + dsptensor_1.data = src1->data; + dsptensor_1.type = src1->type; + dsptensor_1.data_len = ggml_nbytes(src1); + + dsptensor_1.ne[0] = src1->ne[0]; + dsptensor_1.ne[1] = src1->ne[1]; + dsptensor_1.ne[2] = src1->ne[2]; + dsptensor_1.ne[3] = src1->ne[3]; + + dsptensor_1.nb[0] = src1->nb[0]; + dsptensor_1.nb[1] = src1->nb[1]; + dsptensor_1.nb[2] = src1->nb[2]; + dsptensor_1.nb[3] = src1->nb[3]; + } + + dsptensor_2.data = dst->data; + dsptensor_2.data_len = ggml_nbytes(dst); + dsptensor_2.type = dst->type; + + dsptensor_2.ne[0] = dst->ne[0]; + dsptensor_2.ne[1] = dst->ne[1]; + dsptensor_2.ne[2] = dst->ne[2]; + dsptensor_2.ne[3] = dst->ne[3]; + + dsptensor_2.nb[0] = dst->nb[0]; + dsptensor_2.nb[1] = dst->nb[1]; + dsptensor_2.nb[2] = dst->nb[2]; + dsptensor_2.nb[3] = dst->nb[3]; + + memcpy(dsptensor_2.op_params, dst->op_params, GGML_MAX_OP_PARAMS / sizeof(int32_t)); + + hexagon_error = op_func(ctx->ggmlop_handle, &dsptensor_0, &dsptensor_1, &dsptensor_2); + if (AEE_SUCCESS != hexagon_error) { + GGMLHEXAGON_LOG_WARN("ggmlop %s computation fail on cdsp", ggml_op_name(op->op)); + } + + op_perf.info(); + return; +} + +// ================================================================================================= +// section-8: implementation of ggml-hexagon backend according to specification in ggml backend subsystem +// ================================================================================================= +static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const struct ggml_tensor * op_tensor) { + ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *)dev->context; + GGML_UNUSED(ctx); + if (op_tensor->op == GGML_OP_NONE) { + return true; + } + + if (!ggmlhexagon_k_op_caps[ggmlhexagon_get_op_index(op_tensor)].supported) { + return false; + } + + const ggml_tensor * src0 = op_tensor->src[0]; + const ggml_tensor * src1 = op_tensor->src[1]; + const int src0_rank = ggml_n_dims(src0); + int src1_rank = 0; + if (nullptr != src1) { + src1_rank = ggml_n_dims(src1); + } + switch (op_tensor->op) { + case GGML_OP_ADD: + { + if (!ggml_are_same_shape(src0, src1)) { + return false; + } + return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32); + } + case GGML_OP_MUL_MAT: + { + ggmlhexagon_dump_op_info(op_tensor); + //FIXME:keep same filter logic with QNN solution to compare NPU performance between cDSP approach + // and QNN-NPU approach, remove these filters in the future + if (src0_rank != src1_rank) + return false; + if (src0_rank != 2) + return false; + + if (1 == g_hexagon_appcfg.enable_q_mulmat) { + if (1 == g_hexagon_appcfg.enable_all_q_mulmat) { + return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)) && (src1->type == GGML_TYPE_F32); + } + + return (src0->type == GGML_TYPE_F32 + || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q8_0 + || src0->type == GGML_TYPE_Q6_K || src0->type == GGML_TYPE_Q8_K + ) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32); + } else { + return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && + (op_tensor->type == GGML_TYPE_F32); + } + } + case GGML_OP_SOFT_MAX:{ + if (!ggml_is_contiguous(op_tensor)) + return false; + if (!ggml_are_same_shape(src0, op_tensor)) + return false; + } + case GGML_OP_RMS_NORM: + case GGML_OP_POOL_2D: + { + + ggmlhexagon_dump_op_info(op_tensor); + } + default: + break; + } + return false; +} + +static bool ggmlhexagon_can_handle_op_through_qnn(ggml_backend_dev_t dev, const struct ggml_tensor * op_tensor) { + ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *)dev->context; + if (op_tensor->op == GGML_OP_NONE) { + return true; + } + + if (!ggmlqnn_k_op_caps[ggmlhexagon_get_op_index(op_tensor)].supported) { + return false; + } + + struct ggml_tensor * src0 = op_tensor->src[0]; + struct ggml_tensor * src1 = op_tensor->src[1]; + const int64_t ne00 = src0->ne[0];; + const int src0_rank = ggml_n_dims(src0); + int src1_rank = 0; + if (nullptr != src1) { + src1_rank = ggml_n_dims(src1); + } + + switch (op_tensor->op) { + case GGML_OP_ADD: + case GGML_OP_SUB: + { + if (!ggml_are_same_shape(src0, src1)) { + return false; + } + + if (ne00 < 32) + return false; + + return ggmlhexagon_same_types(ctx, op_tensor); + } + + case GGML_OP_DIV: + case GGML_OP_MUL: { + if (ctx->device == HEXAGON_BACKEND_QNNNPU) + return false; + + if (!ggml_are_same_shape(src0, src1)) { + return false; + } + + if ((src0_rank != 2) || (src1_rank != 2)) //TODO: 3D and 4D matrix mul + return false; + + return ggmlhexagon_same_types(ctx, op_tensor); + } + case GGML_OP_MUL_MAT: + { + ggmlhexagon_dump_op_info(op_tensor); + if (src0_rank != src1_rank) // make QNN SDK happy + return false; + + if (src0_rank != 2) { + // FIXME: there are some limitations for mulmat in QNN SDK: rank >= 2. + // keep same filter logic with QNN solution to compare NPU performance between + // cDSP approach and QNN-NPU approach, remove these filters in the future + return false; + } + + if (ctx->device == HEXAGON_BACKEND_QNNNPU) { + if (1 == g_hexagon_appcfg.enable_q_mulmat) + return (src0->type == GGML_TYPE_F32 + || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q8_0 + || src0->type == GGML_TYPE_Q6_K || src0->type == GGML_TYPE_Q8_K + ) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32); + else + return (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && op_tensor->type == GGML_TYPE_F32); + } else { + return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)) + && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32); + } + } + case GGML_OP_LOG: + { + if (ctx->device == HEXAGON_BACKEND_QNNNPU) + return false; + } + case GGML_OP_SQRT: + default: + return ggmlhexagon_same_types(ctx, op_tensor); + } +} + +static bool ggmlhexagon_compute_forward(ggml_backend_t backend, struct ggml_tensor * dst) { + ggmlqnn_op_func_t func = nullptr; + ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *)backend->context; + + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + ggmlhexagon_compute(ctx, dst); + return true; + } + + switch (dst->op) { + case GGML_OP_REPEAT: + ggmlqnn_compute_repeat(ctx, dst); + break; + case GGML_OP_GET_ROWS: + ggmlqnn_compute_get_rows(ctx, dst); + break; + case GGML_OP_DUP: + ggmlqnn_compute_dup(ctx, dst); + break; + case GGML_OP_ADD: + case GGML_OP_SUB: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_SQRT: + case GGML_OP_LOG: + func = ggmlqnn_compute_elementwise; + break; + case GGML_OP_ACC: + ggmlqnn_compute_acc(ctx, dst); + break; + case GGML_OP_UNARY: + switch (ggml_get_unary_op(dst)) { + case GGML_UNARY_OP_GELU: + break; + case GGML_UNARY_OP_SILU: + break; + case GGML_UNARY_OP_GELU_QUICK: + break; + case GGML_UNARY_OP_TANH: + break; + case GGML_UNARY_OP_RELU: + break; + case GGML_UNARY_OP_HARDSIGMOID: + break; + case GGML_UNARY_OP_HARDSWISH: + break; + default: + return false; + } + break; + case GGML_OP_NORM: + ggmlqnn_compute_norm(ctx, dst); + break; + case GGML_OP_GROUP_NORM: + ggmlqnn_compute_group_norm(ctx, dst); + break; + case GGML_OP_CONCAT: + ggmlqnn_compute_concat(ctx, dst); + break; + case GGML_OP_UPSCALE: + ggmlqnn_compute_upsample_nearest2d(ctx, dst); + break; + case GGML_OP_PAD: + ggmlqnn_compute_pad(ctx, dst); + break; + case GGML_OP_ARANGE: + ggmlqnn_compute_arange(ctx, dst); + break; + case GGML_OP_TIMESTEP_EMBEDDING: + ggmlqnn_compute_timestep_embedding(ctx, dst); + break; + case GGML_OP_LEAKY_RELU: + ggmlqnn_compute_leaky_relu(ctx, dst); + break; + case GGML_OP_RMS_NORM: + ggmlqnn_compute_rms_norm(ctx, dst); + break; + case GGML_OP_MUL_MAT: + ggmlqnn_compute_mul_mat(ctx, dst); + break; + case GGML_OP_MUL_MAT_ID: + return false; + case GGML_OP_SCALE: + ggmlqnn_compute_scale(ctx, dst); + break; + case GGML_OP_SQR: + ggmlqnn_compute_sqr(ctx, dst); + break; + case GGML_OP_CLAMP: + ggmlqnn_compute_clamp(ctx, dst); + break; + case GGML_OP_CPY: + ggmlqnn_compute_cpy(ctx, dst); + break; + case GGML_OP_CONT: + ggmlqnn_compute_dup(ctx, dst); + break; + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + break; + case GGML_OP_DIAG_MASK_INF: + ggmlqnn_compute_diag_mask(ctx, dst, -INFINITY); + break; + case GGML_OP_SOFT_MAX: + ggmlqnn_compute_softmax(ctx, dst); + break; + case GGML_OP_ROPE: + ggmlqnn_compute_rope(ctx, dst); + break; + case GGML_OP_IM2COL: + ggmlqnn_compute_im2col(ctx, dst); + break; + case GGML_OP_POOL_2D: + ggmlqnn_compute_pool2d(ctx, dst); + break; + case GGML_OP_SUM_ROWS: + ggmlqnn_compute_sum_rows(ctx, dst); + break; + case GGML_OP_ARGSORT: + ggmlqnn_compute_argsort(ctx, dst); + break; + default: + return false; + } + + if (nullptr != func) + func(ctx, dst); + + return true; +} + +struct ggml_backend_hexagon_buffer_context { + ~ggml_backend_hexagon_buffer_context() { + if (buffer) { + if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) { + //do nothing here because rpc mempool was used for HWACCEL_CDSP + } else { + ggml_aligned_free(buffer, 0); + } + } + + for (auto * sub_buffer : sub_buffers) { + free(sub_buffer); + } + + sub_buffers.clear(); + } + void * buffer = nullptr; + + struct ggml_backend_hexagon_context * backend_ctx = nullptr; + + size_t buffer_size = 0; + std::vector sub_buffers; +}; + +static void ggml_backend_hexagon_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_hexagon_buffer_context * ctx = (ggml_backend_hexagon_buffer_context *)buffer->context; + delete ctx; +} + +static void * ggml_backend_hexagon_buffer_get_base(ggml_backend_buffer_t buffer) { + ggml_backend_hexagon_buffer_context * ctx = (ggml_backend_hexagon_buffer_context *)buffer->context; + return ctx->buffer; +} + +static enum ggml_status ggml_backend_hexagon_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + ggml_backend_hexagon_buffer_context * ctx = (ggml_backend_hexagon_buffer_context *)buffer->context; + GGML_UNUSED(tensor); + GGML_UNUSED(ctx); + return GGML_STATUS_SUCCESS; +} + +static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer, + ggml_tensor * tensor, const void * data, + size_t offset, size_t size) { + GGML_UNUSED(buffer); + + memcpy((char *)tensor->data + offset, data, size); +} + +static void ggml_backend_hexagon_buffer_memset_tensor(ggml_backend_buffer_t buffer, + struct ggml_tensor * tensor, + uint8_t value, size_t offset, size_t size) { + GGML_UNUSED(buffer); + memset((char *)tensor->data + offset, value, size); +} + +static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer, + const ggml_tensor * tensor, + void * data, size_t offset, size_t size) { + GGML_UNUSED(buffer); + memcpy(data, (const char *)tensor->data + offset, size); +} + +static bool ggml_backend_hexagon_buffer_cpy_tensor(ggml_backend_buffer_t buffer, + const struct ggml_tensor * src, + struct ggml_tensor * dst) { + GGML_UNUSED(buffer); + if (ggml_backend_buffer_is_host(src->buffer)) { + memcpy(dst->data, src->data, ggml_nbytes(src)); + return true; + } + + return false; +} + +static void ggml_backend_hexagon_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_hexagon_buffer_context * ctx = (ggml_backend_hexagon_buffer_context *)buffer->context; + memset(ctx->buffer, value, ctx->buffer_size); +} + +static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = { + /* .free_buffer = */ ggml_backend_hexagon_buffer_free_buffer, + /* .get_base = */ ggml_backend_hexagon_buffer_get_base, + /* .init_tensor = */ ggml_backend_hexagon_buffer_init_tensor, + /* .memset_tensor = */ ggml_backend_hexagon_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_hexagon_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_hexagon_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_hexagon_buffer_cpy_tensor, + /* .clear = */ ggml_backend_hexagon_buffer_clear, + /* .reset = */ nullptr, +}; + +static const char * ggml_backend_hexagon_buffer_type_name(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) { + return "hexagon-ion-buffer"; + } + + return "hexagon-normal-buffer"; +} + +static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer( + ggml_backend_buffer_type_t buft, size_t size) { + struct ggml_backend_hexagon_context * ctx = static_cast(buft->context); + GGML_ASSERT(nullptr != ctx); + GGMLHEXAGON_LOG_DEBUG("device %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device)); + + ggml_backend_hexagon_buffer_context * buffer_ctx = new ggml_backend_hexagon_buffer_context; + + size_t size_page = 0; +#if defined(__ANDROID__) || defined(__linux__) + size_page = sysconf(_SC_PAGESIZE); +#else + SYSTEM_INFO systeminfo; + GetSystemInfo(&systeminfo); + size_page = systeminfo.dwPageSize; +#endif + size_t size_aligned = size; + if (0 != (size_aligned % size_page)) { + size_aligned += (size_page - (size_aligned % size_page)); + } + if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) { + GGML_ASSERT(size + ctx->rpc_mempool_usage <= ctx->rpc_mempool_len); + buffer_ctx->buffer = (static_cast(ctx->rpc_mempool)) + ctx->rpc_mempool_usage; + GGMLHEXAGON_LOG_DEBUG("size %d(%d MiB), buffer_ctx->buffer %p", size, size / SIZE_IN_MB, buffer_ctx->buffer); + GGML_ASSERT(nullptr != buffer_ctx->buffer); + ctx->rpc_mempool_usage += size_aligned; + } else { + buffer_ctx->buffer = ggml_aligned_malloc(size_aligned); + } + buffer_ctx->buffer_size = size_aligned; + if (nullptr == buffer_ctx->buffer) { + GGMLHEXAGON_LOG_WARN("%s: failed to allocate %d MiB\n", __func__, size / SIZE_IN_MB); + return nullptr; + } else { + //GGMLHEXAGON_LOG_DEBUG("%s: succeed to allocate %d MiB\n", __func__, size / SIZE_IN_MB); + } + + return ggml_backend_buffer_init(buft, ggml_backend_hexagon_buffer_interface, buffer_ctx, size); +} + +static size_t ggml_backend_hexagon_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return 32; +} + +static size_t ggml_backend_hexagon_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + struct ggml_backend_hexagon_context * ctx = static_cast(buft->context); + GGML_ASSERT(nullptr != ctx); + if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) { + GGML_ASSERT(ctx->rpc_mempool_len > (8 * SIZE_IN_MB)); + return ctx->rpc_mempool_len - (8 * SIZE_IN_MB); + } else { + //TODO:this is an experimental value for LLM models + return (1024 * SIZE_IN_MB); + } +} + +static bool ggml_backend_buft_is_hexagon(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_hexagon_buffer_type_name; +} + +static bool ggml_backend_hexagon_buffer_is_host(ggml_backend_buffer_type_t buft) { + struct ggml_backend_hexagon_context * ctx = static_cast(buft->context); + GGML_ASSERT(nullptr != ctx); + if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) { + //FIXME: return false here is make sense in this scenario although this is not key-point at the moment + // fix it after solving other urgent tasks + //return false; + } + return true; +} + +static const char * ggml_backend_hexagon_name(ggml_backend_t backend) { + ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *) backend->context; + return g_hexagon_mgr[ctx->device].name; +} + +static void ggml_backend_hexagon_free(ggml_backend_t backend) { + GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ ); + ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *)backend->context; + + qnn_instance * instance = (qnn_instance*)g_hexagon_mgr[ctx->device].instance; + if (nullptr != instance) { + std::map::iterator singlenode_graph_it; + for (singlenode_graph_it = ctx->qnn_singlenode_graph_map.begin(); + singlenode_graph_it != ctx->qnn_singlenode_graph_map.end(); singlenode_graph_it++) { + auto & graph_res = singlenode_graph_it->second; + Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_res); + qnn_ptensors_t & ptensors = std::get<1>(graph_res); + for (auto tensor_it = ptensors.begin(); tensor_it != ptensors.end(); ++tensor_it) { + ggmlqnn_free_qnntensor(*tensor_it); + } + GGML_UNUSED(graph_handle); + GGMLHEXAGON_LOG_DEBUG("clean up graph:%s", singlenode_graph_it->first.c_str()); + } + ctx->qnn_singlenode_graph_map.clear(); + + instance->qnn_finalize(); + delete instance; + g_hexagon_mgr[ctx->device].instance = nullptr; + } + + if (nullptr != g_hexagon_mgr[ctx->device].backend) { + //print timestamp and dsp information before deinit cdsp, useful for troubleshooting + ggmlhexagon_print_running_timestamp(ctx); + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + ggmlhexagon_deinit_cdsp(ctx); + } + + delete backend; + g_hexagon_mgr[ctx->device].backend = nullptr; + } + GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ ); +} + +static enum ggml_status ggmlhexagon_backend_graph_compute_general(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + enum ggml_status result = GGML_STATUS_SUCCESS; + ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *)backend->context; + GGML_UNUSED(ctx); + + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * node = cgraph->nodes[i]; + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE + || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW + || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + continue; + } + bool ok = ggmlhexagon_compute_forward(backend, node); + if (!ok) { + GGMLHEXAGON_LOG_DEBUG("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); + } + } + + return result; +} + +static const char * ggml_backend_hexagon_device_get_name(ggml_backend_dev_t dev) { + struct ggml_backend_hexagon_context * ctx = static_cast(dev->context); + if (nullptr == ctx) { + GGMLHEXAGON_LOG_ERROR("pls check why ctx is null"); + return "unknown"; + } + return ctx->name; +} + +static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev_t dev) { + GGMLHEXAGON_LOG_DEBUG("enter %s", __func__); + struct ggml_backend_hexagon_context * ctx = static_cast(dev->context); + static char hexagon_device_desc[GGMLHEXAGON_TMPBUF_LEN]; + if (nullptr == ctx) { + GGMLHEXAGON_LOG_ERROR("pls check why ctx is null"); + return "unknown"; + } + + if (0 == strncmp(ctx->name, "qnn-npu", 7)) { + const char * soc_info = ggmlhexagon_get_socmodel_desc(ctx->socinfo.soc_model); + const char * htp_arch = ggmlhexagon_get_htparch_desc(ctx->socinfo.htp_arch); + std::string dev_desc = std::string(ctx->desc) + + std::string(soc_info) + "_" + std::string(htp_arch) + + "," + std::string(ctx->socinfo.soc_desc); + memset(hexagon_device_desc, 0, GGMLHEXAGON_TMPBUF_LEN); + memcpy(hexagon_device_desc, dev_desc.c_str(), strlen(dev_desc.c_str())); + return hexagon_device_desc; + } else { + return ctx->desc; + } +} + +static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + struct ggml_backend_hexagon_context * ctx = static_cast(dev->context); + if ((nullptr == ctx) || (ctx->device > HEXAGON_BACKEND_GGML)) { + GGMLHEXAGON_LOG_ERROR("pls check params"); + *free = 0; + *total = 0; + } + + if (HEXAGON_BACKEND_QNNCPU == ctx->device || HEXAGON_BACKEND_GGML == ctx->device) { + *total = ggmlhexagon_get_system_total_memory_in_bytes(); + *free = ggmlhexagon_get_system_free_memory_in_bytes(); + } else if (HEXAGON_BACKEND_QNNGPU == ctx->device) { + //TODO: probe GPU info in Qualcomm Adreno GPU + *total = ggmlhexagon_get_system_total_memory_in_bytes(); + *free = ggmlhexagon_get_system_free_memory_in_bytes(); + } else if (HEXAGON_BACKEND_QNNNPU == ctx->device) { + size_t rpc_ion_memsize = 0; + size_t rpc_ion_usage = 0; + if (HWACCEL_CDSP != g_hexagon_appcfg.hwaccel_approach) { + rpc_ion_memsize = ctx->instance->get_rpcmem_capacity(); + rpc_ion_usage = ctx->instance->get_rpcmem_usage(); + } else { + rpc_ion_memsize = ctx->rpc_mempool_capacity; + rpc_ion_usage = ctx->rpc_mempool_usage; + } + *total = rpc_ion_memsize; + *free = (rpc_ion_memsize - rpc_ion_usage); + GGMLHEXAGON_LOG_DEBUG("rpc memsize %d MiB", rpc_ion_memsize / SIZE_IN_MB); + GGMLHEXAGON_LOG_DEBUG("rpc usage %d MiB\n\n", rpc_ion_usage / SIZE_IN_MB); + } +} + +static enum ggml_backend_dev_type ggml_backend_hexagon_device_get_type(ggml_backend_dev_t dev) { + struct ggml_backend_hexagon_context * ctx = static_cast(dev->context); + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + return GGML_BACKEND_DEVICE_TYPE_GPU; + } + + if (HEXAGON_BACKEND_QNNCPU == ctx->device) + return GGML_BACKEND_DEVICE_TYPE_ACCEL; + else if (HEXAGON_BACKEND_QNNGPU == ctx->device) + return GGML_BACKEND_DEVICE_TYPE_ACCEL; + else if (HEXAGON_BACKEND_QNNNPU == ctx->device) + return GGML_BACKEND_DEVICE_TYPE_ACCEL; + else + return GGML_BACKEND_DEVICE_TYPE_CPU; +} + +static void ggml_backend_hexagon_device_get_props(ggml_backend_dev_t dev, + struct ggml_backend_dev_props * props) { + props->name = ggml_backend_hexagon_device_get_name(dev); + props->description = ggml_backend_hexagon_device_get_description(dev); + props->type = ggml_backend_hexagon_device_get_type(dev); + ggml_backend_hexagon_device_get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = { + /* .async = */ false, + /* .host_buffer = */ true, + /* .buffer_from_host_ptr = */ false, + /* .events = */ false, + }; + + if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) { + //don't use system memory in this scenario + props->caps.host_buffer = false; + } +} + +static ggml_backend_t ggml_backend_hexagon_device_init_backend(ggml_backend_dev_t dev, const char * params) { + GGML_UNUSED(dev); + GGMLHEXAGON_LOG_DEBUG("enter %s\n", __func__); + size_t dev_index = 0; + + //case-1: test-backend-ops or other similar scenario: calling ggml_backend_dev_init(dev, reinterpret_cast(i)) directly in user's code + ggmlhexagon_load_cfg(); + if (!ggmlhexagon_check_valid_appcfg()) { + return nullptr; + } + + if (nullptr == params) { + GGMLHEXAGON_LOG_DEBUG("program specified param is nullptr"); + dev_index = (g_hexagon_appcfg.hexagon_backend > 0) ? g_hexagon_appcfg.hexagon_backend : 0; + if (dev_index >= GGML_HEXAGON_MAX_DEVICES) { + GGMLHEXAGON_LOG_INFO("assume the default ggml backend"); + return nullptr; + } + } else { + GGMLHEXAGON_LOG_INFO("program specified param is not nullptr"); + //user's program calling ggml_backend_hexagon_device_init_backend directly + dev_index = (int)(intptr_t)params; + g_hexagon_appcfg.hexagon_backend = dev_index; + GGMLHEXAGON_LOG_INFO("program specified dev_index %d\n", dev_index); + } + GGMLHEXAGON_LOG_DEBUG("hexagon_backend=%d", dev_index); + ggml_backend_t hexagon_backend = ggml_backend_hexagon_init(dev_index, g_hexagon_appcfg.runtime_libpath); + GGMLHEXAGON_LOG_DEBUG("leave %s\n", __func__); + + return hexagon_backend; + +} + +static ggml_backend_buffer_type_t ggml_backend_hexagon_buffer_type(size_t device_index) { + if (device_index >= GGML_HEXAGON_MAX_DEVICES) { + GGMLHEXAGON_LOG_DEBUG("ggml_backend_hexagon_buffer_type error: device_index:%d is out of range [0, %d]\n", + device_index, GGML_HEXAGON_MAX_DEVICES - 1); + return nullptr; + } + + static struct ggml_backend_buffer_type ggml_backend_buffer_type_hexagon = { + /* .iface = */ { + /* .get_name = */ ggml_backend_hexagon_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_hexagon_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_hexagon_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_hexagon_buffer_type_get_max_size, + /* .get_alloc_size = */ nullptr,// defaults to ggml_nbytes + /* .is_host = */ ggml_backend_hexagon_buffer_is_host + }, + /* .device = */ nullptr, + /* .context = */ &g_hexagon_mgr[device_index], + }; + + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + //here is the trick: + //there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP + //and we need to re-use the g_hexagon_mgr + //so context is g_hexagon_mgr[HEXAGON_BACKEND_CDSP] rather than g_hexagon_mgr[0] + ggml_backend_buffer_type_hexagon.context = &g_hexagon_mgr[HEXAGON_BACKEND_CDSP]; + } + + return &ggml_backend_buffer_type_hexagon; +} + +static const char * ggml_backend_hexagon_host_buffer_type_name(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return "Hexagon_Host"; +} + +static const char * ggml_backend_hexagon_host_buffer_name(ggml_backend_buffer_t buffer) { + GGML_UNUSED(buffer); + return "Hexagon_Host"; +} + +static void ggml_backend_hexagon_host_buffer_free(ggml_backend_buffer_t buffer) { + ggml_aligned_free(buffer->context, 0); +} + +static void * ggml_hexagon_host_malloc(ggml_backend_buffer_type_t buft, size_t size) { + return ggml_aligned_malloc(size); +} + +static ggml_backend_buffer_t ggml_backend_hexagon_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + void * host_ptr = ggml_hexagon_host_malloc(buft, size); + + if (nullptr == host_ptr) { + return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); + } + + ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(host_ptr, size); + buffer->buft = buft; + buffer->iface.free_buffer = ggml_backend_hexagon_host_buffer_free; + + return buffer; +} + +static ggml_backend_buffer_type_t ggml_backend_hexagon_host_buffer_type() { + static struct ggml_backend_buffer_type ggml_backend_hexagon_buffer_type_host = { + /* .iface = */ { + /* .get_name = */ ggml_backend_hexagon_host_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_hexagon_host_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, + /* .get_max_size = */ nullptr, + /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, + /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_hexagon_reg(), 0), + /* .context = */ nullptr, + }; + + return &ggml_backend_hexagon_buffer_type_host; +} + +static ggml_backend_buffer_type_t ggml_backend_hexagon_device_get_host_buffer_type(ggml_backend_dev_t dev) { + GGML_UNUSED(dev); + return ggml_backend_hexagon_host_buffer_type(); +} + +static ggml_backend_buffer_type_t ggml_backend_hexagon_device_get_buffer_type(ggml_backend_dev_t dev) { + ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *)dev->context; + return ggml_backend_hexagon_buffer_type(ctx->device); +} + +static ggml_backend_buffer_t ggml_backend_hexagon_device_buffer_from_host_ptr(ggml_backend_dev_t dev, + void * ptr, size_t size, size_t max_tensor_size) { + return ggml_backend_cpu_buffer_from_ptr(ptr, size); + + GGML_UNUSED(dev); + GGML_UNUSED(max_tensor_size); +} + +static bool ggml_backend_hexagon_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) { + if (ggml_backend_buft_is_hexagon(buft)) { + ggml_backend_hexagon_context * dev_ctx = (ggml_backend_hexagon_context *)dev->context; + ggml_backend_hexagon_context * buft_ctx = (ggml_backend_hexagon_context *)buft->context; + return buft_ctx->device == dev_ctx->device; + } + } + + return ggml_backend_buft_is_host(buft); +} + +static struct ggml_backend_device_i ggml_backend_hexagon_device_interface = { + /* .get_name = */ ggml_backend_hexagon_device_get_name, + /* .get_description = */ ggml_backend_hexagon_device_get_description, + /* .get_memory = */ ggml_backend_hexagon_device_get_memory, + /* .get_type = */ ggml_backend_hexagon_device_get_type, + /* .get_props = */ ggml_backend_hexagon_device_get_props, + /* .init_backend = */ ggml_backend_hexagon_device_init_backend, + /* .get_buffer_type = */ ggml_backend_hexagon_device_get_buffer_type, + /* .get_host_buffer_type = */ ggml_backend_hexagon_device_get_host_buffer_type, + /* .buffer_from_host_ptr = */ ggml_backend_hexagon_device_buffer_from_host_ptr, + /* .supports_op = */ nullptr, + /* .supports_buft = */ ggml_backend_hexagon_device_supports_buft, + /* .offload_op = */ nullptr, + /* .event_new = */ nullptr, + /* .event_free = */ nullptr, + /* .event_synchronize = */ nullptr, +}; + +static ggml_backend_i ggml_backend_hexagon_interface = { + /* .get_name = */ ggml_backend_hexagon_name, + /* .free = */ ggml_backend_hexagon_free, + /* .set_tensor_async = */ nullptr, + /* .get_tensor_async = */ nullptr, + /* .cpy_tensor_async = */ nullptr, + /* .synchronize = */ nullptr, + /* .graph_plan_create = */ nullptr, + /* .graph_plan_free = */ nullptr, + /* .graph_plan_update = */ nullptr, + /* .graph_plan_compute = */ nullptr, + /* .graph_compute = */ nullptr, + /* .event_record = */ nullptr, + /* .event_wait = */ nullptr, +}; + +//FIXME: this guid is not make sense +static ggml_guid_t ggml_backend_hexagon_guid() { + static ggml_guid guid = { + 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, + 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 + }; + return &guid; +} + +bool ggml_backend_is_hexagon(ggml_backend_t backend) { + return backend != nullptr && ggml_guid_matches(backend->guid, ggml_backend_hexagon_guid()); +} + +static void ggml_backend_hexagon_set_n_threads(ggml_backend_t backend, int n_threads) { + GGML_ASSERT(ggml_backend_is_hexagon(backend)); + + struct ggml_backend_hexagon_context * ctx = (struct ggml_backend_hexagon_context *)backend->context; + ctx->n_threads = n_threads; +} + +int ggml_backend_hexagon_get_device_count() { + if (g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) { + GGML_ASSERT(g_hexagon_appcfg.hexagon_backend == HEXAGON_BACKEND_CDSP); + //here is the trick: + //there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP + //so return 1 + return 1; + } else { + return GGML_HEXAGON_MAX_DEVICES; + } +} + +struct ggml_backend_hexagon_reg_context { + std::vector devices; +}; + +static const char * ggml_backend_hexagon_reg_get_name(ggml_backend_reg_t reg) { + GGML_UNUSED(reg); + return "ggml-hexagon"; +} + +static size_t ggml_backend_hexagon_reg_get_device_count(ggml_backend_reg_t reg) { + GGML_UNUSED(reg); + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + GGML_ASSERT(g_hexagon_appcfg.hexagon_backend == HEXAGON_BACKEND_CDSP); + //here is the trick: + //there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP + //so return 1 + return 1; + } else { + return GGML_HEXAGON_MAX_DEVICES; + } +} + +static ggml_backend_dev_t ggml_backend_hexagon_reg_get_device(ggml_backend_reg_t reg, size_t index) { + GGML_UNUSED(reg); + GGML_UNUSED(index); + + GGMLHEXAGON_LOG_DEBUG("index %d", index); + ggml_backend_hexagon_reg_context * ctx = (ggml_backend_hexagon_reg_context *)reg->context; + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + GGML_ASSERT(g_hexagon_appcfg.hexagon_backend == HEXAGON_BACKEND_CDSP); + //here is the trick: + //there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP + //so return ctx->devices[0] + return ctx->devices[0]; + } else { + GGML_ASSERT(index < ctx->devices.size()); + return ctx->devices[index]; + } +} + +static void * ggml_backend_hexagon_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) { + GGML_UNUSED(reg); + + if (nullptr == name) + return nullptr; + + const char * slot_name = "ggml_backend_set_n_threads"; + if (0 == memcmp(name, slot_name, strlen(slot_name))) { + return (void *)ggml_backend_hexagon_set_n_threads; + } + return nullptr; +} + +static const ggml_backend_reg_i ggml_backend_hexagon_reg_interface = { + /* .get_name = */ ggml_backend_hexagon_reg_get_name, + /* .get_device_count = */ ggml_backend_hexagon_reg_get_device_count, + /* .get_device = */ ggml_backend_hexagon_reg_get_device, + /* .get_proc_address = */ ggml_backend_hexagon_reg_get_proc_address, +}; + +ggml_backend_reg_t ggml_backend_hexagon_reg() { + static ggml_backend_reg reg; + static bool initialized = false; + GGMLHEXAGON_LOG_DEBUG("enter ggml_backend_hexagon_reg"); + + //case-2: normal scenario, such as llama-cli or UI applicaton + ggmlhexagon_load_cfg(); + if (!ggmlhexagon_check_valid_appcfg()) { + return nullptr; + } + + { + static std::mutex mutex; + std::lock_guard lock(mutex); + if (!initialized) { + ggml_backend_hexagon_reg_context * ctx = new ggml_backend_hexagon_reg_context; + + for (int i = 0; i < ggml_backend_hexagon_get_device_count(); i++) { + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + ggml_backend_hexagon_device_interface.supports_op = ggmlhexagon_can_handle_op_through_cdsp; + } else { + ggml_backend_hexagon_device_interface.supports_op = ggmlhexagon_can_handle_op_through_qnn; + } + if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) { + //don't use system memory in this scenario + ggml_backend_hexagon_device_interface.get_host_buffer_type = nullptr; + } + + GGMLHEXAGON_LOG_DEBUG("create backend device for device %d", i); + ggml_backend_dev_t dev = new ggml_backend_device{ + /* .iface = */ ggml_backend_hexagon_device_interface, + /* .reg = */ ®, + /* .context = */ &g_hexagon_mgr[i] + }; + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + //here is the trick: + //there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP + //and we need to re-use the g_hexagon_mgr + //so context is g_hexagon_mgr[HEXAGON_BACKEND_CDSP] rather than g_hexagon_mgr[0] + dev->context = &g_hexagon_mgr[HEXAGON_BACKEND_CDSP]; + } + ctx->devices.push_back(dev); + + //here is the trick: make cDSP rpc memory pool happy because ggml's backend subsystem need this + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + GGML_ASSERT(HEXAGON_BACKEND_CDSP == g_hexagon_appcfg.hexagon_backend); + int result = ggmlhexagon_init_dsp(&g_hexagon_mgr[HEXAGON_BACKEND_CDSP]); + if (0 != result) { + GGMLHEXAGON_LOG_INFO("init hexagon dsp failure"); + } + GGML_ASSERT(0 == result); + } + } + + reg = ggml_backend_reg { + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_hexagon_reg_interface, + /* .context = */ ctx + }; + } + + initialized = true; + } + GGMLHEXAGON_LOG_DEBUG("leave ggml_backend_hexagon_reg"); + + return ® +} + +const char * ggml_backend_hexagon_get_devname(size_t dev_num) { + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + if (HEXAGON_BACKEND_CDSP == dev_num) + return "HEXAGON_BACKEND_CDSP"; + } + + //here is the trick: fall back for various scenarios + switch (dev_num) { + case HEXAGON_BACKEND_QNNCPU: + return "HEXAGON_BACKEND_QNN_CPU"; + case HEXAGON_BACKEND_QNNGPU: + return "HEXAGON_BACKEND_QNN_GPU"; + case HEXAGON_BACKEND_QNNNPU: + return "HEXAGON_BACKEND_QNN_NPU"; + case HEXAGON_BACKEND_GGML: + return "ggml"; //"fake" hexagon backend, used for compare performance between hexagon backend and the default ggml backend + default: + return "unknown"; + } +} + +static qnn_instance * ggmlqnn_init_qnn_instance(size_t device, const char * qnn_lib_path) { + int result = 0; + GGMLHEXAGON_LOG_INFO("hwaccel approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach, + ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach)); + + qnn_instance * instance = nullptr; + instance = new qnn_instance(qnn_lib_path, g_hexagon_mgr[device].lib, ""); + result = instance->qnn_init(nullptr); + if (0 != result) { + GGMLHEXAGON_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", + ggml_backend_hexagon_get_devname(device)); + delete instance; + return nullptr; + } + qnn_interface qnn_interface = instance->get_qnn_interface(); + if (!qnn_interface.is_loaded()) { + GGMLHEXAGON_LOG_WARN("qnn subsystem failure\n"); + delete instance; + return nullptr; + } + + std::string device_name = ggml_backend_hexagon_get_devname(device); + GGMLHEXAGON_LOG_INFO("qnn device name %s", device_name.c_str()); + g_hexagon_mgr[device].instance = instance; + g_hexagon_mgr[device].raw_interface = instance->get_qnn_raw_interface(); + g_hexagon_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface(); + + return instance; +} + +/** + * + * @param device 0: HEXAGON_BACKEND_QNNCPU 1: HEXAGON_BACKEND_QNNGPU 2: HEXAGON_BACKEND_QNNNPU/HEXAGON_BACKEND_CDSP + * @param runtime_libpath binary runtime library path, such as "/data/local/tmp/" on Android or specified in user's code + * @return + */ +ggml_backend_t ggml_backend_hexagon_init(size_t device, const char * runtime_libpath) { + GGMLHEXAGON_LOG_DEBUG("enter %s", __func__); + if (nullptr == runtime_libpath) + return nullptr; + + //case-3: calling ggml_backend_hexagon_init() directly in user's code + ggmlhexagon_load_cfg(); + if (!ggmlhexagon_check_valid_appcfg()) { + return nullptr; + } + + GGMLHEXAGON_LOG_DEBUG("device %d", device); + GGMLHEXAGON_LOG_DEBUG("runtime libpath %s", runtime_libpath); + if (device >= GGML_HEXAGON_MAX_DEVICES) { + GGMLHEXAGON_LOG_ERROR("invalid device %d", device); + return nullptr; + } + + if (0 != memcmp(runtime_libpath, g_hexagon_appcfg.runtime_libpath, strlen(g_hexagon_appcfg.runtime_libpath))) { + //re-setting runtime libpath + ggmlhexagon_set_runtime_path(device, runtime_libpath); + } + + if (nullptr != g_hexagon_mgr[device].backend) { + GGMLHEXAGON_LOG_DEBUG("backend %d(%s) already loaded", device, + ggml_backend_hexagon_get_devname(device)); + GGMLHEXAGON_LOG_DEBUG("leave %s", __func__); + return g_hexagon_mgr[device].backend; + } + + //don't initialize QNN when hwaccel approach is offload ggml op to Hexagon cDSP directly + if (HWACCEL_CDSP != g_hexagon_appcfg.hwaccel_approach) { + qnn_instance * instance = ggmlqnn_init_qnn_instance(device, runtime_libpath); + if (nullptr == instance) + return nullptr; + } + ggml_backend_hexagon_interface.graph_compute = ggmlhexagon_backend_graph_compute_general; + ggml_backend_t hexagon_backend = new ggml_backend{ + /* .guid = */ ggml_backend_hexagon_guid(), + /* .iface = */ ggml_backend_hexagon_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_hexagon_reg(), device), + /* .context = */ &g_hexagon_mgr[device] + }; + + g_hexagon_mgr[device].backend = hexagon_backend; + if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) { + int result = ggmlhexagon_init_dsp(&g_hexagon_mgr[device]); + if (0 != result) { + GGMLHEXAGON_LOG_INFO("init hexagon dsp failure"); + ggml_backend_hexagon_free(hexagon_backend); + return nullptr; + } + } else { + //get fully description of SoC when hwaccel approach is HWACCEL_QNN and backend is HEXAGON_BACKEND_QNNNPU + GGMLHEXAGON_LOG_INFO("device name %s", ggml_backend_hexagon_device_get_description(hexagon_backend->device)); + } + GGMLHEXAGON_LOG_DEBUG("leave %s", __func__); + + return hexagon_backend; +} + +GGML_BACKEND_DL_IMPL(ggml_backend_hexagon_reg) diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c new file mode 100644 index 0000000000000..e79341ed27569 --- /dev/null +++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c @@ -0,0 +1,1407 @@ +/* +* Copyright (c) 2023-2025 The ggml authors +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to +* deal in the Software without restriction, including without limitation the +* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +* sell copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +*/ +#include +#include +#include +#include +#include +#include + +#include "HAP_perf.h" +#include "HAP_farf.h" +#include "HAP_power.h" +#include "HAP_vtcm_mgr.h" +#include "HAP_compute_res.h" + +#include "qurt.h" + +#include "AEEStdErr.h" +#include "hexagon_types.h" +#include "hexagon_protos.h" + +#include "ggmlop_ap_skel.h" +#include "ggml-dsp.h" + +// ================================================================================================= +// section-1: forward/prototype declaration,global vars,macros,data structures +// ================================================================================================= +#define ggml_tensor dsptensor + +static size_t ggml_nbytes(const struct ggml_tensor * tensor); +static void ggmlhexagon_log_internal(int level, const char * file, const char * func, int line, const char * format, ...); +static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc); + +static void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +static void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k); +static void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); +static void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); + +static float ggml_table_f32_f16[1 << 16]; + +static struct ggml_compute_params params; + +static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { + [GGML_TYPE_F32] = { + .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32, + .vec_dot_type = GGML_TYPE_F32, + .nrows = 1, + }, + [GGML_TYPE_F16] = { + .from_float = NULL, + .vec_dot = NULL, + .vec_dot_type = GGML_TYPE_F16, + .nrows = 1, + }, + [GGML_TYPE_Q4_0] = { + .from_float = NULL, + .vec_dot = NULL, + .vec_dot_type = GGML_TYPE_Q8_0, +#if defined (__ARM_FEATURE_MATMUL_INT8) + .nrows = 2, +#else + .nrows = 1, +#endif + }, + [GGML_TYPE_Q4_1] = { + .from_float = NULL, + .vec_dot = NULL, + .vec_dot_type = GGML_TYPE_Q8_1, +#if defined (__ARM_FEATURE_MATMUL_INT8) + .nrows = 2, +#else + .nrows = 1, +#endif + }, + [GGML_TYPE_Q5_0] = { + .from_float = NULL, + .vec_dot = NULL, + .vec_dot_type = GGML_TYPE_Q8_0, + .nrows = 1, + }, + [GGML_TYPE_Q5_1] = { + .from_float = NULL, + .vec_dot = NULL, + .vec_dot_type = GGML_TYPE_Q8_1, + .nrows = 1, + }, + [GGML_TYPE_Q8_0] = { + .from_float = NULL, + .vec_dot = NULL, + .vec_dot_type = GGML_TYPE_Q8_0, +#if defined (__ARM_FEATURE_MATMUL_INT8) + .nrows = 2, +#else + .nrows = 1, +#endif + }, + [GGML_TYPE_Q8_1] = { + .from_float = NULL, + .vec_dot_type = GGML_TYPE_Q8_1, + .nrows = 1, + }, + [GGML_TYPE_Q2_K] = { + .from_float = NULL, + .vec_dot = NULL, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_Q3_K] = { + .from_float = NULL, + .vec_dot = NULL, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_Q4_K] = { + .from_float = NULL, + .vec_dot = NULL, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_Q5_K] = { + .from_float = NULL, + .vec_dot = NULL, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_Q6_K] = { + .from_float = quantize_row_q6_K, + .vec_dot = ggml_vec_dot_q6_K_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, +}; + +static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { + [GGML_TYPE_I8] = { + .type_name = "i8", + .blck_size = 1, + .type_size = sizeof(int8_t), + .is_quantized = false, + }, + [GGML_TYPE_I16] = { + .type_name = "i16", + .blck_size = 1, + .type_size = sizeof(int16_t), + .is_quantized = false, + }, + [GGML_TYPE_I32] = { + .type_name = "i32", + .blck_size = 1, + .type_size = sizeof(int32_t), + .is_quantized = false, + }, + [GGML_TYPE_I64] = { + .type_name = "i64", + .blck_size = 1, + .type_size = sizeof(int64_t), + .is_quantized = false, + }, + [GGML_TYPE_F64] = { + .type_name = "f64", + .blck_size = 1, + .type_size = sizeof(double), + .is_quantized = false, + }, + [GGML_TYPE_F32] = { + .type_name = "f32", + .blck_size = 1, + .type_size = sizeof(float), + .is_quantized = false, + }, + [GGML_TYPE_F16] = { + .type_name = "f16", + .blck_size = 1, + .type_size = sizeof(ggml_fp16_t), + .is_quantized = false, + .to_float = NULL, + .from_float_ref = NULL, + }, + [GGML_TYPE_Q4_0] = { + .type_name = "q4_0", + .blck_size = QK4_0, + .type_size = sizeof(block_q4_0), + .is_quantized = true, + .to_float = NULL, + .from_float_ref = NULL, + }, + [GGML_TYPE_Q4_1] = { + .type_name = "q4_1", + .blck_size = QK4_1, + .type_size = sizeof(block_q4_1), + .is_quantized = true, + .to_float = NULL, + .from_float_ref = NULL, + }, + [4] = { // GGML_TYPE_Q4_2 + .type_name = "DEPRECATED", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, + }, + [5] = { // GGML_TYPE_Q4_3 + .type_name = "DEPRECATED", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, + }, + [GGML_TYPE_Q5_0] = { + .type_name = "q5_0", + .blck_size = QK5_0, + .type_size = sizeof(block_q5_0), + .is_quantized = true, + .to_float = NULL, + .from_float_ref = NULL, + }, + [GGML_TYPE_Q5_1] = { + .type_name = "q5_1", + .blck_size = QK5_1, + .type_size = sizeof(block_q5_1), + .is_quantized = true, + .to_float = NULL, + .from_float_ref = NULL, + }, + [GGML_TYPE_Q8_0] = { + .type_name = "q8_0", + .blck_size = QK8_0, + .type_size = sizeof(block_q8_0), + .is_quantized = true, + .to_float = NULL, + .from_float_ref = NULL, + }, + [GGML_TYPE_Q8_1] = { + .type_name = "q8_1", + .blck_size = QK8_1, + .type_size = sizeof(block_q8_1), + .is_quantized = true, + .from_float_ref = NULL, + }, + [GGML_TYPE_Q2_K] = { + .type_name = "q2_K", + .blck_size = QK_K, + .type_size = sizeof(block_q2_K), + .is_quantized = true, + .to_float = NULL, + .from_float_ref = NULL, + }, + [GGML_TYPE_Q3_K] = { + .type_name = "q3_K", + .blck_size = QK_K, + .type_size = sizeof(block_q3_K), + .is_quantized = true, + .to_float = NULL, + .from_float_ref = NULL, + }, + [GGML_TYPE_Q4_K] = { + .type_name = "q4_K", + .blck_size = QK_K, + .type_size = sizeof(block_q4_K), + .is_quantized = true, + .to_float = NULL, + .from_float_ref = NULL, + }, + [GGML_TYPE_Q5_K] = { + .type_name = "q5_K", + .blck_size = QK_K, + .type_size = sizeof(block_q5_K), + .is_quantized = true, + .to_float = NULL, + .from_float_ref = NULL, + }, + [GGML_TYPE_Q6_K] = { + .type_name = "q6_K", + .blck_size = QK_K, + .type_size = sizeof(block_q6_K), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q6_K, + .from_float_ref = (ggml_from_float_t) quantize_row_q6_K_ref, + }, + +}; + +// ================================================================================================= +// section-2: ggml-hexagon kernel's internal troubleshooting function +// ================================================================================================= +static void ggmlhexagon_log_internal(int level, const char *file, const char *func, int line, const char *format, ...) { +#if !GGMLHEXAGON_DEBUG + return; +#endif + static char s_ggmlhexagon_log_internal_buf[GGMLHEXAGON_LOGBUF_LEN]; + va_list args; + va_start(args, format); + int len_prefix = snprintf(s_ggmlhexagon_log_internal_buf, GGMLHEXAGON_LOGBUF_LEN, "[%s, %d]: ", + func, line); + int len = vsnprintf(s_ggmlhexagon_log_internal_buf + len_prefix, + GGMLHEXAGON_LOGBUF_LEN - len_prefix, format, args); + if (len < (GGMLHEXAGON_LOGBUF_LEN - len_prefix)) { + FARF(ALWAYS, "%s\n", s_ggmlhexagon_log_internal_buf); + } + va_end(args); +} + +static void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor) { + //return; + float value = 0; + char tmpbuf[GGMLHEXAGON_LOGBUF_LEN]; + size_t buflen = 0; + if (tensor->type == GGML_TYPE_F32) { + memset(tmpbuf, 0, GGMLHEXAGON_LOG_LEVEL_DEBUG); + for (int h = 0; h < tensor->ne[3]; h++) { + for (int i = 0; i < tensor->ne[2]; i++) { + for (int j = 0; j < tensor->ne[1]; j++) { + for (int k = 0; k < tensor->ne[0]; k++) { + value = ((float *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + + j * tensor->ne[0] + k]; + buflen += snprintf(tmpbuf + buflen, GGMLHEXAGON_LOGBUF_LEN - buflen, "%-4.2f\t", value); + } + buflen += snprintf(tmpbuf + buflen, GGMLHEXAGON_LOGBUF_LEN - buflen, "\n"); + } + } + } + GGMLHEXAGON_LOG_DEBUG("\n%s\n", tmpbuf); + } + + GGMLHEXAGON_LOG_DEBUG("\n"); +} + +static void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, int dump_tensor_data) { + GGMLHEXAGON_LOG_DEBUG("ne = %5d x %5d x %5d x %5d , nb = (%5zi, %5zi, %5zi, %5zi)\n", + tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], + tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[3]); + + if ((1 == dump_tensor_data) && (ggml_nbytes(tensor) < 320)) { + ggmlhexagon_dump_tensor_elements(tensor); + } +} + +// ================================================================================================= +// section-3: tiny ggml-dsp: a customized ggml on Hexagon cDSP, ported from original ggml +// ================================================================================================= +static const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) { + return &type_traits_cpu[type]; +} + +static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, + size_t bx, const float *GGML_RESTRICT y, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + ggml_float sumf = 0.0; + for (int i = 0; i < n; ++i) { + sumf += (ggml_float) (x[i] * y[i]); + } + *s = sumf; +} + +inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { + for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; +} + +inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { + for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; +} + +inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { + for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; +} + +static const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) { + return &type_traits[type]; +} + +static int64_t ggml_blck_size(enum ggml_type type) { + return type_traits[type].blck_size; +} + +static size_t ggml_type_size(enum ggml_type type) { + return type_traits[type].type_size; +} + +static size_t ggml_row_size(enum ggml_type type, int64_t ne) { + assert(ne % ggml_blck_size(type) == 0); + return ggml_type_size(type)*ne/ggml_blck_size(type); +} + +static size_t ggml_nbytes(const struct ggml_tensor * tensor) { + size_t nbytes; + const size_t blck_size = ggml_blck_size(tensor->type); + if (blck_size == 1) { + nbytes = ggml_type_size(tensor->type); + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + nbytes += (tensor->ne[i] - 1)*tensor->nb[i]; + } + } + else { + nbytes = tensor->ne[0]*tensor->nb[0]/blck_size; + for (int i = 1; i < GGML_MAX_DIMS; ++i) { + nbytes += (tensor->ne[i] - 1)*tensor->nb[i]; + } + } + + return nbytes; +} + +static size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) { + return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN); +} + +static double ggml_type_sizef(enum ggml_type type) { + return ((double)(type_traits[type].type_size))/type_traits[type].blck_size; +} + +static const char * ggml_type_name(enum ggml_type type) { + return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE"; +} + +static bool ggml_is_quantized(enum ggml_type type) { + return type_traits[type].is_quantized; +} + +static bool ggml_is_empty(const struct ggml_tensor * tensor) { + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + if (tensor->ne[i] == 0) { + return true; + } + } + return false; +} + +static bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return ggml_is_empty(t0) ? ggml_is_empty(t1) : + (t1->ne[0]%t0->ne[0] == 0) && + (t1->ne[1]%t0->ne[1] == 0) && + (t1->ne[2]%t0->ne[2] == 0) && + (t1->ne[3]%t0->ne[3] == 0); +} + +static bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + return + (t0->ne[0] == t1->ne[0]) && + (t0->ne[1] == t1->ne[1]) && + (t0->ne[2] == t1->ne[2]) && + (t0->ne[3] == t1->ne[3]); +} + +static int64_t ggml_nrows(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; +} + +static bool ggml_is_transposed(const struct ggml_tensor * tensor) { + return tensor->nb[0] > tensor->nb[1]; +} + +static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) { + size_t next_nb = ggml_type_size(tensor->type); + if (tensor->ne[0] != ggml_blck_size(tensor->type) && tensor->nb[0] != next_nb) { + return false; + } + next_nb *= tensor->ne[0]/ggml_blck_size(tensor->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + if (tensor->ne[i] != 1) { + if (i > n) { + if (tensor->nb[i] != next_nb) { + return false; + } + next_nb *= tensor->ne[i]; + } else { + // this dimension does not need to be contiguous + next_nb = tensor->ne[i]*tensor->nb[i]; + } + } + } + return true; +} + +static int64_t ggml_nelements(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; +} + +static bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) { + return ggml_is_contiguous_n(tensor, 0); +} + +static bool ggml_is_contiguous(const struct ggml_tensor * tensor) { + return ggml_is_contiguous_0(tensor); +} + +inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { + for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; +} + +static void ggml_abort(const char * file, int line, const char * fmt, ...) { + GGMLHEXAGON_LOG_DEBUG("enter ggml_abort"); + abort(); + return; +} + +// FP16 <-> FP32 +static inline float fp32_from_bits(uint32_t w) { + union { + uint32_t as_bits; + float as_value; + } fp32; + fp32.as_bits = w; + return fp32.as_value; +} + +static inline uint32_t fp32_to_bits(float f) { + union { + float as_value; + uint32_t as_bits; + } fp32; + fp32.as_value = f; + return fp32.as_bits; +} + +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + const uint32_t w = (uint32_t) h << 16; + const uint32_t sign = w & UINT32_C(0x80000000); + const uint32_t two_w = w + w; + + const uint32_t exp_offset = UINT32_C(0xE0) << 23; +#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L) + const float exp_scale = 0x1.0p-112f; +#else + const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); +#endif + const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; + + const uint32_t magic_mask = UINT32_C(126) << 23; + const float magic_bias = 0.5f; + const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; + + const uint32_t denormalized_cutoff = UINT32_C(1) << 27; + const uint32_t result = sign | + (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); + return fp32_from_bits(result); +} + +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { +#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L) + const float scale_to_inf = 0x1.0p+112f; + const float scale_to_zero = 0x1.0p-110f; +#else + const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); + const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); +#endif + float base = (fabsf(f) * scale_to_inf) * scale_to_zero; + + const uint32_t w = fp32_to_bits(f); + const uint32_t shl1_w = w + w; + const uint32_t sign = w & UINT32_C(0x80000000); + uint32_t bias = shl1_w & UINT32_C(0xFF000000); + if (bias < UINT32_C(0x71000000)) { + bias = UINT32_C(0x71000000); + } + + base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; + const uint32_t bits = fp32_to_bits(base); + const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); + const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); + const uint32_t nonsign = exp_bits + mantissa_bits; + return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); +} + +static inline float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { + uint16_t s; + memcpy(&s, &f, sizeof(uint16_t)); + return ggml_table_f32_f16[s]; +} + +static inline void ggml_init(void) { + for (int i = 0; i < (1 << 16); ++i) { + union { + uint16_t u16; + ggml_fp16_t fp16; + } u = {i}; + ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16); + } + + //FIXME:HVX multithreading should be utilized in hexagon-kernels + params.ith = 0; + params.nth = 1; + //FIXME:hardcode buffer size + params.wsize = 512 * 1024 * 1024; + params.wdata = (char*)malloc(params.wsize); + GGML_ASSERT(NULL != params.wdata); +} + +static inline void ggml_deinit(void) { + free(params.wdata); + params.wdata = NULL; + params.wsize = 0; +} + +static inline int nearest_int(float fval) { + assert(fabsf(fval) <= 4194303.f); + float val = fval + 12582912.f; + int i; memcpy(&i, &val, sizeof(int)); + return (i & 0x007fffff) - 0x00400000; +} + +static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type, + const float * GGML_RESTRICT qw) { + float max = 0; + float amax = 0; + for (int i = 0; i < n; ++i) { + float ax = fabsf(x[i]); + if (ax > amax) { amax = ax; max = x[i]; } + } + if (amax < GROUP_MAX_EPS) { // all zero + for (int i = 0; i < n; ++i) { + L[i] = 0; + } + return 0.f; + } + float iscale = -nmax / max; + if (rmse_type == 0) { + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale * x[i]); + L[i] = nmax + MAX(-nmax, MIN(nmax-1, l)); + } + return 1/iscale; + } + bool return_early = false; + if (rmse_type < 0) { + rmse_type = -rmse_type; + return_early = true; + } + float sumlx = 0; + float suml2 = 0; +#ifdef HAVE_BUGGY_APPLE_LINKER + // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7 + for (volatile int i = 0; i < n; ++i) { +#else + for (int i = 0; i < n; ++i) { +#endif + int l = nearest_int(iscale * x[i]); + l = MAX(-nmax, MIN(nmax-1, l)); + L[i] = l + nmax; + float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i])); + sumlx += w*x[i]*l; + suml2 += w*l*l; + } + float scale = suml2 ? sumlx/suml2 : 0.0f; + if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale; + float best = scale * sumlx; + for (int is = -9; is <= 9; ++is) { + if (is == 0) { + continue; + } + iscale = -(nmax + 0.1f*is) / max; + sumlx = suml2 = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale * x[i]); + l = MAX(-nmax, MIN(nmax-1, l)); + float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i])); + sumlx += w*x[i]*l; + suml2 += w*l*l; + } + if (suml2 > 0 && sumlx*sumlx > best*suml2) { + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale * x[i]); + L[i] = nmax + MAX(-nmax, MIN(nmax-1, l)); + } + scale = sumlx/suml2; best = scale*sumlx; + } + } + return scale; +} + +static void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + const int64_t nb = k / QK_K; + + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + + const uint8_t * GGML_RESTRICT ql = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT sc = x[i].scales; + + for (int n = 0; n < QK_K; n += 128) { + for (int l = 0; l < 32; ++l) { + int is = l/16; + const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; + const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; + const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; + const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; + y[l + 0] = d * sc[is + 0] * q1; + y[l + 32] = d * sc[is + 2] * q2; + y[l + 64] = d * sc[is + 4] * q3; + y[l + 96] = d * sc[is + 6] * q4; + } + y += 128; + ql += 64; + qh += 32; + sc += 8; + } + } +} + +static void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + const int64_t nb = k / QK_K; + + int8_t L[QK_K]; + float scales[QK_K/16]; + + for (int i = 0; i < nb; i++) { + + float max_scale = 0; + float max_abs_scale = 0; + + for (int ib = 0; ib < QK_K/16; ++ib) { + + const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL); + scales[ib] = scale; + + const float abs_scale = fabsf(scale); + if (abs_scale > max_abs_scale) { + max_abs_scale = abs_scale; + max_scale = scale; + } + + } + + if (max_abs_scale < GROUP_MAX_EPS) { + memset(&y[i], 0, sizeof(block_q6_K)); + y[i].d = GGML_FP32_TO_FP16(0.f); + x += QK_K; + continue; + } + + float iscale = -128.f/max_scale; + y[i].d = GGML_FP32_TO_FP16(1/iscale); + for (int ib = 0; ib < QK_K/16; ++ib) { + y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib])); + } + + for (int j = 0; j < QK_K/16; ++j) { + float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j]; + if (!d) { + continue; + } + for (int ii = 0; ii < 16; ++ii) { + int l = nearest_int(x[16*j + ii]/d); + l = MAX(-32, MIN(31, l)); + L[16*j + ii] = l + 32; + } + } + + uint8_t * GGML_RESTRICT ql = y[i].ql; + uint8_t * GGML_RESTRICT qh = y[i].qh; + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + const uint8_t q1 = L[j + l + 0] & 0xF; + const uint8_t q2 = L[j + l + 32] & 0xF; + const uint8_t q3 = L[j + l + 64] & 0xF; + const uint8_t q4 = L[j + l + 96] & 0xF; + ql[l+ 0] = q1 | (q3 << 4); + ql[l+32] = q2 | (q4 << 4); + qh[l] = (L[j + l] >> 4) | ((L[j + l + 32] >> 4) << 2) | ((L[j + l + 64] >> 4) << 4) | ((L[j + l + 96] >> 4) << 6); + } + ql += 64; + qh += 32; + } + + x += QK_K; + } +} + +static void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(k % QK_K == 0); + block_q6_K * GGML_RESTRICT y = vy; + quantize_row_q6_K_ref(x, y, k); +} + +static void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q6_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; + a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; + a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; + a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; + } + a += 128; + q4 += 64; + qh += 32; + } + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/16; ++j) { + int scale = x[i].scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; + +} + +static inline uint64 hexagon_perf_get_time_us(void) { + unsigned long long count; + asm volatile (" %0 = c31:30 " : "=r"(count)); + return (uint64)(count) * 10ull / 192ull; +} + +static void ggml_time_init(void) { +} + +static int64_t ggml_time_ms(void) { + return hexagon_perf_get_time_us() * 1000; +} + +static int64_t ggml_time_us(void) { + return hexagon_perf_get_time_us(); +} + +// ================================================================================================= +// section-4: ggml-hexagon kernel helper function +// ================================================================================================= +static int32 g_thread_counts = 1; +int ggmlop_dsp_open(const char*uri, remote_handle64* handle) { + void *tptr = NULL; + FARF(HIGH, "uri %s", uri); + tptr = (void *)malloc(1); + *handle = (remote_handle64)tptr; + assert(*handle); + + ggml_init(); + + GGMLHEXAGON_LOG_DEBUG("api_version = 0x%x", qurt_api_version()); + GGMLHEXAGON_LOG_DEBUG("hvx units = 0x%d", qurt_hvx_get_units()); + qurt_arch_version_t vers; + qurt_sysenv_get_arch_version(&vers); + GGMLHEXAGON_LOG_DEBUG("arch_version=0x%x", vers.arch_version); + qurt_sysenv_app_heap_t aheap; + qurt_sysenv_get_app_heap(&aheap); + GGMLHEXAGON_LOG_DEBUG("aheap.heap_base=0x%x, aheap.heap_limit=0x%x", aheap.heap_base, aheap.heap_limit); + qurt_sysenv_max_hthreads_t mhwt; + qurt_sysenv_get_max_hw_threads(&mhwt); + GGMLHEXAGON_LOG_DEBUG("max hardware threads=%d", mhwt.max_hthreads); + + return 0; +} + +int ggmlop_dsp_close(remote_handle64 handle) { + if (handle) + free((void*)handle); + + ggml_deinit(); + + return 0; +} + +AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32 latency, int32 dcvs_enabled, int32 thread_counts) { + GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ ); + HAP_power_request_t request; + memset(&request, 0, sizeof(HAP_power_request_t)); + request.type = HAP_power_set_apptype; + request.apptype = HAP_POWER_COMPUTE_CLIENT_CLASS; + + g_thread_counts = thread_counts; + + void * ggmop_ctx = (void*)(handle); + int retval = HAP_power_set(ggmop_ctx, &request); + if (retval) { + GGMLHEXAGON_LOG_DEBUG("failed first power vote"); + return AEE_EFAILED; + } + + //configure clocks & DCVS mode + memset(&request, 0, sizeof(HAP_power_request_t)); + request.type = HAP_power_set_DCVS_v2; + request.dcvs_v2.dcvs_enable = TRUE; + request.dcvs_v2.dcvs_params.target_corner = (HAP_dcvs_voltage_corner_t)power_level; + if (dcvs_enabled) { + request.dcvs_v2.dcvs_params.min_corner = HAP_DCVS_VCORNER_DISABLE; + request.dcvs_v2.dcvs_params.max_corner = HAP_DCVS_VCORNER_DISABLE; + } else { + request.dcvs_v2.dcvs_params.min_corner = request.dcvs_v2.dcvs_params.target_corner; + request.dcvs_v2.dcvs_params.max_corner = request.dcvs_v2.dcvs_params.target_corner; + } + request.dcvs_v2.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE; + request.dcvs_v2.set_dcvs_params = TRUE; + request.dcvs_v2.set_latency = TRUE; + request.dcvs_v2.latency = latency; + retval = HAP_power_set(ggmop_ctx, &request); + if (retval) { + GGMLHEXAGON_LOG_DEBUG("failed to vote for performance mode"); + return AEE_EFAILED; + } + + memset(&request, 0, sizeof(HAP_power_request_t)); + request.type = HAP_power_set_HVX; + request.hvx.power_up = TRUE; + retval = HAP_power_set(ggmop_ctx, &request); + if (retval) { + GGMLHEXAGON_LOG_DEBUG("failed to vote for HVX power"); + return AEE_EFAILED; + } + GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ ); + return AEE_SUCCESS; +} + +// ================================================================================================= +// section-5: ggml-hexagon kernel functions: offload ggmlop to cDSP through Hexagon C API and SIMD instructions +// ================================================================================================= +inline static void ggmlhexagon_dsp_add_f32 (const int n, float * z, const float * x, const float * y) { + HVX_Vector * va; + HVX_Vector * vb; + HVX_Vector * vc; + HVX_Vector qf32; + const int FLOATS_PER_VECTOR = 128 / sizeof(float); + const int block = n / FLOATS_PER_VECTOR; + const int left = n % FLOATS_PER_VECTOR; + const int blocks = block * FLOATS_PER_VECTOR; + + if (0 == block) { + for (size_t i = 0; i < n; ++i) + z[i] = x[i] + y[i]; + + return; + } + + if ((((uintptr_t)z | (uintptr_t)x | (uintptr_t)y) % ALIGN_128_BYTE) != 0) { + GGMLHEXAGON_LOG_DEBUG("memaddress mismatch alignment 128 bytes z:%p x:%p y:%p", z, x, y); + for (size_t i = 0; i < n; ++i) + z[i] = x[i] + y[i]; + + return; + } + + va = (HVX_Vector *)x; + vb = (HVX_Vector *)y; + vc = (HVX_Vector *)z; + for (size_t i = 0; i < block; ++i) { + qf32 = Q6_Vqf32_vadd_VsfVsf(*va++, *vb++); + *vc = Q6_Vsf_equals_Vqf32(qf32); + vc++; + } + + if (left > 0) { + for (size_t i = 0; i < left; ++i) + z[i + blocks] = x[i + blocks] + y[i + blocks]; + } +} + +static void ggml_compute_forward_add_f32( + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ ); + uint64_t start_time = ggml_time_us(); + + memcpy(dst->ne, src1->ne, 16); + memcpy(dst->nb, src1->nb, 16); + ggmlhexagon_dump_tensor(src0, 1); + ggmlhexagon_dump_tensor(src1, 1); + ggmlhexagon_dump_tensor(dst, 1); + + GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); + + const int ith = 0; + const int nth = 1; + + const int nr = ggml_nrows(src0); + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT( nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + const int dr = (nr + nth - 1)/nth; + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + if (nb10 == sizeof(float)) { + for (int ir = ir0; ir < ir1; ++ir) { + // src1 is broadcastable across src0 and dst in i1, i2, i3 + const int32_t i03 = ir/(ne02*ne01); + const int32_t i02 = (ir - i03*ne02*ne01)/ne01; + const int32_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const int32_t i13 = i03 % ne13; + const int32_t i12 = i02 % ne12; + const int32_t i11 = i01 % ne11; + const int32_t nr0 = ne00 / ne10; + + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); + for (int32_t r = 0; r < nr0; ++r) { + ggmlhexagon_dsp_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr); + } + } + } else { + // src1 is not contiguous + for (int ir = ir0; ir < ir1; ++ir) { + // src1 is broadcastable across src0 and dst in i1, i2, i3 + const int32_t i03 = ir/(ne02*ne01); + const int32_t i02 = (ir - i03*ne02*ne01)/ne01; + const int32_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const int32_t i13 = i03 % ne13; + const int32_t i12 = i02 % ne12; + const int32_t i11 = i01 % ne11; + + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + + for (int32_t i0 = 0; i0 < ne0; ++i0) { + const int32_t i10 = i0 % ne10; + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10); + + dst_ptr[i0] = src0_ptr[i0] + *src1_ptr; + } + } + } + + uint64_t end_time = ggml_time_us(); + uint64_t duration = (end_time - start_time); + GGMLHEXAGON_LOG_DEBUG("duration %llu us", duration); +#if !GGMLHEXAGON_DEBUG + UNUSED(duration); +#endif + + GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ ); +} + +//FIXME: failed with test-backend-ops when disable ion rpc mempool +int ggmlop_dsp_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) +{ + GGMLHEXAGON_LOG_DEBUG("enter %s\n", __func__); + switch (src0->type) { + case GGML_TYPE_F32: + { + if (src1->type == GGML_TYPE_F32) { + ggml_compute_forward_add_f32(src0, src1, dst); + } else { + GGML_ABORT("fatal error"); + } + break; + } + default: + { + GGML_ABORT("fatal error"); + } + } + GGMLHEXAGON_LOG_DEBUG("leave %s\n", __func__); + return 0; +} + +static void ggml_compute_forward_mul_mat_one_chunk( + const struct ggml_compute_params * params, + const ggml_tensor * src0, + const ggml_tensor * src1, + struct ggml_tensor * dst, + const enum ggml_type type, + const int32_t num_rows_per_vec_dot, + const int32_t ir0_start, + const int32_t ir0_end, + const int32_t ir1_start, + const int32_t ir1_end) { + ggmlhexagon_dump_tensor(src0, 0); + ggmlhexagon_dump_tensor(src1, 0); + ggmlhexagon_dump_tensor(dst, 0); + + dst->ne[0] = src0->ne[1]; + dst->ne[1] = src1->ne[1]; + dst->ne[2] = src1->ne[2]; + dst->ne[3] = src1->ne[3]; + + dst->nb[0] = ggml_type_size(src1->type); + dst->nb[1] = dst->nb[0] * (dst->ne[0] / ggml_blck_size(src1->type)); + dst->nb[2] = dst->nb[1] * dst->ne[1]; + dst->nb[3] = dst->nb[2] * dst->ne[2]; + ggmlhexagon_dump_tensor(dst, 0); + + GGML_TENSOR_BINARY_OP_LOCALS + + const bool src1_cont = ggml_is_contiguous(src1); + + ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot; + enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; + + // broadcast factors + const int32_t r2 = ne12 / ne02; + const int32_t r3 = ne13 / ne03; + + if (ir0_start >= ir0_end || ir1_start >= ir1_end) { + return; + } + + const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t row_size = ggml_row_size(vec_dot_type, ne10); + + assert(ne12 % ne02 == 0); + assert(ne13 % ne03 == 0); + + // block-tiling attempt + const int32_t blck_0 = 16; + const int32_t blck_1 = 16; + + const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11; + + // attempt to reduce false-sharing (does not seem to make a difference) + // 16 * 2, accounting for mmla kernels + float tmp[32]; + + for (int32_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) { + for (int32_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) { + for (int32_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) { + const int32_t i13 = (ir1 / (ne12 * ne1)); + const int32_t i12 = (ir1 - i13 * ne12 * ne1) / ne1; + const int32_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1); + + // broadcast src0 into src1 + const int32_t i03 = i13 / r3; + const int32_t i02 = i12 / r2; + + const int32_t i1 = i11; + const int32_t i2 = i12; + const int32_t i3 = i13; + + const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03); + + // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides + // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using + // the original src1 data pointer, so we should index using the indices directly + // TODO: this is a bit of a hack, we should probably have a better way to handle this + const char * src1_col = (const char*)wdata + + (src1_cont || src1->type != vec_dot_type + ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size + : (i11 * nb11 + i12 * nb12 + i13 * nb13)); + float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3)); + + //for (int32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) { + // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); + //} + + for (int32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) { + vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot); + } + + for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) { + memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float)); + } + } + } + } +} + +//FIXME: only support fp32 mulmat on cDSP +static int ggmlop_dsp_mulmat_singlethread(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ ); + ggmlhexagon_dump_tensor(src0, 0); + ggmlhexagon_dump_tensor(src1, 0); + ggmlhexagon_dump_tensor(dst, 0); + + dst->ne[0] = src0->ne[1]; + dst->ne[1] = src1->ne[1]; + dst->ne[2] = src1->ne[2]; + dst->ne[3] = src1->ne[3]; + + dst->nb[0] = ggml_type_size(src1->type); + dst->nb[1] = dst->nb[0] * (dst->ne[0] / ggml_blck_size(src1->type)); + dst->nb[2] = dst->nb[1] * dst->ne[1]; + dst->nb[3] = dst->nb[2] * dst->ne[2]; + ggmlhexagon_dump_tensor(dst, 0); + + GGML_TENSOR_BINARY_OP_LOCALS + + enum ggml_type const vec_dot_type = type_traits_cpu[src0->type].vec_dot_type; + ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float; + int32_t const vec_dot_num_rows = type_traits_cpu[src0->type].nrows; + const int ith = 0; + const int nth = 1; + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(src0->type)); + GGML_ASSERT(nb10 == ggml_type_size(src1->type)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + +#if 0 //naive algorithm for fp32, can pass various case in UT + { + //ggml_dump_tensor(src0); + //ggml_dump_tensor(src1); + + float * a = (float*)src0->data; + float * b = (float*)src1->data; + float * c = (float*)dst->data; + int M = src0->ne[1]; + int K = src0->ne[0]; + int N = src1->ne[1]; + float sum = 0; + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) { + sum = 0; + for (int h = 0; h < K; h++) { + sum += a[i * K + h] * b[h * N + j]; + } + c[i * N + j] = sum; + } + } + return 0; + } +#endif + + if (src1->type != vec_dot_type) { + size_t wsize = ggml_row_size(vec_dot_type, ggml_nelements(src1)); + GGML_ASSERT(wsize < params.wsize); + } + + if (src1->type != vec_dot_type) { + char * wdata = params.wdata; + + const size_t nbw0 = ggml_type_size(vec_dot_type); + const size_t nbw1 = ggml_row_size(vec_dot_type, ne10); + const size_t nbw2 = nbw1*ne11; + const size_t nbw3 = nbw2*ne12; + + assert(params.wsize >= ne13*nbw3); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + size_t bs = ggml_blck_size(vec_dot_type); + int64_t ne10_block_start = (ith * ne10/bs) / nth; + int64_t ne10_block_end = ((ith + 1) * ne10/bs) / nth; + from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10), + (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + ne10_block_start*nbw0), + (ne10_block_end - ne10_block_start) * bs); + } + } + } + } + + // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers) + const int32_t nr0 = ne0; + + // This is the size of the rest of the dimensions of the result + const int32_t nr1 = ne1 * ne2 * ne3; + + // Now select a reasonable chunk size. + int chunk_size = 16; + + // We need to step up the size if it's small + if (nr0 == 1 || nr1 == 1) { + chunk_size = 64; + } + + // distribute the work across the inner or outer loop based on which one is larger + // The number of chunks in the 0/1 dim. + // CEIL(nr0/chunk_size) + int32_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size; + int32_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size; + + // If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread. + // Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggml-org/llama.cpp/pull/6915 + // In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that. + if (nchunk0 * nchunk1 < 4) { + // distribute the thread work across the inner or outer loop based on which one is larger + nchunk0 = 1; // parallelize by src0 rows + nchunk1 = 1; // parallelize by src1 rows + } + + // The number of elements in each chunk + const int32_t dr0 = (nr0 + nchunk0 - 1) / nchunk0; + const int32_t dr1 = (nr1 + nchunk1 - 1) / nchunk1; + + // The first chunk comes from our thread_id, the rest will get auto-assigned. + int current_chunk = 0; + + while (current_chunk < nchunk0 * nchunk1) { + const int32_t ith0 = current_chunk % nchunk0; + const int32_t ith1 = current_chunk / nchunk0; + + const int32_t ir0_start = dr0 * ith0; + const int32_t ir0_end = MIN(ir0_start + dr0, nr0); + + const int32_t ir1_start = dr1 * ith1; + const int32_t ir1_end = MIN(ir1_start + dr1, nr1); + + // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols + int32_t num_rows_per_vec_dot = vec_dot_num_rows; + + // these checks are needed to avoid crossing dim1 boundaries + // can be optimized, but the logic would become more complicated, so keeping it like this for simplicity + if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) { + num_rows_per_vec_dot = 1; + } + ggml_compute_forward_mul_mat_one_chunk(¶ms, src0, src1, dst, src0->type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end); + + if (1 >= nchunk0 * nchunk1) { + break; + } + current_chunk++; + } + + GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ ); + return 0; +} + +int ggmlop_dsp_mulmat_multithread(remote_handle64 h, const struct dsptensor * src0, const struct dsptensor * src1, dsptensor * dst) { + GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ ); + GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ ); + return 0; +} + +int ggmlop_dsp_mulmat(remote_handle64 h, const struct dsptensor * src0, const struct dsptensor * src1, dsptensor * dst) { + if (g_thread_counts > 1) { + return ggmlop_dsp_mulmat_multithread(h, src0, src1, dst); + } else { + return ggmlop_dsp_mulmat_singlethread(h, src0, src1, dst); + } + return 0; +} + +int ggmlop_dsp_softmax(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) { + + GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ ); + GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ ); + return 0; +} + +int ggmlop_dsp_rmsnorm(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) { + GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ ); + GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ ); + + return 0; +} + +int ggmlop_dsp_pool2d(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) { + + GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ ); + GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ ); + return 0; +} diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.h b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h new file mode 100644 index 0000000000000..c77e45391205e --- /dev/null +++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h @@ -0,0 +1,328 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define GGML_MAX_DIMS 4 + +#define ALIGN_128_BYTE 128 + +#define GGML_UNUSED(x) (void)(x) + +#define UNUSED GGML_UNUSED + +#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1)) + +#define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__) + +#define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x) + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +#if UINTPTR_MAX == 0xFFFFFFFF +#define GGML_MEM_ALIGN 4 +#else +#define GGML_MEM_ALIGN 16 +#endif + +#define GGML_RESTRICT + +#define static_assert(a, b) do { } while (0) + +#define GROUP_MAX_EPS 1e-15f + +// QK = number of values after dequantization +// QK_K = super-block size +#define QK_K 256 +#define K_SCALE_SIZE 12 + +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) +#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) +#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) + +//NPU performance will be slower when enable GGMLHEXAGON_DEBUG +#ifdef NDEBUG +#define GGMLHEXAGON_DEBUG 0 +#else +#define GGMLHEXAGON_DEBUG 1 +#endif + +#define GGMLHEXAGON_LOGBUF_LEN 4096 +#define GGMLHEXAGON_TMPBUF_LEN 256 +#if GGMLHEXAGON_DEBUG +#define GGMLHEXAGON_LOG_DEBUG(...) ggmlhexagon_log_internal(GGMLHEXAGON_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#else +#define GGMLHEXAGON_LOG_DEBUG(...) +#endif + +#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \ + const type prefix##0 = (pointer)->array[0]; \ + GGML_UNUSED(prefix##0); +#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \ + GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \ + const type prefix##1 = (pointer)->array[1]; \ + GGML_UNUSED(prefix##1); +#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \ + GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \ + const type prefix##2 = (pointer)->array[2]; \ + GGML_UNUSED(prefix##2); +#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \ + GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \ + const type prefix##3 = (pointer)->array[3]; \ + GGML_UNUSED(prefix##3); + +#define GGML_TENSOR_UNARY_OP_LOCALS \ + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \ + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) + +#define GGML_TENSOR_BINARY_OP_LOCALS \ + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \ + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \ + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \ + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) + +#define GGML_TENSOR_BINARY_OP_LOCALS01 \ + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \ + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) + +enum ggmlhexagon_log_level { + GGMLHEXAGON_LOG_LEVEL_NONE = 0, + GGMLHEXAGON_LOG_LEVEL_DEBUG = 1, + GGMLHEXAGON_LOG_LEVEL_INFO = 2, + GGMLHEXAGON_LOG_LEVEL_WARN = 3, + GGMLHEXAGON_LOG_LEVEL_ERROR = 4, + GGMLHEXAGON_LOG_LEVEL_CONT = 5, +}; + +enum ggml_type { + GGML_TYPE_F32 = 0, + GGML_TYPE_F16 = 1, + GGML_TYPE_Q4_0 = 2, + GGML_TYPE_Q4_1 = 3, + // GGML_TYPE_Q4_2 = 4, support has been removed + // GGML_TYPE_Q4_3 = 5, support has been removed + GGML_TYPE_Q5_0 = 6, + GGML_TYPE_Q5_1 = 7, + GGML_TYPE_Q8_0 = 8, + GGML_TYPE_Q8_1 = 9, + GGML_TYPE_Q2_K = 10, + GGML_TYPE_Q3_K = 11, + GGML_TYPE_Q4_K = 12, + GGML_TYPE_Q5_K = 13, + GGML_TYPE_Q6_K = 14, + GGML_TYPE_Q8_K = 15, + GGML_TYPE_IQ2_XXS = 16, + GGML_TYPE_IQ2_XS = 17, + GGML_TYPE_IQ3_XXS = 18, + GGML_TYPE_IQ1_S = 19, + GGML_TYPE_IQ4_NL = 20, + GGML_TYPE_IQ3_S = 21, + GGML_TYPE_IQ2_S = 22, + GGML_TYPE_IQ4_XS = 23, + GGML_TYPE_I8 = 24, + GGML_TYPE_I16 = 25, + GGML_TYPE_I32 = 26, + GGML_TYPE_I64 = 27, + GGML_TYPE_F64 = 28, + GGML_TYPE_IQ1_M = 29, + GGML_TYPE_BF16 = 30, + // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files + // GGML_TYPE_Q4_0_4_8 = 32, + // GGML_TYPE_Q4_0_8_8 = 33, + GGML_TYPE_TQ1_0 = 34, + GGML_TYPE_TQ2_0 = 35, + // GGML_TYPE_IQ4_NL_4_4 = 36, + // GGML_TYPE_IQ4_NL_4_8 = 37, + // GGML_TYPE_IQ4_NL_8_8 = 38, + GGML_TYPE_COUNT = 39, +}; + +typedef double ggml_float; +typedef uint16_t ggml_fp16_t; +typedef uint16_t ggml_half; +typedef uint32_t ggml_half2; +typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx, + const void * GGML_RESTRICT y, size_t by, int nrc); +typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); + +typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); + +struct ggml_compute_params { + // ith = thread index, nth = number of threads + int ith, nth; + + // work buffer for all threads + size_t wsize; + void * wdata; +}; + +#define QK4_0 32 +typedef struct { + ggml_half d; // delta + uint8_t qs[QK4_0 / 2]; // nibbles / quants +} block_q4_0; + +#define QK4_1 32 +typedef struct { + union { + struct { + ggml_half d; // delta + ggml_half m; // min + } GGML_COMMON_AGGR_S; + ggml_half2 dm; + } GGML_COMMON_AGGR_U; + uint8_t qs[QK4_1 / 2]; // nibbles / quants +} block_q4_1; + +#define QK5_0 32 +typedef struct { + ggml_half d; // delta + uint8_t qh[4]; // 5-th bit of quants + uint8_t qs[QK5_0 / 2]; // nibbles / quants +} block_q5_0; + +#define QK5_1 32 +typedef struct { + union { + struct { + ggml_half d; // delta + ggml_half m; // min + } GGML_COMMON_AGGR_S; + ggml_half2 dm; + } GGML_COMMON_AGGR_U; + uint8_t qh[4]; // 5-th bit of quants + uint8_t qs[QK5_1 / 2]; // nibbles / quants +} block_q5_1; + +#define QK8_0 32 +typedef struct { + ggml_half d; // delta + int8_t qs[QK8_0]; // quants +} block_q8_0; + +#define QK8_1 32 +typedef struct { + union { + struct { + ggml_half d; // delta + ggml_half s; // d * sum(qs[i]) + } GGML_COMMON_AGGR_S; + ggml_half2 ds; + } GGML_COMMON_AGGR_U; + int8_t qs[QK8_1]; // quants +} block_q8_1; + +// 2-bit quantization +// weight is represented as x = a * q + b +// 16 blocks of 16 elements each +// Effectively 2.625 bits per weight +typedef struct { + uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits + uint8_t qs[QK_K/4]; // quants + union { + struct { + ggml_half d; // super-block scale for quantized scales + ggml_half dmin; // super-block scale for quantized mins + } GGML_COMMON_AGGR_S; + ggml_half2 dm; + } GGML_COMMON_AGGR_U; +} block_q2_K; + +// 3-bit quantization +// weight is represented as x = a * q +// 16 blocks of 16 elements each +// Effectively 3.4375 bits per weight +typedef struct { + uint8_t hmask[QK_K/8]; // quants - high bit + uint8_t qs[QK_K/4]; // quants - low 2 bits + uint8_t scales[12]; // scales, quantized with 6 bits + ggml_half d; // super-block scale +} block_q3_K; + +// 4-bit quantization +// 8 blocks of 32 elements each +// weight is represented as x = a * q + b +// Effectively 4.5 bits per weight +typedef struct { + union { + struct { + ggml_half d; // super-block scale for quantized scales + ggml_half dmin; // super-block scale for quantized mins + } GGML_COMMON_AGGR_S; + ggml_half2 dm; + } GGML_COMMON_AGGR_U; + uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits + uint8_t qs[QK_K/2]; // 4--bit quants +} block_q4_K; + +// 5-bit quantization +// 8 blocks of 32 elements each +// weight is represented as x = a * q + b +// Effectively 5.5 bits per weight +typedef struct { + union { + struct { + ggml_half d; // super-block scale for quantized scales + ggml_half dmin; // super-block scale for quantized mins + } GGML_COMMON_AGGR_S; + ggml_half2 dm; + } GGML_COMMON_AGGR_U; + uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits + uint8_t qh[QK_K/8]; // quants, high bit + uint8_t qs[QK_K/2]; // quants, low 4 bits +} block_q5_K; + +// 6-bit quantization +// weight is represented as x = a * q +// 16 blocks of 16 elements each +// Effectively 6.5625 bits per weight +typedef struct { + uint8_t ql[QK_K/2]; // quants, lower 4 bits + uint8_t qh[QK_K/4]; // quants, upper 2 bits + int8_t scales[QK_K/16]; // scales, quantized with 8 bits + ggml_half d; // super-block scale +} block_q6_K; + +typedef struct { + float d; // delta + int8_t qs[QK_K]; // quants + int16_t bsums[QK_K/16]; // sum of quants in groups of 16 +} block_q8_K; + +struct ggml_type_traits { + const char * type_name; + int64_t blck_size; + int64_t blck_size_interleave; // interleave elements in blocks + size_t type_size; + bool is_quantized; + ggml_to_float_t to_float; + ggml_from_float_t from_float_ref; +}; + +struct ggml_type_traits_cpu { + ggml_from_float_t from_float; + ggml_vec_dot_t vec_dot; + enum ggml_type vec_dot_type; + int64_t nrows; // number of rows to process simultaneously +}; + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.c b/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.c new file mode 100644 index 0000000000000..b0a660ce96a79 --- /dev/null +++ b/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.c @@ -0,0 +1,463 @@ +//qidl copyright +//qidl nested=false +#include "ggmlop_ap_skel.h" +#include +#ifndef _WIN32 +#include "HAP_farf.h" +#include +#endif //_WIN32 for HAP_farf +#ifndef _ALLOCATOR_H +#define _ALLOCATOR_H + +#include +#include + +typedef struct _heap _heap; +struct _heap { + _heap* pPrev; + const char* loc; + uint64_t buf; +}; + +typedef struct _allocator { + _heap* pheap; + uint8_t* stack; + uint8_t* stackEnd; + int nSize; +} _allocator; + +_ATTRIBUTE_UNUSED +static __inline int _heap_alloc(_heap** ppa, const char* loc, int size, void** ppbuf) { + _heap* pn = 0; + pn = MALLOC((size_t)size + sizeof(_heap) - sizeof(uint64_t)); + if(pn != 0) { + pn->pPrev = *ppa; + pn->loc = loc; + *ppa = pn; + *ppbuf = (void*)&(pn->buf); + return 0; + } else { + return -1; + } +} +#define _ALIGN_SIZE(x, y) (((x) + (y-1)) & ~(y-1)) + +_ATTRIBUTE_UNUSED +static __inline int _allocator_alloc(_allocator* me, + const char* loc, + int size, + unsigned int al, + void** ppbuf) { + if(size < 0) { + return -1; + } else if (size == 0) { + *ppbuf = 0; + return 0; + } + if((_ALIGN_SIZE((uintptr_t)me->stackEnd, al) + (size_t)size) < (uintptr_t)me->stack + (size_t)me->nSize) { + *ppbuf = (uint8_t*)_ALIGN_SIZE((uintptr_t)me->stackEnd, al); + me->stackEnd = (uint8_t*)_ALIGN_SIZE((uintptr_t)me->stackEnd, al) + size; + return 0; + } else { + return _heap_alloc(&me->pheap, loc, size, ppbuf); + } +} + +_ATTRIBUTE_UNUSED +static __inline void _allocator_deinit(_allocator* me) { + _heap* pa = me->pheap; + while(pa != 0) { + _heap* pn = pa; + const char* loc = pn->loc; + (void)loc; + pa = pn->pPrev; + FREE(pn); + } +} + +_ATTRIBUTE_UNUSED +static __inline void _allocator_init(_allocator* me, uint8_t* stack, int stackSize) { + me->stack = stack; + me->stackEnd = stack + stackSize; + me->nSize = stackSize; + me->pheap = 0; +} + + +#endif // _ALLOCATOR_H + +#ifndef SLIM_H +#define SLIM_H + +#include + +//a C data structure for the idl types that can be used to implement +//static and dynamic language bindings fairly efficiently. +// +//the goal is to have a minimal ROM and RAM footprint and without +//doing too many allocations. A good way to package these things seemed +//like the module boundary, so all the idls within one module can share +//all the type references. + + +#define PARAMETER_IN 0x0 +#define PARAMETER_OUT 0x1 +#define PARAMETER_INOUT 0x2 +#define PARAMETER_ROUT 0x3 +#define PARAMETER_INROUT 0x4 + +//the types that we get from idl +#define TYPE_OBJECT 0x0 +#define TYPE_INTERFACE 0x1 +#define TYPE_PRIMITIVE 0x2 +#define TYPE_ENUM 0x3 +#define TYPE_STRING 0x4 +#define TYPE_WSTRING 0x5 +#define TYPE_STRUCTURE 0x6 +#define TYPE_UNION 0x7 +#define TYPE_ARRAY 0x8 +#define TYPE_SEQUENCE 0x9 + +//these require the pack/unpack to recurse +//so it's a hint to those languages that can optimize in cases where +//recursion isn't necessary. +#define TYPE_COMPLEX_STRUCTURE (0x10 | TYPE_STRUCTURE) +#define TYPE_COMPLEX_UNION (0x10 | TYPE_UNION) +#define TYPE_COMPLEX_ARRAY (0x10 | TYPE_ARRAY) +#define TYPE_COMPLEX_SEQUENCE (0x10 | TYPE_SEQUENCE) + + +typedef struct Type Type; + +#define INHERIT_TYPE\ + int32_t nativeSize; /*in the simple case its the same as wire size and alignment*/\ + union {\ + struct {\ + const uintptr_t p1;\ + const uintptr_t p2;\ + } _cast;\ + struct {\ + uint32_t iid;\ + uint32_t bNotNil;\ + } object;\ + struct {\ + const Type *arrayType;\ + int32_t nItems;\ + } array;\ + struct {\ + const Type *seqType;\ + int32_t nMaxLen;\ + } seqSimple; \ + struct {\ + uint32_t bFloating;\ + uint32_t bSigned;\ + } prim; \ + const SequenceType* seqComplex;\ + const UnionType *unionType;\ + const StructType *structType;\ + int32_t stringMaxLen;\ + uint8_t bInterfaceNotNil;\ + } param;\ + uint8_t type;\ + uint8_t nativeAlignment\ + +typedef struct UnionType UnionType; +typedef struct StructType StructType; +typedef struct SequenceType SequenceType; +struct Type { + INHERIT_TYPE; +}; + +struct SequenceType { + const Type * seqType; + uint32_t nMaxLen; + uint32_t inSize; + uint32_t routSizePrimIn; + uint32_t routSizePrimROut; +}; + +//byte offset from the start of the case values for +//this unions case value array. it MUST be aligned +//at the alignment requrements for the descriptor +// +//if negative it means that the unions cases are +//simple enumerators, so the value read from the descriptor +//can be used directly to find the correct case +typedef union CaseValuePtr CaseValuePtr; +union CaseValuePtr { + const uint8_t* value8s; + const uint16_t* value16s; + const uint32_t* value32s; + const uint64_t* value64s; +}; + +//these are only used in complex cases +//so I pulled them out of the type definition as references to make +//the type smaller +struct UnionType { + const Type *descriptor; + uint32_t nCases; + const CaseValuePtr caseValues; + const Type * const *cases; + int32_t inSize; + int32_t routSizePrimIn; + int32_t routSizePrimROut; + uint8_t inAlignment; + uint8_t routAlignmentPrimIn; + uint8_t routAlignmentPrimROut; + uint8_t inCaseAlignment; + uint8_t routCaseAlignmentPrimIn; + uint8_t routCaseAlignmentPrimROut; + uint8_t nativeCaseAlignment; + uint8_t bDefaultCase; +}; + +struct StructType { + uint32_t nMembers; + const Type * const *members; + int32_t inSize; + int32_t routSizePrimIn; + int32_t routSizePrimROut; + uint8_t inAlignment; + uint8_t routAlignmentPrimIn; + uint8_t routAlignmentPrimROut; +}; + +typedef struct Parameter Parameter; +struct Parameter { + INHERIT_TYPE; + uint8_t mode; + uint8_t bNotNil; +}; + +#define SLIM_IFPTR32(is32,is64) (sizeof(uintptr_t) == 4 ? (is32) : (is64)) +#define SLIM_SCALARS_IS_DYNAMIC(u) (((u) & 0x00ffffff) == 0x00ffffff) + +typedef struct Method Method; +struct Method { + uint32_t uScalars; //no method index + int32_t primInSize; + int32_t primROutSize; + int maxArgs; + int numParams; + const Parameter * const *params; + uint8_t primInAlignment; + uint8_t primROutAlignment; +}; + +typedef struct Interface Interface; + +struct Interface { + int nMethods; + const Method * const *methodArray; + int nIIds; + const uint32_t *iids; + const uint16_t* methodStringArray; + const uint16_t* methodStrings; + const char* strings; +}; + + +#endif //SLIM_H + + +#ifndef _GGMLOP_SLIM_H +#define _GGMLOP_SLIM_H +#include + +#ifndef __QAIC_SLIM +#define __QAIC_SLIM(ff) ff +#endif +#ifndef __QAIC_SLIM_EXPORT +#define __QAIC_SLIM_EXPORT +#endif + +static const Type types[5]; +static const Type* const typeArrays[7] = {&(types[0]),&(types[1]),&(types[1]),&(types[0]),&(types[2]),&(types[0]),&(types[3])}; +static const StructType structTypes[1] = {{0x7,&(typeArrays[0]),0x70,0x4,0x6c,0x4,0x4,0x4}}; +static const Type types[5] = {{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4},{0x10,{{(const uintptr_t)&(types[0]),(const uintptr_t)0x4}}, 8,0x4},{0x40,{{(const uintptr_t)&(types[0]),(const uintptr_t)0x10}}, 8,0x4},{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)&(types[4]),(const uintptr_t)0x0}}, 9,SLIM_IFPTR32(0x4,0x8)},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4}}; +static const Parameter parameters[6] = {{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)0x0,0}}, 4,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),3,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),0,0},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4,0,0},{SLIM_IFPTR32(0x74,0x80),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x74,0x80),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),3,0}}; +static const Parameter* const parameterArrays[9] = {(&(parameters[4])),(&(parameters[4])),(&(parameters[5])),(&(parameters[3])),(&(parameters[3])),(&(parameters[3])),(&(parameters[0])),(&(parameters[1])),(&(parameters[2]))}; +static const Method methods[4] = {{REMOTE_SCALARS_MAKEX(0,0,0x2,0x0,0x0,0x1),0x4,0x0,2,2,(&(parameterArrays[6])),0x4,0x1},{REMOTE_SCALARS_MAKEX(0,0,0x0,0x0,0x1,0x0),0x0,0x0,1,1,(&(parameterArrays[8])),0x1,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x1,0x0,0x0,0x0),0xc,0x0,3,3,(&(parameterArrays[3])),0x4,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x3,0x2,0x0,0x0),0xe4,0x6c,3,3,(&(parameterArrays[0])),0x4,0x4}}; +static const Method* const methodArrays[8] = {&(methods[0]),&(methods[1]),&(methods[2]),&(methods[3]),&(methods[3]),&(methods[3]),&(methods[3]),&(methods[3])}; +static const char strings[167] = "dsp_setclocks\0dsp_rmsnorm\0dsp_softmax\0dcvs_enable\0power_level\0dsp_pool2d\0dsp_mulmat\0op_params\0dsp_add\0latency\0flags\0close\0src1\0data\0type\0src0\0open\0dst\0uri\0op\0nb\0ne\0h\0"; +static const uint16_t methodStrings[134] = {62,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,14,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,26,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,73,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,94,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,0,50,102,38,142,151,164,116,164}; +static const uint16_t methodStringsArrays[8] = {129,132,125,100,75,50,25,0}; +__QAIC_SLIM_EXPORT const Interface __QAIC_SLIM(ggmlop_slim) = {8,&(methodArrays[0]),0,0,&(methodStringsArrays [0]),methodStrings,strings}; +#endif //_GGMLOP_SLIM_H + + +#ifdef __cplusplus +extern "C" { +#endif +__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_open)(const char* uri, remote_handle64* h) __QAIC_STUB_ATTRIBUTE { + return __QAIC_REMOTE(remote_handle64_open)(uri, h); +} +__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_close)(remote_handle64 h) __QAIC_STUB_ATTRIBUTE { + return __QAIC_REMOTE(remote_handle64_close)(h); +} +static __inline int _stub_method(remote_handle64 _handle, uint32_t _mid, uint32_t _in0[1], uint32_t _in1[1], uint32_t _in2[1], uint32_t _in3[1]) { + remote_arg _pra[1] = {0}; + uint32_t _primIn[4]= {0}; + int _nErr = 0; + _pra[0].buf.pv = (void*)_primIn; + _pra[0].buf.nLen = sizeof(_primIn); + _COPY(_primIn, 0, _in0, 0, 4); + _COPY(_primIn, 4, _in1, 0, 4); + _COPY(_primIn, 8, _in2, 0, 4); + _COPY(_primIn, 12,_in3, 0, 4); + _TRY_FARF(_nErr, __QAIC_REMOTE(remote_handle64_invoke)(_handle, REMOTE_SCALARS_MAKEX(0, _mid, 1, 0, 0, 0), _pra)); + _CATCH_FARF(_nErr) { + _QAIC_FARF(RUNTIME_ERROR, "ERROR 0x%x: handle=0x%"PRIx64", scalar=0x%x, method ID=%d: %s failed\n", _nErr , _handle, REMOTE_SCALARS_MAKEX(0, _mid, 1, 0, 0, 0), _mid, __func__); + } + return _nErr; +} +__QAIC_STUB_EXPORT AEEResult __QAIC_STUB(ggmlop_dsp_setclocks)(remote_handle64 _handle, int32 power_level, int32 latency, int32 dcvs_enable, int32 threads) __QAIC_STUB_ATTRIBUTE { + uint32_t _mid = 2; + return _stub_method(_handle, _mid, (uint32_t*)&power_level, (uint32_t*)&latency, (uint32_t*)&dcvs_enable, (uint32_t*)&threads); +} +static __inline int _stub_unpack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) { + int _nErr = 0; + remote_arg* _praROutPostStart = _praROutPost; + remote_arg** _ppraROutPostStart = _ppraROutPost; + _ppraROutPost = &_praROutPost; + _COPY(_rout0, 0, _primROut, 0, 4); + _COPY(_rout1, 0, _primROut, 4, 16); + _COPY(_rout2, 0, _primROut, 20, 16); + _COPY(_rout3, 0, _primROut, 36, 4); + _COPY(_rout4, 0, _primROut, 40, 64); + _COPY(_rout5, 0, _primROut, 104, 4); + _ppraROutPostStart[0] += (_praROutPost - _praROutPostStart) +1; + return _nErr; +} +static __inline int _stub_pack(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) { + int _nErr = 0; + remote_arg* _praInStart = _praIn; + remote_arg** _ppraInStart = _ppraIn; + remote_arg* _praROutStart = _praROut; + remote_arg** _ppraROutStart = _ppraROut; + _ppraIn = &_praIn; + _ppraROut = &_praROut; + _COPY(_primIn, 0, _rout6Len, 0, 4); + _praROut[0].buf.pv = _rout6[0]; + _praROut[0].buf.nLen = (4 * _rout6Len[0]); + _ppraInStart[0] += (_praIn - _praInStart) + 0; + _ppraROutStart[0] += (_praROut - _praROutStart) +1; + return _nErr; +} +static __inline int _stub_pack_1(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _in0[1], _ATTRIBUTE_UNUSED uint32_t _in1[4], _ATTRIBUTE_UNUSED uint32_t _in2[4], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED uint32_t _in4[16], _ATTRIBUTE_UNUSED uint32_t _in5[1], _ATTRIBUTE_UNUSED char* _in6[1], _ATTRIBUTE_UNUSED uint32_t _in6Len[1]) { + int _nErr = 0; + remote_arg* _praInStart = _praIn; + remote_arg** _ppraInStart = _ppraIn; + remote_arg* _praROutStart = _praROut; + remote_arg** _ppraROutStart = _ppraROut; + _ppraIn = &_praIn; + _ppraROut = &_praROut; + _COPY(_primIn, 0, _in0, 0, 4); + _COPY(_primIn, 4, _in1, 0, 16); + _COPY(_primIn, 20, _in2, 0, 16); + _COPY(_primIn, 36, _in3, 0, 4); + _COPY(_primIn, 40, _in4, 0, 64); + _COPY(_primIn, 104, _in5, 0, 4); + _COPY(_primIn, 108, _in6Len, 0, 4); + _praIn[0].buf.pv = (void*) _in6[0]; + _praIn[0].buf.nLen = (4 * _in6Len[0]); + _ppraInStart[0] += (_praIn - _praInStart) + 1; + _ppraROutStart[0] += (_praROut - _praROutStart) +0; + return _nErr; +} +static __inline void _count(int _numIn[1], int _numROut[1], int _numInH[1], int _numROutH[1], _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) { + _numIn[0] += 0; + _numROut[0] += 1; + _numInH[0] += 0; + _numROutH[0] += 0; +} +static __inline void _count_1(int _numIn[1], int _numROut[1], int _numInH[1], int _numROutH[1], _ATTRIBUTE_UNUSED uint32_t _in0[1], _ATTRIBUTE_UNUSED uint32_t _in1[4], _ATTRIBUTE_UNUSED uint32_t _in2[4], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED uint32_t _in4[16], _ATTRIBUTE_UNUSED uint32_t _in5[1], _ATTRIBUTE_UNUSED char* _in6[1], _ATTRIBUTE_UNUSED uint32_t _in6Len[1]) { + _numIn[0] += 1; + _numROut[0] += 0; + _numInH[0] += 0; + _numROutH[0] += 0; +} +static __inline int _stub_method_1(remote_handle64 _handle, uint32_t _mid, uintptr_t _in0[SLIM_IFPTR32(29, 16)], uintptr_t _in1[SLIM_IFPTR32(29, 16)], uintptr_t _rout2[SLIM_IFPTR32(29, 16)]) { + remote_arg* _pra = 0; + int _numIn[1] = {0}; + int _numROut[1] = {0}; + int _numInH[1] = {0}; + int _numROutH[1] = {0}; + _allocator _al[1] = {{0}}; + uint32_t _primIn[57]= {0}; + uint32_t _primROut[27]= {0}; + remote_arg* _praIn = 0; + remote_arg* _praROut = 0; + remote_arg* _praROutPost = 0; + remote_arg** _ppraROutPost = &_praROutPost; + remote_arg** _ppraIn = &_praIn; + remote_arg** _ppraROut = &_praROut; + remote_arg* _praHIn = 0; + remote_arg** _ppraHIn = &_praHIn; + remote_arg* _praHROut = 0; + remote_arg** _ppraHROut = &_praHROut; + int _nErr = 0; + _numIn[0] = 0; + _numROut[0] = 0; + _numInH[0] = 0; + _numROutH[0] = 0; + _count_1(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint32_t*)&(((uint32_t*)_in0)[1]), (uint32_t*)&(((uint32_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[9]), (uint32_t*)&(((uint32_t*)_in0)[10]), (uint32_t*)&(((uint32_t*)_in0)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[27]), (char**)&(((uint64_t*)_in0)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[28]), (uint32_t*)&(((uint32_t*)_in0)[30]))); + _count_1(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint32_t*)&(((uint32_t*)_in1)[1]), (uint32_t*)&(((uint32_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[9]), (uint32_t*)&(((uint32_t*)_in1)[10]), (uint32_t*)&(((uint32_t*)_in1)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[27]), (char**)&(((uint64_t*)_in1)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[28]), (uint32_t*)&(((uint32_t*)_in1)[30]))); + _count(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30]))); + if(_numIn[0]>=255){ + return AEE_EUNSUPPORTED; + } + if(_numROut[0]>=255){ + return AEE_EUNSUPPORTED; + } + _allocator_init(_al, 0, 0); + _QAIC_ALLOCATE(_nErr, _al, ((((((((_numIn[0] + _numROut[0]) + _numInH[0]) + _numROutH[0]) + 1) + 1) + 0) + 0) * sizeof(_pra[0])), 4, _pra); + _QAIC_ASSERT(_nErr, _pra); + _pra[0].buf.pv = (void*)_primIn; + _pra[0].buf.nLen = sizeof(_primIn); + _pra[(_numIn[0] + 1)].buf.pv = (void*)_primROut; + _pra[(_numIn[0] + 1)].buf.nLen = sizeof(_primROut); + _praIn = (_pra + 1); + _praROut = (_praIn + _numIn[0] + 1); + _praROutPost = _praROut; + if(_praHIn == 0) + { + _praHIn = ((_praROut + _numROut[0]) + 1); + } + if(_praHROut == 0) + (_praHROut = _praHIn + _numInH[0] + 0); + _TRY(_nErr, _stub_pack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 0), 0, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint32_t*)&(((uint32_t*)_in0)[1]), (uint32_t*)&(((uint32_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[9]), (uint32_t*)&(((uint32_t*)_in0)[10]), (uint32_t*)&(((uint32_t*)_in0)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[27]), (char**)&(((uint64_t*)_in0)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[28]), (uint32_t*)&(((uint32_t*)_in0)[30])))); + _TRY(_nErr, _stub_pack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 112), 0, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint32_t*)&(((uint32_t*)_in1)[1]), (uint32_t*)&(((uint32_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[9]), (uint32_t*)&(((uint32_t*)_in1)[10]), (uint32_t*)&(((uint32_t*)_in1)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[27]), (char**)&(((uint64_t*)_in1)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[28]), (uint32_t*)&(((uint32_t*)_in1)[30])))); + _TRY(_nErr, _stub_pack(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 224), ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30])))); + _QAIC_ASSERT(_nErr, (_numInH[0] + 0) <= 15); + _QAIC_ASSERT(_nErr, (_numROutH[0] + 0) <= 15); + _TRY_FARF(_nErr, __QAIC_REMOTE(remote_handle64_invoke)(_handle, REMOTE_SCALARS_MAKEX(0, _mid, (_numIn[0] + 1), (_numROut[0] + 1), (_numInH[0] + 0), (_numROutH[0] + 0)), _pra)); + _TRY(_nErr, _stub_unpack((_praROutPost + 0), _ppraROutPost, ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30])))); + _QAIC_CATCH(_nErr) {} + _CATCH_FARF(_nErr) { + _QAIC_FARF(RUNTIME_ERROR, "ERROR 0x%x: handle=0x%"PRIx64", scalar=0x%x, method ID=%d: %s failed\n", _nErr , _handle, REMOTE_SCALARS_MAKEX(0, _mid, (_numIn[0] + 1), (_numROut[0] + 1), (_numInH[0] + 0), (_numROutH[0] + 0)), _mid, __func__); + } + _allocator_deinit(_al); + return _nErr; +} +__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_add)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE { + uint32_t _mid = 3; + return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst); +} +__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_mulmat)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE { + uint32_t _mid = 4; + return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst); +} +__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_softmax)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE { + uint32_t _mid = 5; + return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst); +} +__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_rmsnorm)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE { + uint32_t _mid = 6; + return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst); +} +__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_pool2d)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE { + uint32_t _mid = 7; + return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst); +} diff --git a/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.h b/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.h new file mode 100644 index 0000000000000..f189c48d0238b --- /dev/null +++ b/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.h @@ -0,0 +1,287 @@ +#ifndef _GGMLOP_H +#define _GGMLOP_H +//qidl copyright +//qidl nested=false +#include +#include +#include +#include + + +#ifndef __QAIC_HEADER +#define __QAIC_HEADER(ff) ff +#endif //__QAIC_HEADER + +#ifndef __QAIC_HEADER_EXPORT +#define __QAIC_HEADER_EXPORT +#endif // __QAIC_HEADER_EXPORT + +#ifndef __QAIC_HEADER_ATTRIBUTE +#define __QAIC_HEADER_ATTRIBUTE +#endif // __QAIC_HEADER_ATTRIBUTE + +#ifndef __QAIC_IMPL +#define __QAIC_IMPL(ff) ff +#endif //__QAIC_IMPL + +#ifndef __QAIC_IMPL_EXPORT +#define __QAIC_IMPL_EXPORT +#endif // __QAIC_IMPL_EXPORT + +#ifndef __QAIC_IMPL_ATTRIBUTE +#define __QAIC_IMPL_ATTRIBUTE +#endif // __QAIC_IMPL_ATTRIBUTE +#ifndef _QAIC_ENV_H +#define _QAIC_ENV_H + +#include +#ifdef _WIN32 +#include "qtest_stdlib.h" +#else +#define MALLOC malloc +#define FREE free +#endif + +#ifdef __GNUC__ +#ifdef __clang__ +#pragma GCC diagnostic ignored "-Wunknown-pragmas" +#else +#pragma GCC diagnostic ignored "-Wpragmas" +#endif +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wunused-function" +#endif + +#ifndef _ATTRIBUTE_UNUSED + +#ifdef _WIN32 +#define _ATTRIBUTE_UNUSED +#else +#define _ATTRIBUTE_UNUSED __attribute__ ((unused)) +#endif + +#endif // _ATTRIBUTE_UNUSED + +#ifndef _ATTRIBUTE_VISIBILITY + +#ifdef _WIN32 +#define _ATTRIBUTE_VISIBILITY +#else +#define _ATTRIBUTE_VISIBILITY __attribute__ ((visibility("default"))) +#endif + +#endif // _ATTRIBUTE_VISIBILITY + +#ifndef __QAIC_REMOTE +#define __QAIC_REMOTE(ff) ff +#endif //__QAIC_REMOTE + +#ifndef __QAIC_HEADER +#define __QAIC_HEADER(ff) ff +#endif //__QAIC_HEADER + +#ifndef __QAIC_HEADER_EXPORT +#define __QAIC_HEADER_EXPORT +#endif // __QAIC_HEADER_EXPORT + +#ifndef __QAIC_HEADER_ATTRIBUTE +#define __QAIC_HEADER_ATTRIBUTE +#endif // __QAIC_HEADER_ATTRIBUTE + +#ifndef __QAIC_IMPL +#define __QAIC_IMPL(ff) ff +#endif //__QAIC_IMPL + +#ifndef __QAIC_IMPL_EXPORT +#define __QAIC_IMPL_EXPORT +#endif // __QAIC_IMPL_EXPORT + +#ifndef __QAIC_IMPL_ATTRIBUTE +#define __QAIC_IMPL_ATTRIBUTE +#endif // __QAIC_IMPL_ATTRIBUTE + +#ifndef __QAIC_STUB +#define __QAIC_STUB(ff) ff +#endif //__QAIC_STUB + +#ifndef __QAIC_STUB_EXPORT +#define __QAIC_STUB_EXPORT +#endif // __QAIC_STUB_EXPORT + +#ifndef __QAIC_STUB_ATTRIBUTE +#define __QAIC_STUB_ATTRIBUTE +#endif // __QAIC_STUB_ATTRIBUTE + +#ifndef __QAIC_SKEL +#define __QAIC_SKEL(ff) ff +#endif //__QAIC_SKEL__ + +#ifndef __QAIC_SKEL_EXPORT +#define __QAIC_SKEL_EXPORT +#endif // __QAIC_SKEL_EXPORT + +#ifndef __QAIC_SKEL_ATTRIBUTE +#define __QAIC_SKEL_ATTRIBUTE +#endif // __QAIC_SKEL_ATTRIBUTE + +#ifdef __QAIC_DEBUG__ + #ifndef __QAIC_DBG_PRINTF__ + #include + #define __QAIC_DBG_PRINTF__( ee ) do { printf ee ; } while(0) + #endif +#else + #define __QAIC_DBG_PRINTF__( ee ) (void)0 +#endif + + +#define _OFFSET(src, sof) ((void*)(((char*)(src)) + (sof))) + +#define _COPY(dst, dof, src, sof, sz) \ + do {\ + struct __copy { \ + char ar[sz]; \ + };\ + *(struct __copy*)_OFFSET(dst, dof) = *(struct __copy*)_OFFSET(src, sof);\ + } while (0) + +#define _COPYIF(dst, dof, src, sof, sz) \ + do {\ + if(_OFFSET(dst, dof) != _OFFSET(src, sof)) {\ + _COPY(dst, dof, src, sof, sz); \ + } \ + } while (0) + +_ATTRIBUTE_UNUSED +static __inline void _qaic_memmove(void* dst, void* src, int size) { + int i = 0; + for(i = 0; i < size; ++i) { + ((char*)dst)[i] = ((char*)src)[i]; + } +} + +#define _MEMMOVEIF(dst, src, sz) \ + do {\ + if(dst != src) {\ + _qaic_memmove(dst, src, sz);\ + } \ + } while (0) + + +#define _ASSIGN(dst, src, sof) \ + do {\ + dst = OFFSET(src, sof); \ + } while (0) + +#define _STD_STRLEN_IF(str) (str == 0 ? 0 : strlen(str)) + +#include "AEEStdErr.h" + +#ifdef _WIN32 +#define _QAIC_FARF(level, msg, ...) (void)0 +#else +#define _QAIC_FARF(level, msg, ...) (void)0 +#endif //_WIN32 for _QAIC_FARF + +#define _TRY(ee, func) \ + do { \ + if (AEE_SUCCESS != ((ee) = func)) {\ + __QAIC_DBG_PRINTF__((__FILE__ ":%d:error:%d:%s\n", __LINE__, (int)(ee),#func));\ + goto ee##bail;\ + } \ + } while (0) + +#define _TRY_FARF(ee, func) \ + do { \ + if (AEE_SUCCESS != ((ee) = func)) {\ + goto ee##farf##bail;\ + } \ + } while (0) + +#define _QAIC_CATCH(exception) exception##bail: if (exception != AEE_SUCCESS) + +#define _CATCH_FARF(exception) exception##farf##bail: if (exception != AEE_SUCCESS) + +#define _QAIC_ASSERT(nErr, ff) _TRY(nErr, 0 == (ff) ? AEE_EBADPARM : AEE_SUCCESS) + +#ifdef __QAIC_DEBUG__ +#define _QAIC_ALLOCATE(nErr, pal, size, alignment, pv) _TRY(nErr, _allocator_alloc(pal, __FILE_LINE__, size, alignment, (void**)&pv));\ + _QAIC_ASSERT(nErr,pv || !(size)) +#else +#define _QAIC_ALLOCATE(nErr, pal, size, alignment, pv) _TRY(nErr, _allocator_alloc(pal, 0, size, alignment, (void**)&pv));\ + _QAIC_ASSERT(nErr,pv || !(size)) +#endif + + +#endif // _QAIC_ENV_H + +#ifdef __cplusplus +extern "C" { +#endif +#if !defined(__QAIC_STRING1_OBJECT_DEFINED__) && !defined(__STRING1_OBJECT__) +#define __QAIC_STRING1_OBJECT_DEFINED__ +#define __STRING1_OBJECT__ +typedef struct _cstring1_s { + char* data; + int dataLen; +} _cstring1_t; + +#endif /* __QAIC_STRING1_OBJECT_DEFINED__ */ +/// Enabling stub-skel mismatch check feature in the auto-gen files. +/// Please refer to the IDL documentation for more details on the feature. +/// It is fully supported only on Kailua and later targets. +#define IDL_VERSION "0.0.1" +typedef struct dsptensor dsptensor; +struct dsptensor { + int32_t type; + int32_t ne[4]; + int32_t nb[4]; + int32_t op; + int32_t op_params[16]; + int32_t flags; + void * data; + int data_len; +}; +/** + * Opens the handle in the specified domain. If this is the first + * handle, this creates the session. Typically this means opening + * the device, aka open("/dev/adsprpc-smd"), then calling ioctl + * device APIs to create a PD on the DSP to execute our code in, + * then asking that PD to dlopen the .so and dlsym the skel function. + * + * @param uri, _URI"&_dom=aDSP" + * _URI is a QAIC generated uri, or + * "file:///?_skel_handle_invoke&_modver=1.0" + * If the _dom parameter is not present, _dom=DEFAULT is assumed + * but not forwarded. + * Reserved uri keys: + * [0]: first unamed argument is the skel invoke function + * _dom: execution domain name, _dom=mDSP/aDSP/DEFAULT + * _modver: module version, _modver=1.0 + * _*: any other key name starting with an _ is reserved + * Unknown uri keys/values are forwarded as is. + * @param h, resulting handle + * @retval, 0 on success + */ +__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_open)(const char* uri, remote_handle64* h) __QAIC_HEADER_ATTRIBUTE; +/** + * Closes a handle. If this is the last handle to close, the session + * is closed as well, releasing all the allocated resources. + + * @param h, the handle to close + * @retval, 0 on success, should always succeed + */ +__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_close)(remote_handle64 h) __QAIC_HEADER_ATTRIBUTE; +__QAIC_HEADER_EXPORT AEEResult __QAIC_HEADER(ggmlop_dsp_setclocks)(remote_handle64 _h, int32 power_level, int32 latency, int32 dcvs_enable, int32 threads) __QAIC_HEADER_ATTRIBUTE; +__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_add)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE; +__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_mulmat)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE; +__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_softmax)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE; +__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_rmsnorm)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE; +__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_pool2d)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE; +#ifndef ggmlop_URI +#define ggmlop_URI "file:///libggmlop_skel.so?ggmlop_skel_handle_invoke&_modver=1.0&_idlver=0.0.1" +#endif /*ggmlop_URI*/ +#ifdef __cplusplus +} +#endif +#endif //_GGMLOP_H diff --git a/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp_skel.c b/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp_skel.c new file mode 100644 index 0000000000000..1e9d31a72319d --- /dev/null +++ b/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp_skel.c @@ -0,0 +1,621 @@ +//qidl copyright +//qidl nested=false +#include "ggmlop_ap_skel.h" + +#include +#ifndef _WIN32 +#include "HAP_farf.h" +#endif //_WIN32 for HAP_farf +#ifndef _ALLOCATOR_H +#define _ALLOCATOR_H + +#include +#include + +typedef struct _heap _heap; +struct _heap { + _heap* pPrev; + const char* loc; + uint64_t buf; +}; + +typedef struct _allocator { + _heap* pheap; + uint8_t* stack; + uint8_t* stackEnd; + int nSize; +} _allocator; + +_ATTRIBUTE_UNUSED +static __inline int _heap_alloc(_heap** ppa, const char* loc, int size, void** ppbuf) { + _heap* pn = 0; + pn = MALLOC((size_t)size + sizeof(_heap) - sizeof(uint64_t)); + if(pn != 0) { + pn->pPrev = *ppa; + pn->loc = loc; + *ppa = pn; + *ppbuf = (void*)&(pn->buf); + return 0; + } else { + return -1; + } +} +#define _ALIGN_SIZE(x, y) (((x) + (y-1)) & ~(y-1)) + +_ATTRIBUTE_UNUSED +static __inline int _allocator_alloc(_allocator* me, + const char* loc, + int size, + unsigned int al, + void** ppbuf) { + if(size < 0) { + return -1; + } else if (size == 0) { + *ppbuf = 0; + return 0; + } + if((_ALIGN_SIZE((uintptr_t)me->stackEnd, al) + (size_t)size) < (uintptr_t)me->stack + (size_t)me->nSize) { + *ppbuf = (uint8_t*)_ALIGN_SIZE((uintptr_t)me->stackEnd, al); + me->stackEnd = (uint8_t*)_ALIGN_SIZE((uintptr_t)me->stackEnd, al) + size; + return 0; + } else { + return _heap_alloc(&me->pheap, loc, size, ppbuf); + } +} + +_ATTRIBUTE_UNUSED +static __inline void _allocator_deinit(_allocator* me) { + _heap* pa = me->pheap; + while(pa != 0) { + _heap* pn = pa; + const char* loc = pn->loc; + (void)loc; + pa = pn->pPrev; + FREE(pn); + } +} + +_ATTRIBUTE_UNUSED +static __inline void _allocator_init(_allocator* me, uint8_t* stack, int stackSize) { + me->stack = stack; + me->stackEnd = stack + stackSize; + me->nSize = stackSize; + me->pheap = 0; +} + + +#endif // _ALLOCATOR_H + +#ifndef SLIM_H +#define SLIM_H + +#include + +//a C data structure for the idl types that can be used to implement +//static and dynamic language bindings fairly efficiently. +// +//the goal is to have a minimal ROM and RAM footprint and without +//doing too many allocations. A good way to package these things seemed +//like the module boundary, so all the idls within one module can share +//all the type references. + + +#define PARAMETER_IN 0x0 +#define PARAMETER_OUT 0x1 +#define PARAMETER_INOUT 0x2 +#define PARAMETER_ROUT 0x3 +#define PARAMETER_INROUT 0x4 + +//the types that we get from idl +#define TYPE_OBJECT 0x0 +#define TYPE_INTERFACE 0x1 +#define TYPE_PRIMITIVE 0x2 +#define TYPE_ENUM 0x3 +#define TYPE_STRING 0x4 +#define TYPE_WSTRING 0x5 +#define TYPE_STRUCTURE 0x6 +#define TYPE_UNION 0x7 +#define TYPE_ARRAY 0x8 +#define TYPE_SEQUENCE 0x9 + +//these require the pack/unpack to recurse +//so it's a hint to those languages that can optimize in cases where +//recursion isn't necessary. +#define TYPE_COMPLEX_STRUCTURE (0x10 | TYPE_STRUCTURE) +#define TYPE_COMPLEX_UNION (0x10 | TYPE_UNION) +#define TYPE_COMPLEX_ARRAY (0x10 | TYPE_ARRAY) +#define TYPE_COMPLEX_SEQUENCE (0x10 | TYPE_SEQUENCE) + + +typedef struct Type Type; + +#define INHERIT_TYPE\ + int32_t nativeSize; /*in the simple case its the same as wire size and alignment*/\ + union {\ + struct {\ + const uintptr_t p1;\ + const uintptr_t p2;\ + } _cast;\ + struct {\ + uint32_t iid;\ + uint32_t bNotNil;\ + } object;\ + struct {\ + const Type *arrayType;\ + int32_t nItems;\ + } array;\ + struct {\ + const Type *seqType;\ + int32_t nMaxLen;\ + } seqSimple; \ + struct {\ + uint32_t bFloating;\ + uint32_t bSigned;\ + } prim; \ + const SequenceType* seqComplex;\ + const UnionType *unionType;\ + const StructType *structType;\ + int32_t stringMaxLen;\ + uint8_t bInterfaceNotNil;\ + } param;\ + uint8_t type;\ + uint8_t nativeAlignment\ + +typedef struct UnionType UnionType; +typedef struct StructType StructType; +typedef struct SequenceType SequenceType; +struct Type { + INHERIT_TYPE; +}; + +struct SequenceType { + const Type * seqType; + uint32_t nMaxLen; + uint32_t inSize; + uint32_t routSizePrimIn; + uint32_t routSizePrimROut; +}; + +//byte offset from the start of the case values for +//this unions case value array. it MUST be aligned +//at the alignment requrements for the descriptor +// +//if negative it means that the unions cases are +//simple enumerators, so the value read from the descriptor +//can be used directly to find the correct case +typedef union CaseValuePtr CaseValuePtr; +union CaseValuePtr { + const uint8_t* value8s; + const uint16_t* value16s; + const uint32_t* value32s; + const uint64_t* value64s; +}; + +//these are only used in complex cases +//so I pulled them out of the type definition as references to make +//the type smaller +struct UnionType { + const Type *descriptor; + uint32_t nCases; + const CaseValuePtr caseValues; + const Type * const *cases; + int32_t inSize; + int32_t routSizePrimIn; + int32_t routSizePrimROut; + uint8_t inAlignment; + uint8_t routAlignmentPrimIn; + uint8_t routAlignmentPrimROut; + uint8_t inCaseAlignment; + uint8_t routCaseAlignmentPrimIn; + uint8_t routCaseAlignmentPrimROut; + uint8_t nativeCaseAlignment; + uint8_t bDefaultCase; +}; + +struct StructType { + uint32_t nMembers; + const Type * const *members; + int32_t inSize; + int32_t routSizePrimIn; + int32_t routSizePrimROut; + uint8_t inAlignment; + uint8_t routAlignmentPrimIn; + uint8_t routAlignmentPrimROut; +}; + +typedef struct Parameter Parameter; +struct Parameter { + INHERIT_TYPE; + uint8_t mode; + uint8_t bNotNil; +}; + +#define SLIM_IFPTR32(is32,is64) (sizeof(uintptr_t) == 4 ? (is32) : (is64)) +#define SLIM_SCALARS_IS_DYNAMIC(u) (((u) & 0x00ffffff) == 0x00ffffff) + +typedef struct Method Method; +struct Method { + uint32_t uScalars; //no method index + int32_t primInSize; + int32_t primROutSize; + int maxArgs; + int numParams; + const Parameter * const *params; + uint8_t primInAlignment; + uint8_t primROutAlignment; +}; + +typedef struct Interface Interface; + +struct Interface { + int nMethods; + const Method * const *methodArray; + int nIIds; + const uint32_t *iids; + const uint16_t* methodStringArray; + const uint16_t* methodStrings; + const char* strings; +}; + + +#endif //SLIM_H + + +#ifndef _GGMLOP_SLIM_H +#define _GGMLOP_SLIM_H +#include + +#ifndef __QAIC_SLIM +#define __QAIC_SLIM(ff) ff +#endif +#ifndef __QAIC_SLIM_EXPORT +#define __QAIC_SLIM_EXPORT +#endif + +static const Type types[5]; +static const Type* const typeArrays[7] = {&(types[0]),&(types[1]),&(types[1]),&(types[0]),&(types[2]),&(types[0]),&(types[3])}; +static const StructType structTypes[1] = {{0x7,&(typeArrays[0]),0x70,0x4,0x6c,0x4,0x4,0x4}}; +static const Type types[5] = {{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4},{0x10,{{(const uintptr_t)&(types[0]),(const uintptr_t)0x4}}, 8,0x4},{0x40,{{(const uintptr_t)&(types[0]),(const uintptr_t)0x10}}, 8,0x4},{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)&(types[4]),(const uintptr_t)0x0}}, 9,SLIM_IFPTR32(0x4,0x8)},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4}}; +static const Parameter parameters[6] = {{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)0x0,0}}, 4,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),3,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),0,0},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4,0,0},{SLIM_IFPTR32(0x74,0x80),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x74,0x80),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),3,0}}; +static const Parameter* const parameterArrays[9] = {(&(parameters[4])),(&(parameters[4])),(&(parameters[5])),(&(parameters[3])),(&(parameters[3])),(&(parameters[3])),(&(parameters[0])),(&(parameters[1])),(&(parameters[2]))}; +static const Method methods[4] = {{REMOTE_SCALARS_MAKEX(0,0,0x2,0x0,0x0,0x1),0x4,0x0,2,2,(&(parameterArrays[6])),0x4,0x1},{REMOTE_SCALARS_MAKEX(0,0,0x0,0x0,0x1,0x0),0x0,0x0,1,1,(&(parameterArrays[8])),0x1,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x1,0x0,0x0,0x0),0xc,0x0,3,3,(&(parameterArrays[3])),0x4,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x3,0x2,0x0,0x0),0xe4,0x6c,3,3,(&(parameterArrays[0])),0x4,0x4}}; +static const Method* const methodArrays[8] = {&(methods[0]),&(methods[1]),&(methods[2]),&(methods[3]),&(methods[3]),&(methods[3]),&(methods[3]),&(methods[3])}; +static const char strings[167] = "dsp_setclocks\0dsp_rmsnorm\0dsp_softmax\0dcvs_enable\0power_level\0dsp_pool2d\0dsp_mulmat\0op_params\0dsp_add\0latency\0flags\0close\0src1\0data\0type\0src0\0open\0dst\0uri\0op\0nb\0ne\0h\0"; +static const uint16_t methodStrings[134] = {62,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,14,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,26,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,73,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,94,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,0,50,102,38,142,151,164,116,164}; +static const uint16_t methodStringsArrays[8] = {129,132,125,100,75,50,25,0}; +__QAIC_SLIM_EXPORT const Interface __QAIC_SLIM(ggmlop_slim) = {8,&(methodArrays[0]),0,0,&(methodStringsArrays [0]),methodStrings,strings}; +#endif //_GGMLOP_SLIM_H +extern int adsp_mmap_fd_getinfo(int, uint32_t *); +#ifdef __cplusplus +extern "C" { +#endif +_ATTRIBUTE_VISIBILITY uint32_t ggmlop_skel_handle_invoke_qaic_version = 10048; +_ATTRIBUTE_VISIBILITY char ggmlop_skel_handle_invoke_uri[77+1]="file:///libggmlop_skel.so?ggmlop_skel_handle_invoke&_modver=1.0&_idlver=0.0.1"; +static __inline int _skel_pack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) { + int _nErr = 0; + remote_arg* _praROutPostStart = _praROutPost; + remote_arg** _ppraROutPostStart = _ppraROutPost; + _ppraROutPost = &_praROutPost; + _COPY(_primROut, 0, _rout0, 0, 4); + _COPY(_primROut, 4, _rout1, 0, 16); + _COPY(_primROut, 20, _rout2, 0, 16); + _COPY(_primROut, 36, _rout3, 0, 4); + _COPY(_primROut, 40, _rout4, 0, 64); + _COPY(_primROut, 104, _rout5, 0, 4); + _ppraROutPostStart[0] += (_praROutPost - _praROutPostStart) +1; + return _nErr; +} +static __inline int _skel_unpack(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) { + int _nErr = 0; + remote_arg* _praInStart = _praIn; + remote_arg** _ppraInStart = _ppraIn; + remote_arg* _praROutStart = _praROut; + remote_arg** _ppraROutStart = _ppraROut; + _ppraIn = &_praIn; + _ppraROut = &_praROut; + _COPY(_rout6Len, 0, _primIn, 0, 4); + _QAIC_ASSERT(_nErr, ((_praROut[0].buf.nLen / 4)) >= (size_t)(_rout6Len[0])); + _rout6[0] = _praROut[0].buf.pv; + _ppraInStart[0] += (_praIn - _praInStart) + 0; + _ppraROutStart[0] += (_praROut - _praROutStart) +1; + _QAIC_CATCH(_nErr) {} + return _nErr; +} +static __inline int _skel_unpack_1(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _in0[1], _ATTRIBUTE_UNUSED uint32_t _in1[4], _ATTRIBUTE_UNUSED uint32_t _in2[4], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED uint32_t _in4[16], _ATTRIBUTE_UNUSED uint32_t _in5[1], _ATTRIBUTE_UNUSED char* _in6[1], _ATTRIBUTE_UNUSED uint32_t _in6Len[1]) { + int _nErr = 0; + remote_arg* _praInStart = _praIn; + remote_arg** _ppraInStart = _ppraIn; + remote_arg* _praROutStart = _praROut; + remote_arg** _ppraROutStart = _ppraROut; + _ppraIn = &_praIn; + _ppraROut = &_praROut; + _COPY(_in0, 0, _primIn, 0, 4); + _COPY(_in1, 0, _primIn, 4, 16); + _COPY(_in2, 0, _primIn, 20, 16); + _COPY(_in3, 0, _primIn, 36, 4); + _COPY(_in4, 0, _primIn, 40, 64); + _COPY(_in5, 0, _primIn, 104, 4); + _COPY(_in6Len, 0, _primIn, 108, 4); + _QAIC_ASSERT(_nErr, ((_praIn[0].buf.nLen / 4)) >= (size_t)(_in6Len[0])); + _in6[0] = _praIn[0].buf.pv; + _ppraInStart[0] += (_praIn - _praInStart) + 1; + _ppraROutStart[0] += (_praROut - _praROutStart) +0; + _QAIC_CATCH(_nErr) {} + return _nErr; +} +static __inline int _skel_method(int (*_pfn)(remote_handle64, const dsptensor*, const dsptensor*, dsptensor*), remote_handle64 _h, uint32_t _sc, remote_arg* _pra) { + remote_arg* _praEnd = 0; + uintptr_t _in0[SLIM_IFPTR32(29, 16)] = {0}; + uintptr_t _in1[SLIM_IFPTR32(29, 16)] = {0}; + uintptr_t _rout2[SLIM_IFPTR32(29, 16)] = {0}; + uint32_t* _primIn= 0; + int _numIn[1] = {0}; + uint32_t* _primROut= 0; + int _numInH[1] = {0}; + int _numROut[1] = {0}; + remote_arg* _praIn = 0; + remote_arg* _praROut = 0; + remote_arg* _praROutPost = 0; + remote_arg** _ppraROutPost = &_praROutPost; + _allocator _al[1] = {{0}}; + remote_arg** _ppraIn = &_praIn; + remote_arg** _ppraROut = &_praROut; + remote_arg* _praHIn = 0; + remote_arg** _ppraHIn = &_praHIn; + remote_arg* _praHROut = 0; + remote_arg** _ppraHROut = &_praHROut; + int _nErr = 0; + _praEnd = ((_pra + REMOTE_SCALARS_INBUFS(_sc)) + REMOTE_SCALARS_OUTBUFS(_sc) + REMOTE_SCALARS_INHANDLES(_sc) + REMOTE_SCALARS_OUTHANDLES(_sc)); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INBUFS(_sc)>=1); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTBUFS(_sc)>=1); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INHANDLES(_sc)==0); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTHANDLES(_sc)==0); + _QAIC_ASSERT(_nErr, (_pra + ((1 + 1) + (((0 + 0) + 0) + 0))) <= _praEnd); + _numIn[0] = (REMOTE_SCALARS_INBUFS(_sc) - 1); + _QAIC_ASSERT(_nErr, _pra[0].buf.nLen >= 228); + _primIn = _pra[0].buf.pv; + _QAIC_ASSERT(_nErr, _pra[(_numIn[0] + 1)].buf.nLen >= 108); + _primROut = _pra[(_numIn[0] + 1)].buf.pv; + _numInH[0] = REMOTE_SCALARS_INHANDLES(_sc); + _numROut[0] = REMOTE_SCALARS_OUTBUFS(_sc); + _praIn = (_pra + 1); + _praROut = (_praIn + _numIn[0] + 1); + _praROutPost = _praROut; + _allocator_init(_al, 0, 0); + if(_praHIn == 0) + { + _praHIn = ((_praROut + _numROut[0]) + 1); + } + if(_praHROut == 0) + (_praHROut = _praHIn + _numInH[0] + 0); + _TRY(_nErr, _skel_unpack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 0), 0, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint32_t*)&(((uint32_t*)_in0)[1]), (uint32_t*)&(((uint32_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[9]), (uint32_t*)&(((uint32_t*)_in0)[10]), (uint32_t*)&(((uint32_t*)_in0)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[27]), (char**)&(((uint64_t*)_in0)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[28]), (uint32_t*)&(((uint32_t*)_in0)[30])))); + _TRY(_nErr, _skel_unpack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 112), 0, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint32_t*)&(((uint32_t*)_in1)[1]), (uint32_t*)&(((uint32_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[9]), (uint32_t*)&(((uint32_t*)_in1)[10]), (uint32_t*)&(((uint32_t*)_in1)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[27]), (char**)&(((uint64_t*)_in1)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[28]), (uint32_t*)&(((uint32_t*)_in1)[30])))); + _TRY(_nErr, _skel_unpack(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 224), ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30])))); + _TRY(_nErr, _pfn(_h, (const dsptensor*)_in0, (const dsptensor*)_in1, (dsptensor*)_rout2)); + _TRY(_nErr, _skel_pack((_praROutPost + 0), _ppraROutPost, ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30])))); + _QAIC_CATCH(_nErr) {} + _allocator_deinit(_al); + return _nErr; +} +static __inline int _skel_method_1(int (*_pfn)(remote_handle64, int32, int32, int32, int32), remote_handle64 _h, uint32_t _sc, remote_arg* _pra) { + remote_arg* _praEnd = 0; + uint32_t _in0[1] = {0}; + uint32_t _in1[1] = {0}; + uint32_t _in2[1] = {0}; + uint32_t _in3[1] = {0}; + uint32_t* _primIn= 0; + int _nErr = 0; + _praEnd = ((_pra + REMOTE_SCALARS_INBUFS(_sc)) + REMOTE_SCALARS_OUTBUFS(_sc) + REMOTE_SCALARS_INHANDLES(_sc) + REMOTE_SCALARS_OUTHANDLES(_sc)); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INBUFS(_sc)==1); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTBUFS(_sc)==0); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INHANDLES(_sc)==0); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTHANDLES(_sc)==0); + _QAIC_ASSERT(_nErr, (_pra + ((1 + 0) + (((0 + 0) + 0) + 0))) <= _praEnd); + _QAIC_ASSERT(_nErr, _pra[0].buf.nLen >= 12); + _primIn = _pra[0].buf.pv; + _COPY(_in0, 0, _primIn, 0, 4); + _COPY(_in1, 0, _primIn, 4, 4); + _COPY(_in2, 0, _primIn, 8, 4); + _COPY(_in3, 0, _primIn, 12, 4); + _TRY(_nErr, _pfn(_h, (int32)*_in0, (int32)*_in1, (int32)*_in2, (int32)*_in3)); + _QAIC_CATCH(_nErr) {} + return _nErr; +} +static __inline int _skel_method_2(int (*_pfn)(remote_handle64), uint32_t _sc, remote_arg* _pra) { + remote_arg* _praEnd = 0; + remote_handle64 _in0[1] = {0}; + remote_arg* _praRHandleIn = _pra + REMOTE_SCALARS_INBUFS(_sc) + REMOTE_SCALARS_OUTBUFS(_sc); + int _nErr = 0; + _praEnd = ((_pra + REMOTE_SCALARS_INBUFS(_sc)) + REMOTE_SCALARS_OUTBUFS(_sc) + REMOTE_SCALARS_INHANDLES(_sc) + REMOTE_SCALARS_OUTHANDLES(_sc)); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INBUFS(_sc)==0); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTBUFS(_sc)==0); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INHANDLES(_sc)==1); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTHANDLES(_sc)==0); + _QAIC_ASSERT(_nErr, (_pra + ((0 + 0) + (((1 + 0) + 0) + 0))) <= _praEnd); + _COPY(_in0, 0, &(_praRHandleIn[0].h64), 0, sizeof(remote_handle64)); + _TRY(_nErr, _pfn((remote_handle64)*_in0)); + _QAIC_CATCH(_nErr) {} + return _nErr; +} +static __inline int _compare_versions(char* stub_ver, char* skel_ver, int* result) { + unsigned long int major_stub = 0, minor_stub = 0, patch_stub = 0; + unsigned long int major_skel = 0, minor_skel = 0, patch_skel = 0; + char *saveptr1 = NULL; + char *token1 = NULL; + char *saveptr2 = NULL; + char *token2 = NULL; + int i=0; + for (i=0, token1 = strtok_r(stub_ver, ".", &saveptr1); i<3 && token1 != NULL; i++, token1 = strtok_r(NULL, ".", &saveptr1)) + { + unsigned long int tn = strtoul(token1, NULL,10); + if( tn > 999) + { + *result=-1; + return 0; + } + else + { + if(i==0) major_stub=tn; + if(i==1) minor_stub=tn; + if(i==2) patch_stub=tn; + } + } + for (i=0, token2 = strtok_r(skel_ver, ".", &saveptr2); i<3 && token2 != NULL; i++, token2 = strtok_r(NULL, ".", &saveptr2)) + { + unsigned long int tn = strtoul(token2, NULL,10); + if( tn > 999) + { + *result=-1; + return 0; + } + else + { + if(i==0) major_skel=tn; + if(i==1) minor_skel=tn; + if(i==2) patch_skel=tn; + } + } + if(major_stub=patch_stub)) + { + *result=1; + return 0; + } + } + *result=-1; + return 0; +} +static __inline int _stub_skel_version_check(char*_in0, int* resVal) { + int _nErr = 0; + char* p = strstr(_in0, "_idlver="); + if(!p) + { + *resVal = -1; + return 0; + } + p+=8; + int i=0,len=0, comVer=0,num_delimit=0, updtInxStub=0, updtInxSkel=0; + for(i=0;i2) + { + *resVal = -1; + return 0; + } + if ((p[i]>='0' && p[i]<='9') || (p[i]=='.')) + { + len++; + if(p[i]=='.') + { + num_delimit++; + } + } + else if(p[i]=='&') + { + break; + } + else + { + *resVal = -1; + return 0; + } + } + char* stubVer=(char*)MALLOC(len+1); + _QAIC_ASSERT(_nErr, stubVer!=NULL); + for(i=0;i='0' && p[i]<='9') || (p[i]=='.')) + { + stubVer[updtInxStub]=p[i]; + updtInxStub++; + } + else if(p[i]=='&') + { + break; + } + } + stubVer[len]='\0'; + char* skelVer=(char*)MALLOC(strlen(IDL_VERSION)+1); + _QAIC_ASSERT(_nErr, skelVer!=NULL); + for(i=0;i< strlen(IDL_VERSION);i++) + { + skelVer[updtInxSkel]=IDL_VERSION[i]; + updtInxSkel++; + } + skelVer[strlen(IDL_VERSION)]='\0'; + _TRY(_nErr, _compare_versions(stubVer, skelVer, &comVer)); + *resVal = 0; + if (comVer==-1) + { + *resVal = -1; + } + FREE(stubVer); + FREE(skelVer); + _QAIC_CATCH(_nErr) {} + return 0; +} +static __inline int _skel_method_3(int (*_pfn)(const char*, remote_handle64*), uint32_t _sc, remote_arg* _pra) { + remote_arg* _praEnd = 0; + char* _in0[1] = {0}; + uint32_t _in0Len[1] = {0}; + remote_handle64 _rout1[1] = {0}; + uint32_t* _primIn= 0; + remote_arg* _praRHandleROut = _pra + REMOTE_SCALARS_INBUFS(_sc) + REMOTE_SCALARS_OUTBUFS(_sc) + REMOTE_SCALARS_INHANDLES(_sc) ; + remote_arg* _praIn = 0; + int _nErr = 0; + _praEnd = ((_pra + REMOTE_SCALARS_INBUFS(_sc)) + REMOTE_SCALARS_OUTBUFS(_sc) + REMOTE_SCALARS_INHANDLES(_sc) + REMOTE_SCALARS_OUTHANDLES(_sc)); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INBUFS(_sc)==2); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTBUFS(_sc)==0); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INHANDLES(_sc)==0); + _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTHANDLES(_sc)==1); + _QAIC_ASSERT(_nErr, (_pra + ((2 + 0) + (((0 + 1) + 0) + 0))) <= _praEnd); + _QAIC_ASSERT(_nErr, _pra[0].buf.nLen >= 4); + _primIn = _pra[0].buf.pv; + _COPY(_in0Len, 0, _primIn, 0, 4); + _praIn = (_pra + 1); + _QAIC_ASSERT(_nErr, ((_praIn[0].buf.nLen / 1)) >= (size_t)(_in0Len[0])); + _in0[0] = _praIn[0].buf.pv; + _QAIC_ASSERT(_nErr, (_in0Len[0] > 0) && (_in0[0][(_in0Len[0] - 1)] == 0)); + int resVal; + _TRY(_nErr, _stub_skel_version_check(*_in0, &resVal)); + if(resVal==-1) + { + return AEE_ESTUBSKELVERMISMATCH; + } + _TRY(_nErr, _pfn((const char*)*_in0, (remote_handle64*)_rout1)); + _COPY(&(_praRHandleROut[0].h64), 0, _rout1, 0, sizeof(remote_handle64)); + _QAIC_CATCH(_nErr) {} + return _nErr; +} +__QAIC_SKEL_EXPORT int __QAIC_SKEL(ggmlop_skel_handle_invoke)(remote_handle64 _h, uint32_t _sc, remote_arg* _pra) __QAIC_SKEL_ATTRIBUTE { + switch(REMOTE_SCALARS_METHOD(_sc)){ + case 0: + return _skel_method_3(__QAIC_IMPL(ggmlop_dsp_open), _sc, _pra); + case 1: + return _skel_method_2(__QAIC_IMPL(ggmlop_dsp_close), _sc, _pra); + case 2: + return _skel_method_1(__QAIC_IMPL(ggmlop_dsp_setclocks), _h, _sc, _pra); + case 3: + return _skel_method(__QAIC_IMPL(ggmlop_dsp_add), _h, _sc, _pra); + case 4: + return _skel_method(__QAIC_IMPL(ggmlop_dsp_mulmat), _h, _sc, _pra); + case 5: + return _skel_method(__QAIC_IMPL(ggmlop_dsp_softmax), _h, _sc, _pra); + case 6: + return _skel_method(__QAIC_IMPL(ggmlop_dsp_rmsnorm), _h, _sc, _pra); + case 7: + return _skel_method(__QAIC_IMPL(ggmlop_dsp_pool2d), _h, _sc, _pra); + } + return AEE_EUNSUPPORTED; +} diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh new file mode 100755 index 0000000000000..b686c4abf321f --- /dev/null +++ b/scripts/build-run-android.sh @@ -0,0 +1,431 @@ +#!/bin/bash +# build llama.cpp + ggml-hexagon for Snapdragon mobile SoC equipped Android phone on Linux + +set -e + +PWD=`pwd` +ANDROID_PLATFORM=android-34 +ANDROID_NDK=${PWD}/android-ndk-r26c +REMOTE_PATH=/data/local/tmp/ +GGUF_MODEL_NAME=/sdcard/gemma-3-4b-it-Q8_0.gguf +GGUF_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf + +#QNN SDK could be found at: +#https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk +#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools +QNN_SDK_URL=https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk +QNN_SDK_INSTALL_PATH=/opt/qcom/aistack/qairt/ +QNN_SDK_VERSION=2.32.0.250228 +QNN_SDK_VERSION=2.33.0.250327 +QNN_SDK_PATH=${QNN_SDK_INSTALL_PATH}/${QNN_SDK_VERSION} + +#5.5.3.0 should be also ok +HEXAGON_SDK_PATH=/opt/qcom/Hexagon_SDK/6.2.0.1 +#available htp arch version: +#v68 --- Snapdragon 888 +#v69 --- Snapdragon 8 Gen1 +#v73 --- Snapdragon 8 Gen2 +#v75 --- Snapdragon 8 Gen3 +#v79 --- Snapdragon 8 Elite(aka Gen4) +HTP_ARCH_VERSION=v75 +HTP_ARCH_VERSION_a=V75 + +HTP_ARCH_VERSION=v79 +HTP_ARCH_VERSION_a=V79 + +#running_params=" -mg 2 -ngl 99 " +#running_params=" -mg 2 -ngl 99 -t 8 -fa 1 " +running_params=" -mg 2 -ngl 99 -t 8 " + +function dump_vars() +{ + echo -e "ANDROID_NDK: ${ANDROID_NDK}" + echo -e "QNN_SDK_PATH: ${QNN_SDK_PATH}" + echo -e "HEXAGON_SDK_PATH: ${HEXAGON_SDK_PATH}" +} + + +function show_pwd() +{ + echo -e "current working path:$(pwd)\n" +} + + +function check_hexagon_sdk() +{ + if [ ! -d ${HEXAGON_SDK_PATH} ]; then + echo -e "HEXAGON_SDK_PATH ${HEXAGON_SDK_PATH} not exist, pls install it accordingly...\n" + exit 0 + else + printf "Qualcomm Hexagon SDK already exist:${HEXAGON_SDK_PATH} \n\n" + fi +} + + +function check_and_download_qnn_sdk() +{ + is_qnn_sdk_exist=1 + + if [ ! -d ${QNN_SDK_PATH} ]; then + echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, download it from ${QNN_SDK_URL}...\n" + is_qnn_sdk_exist=0 + fi + + if [ ! -f ${QNN_SDK_PATH}/sdk.yaml ]; then + is_qnn_sdk_exist=0 + fi + + if [ ${is_qnn_sdk_exist} -eq 0 ]; then + echo "sudo mkdir -p ${QNN_SDK_INSTALL_PATH}" + sudo mkdir -p ${QNN_SDK_INSTALL_PATH} + if [ ! -f v${QNN_SDK_VERSION}.zip ]; then + wget --no-config --quiet --show-progress -O v${QNN_SDK_VERSION}.zip https://softwarecenter.qualcomm.com/api/download/software/sdks/Qualcomm_AI_Runtime_Community/All/${QNN_SDK_VERSION}/v${QNN_SDK_VERSION}.zip + fi + unzip v${QNN_SDK_VERSION}.zip + if [ $? -ne 0 ]; then + printf "failed to download Qualcomm QNN SDK to %s \n" "${QNN_SDK_PATH}" + exit 1 + fi + sudo mv qairt/${QNN_SDK_VERSION} ${QNN_SDK_INSTALL_PATH}/ + printf "Qualcomm QNN SDK saved to ${QNN_SDK_PATH} \n\n" + sudo rm -rf qairt + else + printf "Qualcomm QNN SDK already exist:${QNN_SDK_PATH} \n\n" + fi +} + + +function check_and_download_ndk() +{ + is_android_ndk_exist=1 + + if [ ! -d ${ANDROID_NDK} ]; then + is_android_ndk_exist=0 + fi + + if [ ! -f ${ANDROID_NDK}/build/cmake/android.toolchain.cmake ]; then + is_android_ndk_exist=0 + fi + + if [ ${is_android_ndk_exist} -eq 0 ]; then + + if [ ! -f android-ndk-r26c-linux.zip ]; then + wget --no-config --quiet --show-progress -O android-ndk-r26c-linux.zip https://dl.google.com/android/repository/android-ndk-r26c-linux.zip + fi + + unzip android-ndk-r26c-linux.zip + + if [ $? -ne 0 ]; then + printf "failed to download android ndk to %s \n" "${ANDROID_NDK}" + exit 1 + fi + + printf "android ndk saved to ${ANDROID_NDK} \n\n" + else + printf "android ndk already exist:${ANDROID_NDK} \n\n" + fi +} + + +function build_arm64 +{ + cmake -H. -B./out/android -DCMAKE_BUILD_TYPE=Release -DGGML_OPENMP=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_HEXAGON=ON -DLLAMA_CURL=OFF -DQNN_SDK_PATH=${QNN_SDK_PATH} -DHEXAGON_SDK_PATH=${HEXAGON_SDK_PATH} -DHTP_ARCH_VERSION=${HTP_ARCH_VERSION} + cd out/android + make -j16 + show_pwd + + cd - +} + +function build_arm64_debug +{ + cmake -H. -B./out/android -DCMAKE_BUILD_TYPE=Debug -DGGML_OPENMP=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_HEXAGON=ON -DLLAMA_CURL=OFF -DQNN_SDK_PATH=${QNN_SDK_PATH} -DHEXAGON_SDK_PATH=${HEXAGON_SDK_PATH} -DHTP_ARCH_VERSION=${HTP_ARCH_VERSION} + cd out/android + make -j16 + show_pwd + + cd - +} + + +function remove_temp_dir() +{ + if [ -d out/android ]; then + echo "remove out/android directory in `pwd`" + rm -rf out/android + fi +} + + +function check_qnn_libs() +{ + #reuse the cached qnn libs on Android phone + adb shell ls ${REMOTE_PATH}/libQnnCpu.so + adb shell ls ${REMOTE_PATH}/libQnnGpu.so + adb shell ls ${REMOTE_PATH}/libQnnHtp.so + if [ $? -eq 0 ]; then + printf "QNN libs already exist on Android phone\n" + else + update_qnn_libs + fi + update_qnn_cfg +} + + +function update_qnn_libs() +{ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/ + + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp${HTP_ARCH_VERSION_a}Stub.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/hexagon-${HTP_ARCH_VERSION}/unsigned/libQnnHtp${HTP_ARCH_VERSION_a}Skel.so ${REMOTE_PATH}/ +} + + +function update_qnn_cfg() +{ + adb push ./scripts/ggml-hexagon.cfg ${REMOTE_PATH}/ +} + + +function build_ggml_hexagon() +{ + show_pwd + check_and_download_ndk + check_and_download_qnn_sdk + check_hexagon_sdk + dump_vars + remove_temp_dir + build_arm64 +} + +function build_ggml_hexagon_debug() +{ + show_pwd + check_and_download_ndk + check_and_download_qnn_sdk + check_hexagon_sdk + dump_vars + remove_temp_dir + build_arm64_debug +} + + +function prepare_run_on_phone() +{ + if [ $# != 1 ]; then + print "invalid param" + return + fi + program=$1 + + check_qnn_libs + + if [ -f ./out/android/bin/libggml-cpu.so ]; then + adb push ./out/android/bin/*.so ${REMOTE_PATH}/ + fi + adb push ./out/android/bin/${program} ${REMOTE_PATH}/ + adb shell chmod +x ${REMOTE_PATH}/${program} +} + +function run_llamacli() +{ + prepare_run_on_phone llama-cli + + adb shell "cd ${REMOTE_PATH} \ + && export LD_LIBRARY_PATH=${REMOTE_PATH} \ + && ${REMOTE_PATH}/llama-cli ${running_params} -no-cnv -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\"" + +} + + +function run_llamabench() +{ + prepare_run_on_phone llama-bench + + adb shell "cd ${REMOTE_PATH} \ + && export LD_LIBRARY_PATH=${REMOTE_PATH} \ + && ${REMOTE_PATH}/llama-bench ${running_params} -m ${GGUF_MODEL_NAME}" + +} + + +function run_test-ops() +{ + prepare_run_on_phone test-backend-ops + + adb shell "cd ${REMOTE_PATH} \ + && export LD_LIBRARY_PATH=${REMOTE_PATH} \ + && ${REMOTE_PATH}/test-backend-ops test" + +} + +function run_test-op() +{ + prepare_run_on_phone test-backend-ops + + echo "adb shell cd ${REMOTE_PATH} \ + && export LD_LIBRARY_PATH=${REMOTE_PATH} \ + && ${REMOTE_PATH}/test-backend-ops test -o $opname " + + echo "\n" + adb shell "cd ${REMOTE_PATH} \ + && export LD_LIBRARY_PATH=${REMOTE_PATH} \ + && ${REMOTE_PATH}/test-backend-ops test -o $opname " + +} + + +function print_oplist() +{ +oplist="DUP + ADD + ADD1 + ACC + SUB + MUL + DIV + SQR + SQRT + LOG + SIN + COS + SUM + SUM_ROWS + MEAN + ARGMAX + COUNT_EQUAL + REPEAT + REPEAT_BACK + CONCAT + SILU_BACK + NORM + RMS_NORM + RMS_NORM_BACK + GROUP_NORM + + MUL_MAT + MUL_MAT_ID + OUT_PROD + + SCALE + SET + CPY + CONT + RESHAPE + VIEW + PERMUTE + TRANSPOSE + GET_ROWS + GET_ROWS_BACK + DIAG + DIAG_MASK_INF + DIAG_MASK_ZERO + SOFT_MAX + SOFT_MAX_BACK + ROPE + ROPE_BACK + CLAMP + CONV_TRANSPOSE_1D + IM2COL + IM2COL_BACK + CONV_TRANSPOSE_2D + POOL_1D + POOL_2D + POOL_2D_BACK + UPSCALE + PAD + PAD_REFLECT_1D + ARANGE + TIMESTEP_EMBEDDING + ARGSORT + LEAKY_RELU + + FLASH_ATTN_EXT + FLASH_ATTN_BACK + SSM_CONV + SSM_SCAN + WIN_PART + WIN_UNPART + GET_REL_POS + ADD_REL_POS + RWKV_WKV6 + GATED_LINEAR_ATTN" + +echo "opname list: " +echo ${oplist} +} + +function show_usage() +{ + echo "Usage:" + echo " $0 help" + echo " $0 print_oplist" + echo " $0 build" + echo " $0 build_debug (enable debug log for developers on ARM-AP side and cDSP side)" + echo " $0 updateqnnlib" + echo " $0 run_testops" + echo " $0 run_testop [ADD/MUL_MAT]" + echo " $0 run_llamacli" + echo " $0 run_llamabench" + + echo -e "\n\n\n" +} + + +show_pwd + +check_and_download_ndk +check_and_download_qnn_sdk +check_hexagon_sdk + +if [ $# == 0 ]; then + show_usage + exit 1 +elif [ $# == 1 ]; then + if [ "$1" == "-h" ]; then + show_usage + exit 1 + elif [ "$1" == "help" ]; then + show_usage + exit 1 + elif [ "$1" == "print_oplist" ]; then + print_oplist + exit 1 + elif [ "$1" == "build" ]; then + build_ggml_hexagon + exit 0 + elif [ "$1" == "build_debug" ]; then + build_ggml_hexagon_debug + exit 0 + elif [ "$1" == "run_testops" ]; then + run_test-ops + exit 0 + elif [ "$1" == "run_llamacli" ]; then + run_llamacli + exit 0 + elif [ "$1" == "run_llamabench" ]; then + run_llamabench + exit 0 + elif [ "$1" == "updateqnnlib" ]; then + update_qnn_libs + exit 0 + else + show_usage + exit 1 + fi +elif [ $# == 2 ]; then + opname=$2 +#TODO: check opname in oplist +#opname can be found via print_oplist: + + run_test-op + exit 0 +else + show_usage + exit 1 +fi diff --git a/scripts/ggml-hexagon.cfg b/scripts/ggml-hexagon.cfg new file mode 100644 index 0000000000000..6e5a37e03c520 --- /dev/null +++ b/scripts/ggml-hexagon.cfg @@ -0,0 +1,108 @@ +# +# Copyright (c) 2023-2025 The ggml authors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +# +# runtime configuration for ggml-hexagon backend +# +[general] +#version of ggml-hexagon.cpp on ARM-AP side +version = "1.04" +#version of ggml-dsp.c on cDSP side +ggmldsp_version = "0.61" + +#0: HEXAGON_BACKEND_QNNCPU +#1: HEXAGON_BACKEND_QNNGPU +#2: HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP +#3: default ggml backend +hexagon_backend = 2 +# 0: hwaccel approach through HWACCEL_QNN: offload ggml op to QNN +# 1: hwaccel approach through HWACCEL_QNN_SINGLEGRAPH: mapping entire ggml cgraph to a single QNN graph +# 2: hwaccel approach through HWACCEL_CDSP:offload ggml op to cDSP directly +hwaccel_approach = 2 +# +#attention: +# a. HWACCEL_QNN_SINGLEGRAPH not supported at the moment; +# b. following combinations are valid: +# 1: hwaccel_approach = 2 AND hexagon_backend = 2(this is the default setting) +# 2: hwaccel_approach = 0 AND hexagon_backend = 2(QNNNPU) +# 3: hwaccel_approach = 0 AND hexagon_backend = 1(QNNGPU) +# 4: hwaccel_approach = 0 AND hexagon_backend = 0(QNNCPU) +# 5: hwaccel_approach = 2 AND hexagon_backend = 3 +# 6: hwaccel_approach = 0 AND hexagon_backend = 3 +# +#generally speaking, we only need to focus on b-1 and b-2 in this PR. + + +#enable/disable offload quantized type mulmat +#quatized type mulmat works fine through HWACCEL_QNN at the moment +#quatized type mulmat doesn't works fine through HWACCEL_CDSP at the moment +#this item will make mulmat performance comprision easily +enable_q_mulmat = 0 + + +# enable/disable print tensors info in op function +print_tensors_info = 0 +# enable/disable dump op info in handle_op +dump_op_info = 0 + + +# enable/disable perf of op function +# this is the default setting +enable_perf = 1 + + +# enablie/disable profiler feature to visually compare NPU performance between HWACCEL_CDSP and HWACCEL_QNN +# this is default setting +enable_profiler = 0 +#threshold duration of NPU performance profiler, per seconds +profiler_duration = 5 +#threshold counst of NPU performance profiler +profiler_counts = 200 +#attention: +# NPU performance might be slower when enable_profiler = 1 because of file I/O in this feature; +# ensure enable_perf = 1 when set enable_profiler = 1; + + +#hwaccel approach through QNN(offload ggml op to QNN-NPU) +[qnn] +# enable/disable QNN SDK's internal log, this will very helpful for troubleshooting in HWACCEL_QNN approach +print_qnn_internal_log = 0 + +hvx_threads = 8 +vtcm_size_in_mb = 8 +enable_dlbc = 1 +precision_mode = "fp16" + + +#hwaccel approach through cDSP(offload ggml op to Hexagon cDSP directly) +[cdsp] +#enable/disable rpc ion memory pool +enable_rpc_ion_mempool = 1 + +#enable/disable offload all quantized type mulmat to cDSP +enable_all_q_mulmat = 0 +#attention: +#ensure enable_q_mulmat = 1 when set enable_all_q_mulmat = 1 + +#enable/disable multi-threading on cDSP side +# 0 disable multi-threading on cDSP side +# 1 disable multi-threading on cDSP side +# 2-8 thread_counts on cDSP side +thread_counts = 1