Skip to content

Commit e4b0d8c

Browse files
committed
ggml-qnn: rebase to upstream
1 parent 0a34837 commit e4b0d8c

File tree

4 files changed

+92
-126
lines changed

4 files changed

+92
-126
lines changed

CMakeLists.txt

+11
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,16 @@ set(CMAKE_WARN_UNUSED_CLI YES)
77

88
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
99

10+
if(CMAKE_SYSTEM_NAME STREQUAL "Android")
11+
set(TARGET_SNAPDRAGON8GEN3 ON)
12+
if(TARGET_SNAPDRAGON8GEN3)
13+
#works fine on Snapdragon 8Gen3 with 5x-10x(76.64 tokens per second) performance gain through the default ggml backend
14+
add_definitions(-march=armv8.7-a)
15+
add_definitions(-mcpu=cortex-x1)
16+
add_definitions(-mtune=cortex-x1)
17+
endif()
18+
endif()
19+
1020
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
1121
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
1222
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
@@ -117,6 +127,7 @@ llama_option_depr(WARNING LLAMA_RPC GGML_RPC)
117127
llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
118128
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
119129
llama_option_depr(WARNING LLAMA_CANN GGML_CANN)
130+
llama_option_depr(WARNING LLAMA_QNN GGML_QNN)
120131

121132
if (NOT MSVC)
122133
if (LLAMA_SANITIZE_THREAD)

ggml/src/ggml-qnn/ggml-qnn.cpp

+60-59
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,16 @@
1414
* section-6 QNN helper function
1515
* section-7 ggml-qnn backend helper function / class
1616
* section-8 implementation of ggml-qnn backend according to ggml's backend subsystem
17-
* section-9 implementation of offload ggml op to QNN backend
18-
* section-10 illustrate why the second approach is actual an fake at the moment
17+
* section-9 implementation of general approach or the first tech approach
18+
* section-10 implementation of the second tech approach:mapping the entire ggml cgraph to a single QNN graph
1919
*
2020
* currently provide following ggml op' QNN backend implementation:
21-
* - GGML_OP_ADD: this is a simple skeleton, can expand other ggml ops according to expertise
22-
* - GGML_OP_MUL: this is a simple skeleton, can expand other ggml ops according to expertise
23-
* - GGML_OP_MUL_MAT:this is a complicated skeleton, can expand other complex ggml ops accordingly
21+
* - GGML_OP_ADD/GGML_OP_SUB/GGML_OP_MUL/GGML_OP_DIV:
22+
* this is a simple skeleton, can expand other ggml ops according to expertise
23+
* - GGML_OP_LOG/GGML_OP_SQRT:
24+
* this is a simple skeleton, can expand other ggml ops according to expertise
25+
* - GGML_OP_MUL_MAT:
26+
* this is a complicated skeleton, can expand other complex ggml ops accordingly
2427
*
2528
* Permission is hereby granted, free of charge, to any person obtaining a copy
2629
* of this software and associated documentation files (the "Software"), to
@@ -80,7 +83,6 @@
8083
#include <unordered_set>
8184
#include <utility>
8285
#include <future>
83-
#include <chrono>
8486
#if (defined __ANDROID__) || (defined ANDROID)
8587
#include "android/log.h"
8688
#endif
@@ -186,7 +188,6 @@ static void ggml_qnn_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst
186188

187189
#define GGMLQNN_MEM_ADD(alignment) (sizeof (size_t) + alignment)
188190
#define GGMLQNN_MEM_MASK(alignment) ((uintptr_t)alignment - 1)
189-
#define TENSOR_DUMP(tensor) ggmlqnn_tensor_dump(tensor, #tensor)
190191
#define GQCGT ggmlqnn_create_general_tensor
191192
#define QNN_VER_PTR(x) (&((x).v1))
192193
#define RPCMEM_DEFAULT_FLAGS 1
@@ -260,7 +261,7 @@ using qnn_ptensors_t = std::vector< Qnn_Tensor_t *>;
260261
using qnn_singlenode_res_t = std::tuple<Qnn_GraphHandle_t, qnn_ptensors_t>;
261262

262263
//QNN resource management for the second technical approach(mapping the entire cgraph to a single QNN graph)
263-
using qnn_tensors_t = std::vector< Qnn_Tensor_t * >;
264+
using qnn_tensors_t = std::vector< Qnn_Tensor_t >;
264265
using qnn_cgraph_node_t = std::tuple<std::string, Qnn_GraphHandle_t>;
265266
using qnn_cgraph_nodes_t = std::vector<qnn_cgraph_node_t>;
266267
using qnn_multinode_res_t = std::tuple<Qnn_GraphHandle_t, qnn_cgraph_nodes_t, qnn_ptensors_t, qnn_tensors_t, qnn_tensors_t>;
@@ -325,11 +326,6 @@ struct ggml_backend_qnn_context {
325326
size_t work_size;
326327
size_t desired_size;
327328
int n_threads;
328-
329-
#if 1//ndef NDEBUG
330-
std::atomic_uint32_t supported_op_count = 0;
331-
std::atomic_uint32_t unsupported_op_count = 0;
332-
#endif
333329
};
334330

335331
struct qnn_op_caps {
@@ -370,8 +366,6 @@ static struct qnn_parameter g_qnn_params = {
370366
#if defined(__ANDROID__)
371367
//Android command line program
372368
.qnn_runtimelib_path = "/data/local/tmp/",
373-
//Android KanTV standard APP
374-
// .qnn_runtimelib_path = "/data/data/com.cdeos.kantv/qnnlib/",
375369
#elif defined(__linux__)
376370
.qnn_runtimelib_path = "/tmp/",
377371
#elif defined(_WIN32)
@@ -1066,46 +1060,6 @@ static void ggmlqnn_load_cfg() {
10661060
}
10671061
}
10681062

1069-
static void ggmlqnn_tensor_dump_elements(const ggml_tensor * tensor) {
1070-
float value = 0;
1071-
std::ostringstream tmposs;
1072-
if (tensor->type == GGML_TYPE_F32) {
1073-
for (int h = 0; h < tensor->ne[3]; h++) {
1074-
for (int i = 0; i < tensor->ne[2]; i++) {
1075-
for (int j = 0; j < tensor->ne[1]; j++) {
1076-
for (int k = 0; k < tensor->ne[0]; k++) {
1077-
value = ((float *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] +
1078-
j * tensor->ne[0] + k];
1079-
tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value
1080-
<< " ";
1081-
}
1082-
if (strlen(tmposs.str().c_str()) <= (4096 - 96)) {
1083-
GGMLQNN_LOG_DEBUG("%s\n", tmposs.str().c_str());
1084-
}
1085-
tmposs.clear();
1086-
tmposs.str("");
1087-
}
1088-
}
1089-
}
1090-
}
1091-
1092-
GGMLQNN_LOG_DEBUG("\n");
1093-
}
1094-
1095-
1096-
static void ggmlqnn_tensor_dump(const ggml_tensor * tensor, const char * name) {
1097-
GGMLQNN_LOG_DEBUG("dump ggml tensor %s(%s)\n", name, tensor->name);
1098-
GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64", nb = (%5zi, %5zi, %5zi, %5zi)\n",
1099-
name,
1100-
tensor->type, ggml_type_name(tensor->type),
1101-
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3],
1102-
tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[2]);
1103-
ggmlqnn_tensor_dump_elements(tensor);
1104-
1105-
GGMLQNN_LOG_DEBUG("\n");
1106-
}
1107-
1108-
11091063
// =================================================================================================
11101064
// section-6: QNN helper function
11111065
// =================================================================================================
@@ -2698,7 +2652,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
26982652
std::filesystem::path full_path(std::string(g_qnn_params.qnn_runtimelib_path) + "libcdsprpc.so");
26992653
full_path /= std::filesystem::path("libcdsprpc.so").filename();
27002654
_rpc_lib_handle = dlopen(full_path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
2701-
if (!_rpc_lib_handle) {
2655+
if (nullptr == _rpc_lib_handle) {
27022656
GGMLQNN_LOG_WARN("failed to load %s\n", full_path.c_str());
27032657
_rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
27042658
}
@@ -5083,9 +5037,56 @@ void ggml_qnn_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
50835037
// =================================================================================================
50845038
// details: https://github.com/ggml-org/llama.cpp/pull/12326#issuecomment-2712838649
50855039
// ref: https://github.com/kantv-ai/kantv/blob/kantv-poc-with-qnn/core/ggml/jni/Inception_v3.cpp#L20634
5086-
static enum ggml_status
5087-
ggml_backend_qnn_graph_compute_special(ggml_backend_t backend, struct ggml_cgraph *cgraph) {
5088-
enum ggml_status ggml_result = GGML_STATUS_SUCCESS;
5040+
static enum ggml_status ggml_backend_qnn_graph_compute_special(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
5041+
enum ggml_status ggml_result = GGML_STATUS_SUCCESS;
5042+
Qnn_ErrorHandle_t qnn_error = QNN_SUCCESS;
5043+
qnn_perf op_perf = qnn_perf("ggml_backend_qnn_graph_compute_special");
5044+
qnn_instance * instance = nullptr;
5045+
Qnn_GraphHandle_t graph_handle = nullptr;
5046+
ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context;
5047+
instance = ctx->instance;
5048+
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
5049+
op_perf.start();
5050+
5051+
//now we got the entire ggml cgraph or a ggml cgraph which contains multiple nodes
5052+
GGMLQNN_LOG_DEBUG("qnn device %d(%s)", ctx->device, ggml_backend_qnn_get_devname(ctx->device));
5053+
GGMLQNN_LOG_DEBUG("cgraph->n_nodes %d", cgraph->n_nodes);
5054+
int num_nodes = std::min(5, cgraph->n_nodes);
5055+
//for (int i = 0; i < cgraph->n_nodes; i++) {
5056+
for (int i = 0; i < num_nodes; i++) {
5057+
ggml_tensor * node = cgraph->nodes[i];
5058+
GGMLQNN_LOG_DEBUG("%s: op %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
5059+
}
5060+
5061+
//now we'll offload the ggml cgraph to a single QNN graph
5062+
std::string graph_name;
5063+
ggmlqnn_get_graphkey_from_cgraph(cgraph, graph_name);
5064+
if (graph_name == "")
5065+
return GGML_STATUS_SUCCESS;
5066+
if (ctx->qnn_multinode_graph_map.find(graph_name) != ctx->qnn_multinode_graph_map.end()) {
5067+
GGMLQNN_LOG_DEBUG("graph name %s already create", graph_name.c_str());
5068+
//retrieve computational resource from cached QNN graph
5069+
qnn_multinode_res_t &graph_res = ctx->qnn_multinode_graph_map[graph_name];
5070+
graph_handle = std::get<0>(graph_res);
5071+
} else {
5072+
//create QNN graph
5073+
GGMLQNN_LOG_INFO("graph name %s", graph_name.c_str());
5074+
qnn_error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), 8, 4);
5075+
if (QNN_SUCCESS != qnn_error) {
5076+
GGMLQNN_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d(%s)\n", graph_name.c_str(), qnn_error,
5077+
ggmlqnn_get_qnnerror_string(qnn_error));
5078+
return ggml_result;
5079+
}
5080+
graph_handle = instance->get_qnn_graph_handle();
5081+
//TBD: compose a single opcfg QNN graph
5082+
5083+
//finalize QNN graph
5084+
CHECK_QNN_API(qnn_error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
5085+
5086+
//TBD: cache QNN graph
5087+
}
5088+
//TBD: exec QNN graph
5089+
GGMLQNN_LOG_DEBUG("the second inference approach \"mapping cgraph to QNN graph\" will be seen in the future");
50895090

50905091
return ggml_result;
50915092
}

scripts/build-run-android.sh

+10-66
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ PWD=`pwd`
77
ANDROID_PLATFORM=android-34
88
ANDROID_NDK=${PWD}/android-ndk-r26c
99
REMOTE_PATH=/data/local/tmp/
10-
GGUF_MODEL_NAME=/sdcard/deepseek-r1-distill-qwen-1.5b-q4_0.gguf
1110
GGUF_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
1211

1312
#QNN SDK could be found at:
@@ -18,8 +17,7 @@ QNN_SDK_INSTALL_PATH=/opt/qcom/aistack/qairt/
1817
QNN_SDK_VERSION=2.32.0.250228
1918
QNN_SDK_PATH=${QNN_SDK_INSTALL_PATH}/${QNN_SDK_VERSION}
2019

21-
#default is QNN NPU
22-
qnnbackend=2
20+
qnnparams=" -mg 2 -ngl 99 "
2321

2422
function dump_vars()
2523
{
@@ -188,7 +186,7 @@ function run_llamacli()
188186

189187
adb shell "cd ${REMOTE_PATH} \
190188
&& export LD_LIBRARY_PATH=${REMOTE_PATH} \
191-
&& ${REMOTE_PATH}/llama-cli -mg ${qnnbackend} -ngl 99 -no-cnv -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
189+
&& ${REMOTE_PATH}/llama-cli ${qnnparams} -no-cnv -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
192190

193191
}
194192

@@ -199,12 +197,11 @@ function run_llamabench()
199197

200198
adb shell "cd ${REMOTE_PATH} \
201199
&& export LD_LIBRARY_PATH=${REMOTE_PATH} \
202-
&& ${REMOTE_PATH}/llama-bench -mg ${qnnbackend} -m ${GGUF_MODEL_NAME}"
200+
&& ${REMOTE_PATH}/llama-bench ${qnnparams} -m ${GGUF_MODEL_NAME}"
203201

204202
}
205203

206204

207-
#refer to:https://github.com/ggml-org/llama.cpp/pull/12155
208205
function run_test-ops()
209206
{
210207
prepare_run_on_phone test-backend-ops
@@ -215,37 +212,6 @@ function run_test-ops()
215212

216213
}
217214

218-
function run_test-op()
219-
{
220-
prepare_run_on_phone test-backend-ops
221-
222-
qnnbackendname=qnn-cpu
223-
case $qnnbackend in
224-
0)
225-
qnnbackendname=qnn-cpu
226-
;;
227-
1)
228-
qnnbackendname=qnn-gpu
229-
;;
230-
2)
231-
qnnbackendname=qnn-npu
232-
;;
233-
*)
234-
qnnbackendname=qnn-cpu
235-
;;
236-
esac
237-
238-
#debug
239-
echo "adb shell cd ${REMOTE_PATH} \
240-
&& export LD_LIBRARY_PATH=${REMOTE_PATH} \
241-
&& ${REMOTE_PATH}/test-backend-ops test -o $opname -b $qnnbackendname "
242-
243-
echo "\n"
244-
adb shell "cd ${REMOTE_PATH} \
245-
&& export LD_LIBRARY_PATH=${REMOTE_PATH} \
246-
&& ${REMOTE_PATH}/test-backend-ops test -o $opname -b $qnnbackendname "
247-
248-
}
249215

250216
function print_oplist()
251217
{
@@ -335,9 +301,8 @@ function show_usage()
335301
echo " $0 build"
336302
echo " $0 updateqnnlib"
337303
echo " $0 run_testops"
338-
echo " $0 run_testop [ADD/MUL/MUL_MAT......(op from print_oplist)] [0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU)]"
339-
echo " $0 run_llamacli 0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
340-
echo " $0 run_llamabench 0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
304+
echo " $0 run_llamacli"
305+
echo " $0 run_llamabench"
341306

342307
echo -e "\n\n\n"
343308
}
@@ -367,40 +332,19 @@ elif [ $# == 1 ]; then
367332
elif [ "$1" == "run_testops" ]; then
368333
run_test-ops
369334
exit 0
370-
371-
elif [ "$1" == "updateqnnlib" ]; then
372-
update_qnn_libs
373-
exit 0
374-
else
375-
show_usage
376-
exit 1
377-
fi
378-
elif [ $# == 2 ]; then
379-
qnnbackend=$2
380-
if [ ${qnnbackend} -gt 3 ]; then
381-
show_usage
382-
exit 1
383-
fi
384-
385-
if [ "$1" == "run_llamacli" ]; then
335+
elif [ "$1" == "run_llamacli" ]; then
386336
run_llamacli
387337
exit 0
388338
elif [ "$1" == "run_llamabench" ]; then
389339
run_llamabench
390340
exit 0
391-
fi
392-
elif [ $# == 3 ]; then
393-
opname=$2
394-
#TODO: check opname in oplist
395-
#opname can be found via print_oplist:
396-
397-
qnnbackend=$3
398-
if [ ${qnnbackend} -gt 3 ]; then
341+
elif [ "$1" == "updateqnnlib" ]; then
342+
update_qnn_libs
343+
exit 0
344+
else
399345
show_usage
400346
exit 1
401347
fi
402-
run_test-op
403-
exit 0
404348
else
405349
show_usage
406350
exit 1

scripts/ggml-qnn.cfg

+11-1
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,19 @@
11
[general]
2+
#0: QNN-CPU backend
3+
#1: QNN-GPU backend
4+
#2: QNN-NPU(htp) backend
5+
#3: default ggml backend
6+
qnn_backend = 2
7+
28
# enable/disable QNN's internal log
39
print_qnn_internal_log = 0
10+
411
# 0: general approach,similar to ggml-sycl or ggml-cann
512
# 1: mapping entire ggml cgraph to QNN graph
613
inference_approach = 0
714

815
[npu]
9-
npu_inference_datatype = "fp16"
16+
hvx_threads = 4
17+
vtcm_size_in_mb = 8
18+
enable_dlbc = 1
19+
precision_mode = "fp16"

0 commit comments

Comments
 (0)