Skip to content

Commit 1e98561

Browse files
committed
ggml-qnn: rebase to upstream
1 parent 0a34837 commit 1e98561

File tree

4 files changed

+87
-122
lines changed

4 files changed

+87
-122
lines changed

CMakeLists.txt

+12
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,17 @@ include(CheckIncludeFileCXX)
66
set(CMAKE_WARN_UNUSED_CLI YES)
77

88
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
9+
set(CMAKE_VERBOSE_MAKEFILE on)
10+
11+
if(CMAKE_SYSTEM_NAME STREQUAL "Android")
12+
set(TARGET_SNAPDRAGON8GEN3 ON)
13+
if(TARGET_SNAPDRAGON8GEN3)
14+
#works fine on Snapdragon 8Gen3 with 5x-10x(76.64 tokens per second) performance gain through the default ggml backend
15+
add_definitions(-march=armv8.7-a)
16+
add_definitions(-mcpu=cortex-x1)
17+
add_definitions(-mtune=cortex-x1)
18+
endif()
19+
endif()
920

1021
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
1122
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
@@ -117,6 +128,7 @@ llama_option_depr(WARNING LLAMA_RPC GGML_RPC)
117128
llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
118129
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
119130
llama_option_depr(WARNING LLAMA_CANN GGML_CANN)
131+
llama_option_depr(WARNING LLAMA_QNN GGML_QNN)
120132

121133
if (NOT MSVC)
122134
if (LLAMA_SANITIZE_THREAD)

ggml/src/ggml-qnn/ggml-qnn.cpp

+54-55
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@
1414
* section-6 QNN helper function
1515
* section-7 ggml-qnn backend helper function / class
1616
* section-8 implementation of ggml-qnn backend according to ggml's backend subsystem
17-
* section-9 implementation of offload ggml op to QNN backend
18-
* section-10 illustrate why the second approach is actual an fake at the moment
17+
* section-9 implementation of general approach or the first tech approach
18+
* section-10 implementation of the second tech approach:mapping the entire ggml cgraph to a single QNN graph
1919
*
2020
* currently provide following ggml op' QNN backend implementation:
2121
* - GGML_OP_ADD: this is a simple skeleton, can expand other ggml ops according to expertise
@@ -186,7 +186,6 @@ static void ggml_qnn_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst
186186

187187
#define GGMLQNN_MEM_ADD(alignment) (sizeof (size_t) + alignment)
188188
#define GGMLQNN_MEM_MASK(alignment) ((uintptr_t)alignment - 1)
189-
#define TENSOR_DUMP(tensor) ggmlqnn_tensor_dump(tensor, #tensor)
190189
#define GQCGT ggmlqnn_create_general_tensor
191190
#define QNN_VER_PTR(x) (&((x).v1))
192191
#define RPCMEM_DEFAULT_FLAGS 1
@@ -260,7 +259,7 @@ using qnn_ptensors_t = std::vector< Qnn_Tensor_t *>;
260259
using qnn_singlenode_res_t = std::tuple<Qnn_GraphHandle_t, qnn_ptensors_t>;
261260

262261
//QNN resource management for the second technical approach(mapping the entire cgraph to a single QNN graph)
263-
using qnn_tensors_t = std::vector< Qnn_Tensor_t * >;
262+
using qnn_tensors_t = std::vector< Qnn_Tensor_t >;
264263
using qnn_cgraph_node_t = std::tuple<std::string, Qnn_GraphHandle_t>;
265264
using qnn_cgraph_nodes_t = std::vector<qnn_cgraph_node_t>;
266265
using qnn_multinode_res_t = std::tuple<Qnn_GraphHandle_t, qnn_cgraph_nodes_t, qnn_ptensors_t, qnn_tensors_t, qnn_tensors_t>;
@@ -325,11 +324,6 @@ struct ggml_backend_qnn_context {
325324
size_t work_size;
326325
size_t desired_size;
327326
int n_threads;
328-
329-
#if 1//ndef NDEBUG
330-
std::atomic_uint32_t supported_op_count = 0;
331-
std::atomic_uint32_t unsupported_op_count = 0;
332-
#endif
333327
};
334328

335329
struct qnn_op_caps {
@@ -370,8 +364,6 @@ static struct qnn_parameter g_qnn_params = {
370364
#if defined(__ANDROID__)
371365
//Android command line program
372366
.qnn_runtimelib_path = "/data/local/tmp/",
373-
//Android KanTV standard APP
374-
// .qnn_runtimelib_path = "/data/data/com.cdeos.kantv/qnnlib/",
375367
#elif defined(__linux__)
376368
.qnn_runtimelib_path = "/tmp/",
377369
#elif defined(_WIN32)
@@ -1066,46 +1058,6 @@ static void ggmlqnn_load_cfg() {
10661058
}
10671059
}
10681060

1069-
static void ggmlqnn_tensor_dump_elements(const ggml_tensor * tensor) {
1070-
float value = 0;
1071-
std::ostringstream tmposs;
1072-
if (tensor->type == GGML_TYPE_F32) {
1073-
for (int h = 0; h < tensor->ne[3]; h++) {
1074-
for (int i = 0; i < tensor->ne[2]; i++) {
1075-
for (int j = 0; j < tensor->ne[1]; j++) {
1076-
for (int k = 0; k < tensor->ne[0]; k++) {
1077-
value = ((float *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] +
1078-
j * tensor->ne[0] + k];
1079-
tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value
1080-
<< " ";
1081-
}
1082-
if (strlen(tmposs.str().c_str()) <= (4096 - 96)) {
1083-
GGMLQNN_LOG_DEBUG("%s\n", tmposs.str().c_str());
1084-
}
1085-
tmposs.clear();
1086-
tmposs.str("");
1087-
}
1088-
}
1089-
}
1090-
}
1091-
1092-
GGMLQNN_LOG_DEBUG("\n");
1093-
}
1094-
1095-
1096-
static void ggmlqnn_tensor_dump(const ggml_tensor * tensor, const char * name) {
1097-
GGMLQNN_LOG_DEBUG("dump ggml tensor %s(%s)\n", name, tensor->name);
1098-
GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64", nb = (%5zi, %5zi, %5zi, %5zi)\n",
1099-
name,
1100-
tensor->type, ggml_type_name(tensor->type),
1101-
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3],
1102-
tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[2]);
1103-
ggmlqnn_tensor_dump_elements(tensor);
1104-
1105-
GGMLQNN_LOG_DEBUG("\n");
1106-
}
1107-
1108-
11091061
// =================================================================================================
11101062
// section-6: QNN helper function
11111063
// =================================================================================================
@@ -2698,7 +2650,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
26982650
std::filesystem::path full_path(std::string(g_qnn_params.qnn_runtimelib_path) + "libcdsprpc.so");
26992651
full_path /= std::filesystem::path("libcdsprpc.so").filename();
27002652
_rpc_lib_handle = dlopen(full_path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
2701-
if (!_rpc_lib_handle) {
2653+
if (nullptr == _rpc_lib_handle) {
27022654
GGMLQNN_LOG_WARN("failed to load %s\n", full_path.c_str());
27032655
_rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
27042656
}
@@ -5083,9 +5035,56 @@ void ggml_qnn_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
50835035
// =================================================================================================
50845036
// details: https://github.com/ggml-org/llama.cpp/pull/12326#issuecomment-2712838649
50855037
// ref: https://github.com/kantv-ai/kantv/blob/kantv-poc-with-qnn/core/ggml/jni/Inception_v3.cpp#L20634
5086-
static enum ggml_status
5087-
ggml_backend_qnn_graph_compute_special(ggml_backend_t backend, struct ggml_cgraph *cgraph) {
5088-
enum ggml_status ggml_result = GGML_STATUS_SUCCESS;
5038+
static enum ggml_status ggml_backend_qnn_graph_compute_special(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
5039+
enum ggml_status ggml_result = GGML_STATUS_SUCCESS;
5040+
Qnn_ErrorHandle_t qnn_error = QNN_SUCCESS;
5041+
qnn_perf op_perf = qnn_perf("ggml_backend_qnn_graph_compute_special");
5042+
qnn_instance * instance = nullptr;
5043+
Qnn_GraphHandle_t graph_handle = nullptr;
5044+
ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context;
5045+
instance = ctx->instance;
5046+
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
5047+
op_perf.start();
5048+
5049+
//now we got the entire ggml cgraph or a ggml cgraph which contains multiple nodes
5050+
GGMLQNN_LOG_DEBUG("qnn device %d(%s)", ctx->device, ggml_backend_qnn_get_devname(ctx->device));
5051+
GGMLQNN_LOG_DEBUG("cgraph->n_nodes %d", cgraph->n_nodes);
5052+
int num_nodes = std::min(5, cgraph->n_nodes);
5053+
//for (int i = 0; i < cgraph->n_nodes; i++) {
5054+
for (int i = 0; i < num_nodes; i++) {
5055+
ggml_tensor * node = cgraph->nodes[i];
5056+
GGMLQNN_LOG_DEBUG("%s: op %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
5057+
}
5058+
5059+
//now we'll offload the ggml cgraph to a single QNN graph
5060+
std::string graph_name;
5061+
ggmlqnn_get_graphkey_from_cgraph(cgraph, graph_name);
5062+
if (graph_name == "")
5063+
return GGML_STATUS_SUCCESS;
5064+
if (ctx->qnn_multinode_graph_map.find(graph_name) != ctx->qnn_multinode_graph_map.end()) {
5065+
GGMLQNN_LOG_DEBUG("graph name %s already create", graph_name.c_str());
5066+
//retrieve computational resource from cached QNN graph
5067+
qnn_multinode_res_t &graph_res = ctx->qnn_multinode_graph_map[graph_name];
5068+
graph_handle = std::get<0>(graph_res);
5069+
} else {
5070+
//create QNN graph
5071+
GGMLQNN_LOG_INFO("graph name %s", graph_name.c_str());
5072+
qnn_error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), 8, 4);
5073+
if (QNN_SUCCESS != qnn_error) {
5074+
GGMLQNN_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d(%s)\n", graph_name.c_str(), qnn_error,
5075+
ggmlqnn_get_qnnerror_string(qnn_error));
5076+
return ggml_result;
5077+
}
5078+
graph_handle = instance->get_qnn_graph_handle();
5079+
//TBD: compose a single opcfg QNN graph
5080+
5081+
//finalize QNN graph
5082+
CHECK_QNN_API(qnn_error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
5083+
5084+
//TBD: cache QNN graph
5085+
}
5086+
//TBD: exec QNN graph
5087+
GGMLQNN_LOG_DEBUG("the second inference approach \"mapping cgraph to QNN graph\" will be seen in the future");
50895088

50905089
return ggml_result;
50915090
}

scripts/build-run-android.sh

+10-66
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ PWD=`pwd`
77
ANDROID_PLATFORM=android-34
88
ANDROID_NDK=${PWD}/android-ndk-r26c
99
REMOTE_PATH=/data/local/tmp/
10-
GGUF_MODEL_NAME=/sdcard/deepseek-r1-distill-qwen-1.5b-q4_0.gguf
1110
GGUF_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
1211

1312
#QNN SDK could be found at:
@@ -18,8 +17,7 @@ QNN_SDK_INSTALL_PATH=/opt/qcom/aistack/qairt/
1817
QNN_SDK_VERSION=2.32.0.250228
1918
QNN_SDK_PATH=${QNN_SDK_INSTALL_PATH}/${QNN_SDK_VERSION}
2019

21-
#default is QNN NPU
22-
qnnbackend=2
20+
qnnparams=" -mg 2 -ngl 99 "
2321

2422
function dump_vars()
2523
{
@@ -188,7 +186,7 @@ function run_llamacli()
188186

189187
adb shell "cd ${REMOTE_PATH} \
190188
&& export LD_LIBRARY_PATH=${REMOTE_PATH} \
191-
&& ${REMOTE_PATH}/llama-cli -mg ${qnnbackend} -ngl 99 -no-cnv -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
189+
&& ${REMOTE_PATH}/llama-cli ${qnnparams} -no-cnv -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
192190

193191
}
194192

@@ -199,12 +197,11 @@ function run_llamabench()
199197

200198
adb shell "cd ${REMOTE_PATH} \
201199
&& export LD_LIBRARY_PATH=${REMOTE_PATH} \
202-
&& ${REMOTE_PATH}/llama-bench -mg ${qnnbackend} -m ${GGUF_MODEL_NAME}"
200+
&& ${REMOTE_PATH}/llama-bench ${qnnparams} -m ${GGUF_MODEL_NAME}"
203201

204202
}
205203

206204

207-
#refer to:https://github.com/ggml-org/llama.cpp/pull/12155
208205
function run_test-ops()
209206
{
210207
prepare_run_on_phone test-backend-ops
@@ -215,37 +212,6 @@ function run_test-ops()
215212

216213
}
217214

218-
function run_test-op()
219-
{
220-
prepare_run_on_phone test-backend-ops
221-
222-
qnnbackendname=qnn-cpu
223-
case $qnnbackend in
224-
0)
225-
qnnbackendname=qnn-cpu
226-
;;
227-
1)
228-
qnnbackendname=qnn-gpu
229-
;;
230-
2)
231-
qnnbackendname=qnn-npu
232-
;;
233-
*)
234-
qnnbackendname=qnn-cpu
235-
;;
236-
esac
237-
238-
#debug
239-
echo "adb shell cd ${REMOTE_PATH} \
240-
&& export LD_LIBRARY_PATH=${REMOTE_PATH} \
241-
&& ${REMOTE_PATH}/test-backend-ops test -o $opname -b $qnnbackendname "
242-
243-
echo "\n"
244-
adb shell "cd ${REMOTE_PATH} \
245-
&& export LD_LIBRARY_PATH=${REMOTE_PATH} \
246-
&& ${REMOTE_PATH}/test-backend-ops test -o $opname -b $qnnbackendname "
247-
248-
}
249215

250216
function print_oplist()
251217
{
@@ -335,9 +301,8 @@ function show_usage()
335301
echo " $0 build"
336302
echo " $0 updateqnnlib"
337303
echo " $0 run_testops"
338-
echo " $0 run_testop [ADD/MUL/MUL_MAT......(op from print_oplist)] [0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU)]"
339-
echo " $0 run_llamacli 0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
340-
echo " $0 run_llamabench 0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
304+
echo " $0 run_llamacli"
305+
echo " $0 run_llamabench"
341306

342307
echo -e "\n\n\n"
343308
}
@@ -367,40 +332,19 @@ elif [ $# == 1 ]; then
367332
elif [ "$1" == "run_testops" ]; then
368333
run_test-ops
369334
exit 0
370-
371-
elif [ "$1" == "updateqnnlib" ]; then
372-
update_qnn_libs
373-
exit 0
374-
else
375-
show_usage
376-
exit 1
377-
fi
378-
elif [ $# == 2 ]; then
379-
qnnbackend=$2
380-
if [ ${qnnbackend} -gt 3 ]; then
381-
show_usage
382-
exit 1
383-
fi
384-
385-
if [ "$1" == "run_llamacli" ]; then
335+
elif [ "$1" == "run_llamacli" ]; then
386336
run_llamacli
387337
exit 0
388338
elif [ "$1" == "run_llamabench" ]; then
389339
run_llamabench
390340
exit 0
391-
fi
392-
elif [ $# == 3 ]; then
393-
opname=$2
394-
#TODO: check opname in oplist
395-
#opname can be found via print_oplist:
396-
397-
qnnbackend=$3
398-
if [ ${qnnbackend} -gt 3 ]; then
341+
elif [ "$1" == "updateqnnlib" ]; then
342+
update_qnn_libs
343+
exit 0
344+
else
399345
show_usage
400346
exit 1
401347
fi
402-
run_test-op
403-
exit 0
404348
else
405349
show_usage
406350
exit 1

scripts/ggml-qnn.cfg

+11-1
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,19 @@
11
[general]
2+
#0: QNN-CPU backend
3+
#1: QNN-GPU backend
4+
#2: QNN-NPU(htp) backend
5+
#3: default ggml backend
6+
qnn_backend = 2
7+
28
# enable/disable QNN's internal log
39
print_qnn_internal_log = 0
10+
411
# 0: general approach,similar to ggml-sycl or ggml-cann
512
# 1: mapping entire ggml cgraph to QNN graph
613
inference_approach = 0
714

815
[npu]
9-
npu_inference_datatype = "fp16"
16+
hvx_threads = 4
17+
vtcm_size_in_mb = 8
18+
enable_dlbc = 1
19+
precision_mode = "fp16"

0 commit comments

Comments
 (0)