|
14 | 14 | * section-6 QNN helper function
|
15 | 15 | * section-7 ggml-qnn backend helper function / class
|
16 | 16 | * section-8 implementation of ggml-qnn backend according to ggml's backend subsystem
|
17 |
| - * section-9 implementation of offload ggml op to QNN backend |
18 |
| - * section-10 illustrate why the second approach is actual an fake at the moment |
| 17 | + * section-9 implementation of general approach or the first tech approach |
| 18 | + * section-10 implementation of the second tech approach:mapping the entire ggml cgraph to a single QNN graph |
19 | 19 | *
|
20 | 20 | * currently provide following ggml op' QNN backend implementation:
|
21 |
| - * - GGML_OP_ADD: this is a simple skeleton, can expand other ggml ops according to expertise |
22 |
| - * - GGML_OP_MUL: this is a simple skeleton, can expand other ggml ops according to expertise |
23 |
| - * - GGML_OP_MUL_MAT:this is a complicated skeleton, can expand other complex ggml ops accordingly |
| 21 | + * - GGML_OP_ADD/GGML_OP_SUB/GGML_OP_MUL/GGML_OP_DIV: |
| 22 | + * this is a simple skeleton, can expand other ggml ops according to expertise |
| 23 | + * - GGML_OP_LOG/GGML_OP_SQRT: |
| 24 | + * this is a simple skeleton, can expand other ggml ops according to expertise |
| 25 | + * - GGML_OP_MUL_MAT: |
| 26 | + * this is a complicated skeleton, can expand other complex ggml ops accordingly |
24 | 27 | *
|
25 | 28 | * Permission is hereby granted, free of charge, to any person obtaining a copy
|
26 | 29 | * of this software and associated documentation files (the "Software"), to
|
|
80 | 83 | #include <unordered_set>
|
81 | 84 | #include <utility>
|
82 | 85 | #include <future>
|
83 |
| -#include <chrono> |
84 | 86 | #if (defined __ANDROID__) || (defined ANDROID)
|
85 | 87 | #include "android/log.h"
|
86 | 88 | #endif
|
@@ -186,7 +188,6 @@ static void ggml_qnn_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst
|
186 | 188 |
|
187 | 189 | #define GGMLQNN_MEM_ADD(alignment) (sizeof (size_t) + alignment)
|
188 | 190 | #define GGMLQNN_MEM_MASK(alignment) ((uintptr_t)alignment - 1)
|
189 |
| -#define TENSOR_DUMP(tensor) ggmlqnn_tensor_dump(tensor, #tensor) |
190 | 191 | #define GQCGT ggmlqnn_create_general_tensor
|
191 | 192 | #define QNN_VER_PTR(x) (&((x).v1))
|
192 | 193 | #define RPCMEM_DEFAULT_FLAGS 1
|
@@ -260,7 +261,7 @@ using qnn_ptensors_t = std::vector< Qnn_Tensor_t *>;
|
260 | 261 | using qnn_singlenode_res_t = std::tuple<Qnn_GraphHandle_t, qnn_ptensors_t>;
|
261 | 262 |
|
262 | 263 | //QNN resource management for the second technical approach(mapping the entire cgraph to a single QNN graph)
|
263 |
| -using qnn_tensors_t = std::vector< Qnn_Tensor_t * >; |
| 264 | +using qnn_tensors_t = std::vector< Qnn_Tensor_t >; |
264 | 265 | using qnn_cgraph_node_t = std::tuple<std::string, Qnn_GraphHandle_t>;
|
265 | 266 | using qnn_cgraph_nodes_t = std::vector<qnn_cgraph_node_t>;
|
266 | 267 | using qnn_multinode_res_t = std::tuple<Qnn_GraphHandle_t, qnn_cgraph_nodes_t, qnn_ptensors_t, qnn_tensors_t, qnn_tensors_t>;
|
@@ -325,11 +326,6 @@ struct ggml_backend_qnn_context {
|
325 | 326 | size_t work_size;
|
326 | 327 | size_t desired_size;
|
327 | 328 | int n_threads;
|
328 |
| - |
329 |
| -#if 1//ndef NDEBUG |
330 |
| - std::atomic_uint32_t supported_op_count = 0; |
331 |
| - std::atomic_uint32_t unsupported_op_count = 0; |
332 |
| -#endif |
333 | 329 | };
|
334 | 330 |
|
335 | 331 | struct qnn_op_caps {
|
@@ -370,8 +366,6 @@ static struct qnn_parameter g_qnn_params = {
|
370 | 366 | #if defined(__ANDROID__)
|
371 | 367 | //Android command line program
|
372 | 368 | .qnn_runtimelib_path = "/data/local/tmp/",
|
373 |
| -//Android KanTV standard APP |
374 |
| -// .qnn_runtimelib_path = "/data/data/com.cdeos.kantv/qnnlib/", |
375 | 369 | #elif defined(__linux__)
|
376 | 370 | .qnn_runtimelib_path = "/tmp/",
|
377 | 371 | #elif defined(_WIN32)
|
@@ -1066,46 +1060,6 @@ static void ggmlqnn_load_cfg() {
|
1066 | 1060 | }
|
1067 | 1061 | }
|
1068 | 1062 |
|
1069 |
| -static void ggmlqnn_tensor_dump_elements(const ggml_tensor * tensor) { |
1070 |
| - float value = 0; |
1071 |
| - std::ostringstream tmposs; |
1072 |
| - if (tensor->type == GGML_TYPE_F32) { |
1073 |
| - for (int h = 0; h < tensor->ne[3]; h++) { |
1074 |
| - for (int i = 0; i < tensor->ne[2]; i++) { |
1075 |
| - for (int j = 0; j < tensor->ne[1]; j++) { |
1076 |
| - for (int k = 0; k < tensor->ne[0]; k++) { |
1077 |
| - value = ((float *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + |
1078 |
| - j * tensor->ne[0] + k]; |
1079 |
| - tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value |
1080 |
| - << " "; |
1081 |
| - } |
1082 |
| - if (strlen(tmposs.str().c_str()) <= (4096 - 96)) { |
1083 |
| - GGMLQNN_LOG_DEBUG("%s\n", tmposs.str().c_str()); |
1084 |
| - } |
1085 |
| - tmposs.clear(); |
1086 |
| - tmposs.str(""); |
1087 |
| - } |
1088 |
| - } |
1089 |
| - } |
1090 |
| - } |
1091 |
| - |
1092 |
| - GGMLQNN_LOG_DEBUG("\n"); |
1093 |
| -} |
1094 |
| - |
1095 |
| - |
1096 |
| -static void ggmlqnn_tensor_dump(const ggml_tensor * tensor, const char * name) { |
1097 |
| - GGMLQNN_LOG_DEBUG("dump ggml tensor %s(%s)\n", name, tensor->name); |
1098 |
| - GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64", nb = (%5zi, %5zi, %5zi, %5zi)\n", |
1099 |
| - name, |
1100 |
| - tensor->type, ggml_type_name(tensor->type), |
1101 |
| - tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], |
1102 |
| - tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[2]); |
1103 |
| - ggmlqnn_tensor_dump_elements(tensor); |
1104 |
| - |
1105 |
| - GGMLQNN_LOG_DEBUG("\n"); |
1106 |
| -} |
1107 |
| - |
1108 |
| - |
1109 | 1063 | // =================================================================================================
|
1110 | 1064 | // section-6: QNN helper function
|
1111 | 1065 | // =================================================================================================
|
@@ -2698,7 +2652,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
|
2698 | 2652 | std::filesystem::path full_path(std::string(g_qnn_params.qnn_runtimelib_path) + "libcdsprpc.so");
|
2699 | 2653 | full_path /= std::filesystem::path("libcdsprpc.so").filename();
|
2700 | 2654 | _rpc_lib_handle = dlopen(full_path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
|
2701 |
| - if (!_rpc_lib_handle) { |
| 2655 | + if (nullptr == _rpc_lib_handle) { |
2702 | 2656 | GGMLQNN_LOG_WARN("failed to load %s\n", full_path.c_str());
|
2703 | 2657 | _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
|
2704 | 2658 | }
|
@@ -5083,9 +5037,56 @@ void ggml_qnn_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
|
5083 | 5037 | // =================================================================================================
|
5084 | 5038 | // details: https://github.com/ggml-org/llama.cpp/pull/12326#issuecomment-2712838649
|
5085 | 5039 | // ref: https://github.com/kantv-ai/kantv/blob/kantv-poc-with-qnn/core/ggml/jni/Inception_v3.cpp#L20634
|
5086 |
| -static enum ggml_status |
5087 |
| -ggml_backend_qnn_graph_compute_special(ggml_backend_t backend, struct ggml_cgraph *cgraph) { |
5088 |
| - enum ggml_status ggml_result = GGML_STATUS_SUCCESS; |
| 5040 | +static enum ggml_status ggml_backend_qnn_graph_compute_special(ggml_backend_t backend, struct ggml_cgraph * cgraph) { |
| 5041 | + enum ggml_status ggml_result = GGML_STATUS_SUCCESS; |
| 5042 | + Qnn_ErrorHandle_t qnn_error = QNN_SUCCESS; |
| 5043 | + qnn_perf op_perf = qnn_perf("ggml_backend_qnn_graph_compute_special"); |
| 5044 | + qnn_instance * instance = nullptr; |
| 5045 | + Qnn_GraphHandle_t graph_handle = nullptr; |
| 5046 | + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; |
| 5047 | + instance = ctx->instance; |
| 5048 | + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; |
| 5049 | + op_perf.start(); |
| 5050 | + |
| 5051 | + //now we got the entire ggml cgraph or a ggml cgraph which contains multiple nodes |
| 5052 | + GGMLQNN_LOG_DEBUG("qnn device %d(%s)", ctx->device, ggml_backend_qnn_get_devname(ctx->device)); |
| 5053 | + GGMLQNN_LOG_DEBUG("cgraph->n_nodes %d", cgraph->n_nodes); |
| 5054 | + int num_nodes = std::min(5, cgraph->n_nodes); |
| 5055 | + //for (int i = 0; i < cgraph->n_nodes; i++) { |
| 5056 | + for (int i = 0; i < num_nodes; i++) { |
| 5057 | + ggml_tensor * node = cgraph->nodes[i]; |
| 5058 | + GGMLQNN_LOG_DEBUG("%s: op %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); |
| 5059 | + } |
| 5060 | + |
| 5061 | + //now we'll offload the ggml cgraph to a single QNN graph |
| 5062 | + std::string graph_name; |
| 5063 | + ggmlqnn_get_graphkey_from_cgraph(cgraph, graph_name); |
| 5064 | + if (graph_name == "") |
| 5065 | + return GGML_STATUS_SUCCESS; |
| 5066 | + if (ctx->qnn_multinode_graph_map.find(graph_name) != ctx->qnn_multinode_graph_map.end()) { |
| 5067 | + GGMLQNN_LOG_DEBUG("graph name %s already create", graph_name.c_str()); |
| 5068 | + //retrieve computational resource from cached QNN graph |
| 5069 | + qnn_multinode_res_t &graph_res = ctx->qnn_multinode_graph_map[graph_name]; |
| 5070 | + graph_handle = std::get<0>(graph_res); |
| 5071 | + } else { |
| 5072 | + //create QNN graph |
| 5073 | + GGMLQNN_LOG_INFO("graph name %s", graph_name.c_str()); |
| 5074 | + qnn_error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), 8, 4); |
| 5075 | + if (QNN_SUCCESS != qnn_error) { |
| 5076 | + GGMLQNN_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d(%s)\n", graph_name.c_str(), qnn_error, |
| 5077 | + ggmlqnn_get_qnnerror_string(qnn_error)); |
| 5078 | + return ggml_result; |
| 5079 | + } |
| 5080 | + graph_handle = instance->get_qnn_graph_handle(); |
| 5081 | + //TBD: compose a single opcfg QNN graph |
| 5082 | + |
| 5083 | + //finalize QNN graph |
| 5084 | + CHECK_QNN_API(qnn_error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr)); |
| 5085 | + |
| 5086 | + //TBD: cache QNN graph |
| 5087 | + } |
| 5088 | + //TBD: exec QNN graph |
| 5089 | + GGMLQNN_LOG_DEBUG("the second inference approach \"mapping cgraph to QNN graph\" will be seen in the future"); |
5089 | 5090 |
|
5090 | 5091 | return ggml_result;
|
5091 | 5092 | }
|
0 commit comments