chraac · chraac · Feb 24, 2025 · Feb 6, 2025 · Feb 6, 2025 · Feb 6, 2025
diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt
@@ -4,12 +4,15 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Android")
     find_library(LOG_LIB log)
     set(QNN_LINK_LIBRARIES ${LOG_LIB})
     set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend")
+elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows" OR CMAKE_SYSTEM_NAME STREQUAL "Linux")
+    set(QNN_DEFAULT_LIB_SEARCH_PATH "" CACHE STRING "customized library search path for QNN backend")
 else()
-    message(FATAL_ERROR "QNN now only available on Android")
+    message(FATAL_ERROR "QNN now only available on Android, Windows and Linux")
 endif()
 
 if(NOT DEFINED GGML_QNN_SDK_PATH)
     # try read from environment variable
+    # TODO: create a function to search for the SDK path
     if(DEFINED ENV{QNN_SDK_PATH})
         set(GGML_QNN_SDK_PATH $ENV{QNN_SDK_PATH})
     else()
@@ -29,5 +32,14 @@ ggml_add_backend_library(ggml-qnn
 target_include_directories(ggml-qnn PRIVATE ${GGML_QNN_SDK_PATH}/include/QNN ${CMAKE_CURRENT_LIST_DIR})
 target_link_libraries(ggml-qnn PRIVATE ${QNN_LINK_LIBRARIES})
 
-string(REGEX REPLACE "/$" "" GGML_QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}")
-target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}/")
+if(NOT "${QNN_DEFAULT_LIB_SEARCH_PATH}" STREQUAL "")
+    string(REGEX REPLACE "/$" "" QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}")
+endif()
+
+message("GGML_QNN_DEFAULT_LIB_SEARCH_PATH: ${QNN_DEFAULT_LIB_SEARCH_PATH}")
+target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}")
+
+if(GGML_QNN_ENABLE_CPU_BACKEND)
+    message("GGML_QNN_ENABLE_CPU_BACKEND is enabled")
+    target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_ENABLE_CPU_BACKEND)
+endif()
diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp
@@ -389,7 +389,7 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t
         case GGML_TYPE_F16:
         case GGML_TYPE_Q8_0:
         case GGML_TYPE_Q4_0:
-            if (!(ctx->supported_types & (1 << tensor->type))) {
+            if (!(ctx->supported_types & (uint64_t(1) << tensor->type))) {
                 QNN_LOG_DEBUG("[%s]unsupported data type %s, supported_types: 0x%x", qnn::get_backend_name(ctx->device),
                               ggml_type_name(tensor->type), ctx->supported_types);
                 return false;

diff --git a/ggml/src/ggml-qnn/buffer.hpp b/ggml/src/ggml-qnn/buffer.hpp
@@ -133,11 +133,14 @@ class qnn_mem_buffer : public qnn_buffer_interface {
         if (data) {
             memcpy(_buffer, data, size);
         }
+
+        QNN_LOG_DEBUG("alloc buffer: %p, size: %ld", _buffer, size);
     }
 
     explicit qnn_mem_buffer(size_t size) : qnn_mem_buffer(nullptr, size) {}
 
     ~qnn_mem_buffer() {
+        QNN_LOG_DEBUG("free buffer: %p, size: %ld", _buffer, _size);
         // the free will do nothing if the _buffer is nullptr
         qnn::align_free(_buffer);
     }

diff --git a/ggml/src/ggml-qnn/dl_loader.hpp b/ggml/src/ggml-qnn/dl_loader.hpp
@@ -0,0 +1,71 @@
+#pragma once
+
+#ifdef __linux__
+#include <dlfcn.h>
+#include <fcntl.h>
+#elif defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
+#include <string>
+
+namespace qnn {
+
+#ifdef __linux__
+typedef void *dl_handler_t;
+
+inline qnn::dl_handler_t dl_load(const std::string &lib_path) {
+    return dlopen(lib_path.c_str(), RTLD_NOW | RTLD_LOCAL);
+}
+
+inline void *dl_sym(qnn::dl_handler_t handle, const std::string &symbol) { return dlsym(handle, symbol.c_str()); }
+
+inline bool dl_unload(qnn::dl_handler_t handle) { return dlclose(handle) == 0; }
+
+inline const char *dl_error() { return dlerror(); }
+#elif defined(_WIN32)
+using dl_handler_t = HMODULE;
+
+inline qnn::dl_handler_t dl_load(const std::string &lib_path) {
+    // suppress error dialogs for missing DLLs
+    auto old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+
+    auto handle = LoadLibraryA(lib_path.c_str()); // TODO: use wstring version for unicode paths
+
+    SetErrorMode(old_mode);
+    return handle;
+}
+
+inline void *dl_sym(qnn::dl_handler_t handle, const std::string &symbol) {
+    auto old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+
+    void *p = (void *)GetProcAddress(handle, symbol.c_str());
+
+    SetErrorMode(old_mode);
+    return p;
+}
+
+inline bool dl_unload(qnn::dl_handler_t handle) {
+    FreeLibrary(handle);
+    return true;
+}
+
+inline const char *dl_error() {
+    // TODO: implement dl_error for Windows
+    return nullptr;
+}
+
+#endif
+
+template <typename Fn>
+Fn dl_sym_typed(qnn::dl_handler_t handle, const std::string &function_name) {
+    return reinterpret_cast<Fn>(dl_sym(handle, function_name));
+}
+
+} // namespace qnn
diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -1,23 +1,7 @@
 #include "ggml-qnn.h"
 
-#include <cassert>
-#include <chrono>
-#include <condition_variable>
-#include <fstream>
 #include <functional>
-#include <iostream>
-#include <list>
 #include <memory>
-#include <mutex>
-#include <queue>
-#include <random>
-#include <regex>
-#include <set>
-#include <sstream>
-#include <thread>
-#include <tuple>
-#include <unordered_set>
-#include <utility>
 #include <vector>
 
 #include "ggml-backend-impl.h"
@@ -44,6 +28,16 @@
 
 namespace {
 
+#ifdef _WIN32
+constexpr const char *kQnnCpuLibName = "QnnCpu.dll";
+constexpr const char *kQnnGpuLibName = "QnnGpu.dll";
+constexpr const char *kQnnNpuLibName = "QnnHtp.dll";
+#else
+constexpr const char *kQnnCpuLibName = "libQnnCpu.so";
+constexpr const char *kQnnGpuLibName = "libQnnGpu.so";
+constexpr const char *kQnnNpuLibName = "libQnnHtp.so";
+#endif
+
 struct qnn_device_caps {
     const char *name;
     const char *description;
@@ -59,23 +53,23 @@ constexpr const qnn_device_caps kDeviceCaps[] = {
         // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul
         "qnn-cpu",
         "Qualcomm Kryo CPU",
-        "libQnnCpu.so",
+        kQnnCpuLibName,
         GGML_BACKEND_DEVICE_TYPE_CPU,
         (1 << GGML_TYPE_I8) | (1 << GGML_TYPE_F32),
     },
     {
         // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul
         "qnn-gpu",
         "Qualcomm Adreno GPU",
-        "libQnnGpu.so",
+        kQnnGpuLibName,
         GGML_BACKEND_DEVICE_TYPE_GPU,
         (1 << GGML_TYPE_F32) | (1 << GGML_TYPE_F16),
     },
     {
         // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul
         "qnn-npu",
         "Qualcomm NPU",
-        "libQnnHtp.so",
+        kQnnNpuLibName,
         GGML_BACKEND_DEVICE_TYPE_ACCEL,
         (1 << GGML_TYPE_F32) | (1 << GGML_TYPE_F16) | (1 << GGML_TYPE_I16) | (1 << GGML_TYPE_I8),
     },
@@ -214,6 +208,8 @@ void ggml_backend_qnn_free(ggml_backend_t backend) {
         instance->qnn_finalize();
         instance.reset();
     }
+
+    delete backend;
 }
 
 bool ggml_backend_qnn_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor *src,
@@ -332,42 +328,10 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev,
     const auto device = dev_ctx->device;
     QNN_LOG_DEBUG("device %s", qnn::get_backend_name(device));
     QNN_LOG_DEBUG("extend_lib_search_path %s", extend_lib_search_path);
-    std::string path = extend_lib_search_path;
-
-// TODO: Fix this for other platforms
-#if defined(__ANDROID__) || defined(ANDROID)
-    if (device == QNN_BACKEND_NPU) {
-        if (setenv("LD_LIBRARY_PATH",
-                   (path + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/"
-                           "dsp:/vendor/dsp/images")
-                       .c_str(),
-                   1) == 0) {
-            QNN_LOG_DEBUG("QNN NPU backend setenv successfully");
-        } else {
-            QNN_LOG_ERROR("QNN NPU backend setenv failure");
-        }
-        if (setenv("ADSP_LIBRARY_PATH",
-                   (path + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/"
-                           "rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp")
-                       .c_str(),
-                   1) == 0) {
-            QNN_LOG_DEBUG("QNN NPU backend setenv successfully");
-        } else {
-            QNN_LOG_ERROR("QNN NPU backend setenv failure");
-        }
-    } else {
-        if (setenv("LD_LIBRARY_PATH", path.c_str(), 1) == 0) {
-            QNN_LOG_DEBUG("%s backend setenv successfully", qnn::get_backend_name(device));
-        } else {
-            QNN_LOG_ERROR("%s backend setenv failure", qnn::get_backend_name(device));
-        }
-    }
-#endif
-
-    auto instance = std::make_shared<qnn::qnn_instance>(path, dev_ctx->lib_name, "ggml");
+    auto instance = std::make_shared<qnn::qnn_instance>(extend_lib_search_path, dev_ctx->lib_name);
     auto result = instance->qnn_init(nullptr);
     if (result != 0) {
-        QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why", qnn::get_backend_name(device));
+        QNN_LOG_WARN("failed to init qnn backend %s", qnn::get_backend_name(device));
         return nullptr;
     }
     auto qnn_interface = instance->get_qnn_interface();
@@ -466,13 +430,15 @@ struct ggml_backend_qnn_reg_impl : ggml_backend_reg {
         QNN_LOG_DEBUG("qnn backend registry init");
         for (size_t i = 0; i < QNN_BACKEND_COUNT; i++) {
             const auto device_enum = (QNNBackend)(QNN_BACKEND_COUNT - 1 - i); // init from the last device, i.e. NPU
+#ifndef GGML_QNN_ENABLE_CPU_BACKEND
             if (device_enum == QNN_BACKEND_CPU) {
                 /*
                  * here we skip the initialization of CPU device,
                  *   cause it'll block unsupported ops fallback to ggml cpu backend
                  */
                 continue;
             }
+#endif
 
             device_contexts.emplace_back(std::make_unique<ggml_backend_qnn_device_context>(
                 /* .device   = */ device_enum, // init from the last device, i.e. NPU

diff --git a/ggml/src/ggml-qnn/graph.cpp b/ggml/src/ggml-qnn/graph.cpp
@@ -1,7 +1,7 @@
 
 #include "graph.hpp"
 
-#include <set>
+#include <algorithm>
 #include <unordered_map>
 
 #include "ggml-impl.h"
@@ -106,13 +106,29 @@ bool bind_src_tensors(ggml_tensor *op, qnn::qnn_tensor_array_t &tensor_wrappers,
     return true;
 }
 
+/**
+ * @brief Extracts input and output tensors from a computational graph.
+ *
+ * This function identifies the input and output tensors of a computational graph by analyzing the connectivity between
+ * tensor nodes. It does this by iterating over each node in the graph, using a connectivity map that associates every
+ * tensor with its number of incoming connections (in_degree), outgoing connections (out_degree), and an insertion index
+ * that preserves order. The insertion index is used later to sort the tensors in their original discovery order.
+ *
+ * TODO: this algorithm is not perfect and may not work for all cases. It assumes that the tensors are
+ *   connected in a way that allows for unambiguous categorization.
+ *   It also assumes that the tensors are connected in a way that allows for unambiguous categorization.
+ */
 int get_io_tensors_from_graph(const ggml_cgraph *cgraph, qnn::ggml_tensor_array_t &inputs,
                               qnn::ggml_tensor_array_t &outputs) {
-    using ggml_tensor_set_t = std::set<ggml_tensor *>;
+    struct _tensor_connectivity_info {
+        size_t in_degree = 0;
+        size_t out_degree = 0;
+        size_t insert_index = 0;
+    };
 
-    ggml_tensor_set_t input_set;
-    ggml_tensor_set_t output_set;
-    ggml_tensor_set_t visited_set;
+    using ggml_tensor_connectivity_map_t = std::unordered_map<ggml_tensor *, _tensor_connectivity_info>;
+
+    ggml_tensor_connectivity_map_t connectivity_map;
     int rank = 0;
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_tensor *dst = cgraph->nodes[i];
@@ -126,25 +142,50 @@ int get_io_tensors_from_graph(const ggml_cgraph *cgraph, qnn::ggml_tensor_array_
         }
 
         rank = std::max(rank, ggml_n_dims(dst));
-        input_set.erase(dst);
-        if (!visited_set.count(dst)) {
-            output_set.insert(dst);
-            visited_set.insert(dst);
+        if (connectivity_map.count(dst) == 0) {
+            connectivity_map[dst] = {
+                1, // in-degree, at least 1
+                0,
+                connectivity_map.size(),
+            };
+        } else {
+            ++(connectivity_map[dst].in_degree);
         }
 
         for (size_t i = 0; i < GGML_MAX_DIMS && dst->src[i]; ++i) {
             auto *src = dst->src[i];
             rank = std::max(rank, ggml_n_dims(src));
-            output_set.erase(src);
-            if (!visited_set.count(src)) {
-                input_set.insert(src);
-                visited_set.insert(src);
+
+            if (connectivity_map.count(src) == 0) {
+                connectivity_map[src] = {
+                    0,
+                    1, // out-degree, at least 1
+                    connectivity_map.size(),
+                };
+            } else {
+                ++(connectivity_map[src].out_degree);
             }
         }
     }
 
-    inputs.assign(input_set.begin(), input_set.end());
-    outputs.assign(output_set.begin(), output_set.end());
+    for (const auto &kv : connectivity_map) {
+        if (kv.second.in_degree == 0) {
+            inputs.push_back(kv.first);
+        }
+
+        if (kv.second.out_degree == 0) {
+            outputs.push_back(kv.first);
+        }
+    }
+
+    std::sort(inputs.begin(), inputs.end(), [&connectivity_map](ggml_tensor *lhs, ggml_tensor *rhs) {
+        return connectivity_map[lhs].insert_index < connectivity_map[rhs].insert_index;
+    });
+
+    std::sort(outputs.begin(), outputs.end(), [&connectivity_map](ggml_tensor *lhs, ggml_tensor *rhs) {
+        return connectivity_map[lhs].insert_index < connectivity_map[rhs].insert_index;
+    });
+
     return rank;
 }
 
@@ -187,7 +228,7 @@ qnn_graph::qnn_graph(const std::string &graph_name, QNNBackend device, std::shar
 
         QnnHtpGraph_CustomConfig_t vtcm_config;
         vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
-        vtcm_config.vtcmSizeInMB = vtcm_size_in_mb;
+        vtcm_config.vtcmSizeInMB = (uint32_t)vtcm_size_in_mb;
         QnnGraph_Config_t graph_vtcm_config;
         graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
         graph_vtcm_config.customConfig = &vtcm_config;