chraac · chraac · May 27, 2025 · May 16, 2025 · May 16, 2025 · May 17, 2025
diff --git a/ggml/src/ggml-qnn/npu/CMakeLists.txt b/ggml/src/ggml-qnn/npu/CMakeLists.txt
@@ -231,6 +231,11 @@ else()
 
     build_idl(idl/hexagon_npu.idl hexagon_npu_skel_OBJS)
 
+    add_subdirectory(${HEXAGON_SDK_ROOT}/libs/qprintf qprintf_dir)
+    target_include_directories(hexagon_npu_skel_OBJS PUBLIC
+        ${HEXAGON_SDK_ROOT}/libs/qprintf/inc/
+    )
+
     # disable warnings for the skel
     set_source_files_properties(
         ${skel_srcs}
@@ -239,12 +244,12 @@ else()
     )
 
     add_library(hexagon_npu_skel SHARED $<TARGET_OBJECTS:hexagon_npu_skel_OBJS>)
-
     target_link_libraries(hexagon_npu_skel
         ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++abi.a
         ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++.a
     )
     set_target_properties(hexagon_npu_skel PROPERTIES OUTPUT_NAME "hexagon_npu_skel_${HEXAGON_ARCH}")
+    target_link_libraries(hexagon_npu_skel qprintf_static)
 
     copy_binaries(hexagon_npu_skel)
 endif()

diff --git a/ggml/src/ggml-qnn/npu/device/device.cpp b/ggml/src/ggml-qnn/npu/device/device.cpp
@@ -106,6 +106,7 @@ int npu_device_open(const char * uri, remote_handle64 * h) {
     }
 
     *h = reinterpret_cast<remote_handle64>(context);
+    DEVICE_LOG_INFO("NPU device context created: %p", (void *) *h);
     return AEE_SUCCESS;
 }
 
@@ -117,6 +118,7 @@ int npu_device_close(remote_handle64 h) {
     }
 
     delete context;
+    DEVICE_LOG_INFO("NPU device context destroyed: %p", (void *) h);
     return AEE_SUCCESS;
 }
 
@@ -130,6 +132,12 @@ AEEResult npu_device_device_support_op(remote_handle64 _h, const npu_device_tens
                                        const npu_device_tensor_spec * src1, const npu_device_tensor_spec * dst,
                                        npu_device_tensor_op op, boolean * is_supported) {
     NPU_UNUSED(_h);
+
+    if (!src0 || !src1 || !dst || !is_supported) {
+        DEVICE_LOG_ERROR("npu_device_device_support_op: Invalid arguments");
+        return AEE_EINVARGS;
+    }
+
     *is_supported = hexagon::support_op(*src0, *src1, *dst, op);
     return AEE_SUCCESS;
 }
@@ -147,28 +155,15 @@ AEEResult npu_device_tensor_init(remote_handle64 _h, const npu_device_tensor_con
     return AEE_SUCCESS;
 }
 
-AEEResult npu_device_tensor_set_src(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle, uint64_t index,
-                                    npu_device_tensor_handle_t src) {
+AEEResult npu_device_tensor_update_params(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle,
+                                          const npu_device_tensor_update_config * config) {
     NPU_UNUSED(_h);
     auto * tensor = tensor_from_handle(tensor_handle);
-    if (!tensor) {
-        return AEE_EINVHANDLE;
-    }
-
-    auto * src_tensor = tensor_from_handle(src);
-    tensor->set_src(index, src_tensor);
-    return AEE_SUCCESS;
-}
-
-AEEResult npu_device_tensor_set_op(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle,
-                                   npu_device_tensor_op op) {
-    NPU_UNUSED(_h);
-    auto * tensor = tensor_from_handle(tensor_handle);
-    if (!tensor) {
+    if (!tensor || !config) {
         return AEE_EINVHANDLE;
     }
 
-    tensor->set_op(op);
+    tensor->update_config(*config);
     return AEE_SUCCESS;
 }
 
@@ -206,6 +201,29 @@ AEEResult npu_device_graph_set_tensor(remote_handle64 _h, npu_device_graph_handl
     return AEE_SUCCESS;
 }
 
+AEEResult npu_device_graph_set_tensor_with_param(remote_handle64 _h, npu_device_graph_handle_t graph_handle,
+                                                 const npu_device_tensor_handle_t *      tensor_handles,
+                                                 int                                     tensor_handlesLen,
+                                                 const npu_device_tensor_update_config * tensor_params,
+                                                 int                                     tensor_paramsLen) {
+    NPU_UNUSED(_h);
+    auto * graph = graph_from_handle(graph_handle);
+    if (!graph || !tensor_handles || tensor_handlesLen <= 0 || !tensor_params ||
+        tensor_handlesLen != tensor_paramsLen) {
+        return AEE_EINVHANDLE;
+    }
+
+    graph->set_tensor(tensor_handles, tensor_handlesLen);
+    for (int i = 0; i < tensor_handlesLen; ++i) {
+        auto * tensor = tensor_from_handle(tensor_handles[i]);
+        if (tensor) {
+            tensor->update_config(tensor_params[i]);
+        }
+    }
+
+    return AEE_SUCCESS;
+}
+
 AEEResult npu_device_graph_compute(remote_handle64 _h, npu_device_graph_handle_t graph_handle) {
     auto dev_ctx = device_context_from_handle(_h);
     if (!dev_ctx) {

diff --git a/ggml/src/ggml-qnn/npu/device/graph.cpp b/ggml/src/ggml-qnn/npu/device/graph.cpp
@@ -10,7 +10,8 @@
 namespace hexagon {
 
 graph::graph() noexcept {
-    DEVICE_LOG_DEBUG("graph(%p) created\n", (void *) this);
+    _vtcm_quota_size = hexagon::vtcm_mem::get_avail_block_size();  // TODO: move to device init?
+    DEVICE_LOG_DEBUG("graph(%p) created: vtcm quota size: %zu\n", (void *) this, _vtcm_quota_size);
 }
 
 graph::~graph() noexcept {
@@ -45,6 +46,8 @@ bool graph::compute(default_thread_pool * thread_pool, const float * f16_to_f32_
     }
 
     DEVICE_LOG_DEBUG("graph(%p) compute\n", (void *) this);
+
+    DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]compute", (void *) this);
     _f16_to_f32_table = f16_to_f32_table;
     if (thread_pool) {
         thread_pool->sync_execute(reinterpret_cast<default_thread_pool::task_type>(&graph::thread_pool_task), this);
@@ -61,6 +64,8 @@ void graph::thread_pool_task(default_thread_pool * pool, size_t thread_idx, size
 }
 
 void graph::compute_impl(default_thread_pool * pool, size_t thread_idx, size_t thread_count) {
+    hexagon::compute_params params = { thread_idx, thread_count, _vtcm_quota_size / thread_count, _f16_to_f32_table };
+
     for (size_t i = 0; i < _tensor_count; ++i) {
         auto * dst  = _tensors[i];
         auto   op   = dst->get_op();
@@ -69,14 +74,14 @@ void graph::compute_impl(default_thread_pool * pool, size_t thread_idx, size_t t
             DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d not supported\n", (void *) this, i, op);
             return;
         }
-
-        hexagon::compute_params params = { thread_idx, thread_count, _f16_to_f32_table };
         if (!func(dst, &params)) {
             DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d compute failed\n", (void *) this, i, op);
         }
 
-        // TODO: figure out which ops need to sync
-        if (pool) {
+        DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]sync_thread, tidx: %zu", (void *) this, thread_idx);
+
+        const bool should_sync = requires_thread_barrier(op);
+        if (pool && should_sync && i < _tensor_count - 1) {
             pool->sync_thread();
         }
         dst->invalidate();

diff --git a/ggml/src/ggml-qnn/npu/device/graph.hpp b/ggml/src/ggml-qnn/npu/device/graph.hpp
@@ -25,6 +25,7 @@ class graph {
 
     std::unique_ptr<tensor *[]> _tensors;
     size_t                      _tensor_count     = 0;
+    size_t                      _vtcm_quota_size  = 0;
     const float *               _f16_to_f32_table = nullptr;
 
     DISABLE_COPY_AND_MOVE(graph);