Skip to content

feat: perf opt part4 #43

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 50 commits into from
May 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
00890e5
wip
chraac May 16, 2025
d2d35e4
refactor: rewrite dequantize_row_q4_0 by intrinsic
chraac May 16, 2025
5149473
log for debug
chraac May 17, 2025
58e8bcf
fix q4 intrinsic
chraac May 19, 2025
f6f2277
small opt
chraac May 19, 2025
8f8906f
wip
chraac May 19, 2025
688b3c6
wip
chraac May 19, 2025
3fc25f2
add vtcm_quota_size
chraac May 20, 2025
e6229ec
add perf log for hexagon-npu backend
chraac May 20, 2025
7f47761
wip
chraac May 20, 2025
9001c4f
add log
chraac May 20, 2025
416ce0d
sync after a specfic op
chraac May 21, 2025
9cdfef7
increase worker thread priority
chraac May 21, 2025
137a3cc
fix unbalanced thread slice
chraac May 21, 2025
e58e921
small slict to fit in vtcm cache
chraac May 21, 2025
00ee0cc
limit the supported row element size
chraac May 21, 2025
035749f
opt 4_0 dequant
chraac May 22, 2025
4d4d239
fix q4 dequant
chraac May 22, 2025
142ab93
add power_utils
chraac May 22, 2025
2415498
add rms_norm
chraac May 22, 2025
c5749a6
wip
chraac May 23, 2025
96d2c18
enable rms_norm f32
chraac May 23, 2025
35df13a
fix rms_norm with param
chraac May 23, 2025
2b76030
fix compiling flags
chraac May 23, 2025
bd70076
use float
chraac May 23, 2025
84d5e0e
fix small row size
chraac May 23, 2025
bb949a3
vectorized rms norm
chraac May 23, 2025
68a6d76
wip
chraac May 23, 2025
1d94db3
read 2 vectors
chraac May 23, 2025
edfb963
rename
chraac May 23, 2025
83160d3
add perf log on update
chraac May 24, 2025
1fce9b1
set empty tensors handle also
chraac May 24, 2025
897bd47
merge some rpc functions
chraac May 24, 2025
bf62a51
opt param update
chraac May 24, 2025
4bacb3f
wip
chraac May 24, 2025
d65475d
print more log
chraac May 24, 2025
4acf419
add struct for update param config
chraac May 24, 2025
a42a9d9
add npu_device_graph_set_tensor_with_param
chraac May 24, 2025
8352d47
merge tensor and params update
chraac May 24, 2025
2caea38
wip
chraac May 25, 2025
1ee710b
wip
chraac May 25, 2025
8c027ba
make as template to reuse
chraac May 25, 2025
57252c0
vectorize dequantize_row_q8_0
chraac May 26, 2025
fe08014
opt
chraac May 26, 2025
8409dd1
avoid using union to store q data
chraac May 26, 2025
435caa2
wip
chraac May 26, 2025
fafe1ad
wip
chraac May 26, 2025
411097e
Merge branch 'dev-refactoring' into dev-perf-opt-part4
chraac May 27, 2025
d8bd368
wip
chraac May 27, 2025
99fc1e3
Merge branch 'dev-refactoring' into dev-perf-opt-part4
chraac May 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion ggml/src/ggml-qnn/npu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,11 @@ else()

build_idl(idl/hexagon_npu.idl hexagon_npu_skel_OBJS)

add_subdirectory(${HEXAGON_SDK_ROOT}/libs/qprintf qprintf_dir)
target_include_directories(hexagon_npu_skel_OBJS PUBLIC
${HEXAGON_SDK_ROOT}/libs/qprintf/inc/
)

# disable warnings for the skel
set_source_files_properties(
${skel_srcs}
Expand All @@ -239,12 +244,12 @@ else()
)

add_library(hexagon_npu_skel SHARED $<TARGET_OBJECTS:hexagon_npu_skel_OBJS>)

target_link_libraries(hexagon_npu_skel
${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++abi.a
${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++.a
)
set_target_properties(hexagon_npu_skel PROPERTIES OUTPUT_NAME "hexagon_npu_skel_${HEXAGON_ARCH}")
target_link_libraries(hexagon_npu_skel qprintf_static)

copy_binaries(hexagon_npu_skel)
endif()
Expand Down
52 changes: 35 additions & 17 deletions ggml/src/ggml-qnn/npu/device/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ int npu_device_open(const char * uri, remote_handle64 * h) {
}

*h = reinterpret_cast<remote_handle64>(context);
DEVICE_LOG_INFO("NPU device context created: %p", (void *) *h);
return AEE_SUCCESS;
}

Expand All @@ -117,6 +118,7 @@ int npu_device_close(remote_handle64 h) {
}

delete context;
DEVICE_LOG_INFO("NPU device context destroyed: %p", (void *) h);
return AEE_SUCCESS;
}

Expand All @@ -130,6 +132,12 @@ AEEResult npu_device_device_support_op(remote_handle64 _h, const npu_device_tens
const npu_device_tensor_spec * src1, const npu_device_tensor_spec * dst,
npu_device_tensor_op op, boolean * is_supported) {
NPU_UNUSED(_h);

if (!src0 || !src1 || !dst || !is_supported) {
DEVICE_LOG_ERROR("npu_device_device_support_op: Invalid arguments");
return AEE_EINVARGS;
}

*is_supported = hexagon::support_op(*src0, *src1, *dst, op);
return AEE_SUCCESS;
}
Expand All @@ -147,28 +155,15 @@ AEEResult npu_device_tensor_init(remote_handle64 _h, const npu_device_tensor_con
return AEE_SUCCESS;
}

AEEResult npu_device_tensor_set_src(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle, uint64_t index,
npu_device_tensor_handle_t src) {
AEEResult npu_device_tensor_update_params(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle,
const npu_device_tensor_update_config * config) {
NPU_UNUSED(_h);
auto * tensor = tensor_from_handle(tensor_handle);
if (!tensor) {
return AEE_EINVHANDLE;
}

auto * src_tensor = tensor_from_handle(src);
tensor->set_src(index, src_tensor);
return AEE_SUCCESS;
}

AEEResult npu_device_tensor_set_op(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle,
npu_device_tensor_op op) {
NPU_UNUSED(_h);
auto * tensor = tensor_from_handle(tensor_handle);
if (!tensor) {
if (!tensor || !config) {
return AEE_EINVHANDLE;
}

tensor->set_op(op);
tensor->update_config(*config);
return AEE_SUCCESS;
}

Expand Down Expand Up @@ -206,6 +201,29 @@ AEEResult npu_device_graph_set_tensor(remote_handle64 _h, npu_device_graph_handl
return AEE_SUCCESS;
}

AEEResult npu_device_graph_set_tensor_with_param(remote_handle64 _h, npu_device_graph_handle_t graph_handle,
const npu_device_tensor_handle_t * tensor_handles,
int tensor_handlesLen,
const npu_device_tensor_update_config * tensor_params,
int tensor_paramsLen) {
NPU_UNUSED(_h);
auto * graph = graph_from_handle(graph_handle);
if (!graph || !tensor_handles || tensor_handlesLen <= 0 || !tensor_params ||
tensor_handlesLen != tensor_paramsLen) {
return AEE_EINVHANDLE;
}

graph->set_tensor(tensor_handles, tensor_handlesLen);
for (int i = 0; i < tensor_handlesLen; ++i) {
auto * tensor = tensor_from_handle(tensor_handles[i]);
if (tensor) {
tensor->update_config(tensor_params[i]);
}
}

return AEE_SUCCESS;
}

AEEResult npu_device_graph_compute(remote_handle64 _h, npu_device_graph_handle_t graph_handle) {
auto dev_ctx = device_context_from_handle(_h);
if (!dev_ctx) {
Expand Down
15 changes: 10 additions & 5 deletions ggml/src/ggml-qnn/npu/device/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
namespace hexagon {

graph::graph() noexcept {
DEVICE_LOG_DEBUG("graph(%p) created\n", (void *) this);
_vtcm_quota_size = hexagon::vtcm_mem::get_avail_block_size(); // TODO: move to device init?
DEVICE_LOG_DEBUG("graph(%p) created: vtcm quota size: %zu\n", (void *) this, _vtcm_quota_size);
}

graph::~graph() noexcept {
Expand Down Expand Up @@ -45,6 +46,8 @@ bool graph::compute(default_thread_pool * thread_pool, const float * f16_to_f32_
}

DEVICE_LOG_DEBUG("graph(%p) compute\n", (void *) this);

DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]compute", (void *) this);
_f16_to_f32_table = f16_to_f32_table;
if (thread_pool) {
thread_pool->sync_execute(reinterpret_cast<default_thread_pool::task_type>(&graph::thread_pool_task), this);
Expand All @@ -61,6 +64,8 @@ void graph::thread_pool_task(default_thread_pool * pool, size_t thread_idx, size
}

void graph::compute_impl(default_thread_pool * pool, size_t thread_idx, size_t thread_count) {
hexagon::compute_params params = { thread_idx, thread_count, _vtcm_quota_size / thread_count, _f16_to_f32_table };

for (size_t i = 0; i < _tensor_count; ++i) {
auto * dst = _tensors[i];
auto op = dst->get_op();
Expand All @@ -69,14 +74,14 @@ void graph::compute_impl(default_thread_pool * pool, size_t thread_idx, size_t t
DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d not supported\n", (void *) this, i, op);
return;
}

hexagon::compute_params params = { thread_idx, thread_count, _f16_to_f32_table };
if (!func(dst, &params)) {
DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d compute failed\n", (void *) this, i, op);
}

// TODO: figure out which ops need to sync
if (pool) {
DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]sync_thread, tidx: %zu", (void *) this, thread_idx);

const bool should_sync = requires_thread_barrier(op);
if (pool && should_sync && i < _tensor_count - 1) {
pool->sync_thread();
}
dst->invalidate();
Expand Down
1 change: 1 addition & 0 deletions ggml/src/ggml-qnn/npu/device/graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class graph {

std::unique_ptr<tensor *[]> _tensors;
size_t _tensor_count = 0;
size_t _vtcm_quota_size = 0;
const float * _f16_to_f32_table = nullptr;

DISABLE_COPY_AND_MOVE(graph);
Expand Down
Loading