From 5e99646b50fc81a7e6301f1f40238ead7f25a464 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 18 Nov 2024 11:02:43 +0100 Subject: [PATCH 001/245] Vulkan: Fix device info output format specifiers (#10366) * Vulkan: Fix device info output format specifiers * Vulkan: Use zu printf specifier for size_t instead of ld --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 19951d9b6d60b..ef067356ec794 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1764,11 +1764,11 @@ static void ggml_vk_print_gpu_info(size_t idx) { fp16 = fp16 && vk12_features.shaderFloat16; std::string device_name = props2.properties.deviceName.data(); - GGML_LOG_DEBUG("ggml_vulkan: %d = %s (%s) | uma: %d | fp16: %d | warp size: %d\n", - idx, device_name.c_str(), driver_props.driverName, uma, fp16, subgroup_size); + GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu\n", + idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size); if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) { - std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl; + GGML_LOG_DEBUG("ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want.\n"); } } @@ -1937,8 +1937,7 @@ void ggml_vk_instance_init() { vk_instance.device_indices.push_back(0); } } - GGML_LOG_DEBUG("ggml_vulkan: Found %d Vulkan devices:\n", vk_instance.device_indices.size()); - + GGML_LOG_DEBUG("ggml_vulkan: Found %zu Vulkan devices:\n", vk_instance.device_indices.size()); for (size_t i = 0; i < vk_instance.device_indices.size(); i++) { ggml_vk_print_gpu_info(i); From 9e4af2d0aa6229d218ade02e04fb2aa8b8ef3ab5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 18 Nov 2024 16:08:20 +0200 Subject: [PATCH 002/245] flake.lock: Update (#10346) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flake lock file updates: • Updated input 'nixpkgs': 'github:NixOS/nixpkgs/4aa36568d413aca0ea84a1684d2d46f55dbabad7?narHash=sha256-Zwl8YgTVJTEum%2BL%2B0zVAWvXAGbWAuXHax3KzuejaDyo%3D' (2024-11-05) → 'github:NixOS/nixpkgs/5e4fbfb6b3de1aa2872b76d49fafc942626e2add?narHash=sha256-OZiZ3m8SCMfh3B6bfGC/Bm4x3qc1m2SVEAlkV6iY7Yg%3D' (2024-11-15) Co-authored-by: github-actions[bot] --- flake.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flake.lock b/flake.lock index 470363a2f8fad..ee8cf07e37220 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1730785428, - "narHash": "sha256-Zwl8YgTVJTEum+L+0zVAWvXAGbWAuXHax3KzuejaDyo=", + "lastModified": 1731676054, + "narHash": "sha256-OZiZ3m8SCMfh3B6bfGC/Bm4x3qc1m2SVEAlkV6iY7Yg=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "4aa36568d413aca0ea84a1684d2d46f55dbabad7", + "rev": "5e4fbfb6b3de1aa2872b76d49fafc942626e2add", "type": "github" }, "original": { From f3d8b637010bc48dc4eaf2f754a151f119847186 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Mon, 18 Nov 2024 08:28:42 -0600 Subject: [PATCH 003/245] vulkan: remove use of null initializer (#10372) Seems like this isn't working for vulkan-over-metal when the array is sized by a spec constant. Maybe a spirv-cross limitation? --- ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp index ddbac5d2c9334..970aac6ef72cc 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp @@ -5,8 +5,6 @@ #endif #extension GL_EXT_shader_explicit_arithmetic_types_int32 : require -#extension GL_EXT_null_initializer : enable - #include "mul_mat_vec_base.comp" layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; @@ -57,7 +55,11 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { y_offset = QUANT_R == 1 ? 1 : QUANT_K/2; - FLOAT_TYPE temp[NUM_ROWS] = {}; + FLOAT_TYPE temp[NUM_ROWS]; + + for (uint i = 0; i < NUM_ROWS; ++i) { + temp[i] = FLOAT_TYPE(0); + } const int unroll_count = 8; From 154a3916e95c79f808ae9e44180818e941ea3440 Mon Sep 17 00:00:00 2001 From: bandoti <141645996+bandoti@users.noreply.github.com> Date: Mon, 18 Nov 2024 11:23:58 -0400 Subject: [PATCH 004/245] Skip searching root path for cross-compile builds (#10383) --- cmake/llama-config.cmake.in | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cmake/llama-config.cmake.in b/cmake/llama-config.cmake.in index ef68417b412cf..a7f1efb88f2aa 100644 --- a/cmake/llama-config.cmake.in +++ b/cmake/llama-config.cmake.in @@ -66,11 +66,15 @@ endif() find_library(ggml_LIBRARY ggml REQUIRED - HINTS ${LLAMA_LIB_DIR}) + HINTS ${LLAMA_LIB_DIR} + NO_CMAKE_FIND_ROOT_PATH +) find_library(llama_LIBRARY llama REQUIRED - HINTS ${LLAMA_LIB_DIR}) + HINTS ${LLAMA_LIB_DIR} + NO_CMAKE_FIND_ROOT_PATH +) set(_llama_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@") set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@") From caea39669db1ae4096b9621ecab249cf1d8336b3 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Mon, 18 Nov 2024 18:43:40 +0100 Subject: [PATCH 005/245] cuda : only use native when supported by cmake (#10389) --- ggml/src/ggml-cuda/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index e592f79896db2..25858ece8ddd7 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -12,7 +12,7 @@ if (CUDAToolkit_FOUND) # 61 == Pascal, __dp4a instruction (per-byte integer dot product) # 70 == V100, FP16 tensor cores # 75 == Turing, int8 tensor cores - if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6") + if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24") set(CMAKE_CUDA_ARCHITECTURES "native") elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16) set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75") From 74b246bc63633bbd6e649172f1a525085793968c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?= Date: Tue, 19 Nov 2024 00:50:04 +0000 Subject: [PATCH 006/245] sycl: Revert MUL_MAT_OP support changes (#10385) From e975c44bc3a64d240155c007d2615237b633d5d1 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Tue, 19 Nov 2024 01:25:17 -0600 Subject: [PATCH 007/245] vulkan: Optimize soft_max (#10301) * vulkan: Optimize soft_max Large soft_max could already saturate memory, but small/medium sizes were pretty slow. The bulk of the gains for them comes from using a smaller workgroup size, and making the workgroup size match the subgroup size also makes the barriers much cheaper. Cache some values in locals to avoid refetching/recomputing. And stamp out a few "template instantiations" so smaller cases will fully unroll. Add a missing early return for OOB rows. This happens when there are more than 512 rows and the dispatch is 512 x H. * vulkan: Further soft_max optimizations Restore the workgroup size of 512 case, use it for >1024. Use unrollable loops for more iteration counts. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 13 +- .../ggml-vulkan/vulkan-shaders/soft_max.comp | 112 ++++++++++++++---- tests/test-backend-ops.cpp | 8 ++ 3 files changed, 106 insertions(+), 27 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index ef067356ec794..21fee2f3d7533 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -218,6 +218,7 @@ struct vk_device_struct { vk_pipeline pipeline_tanh_f32; vk_pipeline pipeline_diag_mask_inf_f32; vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16; + vk_pipeline pipeline_soft_max_f32_wg512, pipeline_soft_max_f32_f16_wg512; vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16; vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16; vk_pipeline pipeline_argsort_f32; @@ -388,6 +389,7 @@ struct vk_op_soft_max_push_constants { float m0; float m1; uint32_t n_head_log2; + uint32_t nrows_x; }; struct vk_op_argsort_push_constants { @@ -1497,8 +1499,10 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_wg512, "soft_max_f32_wg512", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); @@ -3932,10 +3936,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_soft_max_f32; + return src0->ne[0] > 1024 ? ctx->device->pipeline_soft_max_f32_wg512 : ctx->device->pipeline_soft_max_f32; } if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_soft_max_f32_f16; + return src0->ne[0] > 1024 ? ctx->device->pipeline_soft_max_f32_f16_wg512 : ctx->device->pipeline_soft_max_f32_f16; } return nullptr; case GGML_OP_ROPE: @@ -4581,6 +4585,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, scale, max_bias, m0, m1, n_head_log2, + nrows_x, }, dryrun); } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp index 0bd51ecab5870..f9727679ec5f2 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp @@ -1,6 +1,7 @@ #version 450 -#extension GL_EXT_shader_16bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require +#extension GL_EXT_control_flow_attributes : enable layout (push_constant) uniform parameter { @@ -11,14 +12,13 @@ layout (push_constant) uniform parameter float m0; float m1; uint n_head_log2; + uint nrows_x; } p; #include "types.comp" -#extension GL_EXT_control_flow_attributes : enable -#define BLOCK_SIZE 512 - -layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; +layout(constant_id = 0) const uint BLOCK_SIZE = 32; +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; layout (binding = 1) readonly buffer Y {B_TYPE data_b[];}; @@ -26,11 +26,18 @@ layout (binding = 2) buffer D {D_TYPE data_d[];}; shared FLOAT_TYPE vals[BLOCK_SIZE]; -void main() { +// num_iters is the number of BLOCK_SIZE loop iterations we need to iterate +// over all the columns. The main function tries to pass a constant here, +// as if it were a template function, to allow unrolling. +void soft_max(uint num_iters) { const uint tid = gl_LocalInvocationID.x; const uint rowx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; const uint rowy = rowx % p.KY; + if (rowx >= p.nrows_x) { + return; + } + float slope = 1.0f; // ALiBi @@ -46,19 +53,37 @@ void main() { // Find max FLOAT_TYPE max_val = uintBitsToFloat(0xFF800000); - [[unroll]] for (uint col0 = 0; col0 < p.KX; col0 += BLOCK_SIZE) { + // Cache values while we compute the max, so we don't need to read them + // again when we're ready to compute exp(x-max). + const uint DATA_CACHE_SIZE = 16; + FLOAT_TYPE data_cache[DATA_CACHE_SIZE]; + + [[unroll]] for (uint col0 = 0, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) { const uint col = col0 + tid; - if (col >= p.KX) { - break; + FLOAT_TYPE a = FLOAT_TYPE(0); + if (col < p.KX) { + a = data_a[rowx * p.KX + col]; } - max_val = max(max_val, FLOAT_TYPE(data_a[rowx * p.KX + col]) * p.scale + (p.KY > 0 ? slope * FLOAT_TYPE(data_b[rowy * p.KX + col]) : FLOAT_TYPE(0.0f))); + FLOAT_TYPE b = FLOAT_TYPE(0); + if (p.KY > 0 && col < p.KX) { + b = data_b[rowy * p.KX + col]; + } + + FLOAT_TYPE v = a * p.scale + slope * b; + + max_val = max(max_val, v); + + if (idx < DATA_CACHE_SIZE) { + data_cache[idx] = v; + } } - vals[tid] = max_val; + // reduce across the workgroup + vals[tid] = max_val; barrier(); - [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) { + [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) { if (tid < s) { vals[tid] = max(vals[tid], vals[tid + s]); } @@ -68,39 +93,80 @@ void main() { max_val = vals[0]; barrier(); - // Sum up values - vals[tid] = FLOAT_TYPE(0.0f); + FLOAT_TYPE sum = FLOAT_TYPE(0.0f); - [[unroll]] for (uint col0 = 0; col0 < p.KX; col0 += BLOCK_SIZE) { + // Compute sum{exp(x - max)} + [[unroll]] for (uint col0 = 0, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) { const uint col = col0 + tid; if (col >= p.KX) { break; } + // compute exp(a*scale+b*slope), add it to sum, and cache the new value + // in data_cache if possible. const uint i = rowx * p.KX + col; - const FLOAT_TYPE val = exp(FLOAT_TYPE(data_a[i]) * p.scale + (p.KY > 0 ? slope * FLOAT_TYPE(data_b[rowy * p.KX + col]) : FLOAT_TYPE(0.0f)) - max_val); - vals[tid] += val; - data_d[i] = D_TYPE(val); + FLOAT_TYPE val; + if (idx < DATA_CACHE_SIZE) { + val = exp(data_cache[idx] - max_val); + } else { + val = exp(FLOAT_TYPE(data_a[i]) * p.scale + (p.KY > 0 ? slope * FLOAT_TYPE(data_b[rowy * p.KX + col]) : FLOAT_TYPE(0.0f)) - max_val); + } + sum += val; + if (idx < DATA_CACHE_SIZE) { + data_cache[idx] = val; + } else { + data_d[i] = D_TYPE(val); + } } + // reduce across the workgroup + vals[tid] = sum; barrier(); - [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) { + [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) { if (tid < s) { vals[tid] += vals[tid + s]; } barrier(); } + sum = vals[0]; - const D_TYPE divisor = D_TYPE(vals[0]); + FLOAT_TYPE rcpdivisor = 1.0/sum; - [[unroll]] for (uint col0 = 0; col0 < p.KX; col0 += BLOCK_SIZE) { + [[unroll]] for (uint col0 = 0, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) { const uint col = col0 + tid; if (col >= p.KX) { - break; + continue; + } + + if (idx < DATA_CACHE_SIZE) { + data_d[rowx*p.KX + col] = D_TYPE(data_cache[idx] * rcpdivisor); + } else { + data_d[rowx*p.KX + col] *= D_TYPE(rcpdivisor); } + } +} - data_d[rowx*p.KX + col] /= divisor; +void main() { + // instantiate the soft_max function for several different + // dimensions, to allow loop unrolling + uint num_blocks = (p.KX + BLOCK_SIZE - 1) / BLOCK_SIZE; + if (num_blocks > 32) { + soft_max(num_blocks); + } else if (num_blocks > 16) { + soft_max(32); + } else if (num_blocks > 8) { + soft_max(16); + } else if (num_blocks > 4) { + soft_max(8); + } else if (num_blocks == 4) { + soft_max(4); + } else if (num_blocks == 3) { + soft_max(3); + } else if (num_blocks == 2) { + soft_max(2); + } else if (num_blocks == 1) { + soft_max(1); } } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index f8a59b6df175f..01ac7166e0e22 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3823,6 +3823,14 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1})); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, 1.0f, 0.0f)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 4096, 5, 1}, false, 1.0f, 0.0f)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {1024, 1024, 10, 1}, false, 1.0f, 0.0f)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 1024, 10, 1}, false, 1.0f, 0.0f)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {256, 256, 20, 1}, false, 1.0f, 0.0f)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {64, 64, 20, 1}, false, 1.0f, 0.0f)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 64, 20, 1}, false, 1.0f, 0.0f)); + for (int bs : {1, 512}) { for (ggml_type type_a : all_types) { for (ggml_type type_b : {GGML_TYPE_F32}) { From 9a59aa2e6bba59feda3e6a8d22bcfa1181516206 Mon Sep 17 00:00:00 2001 From: Romain Biessy Date: Tue, 19 Nov 2024 09:02:23 +0100 Subject: [PATCH 008/245] sycl : Add option to set the SYCL architecture for all targets (#10266) * Add option to set the SYCL architecture for all targets * Convert GGML_SYCL_HIP_TARGET to the more generic GGML_SYCL_ARCH option * Document that setting GGML_SYCL_ARCH can improve the performance --- docs/backend/SYCL.md | 12 ++++++++---- ggml/CMakeLists.txt | 2 ++ ggml/src/ggml-sycl/CMakeLists.txt | 10 +++++++--- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 5a67d10a7a6ac..6c180f18860ef 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -317,12 +317,14 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR # Build LLAMA with Nvidia BLAS acceleration through SYCL +# Setting GGML_SYCL_DEVICE_ARCH is optional but can improve performance +GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture # Option 1: Use FP32 (recommended for better performance in most cases) -cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx +cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx # Option 2: Use FP16 -cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON +cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON # build all binary cmake --build build --config Release -j -v @@ -340,8 +342,9 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE ## AMD # Use FP32, FP16 is not supported -# Find your GGML_SYCL_HIP_TARGET with rocminfo, under the key 'Name:' -cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_HIP_TARGET=${GGML_SYCL_HIP_TARGET} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx +# Find your GGML_SYCL_DEVICE_ARCH with rocminfo, under the key 'Name:' +GGML_SYCL_DEVICE_ARCH=gfx90a # Example architecture +cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx # build all binary cmake --build build --config Release -j -v @@ -684,6 +687,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512 |--------------------|---------------------------------------|---------------------------------------------| | GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.
FP32 path - recommended for better perforemance than FP16 on quantized model| | GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. | +| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. | | GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. | | CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. | | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index a82818d607f5e..b16a0e9ada7ed 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -164,6 +164,8 @@ option(GGML_SYCL "ggml: use SYCL" option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF) set (GGML_SYCL_TARGET "INTEL" CACHE STRING "ggml: sycl target device") +set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING + "ggml: sycl device architecture") # extra artifacts option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE}) diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt index 03bf3cb929b32..d1d0ff83d636c 100644 --- a/ggml/src/ggml-sycl/CMakeLists.txt +++ b/ggml/src/ggml-sycl/CMakeLists.txt @@ -72,10 +72,14 @@ else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl) elseif (GGML_SYCL_TARGET STREQUAL "AMD") - if (GGML_SYCL_HIP_TARGET STREQUAL "") - message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_HIP_TARGET has not been set.") + if (NOT GGML_SYCL_DEVICE_ARCH) + message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.") endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${GGML_SYCL_HIP_TARGET}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=amdgcn-amd-amdhsa") target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl) endif() + + if (GGML_SYCL_DEVICE_ARCH) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH}") + endif() endif() From 07a1be41edddc6801bbedf43dc7a6b9073b112e3 Mon Sep 17 00:00:00 2001 From: Shane A Date: Tue, 19 Nov 2024 01:04:08 -0800 Subject: [PATCH 009/245] llama : add OLMo November 2024 support (#10394) * Add OLMo November 2024 constants * Add OLMo November 2024 converter * Add loading of OLMo November 2024 tensors and hyper parameters * Add building of OLMo November 2024 model --- convert_hf_to_gguf.py | 5 + gguf-py/gguf/constants.py | 18 ++++ gguf-py/gguf/tensor_mapping.py | 28 ++--- src/llama.cpp | 186 +++++++++++++++++++++++++++++++++ 4 files changed, 223 insertions(+), 14 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 39afa5ef4f272..9f4b8154b88a8 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3040,6 +3040,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] +@Model.register("Olmo1124ForCausalLM") +class Olmo1124Model(Model): + model_arch = gguf.MODEL_ARCH.OLMO_1124 + + @Model.register("OlmoeForCausalLM") class OlmoeModel(Model): model_arch = gguf.MODEL_ARCH.OLMOE diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index bc2b649d1a83d..d83b72f761951 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -243,6 +243,7 @@ class MODEL_ARCH(IntEnum): COMMAND_R = auto() DBRX = auto() OLMO = auto() + OLMO_1124 = auto() OLMOE = auto() OPENELM = auto() ARCTIC = auto() @@ -404,6 +405,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.COMMAND_R: "command-r", MODEL_ARCH.DBRX: "dbrx", MODEL_ARCH.OLMO: "olmo", + MODEL_ARCH.OLMO_1124: "olmo_1124", MODEL_ARCH.OLMOE: "olmoe", MODEL_ARCH.OPENELM: "openelm", MODEL_ARCH.ARCTIC: "arctic", @@ -1069,6 +1071,22 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.OLMO_1124: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_POST_NORM, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.FFN_POST_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], MODEL_ARCH.OLMOE: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index f4a787c56993a..4cbd39e03e942 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -13,7 +13,7 @@ class TensorNameMap: "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone "transformer.word_embeddings", # falcon "word_embeddings", # bloom - "model.embed_tokens", # llama-hf nemotron olmoe + "model.embed_tokens", # llama-hf nemotron olmoe olmo_1124 "tok_embeddings", # llama-pth "embeddings.word_embeddings", # bert nomic-bert "language_model.embedding.word_embeddings", # persimmon @@ -54,7 +54,7 @@ class TensorNameMap: # Output MODEL_TENSOR.OUTPUT: ( "embed_out", # gptneox - "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe + "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo_1124 "output", # llama-pth bloom internlm2 "word_embeddings_for_head", # persimmon "lm_head.linear", # phi2 @@ -66,7 +66,7 @@ class TensorNameMap: MODEL_TENSOR.OUTPUT_NORM: ( "gpt_neox.final_layer_norm", # gptneox "transformer.ln_f", # gpt2 gpt-j falcon jais exaone - "model.norm", # llama-hf baichuan internlm2 olmoe + "model.norm", # llama-hf baichuan internlm2 olmoe olmo_1124 "norm", # llama-pth "transformer.norm_f", # mpt dbrx "ln_f", # refact bloom qwen gpt2 @@ -145,7 +145,7 @@ class TensorNameMap: # Attention query MODEL_TENSOR.ATTN_Q: ( - "model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe + "model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo_1124 "layers.{bid}.attention.wq", # llama-pth "encoder.layer.{bid}.attention.self.query", # bert "transformer.h.{bid}.attn.q_proj", # gpt-j @@ -157,7 +157,7 @@ class TensorNameMap: # Attention key MODEL_TENSOR.ATTN_K: ( - "model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe + "model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo_1124 "layers.{bid}.attention.wk", # llama-pth "encoder.layer.{bid}.attention.self.key", # bert "transformer.h.{bid}.attn.k_proj", # gpt-j @@ -170,7 +170,7 @@ class TensorNameMap: # Attention value MODEL_TENSOR.ATTN_V: ( - "model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe + "model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo_1124 "layers.{bid}.attention.wv", # llama-pth "encoder.layer.{bid}.attention.self.value", # bert "transformer.h.{bid}.attn.v_proj", # gpt-j @@ -188,7 +188,7 @@ class TensorNameMap: "transformer.blocks.{bid}.attn.out_proj", # mpt "transformer.h.{bid}.self_attention.dense", # falcon "h.{bid}.self_attention.dense", # bloom - "model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe + "model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo_1124 "layers.{bid}.attention.wo", # llama-pth "encoder.layer.{bid}.attention.output.dense", # bert "transformer.h.{bid}.attn.out_proj", # gpt-j @@ -215,7 +215,7 @@ class TensorNameMap: ), MODEL_TENSOR.ATTN_POST_NORM: ( - "model.layers.{bid}.post_attention_layernorm", # gemma2 + "model.layers.{bid}.post_attention_layernorm", # gemma2 olmo_1124 ), # Rotary embeddings @@ -250,7 +250,7 @@ class TensorNameMap: # Post feed-forward norm MODEL_TENSOR.FFN_POST_NORM: ( - "model.layers.{bid}.post_feedforward_layernorm", # gemma2 + "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo_1124 ), MODEL_TENSOR.FFN_GATE_INP: ( @@ -273,7 +273,7 @@ class TensorNameMap: "transformer.blocks.{bid}.ffn.up_proj", # mpt "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon "h.{bid}.mlp.dense_h_to_4h", # bloom - "model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron + "model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo_1124 "layers.{bid}.feed_forward.w3", # llama-pth "encoder.layer.{bid}.intermediate.dense", # bert "transformer.h.{bid}.mlp.fc_in", # gpt-j @@ -314,7 +314,7 @@ class TensorNameMap: # Feed-forward gate MODEL_TENSOR.FFN_GATE: ( - "model.layers.{bid}.mlp.gate_proj", # llama-hf refact + "model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo_1124 "layers.{bid}.feed_forward.w1", # llama-pth "transformer.h.{bid}.mlp.w2", # qwen "transformer.h.{bid}.mlp.c_fc2", # jais @@ -346,7 +346,7 @@ class TensorNameMap: "transformer.blocks.{bid}.ffn.down_proj", # mpt "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon "h.{bid}.mlp.dense_4h_to_h", # bloom - "model.layers.{bid}.mlp.down_proj", # llama-hf nemotron + "model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo_1124 "layers.{bid}.feed_forward.w2", # llama-pth "encoder.layer.{bid}.output.dense", # bert "transformer.h.{bid}.mlp.fc_out", # gpt-j @@ -383,7 +383,7 @@ class TensorNameMap: MODEL_TENSOR.ATTN_Q_NORM: ( "language_model.encoder.layers.{bid}.self_attention.q_layernorm", "model.layers.{bid}.self_attn.q_layernorm", # persimmon - "model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon + "model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon olmo_1124 "transformer.blocks.{bid}.attn.q_ln", # sea-lion "encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2 "transformer.layers.{bid}.attn.q_norm", # openelm @@ -392,7 +392,7 @@ class TensorNameMap: MODEL_TENSOR.ATTN_K_NORM: ( "language_model.encoder.layers.{bid}.self_attention.k_layernorm", "model.layers.{bid}.self_attn.k_layernorm", # persimmon - "model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon + "model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon olmo_1124 "transformer.blocks.{bid}.attn.k_ln", # sea-lion "encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2 "transformer.layers.{bid}.attn.k_norm", # openelm diff --git a/src/llama.cpp b/src/llama.cpp index 25d5caeb3f80e..1ac755c729c93 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -183,6 +183,7 @@ enum llm_arch { LLM_ARCH_COMMAND_R, LLM_ARCH_DBRX, LLM_ARCH_OLMO, + LLM_ARCH_OLMO_1124, LLM_ARCH_OLMOE, LLM_ARCH_OPENELM, LLM_ARCH_ARCTIC, @@ -236,6 +237,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_COMMAND_R, "command-r" }, { LLM_ARCH_DBRX, "dbrx" }, { LLM_ARCH_OLMO, "olmo" }, + { LLM_ARCH_OLMO_1124, "olmo_1124" }, { LLM_ARCH_OLMOE, "olmoe" }, { LLM_ARCH_OPENELM, "openelm" }, { LLM_ARCH_ARCTIC, "arctic" }, @@ -1211,6 +1213,25 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_OLMO_1124, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_OLMOE, { @@ -5881,6 +5902,17 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_OLMO_1124: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + switch (hparams.n_layer) { + case 16: model.type = e_model::MODEL_1B; break; + case 32: model.type = e_model::MODEL_7B; break; + case 40: model.type = e_model::MODEL_13B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; case LLM_ARCH_OLMOE: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -8568,6 +8600,31 @@ static bool llm_load_tensors( layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); } } break; + case LLM_ARCH_OLMO_1124: + { + model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + + for (int i = 0; i < n_layer; ++i) { + auto & layer = model.layers[i]; + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0); + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0); + + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0); + } + } break; case LLM_ARCH_OLMOE: { model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -14433,6 +14490,130 @@ struct llm_build_context { return gf; } + struct ggml_cgraph * build_olmo_1124() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + + // mutable variable, needed during the last layer of the computation to skip unused tokens + int32_t n_tokens = this->n_tokens; + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + cur = inpL; + + // self_attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(Qcur, "Qcur_normed", il); + + Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(Kcur, "Kcur_normed", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur_rope", il); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur_rope", il); + + cur = llm_build_kv(ctx0, lctx, kv_self, gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + cur = llm_build_norm(ctx0, cur, hparams, + model.layers[il].attn_post_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_post_norm", il); + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = llm_build_ffn(ctx0, lctx, ffn_inp, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + + cur = llm_build_norm(ctx0, cur, hparams, + model.layers[il].ffn_post_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "ffn_post_norm", -1); + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + // based on the build_qwen2moe() function, changes: // * removed shared experts // * removed bias @@ -16625,6 +16806,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_olmo(); } break; + case LLM_ARCH_OLMO_1124: + { + result = llm.build_olmo_1124(); + } break; case LLM_ARCH_OLMOE: { result = llm.build_olmoe(); @@ -19894,6 +20079,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_QWEN: case LLM_ARCH_QWEN2: case LLM_ARCH_QWEN2MOE: + case LLM_ARCH_OLMO_1124: case LLM_ARCH_OLMOE: case LLM_ARCH_PHI2: case LLM_ARCH_PHI3: From a9069ecb974b988efa4af0f68fbe5444a59a06b2 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 19 Nov 2024 13:29:26 +0200 Subject: [PATCH 010/245] llama : add check for KV cache shifts (#10401) ggml-ci --- common/common.cpp | 6 ++++++ include/llama.h | 3 +++ src/llama.cpp | 6 +++++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index 93037462127d5..d314523db4c62 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -875,6 +875,12 @@ struct common_init_result common_init_from_params(common_params & params) { return iparams; } + if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) { + LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__); + llama_free_model(model); + return iparams; + } + if (!params.control_vectors.empty()) { if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1; if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model); diff --git a/include/llama.h b/include/llama.h index bc268e7996baa..90791d5f5ea12 100644 --- a/include/llama.h +++ b/include/llama.h @@ -667,6 +667,9 @@ extern "C" { // Apply the KV cache updates (such as K-shifts, defragmentation, etc.) LLAMA_API void llama_kv_cache_update(struct llama_context * ctx); + // Check if the context supports KV cache shifting + LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx); + // // State / sessions // diff --git a/src/llama.cpp b/src/llama.cpp index 1ac755c729c93..bfc2b1d5afcbb 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -18222,7 +18222,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { // apply K-shift if needed if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) { - if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA + if (!llama_kv_cache_can_shift(&lctx)) { GGML_ABORT("Deepseek2 does not support K-shift"); } @@ -20471,6 +20471,10 @@ void llama_kv_cache_update(struct llama_context * ctx) { llama_kv_cache_update_internal(*ctx); } +bool llama_kv_cache_can_shift(struct llama_context * ctx) { + return ctx->model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA +} + // deprecated size_t llama_get_state_size(struct llama_context * ctx) { return llama_state_get_size(ctx); From 2641f52eef6b1d0a919d7e7aa5d9f1f28ea335b5 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Tue, 19 Nov 2024 14:29:38 +0100 Subject: [PATCH 011/245] cuda : fix CUDA_FLAGS not being applied (#10403) --- ggml/src/ggml-blas/CMakeLists.txt | 1 - ggml/src/ggml-cuda/CMakeLists.txt | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/ggml/src/ggml-blas/CMakeLists.txt b/ggml/src/ggml-blas/CMakeLists.txt index ffe39e8bd1c97..e2cbabf0dae74 100644 --- a/ggml/src/ggml-blas/CMakeLists.txt +++ b/ggml/src/ggml-blas/CMakeLists.txt @@ -75,7 +75,6 @@ if (BLAS_FOUND) message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}") - #add_compile_options(${BLAS_LINKER_FLAGS}) target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS}) if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel")) diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index 25858ece8ddd7..e1482a269d698 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -149,7 +149,7 @@ if (CUDAToolkit_FOUND) list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED}) endif() - add_compile_options("$<$:${CUDA_FLAGS}>") + target_compile_options(ggml-cuda PRIVATE "$<$:${CUDA_FLAGS}>") else() message(FATAL_ERROR "CUDA Toolkit not found") endif() From 3e2334e02a78130fc4c7f9684c1a816a217cec5d Mon Sep 17 00:00:00 2001 From: bandoti <141645996+bandoti@users.noreply.github.com> Date: Tue, 19 Nov 2024 12:10:30 -0400 Subject: [PATCH 012/245] Add required ggml-base and backend libs to cmake pkg (#10407) --- cmake/llama-config.cmake.in | 52 +++++++++++++++++++++++-------------- ggml/CMakeLists.txt | 8 ++---- ggml/src/CMakeLists.txt | 2 +- 3 files changed, 35 insertions(+), 27 deletions(-) diff --git a/cmake/llama-config.cmake.in b/cmake/llama-config.cmake.in index a7f1efb88f2aa..28a8c18b65176 100644 --- a/cmake/llama-config.cmake.in +++ b/cmake/llama-config.cmake.in @@ -3,17 +3,11 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@) set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@) set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@) -set(GGML_BLAS @GGML_BLAS@) -set(GGML_CUDA @GGML_CUDA@) -set(GGML_METAL @GGML_METAL@) -set(GGML_HIP @GGML_HIP@) set(GGML_ACCELERATE @GGML_ACCELERATE@) -set(GGML_VULKAN @GGML_VULKAN@) set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@) set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@) set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@) set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@) -set(GGML_SYCL @GGML_SYCL@) set(GGML_OPENMP @GGML_OPENMP@) @PACKAGE_INIT@ @@ -22,10 +16,39 @@ set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@") set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@") set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@") -# Ensure transient dependencies satisfied - find_package(Threads REQUIRED) +set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@") +set(_llama_link_deps "") +foreach(_ggml_lib ggml ggml-base) + string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY") + find_library(${_ggml_lib_var} ${_ggml_lib} + REQUIRED + HINTS ${LLAMA_LIB_DIR} + NO_CMAKE_FIND_ROOT_PATH + ) + list(APPEND _llama_link_deps "${${_ggml_lib_var}}") + message(STATUS "Found ${${_ggml_lib_var}}") +endforeach() + +foreach(backend amx blas cann cpu cuda hip kompute metal musa rpc sycl vulkan) + string(TOUPPER "GGML_${backend}" backend_id) + set(_ggml_lib "ggml-${backend}") + string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY") + + find_library(${_ggml_lib_var} ${_ggml_lib} + HINTS ${LLAMA_LIB_DIR} + NO_CMAKE_FIND_ROOT_PATH + ) + if(${_ggml_lib_var}) + list(APPEND _llama_link_deps "${${_ggml_lib_var}}") + set(${backend_id} ON) + message(STATUS "Found backend ${${_ggml_lib_var}}") + else() + set(${backend_id} OFF) + endif() +endforeach() + if (APPLE AND GGML_ACCELERATE) find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED) endif() @@ -48,7 +71,7 @@ if (GGML_VULKAN) find_package(Vulkan REQUIRED) endif() -if (GGML_HIPBLAS) +if (GGML_HIP) find_package(hip REQUIRED) find_package(hipblas REQUIRED) find_package(rocblas REQUIRED) @@ -63,24 +86,13 @@ if (GGML_OPENMP) find_package(OpenMP REQUIRED) endif() - -find_library(ggml_LIBRARY ggml - REQUIRED - HINTS ${LLAMA_LIB_DIR} - NO_CMAKE_FIND_ROOT_PATH -) - find_library(llama_LIBRARY llama REQUIRED HINTS ${LLAMA_LIB_DIR} NO_CMAKE_FIND_ROOT_PATH ) -set(_llama_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@") -set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@") - add_library(llama UNKNOWN IMPORTED) - set_target_properties(llama PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}" diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index b16a0e9ada7ed..9ab91421a7d25 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -235,12 +235,8 @@ set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") #if (GGML_METAL) # set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal") #endif() -install(TARGETS ggml PUBLIC_HEADER) - -if (BUILD_SHARED_LIBS) - install(TARGETS ggml LIBRARY) - install(TARGETS ggml-base LIBRARY) -endif() +install(TARGETS ggml LIBRARY PUBLIC_HEADER) +install(TARGETS ggml-base LIBRARY) # FIXME: this should be done in the backend cmake files if (GGML_METAL) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index ae7d3abc8de32..8df0e85c0d092 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -239,8 +239,8 @@ function(ggml_add_backend backend) if (${BUILD_SHARED_LIBS}) target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_BUILD) target_compile_definitions(${backend_target} PUBLIC GGML_BACKEND_SHARED) - install(TARGETS ${backend_target} LIBRARY) endif() + install(TARGETS ${backend_target} LIBRARY) target_link_libraries(ggml PUBLIC ${backend_target}) string(TOUPPER "GGML_USE_${backend}" backend_use) target_compile_definitions(ggml PUBLIC ${backend_use}) From 1c592f6c64749aeae23e242bc87af28658f5a558 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=95=AD=E6=BE=A7=E9=82=A6?= <45505768+shou692199@users.noreply.github.com> Date: Wed, 20 Nov 2024 01:42:00 +0800 Subject: [PATCH 013/245] cmake: force MSVC compiler charset to utf-8 (#9989) --- CMakeLists.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 93c60ef431d66..994e61e45fedd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,6 +46,13 @@ if (WIN32) add_compile_definitions(_CRT_SECURE_NO_WARNINGS) endif() +if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") + add_compile_options("$<$:/source-charset:utf-8>") + add_compile_options("$<$:/source-charset:utf-8>") + add_compile_options("$<$:/execution-charset:utf-8>") + add_compile_options("$<$:/execution-charset:utf-8>") +endif() + # # option list # From 0354ccbbf23fb8f53cbea565e3dc6258d61ec807 Mon Sep 17 00:00:00 2001 From: PAB Date: Mon, 18 Nov 2024 10:02:49 +0100 Subject: [PATCH 014/245] metal : add `GGML_UNARY_OP_ELU` kernel (ggml/1018) --- ggml/src/ggml-metal/ggml-metal.m | 15 +++++++++++++++ ggml/src/ggml-metal/ggml-metal.metal | 8 ++++++++ 2 files changed, 23 insertions(+) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 58fee4bfd1296..d1abb3cef0ec4 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -126,6 +126,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_GELU_QUICK_4, GGML_METAL_KERNEL_TYPE_SILU, GGML_METAL_KERNEL_TYPE_SILU_4, + GGML_METAL_KERNEL_TYPE_ELU, GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16, GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4, GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32, @@ -649,6 +650,7 @@ @implementation GGMLMetalClass GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK_4, gelu_quick_4, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU, silu, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU_4, silu_4, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ELU, elu, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16, soft_max_f16, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4, soft_max_f16_4, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32, soft_max_f32, has_simdgroup_reduction); @@ -968,6 +970,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_SILU: + case GGML_UNARY_OP_ELU: return ggml_is_contiguous(op->src[0]); default: return false; @@ -1589,6 +1592,18 @@ static void ggml_metal_encode_node( [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; + case GGML_UNARY_OP_ELU: + { + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ELU].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; default: { GGML_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op)); diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 86fdf1c18cfb6..819b20ba89bec 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -782,6 +782,14 @@ kernel void kernel_silu_4( dst[tpig] = x / (1.0f + exp(-x)); } +kernel void kernel_elu( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + device const float & x = src0[tpig]; + dst[tpig] = (x > 0.0f) ? x : (exp(x) - 1.0f); +} + kernel void kernel_sqr( device const float * src0, device float * dst, From 0cc8a630406819e250c13ef0e101ae90280db2b2 Mon Sep 17 00:00:00 2001 From: Plamen Minev Date: Mon, 18 Nov 2024 15:02:27 +0200 Subject: [PATCH 015/245] metal : fox offset integer overflows in im2col (ggml/1015) -- While running StableDiffusion.cpp locally with Metal some offsets overflow and results in incorrect calculations --- ggml/src/ggml-metal/ggml-metal.metal | 52 ++++++++++++++++++---------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 819b20ba89bec..971f5054bce2a 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -2145,20 +2145,34 @@ kernel void kernel_im2col( uint3 tgpg[[threadgroups_per_grid]], uint3 tpitg[[thread_position_in_threadgroup]], uint3 ntg[[threads_per_threadgroup]]) { - const int32_t iiw = tgpig[2] * s0 + tpitg[2] * d0 - p0; - const int32_t iih = tgpig[1] * s1 + tpitg[1] * d1 - p1; +// const int64_t IC = tgpg[0]; + const int64_t OH = tgpg[1]; + const int64_t OW = tgpg[2]; - const int32_t offset_dst = - (tpitg[0] * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * CHW + - (tgpig[0] * (ntg[1] * ntg[2]) + tpitg[1] * ntg[2] + tpitg[2]); +// const int64_t N = ntg[0]; + const int64_t KH = ntg[1]; + const int64_t KW = ntg[2]; + + const int64_t in = tpitg[0]; + const int64_t ikh = tpitg[1]; + const int64_t ikw = tpitg[2]; + + const int64_t iic = tgpig[0]; + const int64_t ioh = tgpig[1]; + const int64_t iow = tgpig[2]; + + const int64_t iiw = iow*s0 + ikw*d0 - p0; + const int64_t iih = ioh*s1 + ikh*d1 - p1; + + const int64_t offset_dst = (in*OH*OW + ioh*OW + iow)*CHW + (iic*(KH*KW) + ikh*KW + ikw); device T * pdst = (device T *) (dst); if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { pdst[offset_dst] = 0.0f; } else { - const int32_t offset_src = tpitg[0] * ofs0 + tgpig[0] * ofs1; - pdst[offset_dst] = x[offset_src + iih * IW + iiw]; + const int64_t offset_src = in*ofs0 + iic*ofs1 + iih*IW + iiw; + pdst[offset_dst] = x[offset_src]; } } @@ -2209,25 +2223,25 @@ kernel void kernel_im2col_ext( uint3 tgpg[[threadgroups_per_grid]], // tgpg[0] = D x IC x KH x KW, CHW = IC x KH x KW uint3 tpitg[[thread_position_in_threadgroup]], uint3 ntg[[threads_per_threadgroup]]) { // [M, 1, 1] - const int32_t KHW = KH * KW; // KHW == ntg[1] * ntg[2], KW == ntg[2] + const int64_t KHW = KH * KW; // KHW == ntg[1] * ntg[2], KW == ntg[2] - const int32_t d = tgpig[0] / CHW; - const int32_t chw = tgpig[0] % CHW; - const int32_t tgpig_0 = chw / KHW; // 0 ~ (IC - 1) - const int32_t HW = tgpig[0] % KHW; + const int64_t d = tgpig[0] / CHW; + const int64_t chw = tgpig[0] % CHW; + const int64_t tgpig_0 = chw / KHW; // 0 ~ (IC - 1) + const int64_t HW = tgpig[0] % KHW; - const int32_t tpitg_0 = (d * ntg[0]) + tpitg[0]; + const int64_t tpitg_0 = (d * ntg[0]) + tpitg[0]; if (tpitg_0 >= N) { return; } - const int32_t tpitg_1 = HW / KW; - const int32_t tpitg_2 = HW % KW; + const int64_t tpitg_1 = HW / KW; + const int64_t tpitg_2 = HW % KW; - const int32_t iiw = tgpig[2] * s0 + tpitg_2 * d0 - p0; - const int32_t iih = tgpig[1] * s1 + tpitg_1 * d1 - p1; + const int64_t iiw = tgpig[2] * s0 + tpitg_2 * d0 - p0; + const int64_t iih = tgpig[1] * s1 + tpitg_1 * d1 - p1; - const int32_t offset_dst = + const int64_t offset_dst = (tpitg_0 * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * CHW + (tgpig_0 * KHW + tpitg_1 * KW + tpitg_2); @@ -2236,7 +2250,7 @@ kernel void kernel_im2col_ext( if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { pdst[offset_dst] = 0.0f; } else { - const int32_t offset_src = tpitg_0 * ofs0 + tgpig_0 * ofs1; + const int64_t offset_src = tpitg_0 * ofs0 + tgpig_0 * ofs1; pdst[offset_dst] = x[offset_src + iih * IW + iiw]; } } From 72819e7be07aa8c425974f7aedd112ee113c3b60 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 19 Nov 2024 19:15:50 +0200 Subject: [PATCH 016/245] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 6ddb71ab133bf..e9bd2dbb05e2c 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -9d0708e863f3aa2fc1eb0b75d433303c30bd0dbc +2884dd72fea8922910fe53387c3d17ab928d3a8e From 514b1936973f366552f77367e6c1db7ff225fec5 Mon Sep 17 00:00:00 2001 From: haopeng <657407891@qq.com> Date: Wed, 20 Nov 2024 04:10:31 +0800 Subject: [PATCH 017/245] add cmake rvv support (#10411) --- ggml/CMakeLists.txt | 1 + ggml/src/ggml-cpu/CMakeLists.txt | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 9ab91421a7d25..2d32da1b6d879 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -109,6 +109,7 @@ if (NOT MSVC) endif() option(GGML_LASX "ggml: enable lasx" ON) option(GGML_LSX "ggml: enable lsx" ON) +option(GGML_RVV "ggml: enable rvv" ON) option(GGML_SVE "ggml: enable SVE" OFF) if (WIN32) diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index cef41a0743cef..2880523331dbd 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -244,6 +244,11 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") if (GGML_LSX) list(APPEND ARCH_FLAGS -mlsx) endif() +elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64") + message(STATUS "RISC-V detected") + if (GGML_RVV) + list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d) + endif() else() message(STATUS "Unknown architecture") endif() From 142d3c28aab76cda1449e0290bf232574fb10bb0 Mon Sep 17 00:00:00 2001 From: Anthony Van de Gejuchte Date: Tue, 19 Nov 2024 23:18:17 +0100 Subject: [PATCH 018/245] Fix missing file renames in Makefile due to changes in commit ae8de6d50a (#10413) --- Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 95110d4eb7d8b..5c899438515e1 100644 --- a/Makefile +++ b/Makefile @@ -730,10 +730,10 @@ GLSLC_CMD = glslc _ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen _ggml_vk_header = ggml/src/ggml-vulkan-shaders.hpp _ggml_vk_source = ggml/src/ggml-vulkan-shaders.cpp -_ggml_vk_input_dir = ggml/src/vulkan-shaders +_ggml_vk_input_dir = ggml/src/ggml-vulkan/vulkan-shaders _ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp) -ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source) +ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source) $(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@ $(_ggml_vk_header): $(_ggml_vk_source) @@ -745,8 +745,8 @@ $(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen --target-hpp $(_ggml_vk_header) \ --target-cpp $(_ggml_vk_source) -vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp - $(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp + $(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp endif # GGML_VULKAN From 68b25ede0c67a92ea5f7beb8d08be4ffa664ec74 Mon Sep 17 00:00:00 2001 From: Neo Zhang Jianyu Date: Wed, 20 Nov 2024 13:54:25 +0800 Subject: [PATCH 019/245] update rel to 4040 (#10395) Co-authored-by: arthw <14088817+arthw@users.noreply.github.com> --- docs/backend/SYCL.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 6c180f18860ef..96c12d8ab8e84 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -34,10 +34,10 @@ The SYCL backend would be broken by some PRs due to no online CI. The following release is verified with good quality: -|Commit ID|Tag|Release|Verified Platform| -|-|-|-|-| -|c16f01b97558440cc9df1472f8093d088338bf6c| ||Arc770/Linux/oneAPI 2024.2| -|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1| +|Commit ID|Tag|Release|Verified Platform| Update date| +|-|-|-|-|-| +|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19| +|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1|| For CI and performance test summary, please refer to [llama.cpp CI for SYCL Backend](https://github.com/NeoZhangJianyu/ci_log/tree/main/llama.cpp). From a77f70500f3b4b2c1efb35326d0a3a435b9b54d0 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Wed, 20 Nov 2024 01:11:00 -0600 Subject: [PATCH 020/245] vulkan: further optimize mul_mat_vec using larger loads (#10387) * vulkan: Use pipeline_robustness to disable robustness in mul_mat_vec. Add some early returns for nonexistent rows in mul_mat_vec shaders. These can only be hit when dispatching a 2D grid of workgroups. Fix the logic for the 2D grid of workgroups to round up. Enable the pipeline robustness extension if it's available, and use it to disable robustness for these pipelines. The instructions to do the bounds checking contend for the same ALU resources as the bit twiddling dequant instructions. * vulkan: Add GLSL structure aliases for quant types to allow larger loads In Vulkan it's not possible to cast pointer types, so instead you have to declare an aliased binding for the memory with a different type. This commit adds aliases for the quant formats using 16b ints, and in a few places where the struct size is a multiple of 4 also using 32b ints. Currently only q4_k's aliases are used, but others will be used in subsequent commits. * vulkan: use larger loads in q5_k and q6_k shaders. Similar to the optimization I did in q4_k recently, this vectorizes some loads and reduces the number of bit twiddling instructions. * vulkan: use larger K step per iteration in mul_mat_vec. Add vec4 dequantization functions, and use them to do K=8 per iteration in mul_mat_vec. This uses 16b loads for the quant values and 128b loads for B which helps reduce the load on the memory system. The K_PER_ITER==2 logic is still there, just for F16/F32, and really only because they support unaligned sizes. Tweak the num_iters/unrolling logic to be simpler and catch a couple missed unrolling opportunities. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 101 +++++++----- .../vulkan-shaders/dequant_funcs.comp | 48 ++++++ .../vulkan-shaders/mul_mat_vec.comp | 82 ++++++++-- .../vulkan-shaders/mul_mat_vec_base.comp | 3 + .../vulkan-shaders/mul_mat_vec_q2_k.comp | 4 + .../vulkan-shaders/mul_mat_vec_q3_k.comp | 4 + .../vulkan-shaders/mul_mat_vec_q4_k.comp | 34 ++--- .../vulkan-shaders/mul_mat_vec_q5_k.comp | 144 +++++++++++------- .../vulkan-shaders/mul_mat_vec_q6_k.comp | 69 ++++++--- .../src/ggml-vulkan/vulkan-shaders/types.comp | 113 +++++++++++++- .../vulkan-shaders/vulkan-shaders-gen.cpp | 6 +- 11 files changed, 459 insertions(+), 149 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 21fee2f3d7533..ca71da2f7b7f5 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -158,6 +158,7 @@ struct vk_device_struct { std::string name; uint64_t max_memory_allocation_size; bool fp16; + bool pipeline_robustness; vk::Device device; uint32_t vendor_id; vk_queue compute_queue; @@ -654,7 +655,7 @@ static uint32_t compile_count = 0; static std::mutex compile_count_mutex; static std::condition_variable compile_count_cond; -static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, std::vector specialization_constants, uint32_t align) { +static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, std::vector specialization_constants, uint32_t align, bool disable_robustness) { VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")"); GGML_ASSERT(parameter_count > 0); GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT @@ -724,6 +725,15 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin vk::PipelineCreateFlags(), pipeline_shader_create_info, pipeline->layout); + + vk::PipelineRobustnessCreateInfoEXT rci; + + if (device->pipeline_robustness && disable_robustness) { + rci.storageBuffers = vk::PipelineRobustnessBufferBehaviorEXT::eDisabled; + rci.uniformBuffers = vk::PipelineRobustnessBufferBehaviorEXT::eDisabled; + compute_pipeline_create_info.setPNext(&rci); + } + pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value; { @@ -1261,7 +1271,7 @@ static void ggml_vk_load_shaders(vk_device& device) { device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared(); std::vector> compiles; - auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, const std::vector& specialization_constants, uint32_t align) { + auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, const std::vector& specialization_constants, uint32_t align, bool disable_robustness = false) { { // wait until fewer than N compiles are in progress uint32_t N = std::max(1u, std::thread::hardware_concurrency()); @@ -1271,7 +1281,7 @@ static void ggml_vk_load_shaders(vk_device& device) { } compile_count++; } - compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align)); + compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align, disable_robustness)); }; if (device->fp16) { @@ -1370,45 +1380,45 @@ static void ggml_vk_load_shaders(vk_device& device) { // computing two rows per workgroup is a benefit for Q4_0 -> Q5_1, but not for Q8_0. ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f32_f32", mul_mat_vec_f32_f32_f32_len, mul_mat_vec_f32_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32_f32", mul_mat_vec_f16_f32_f32_len, mul_mat_vec_f16_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); // dequant shaders ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); @@ -1591,12 +1601,15 @@ static vk_device ggml_vk_get_device(size_t idx) { bool fp16_storage = false; bool fp16_compute = false; + bool pipeline_robustness = false; for (const auto& properties : ext_props) { if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { fp16_storage = true; } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { fp16_compute = true; + } else if (strcmp("VK_EXT_pipeline_robustness", properties.extensionName) == 0) { + pipeline_robustness = true; } } @@ -1642,10 +1655,22 @@ static vk_device ggml_vk_get_device(size_t idx) { vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES; vk11_features.pNext = &vk12_features; + VkPhysicalDevicePipelineRobustnessFeaturesEXT pl_robustness_features; + pl_robustness_features.pNext = nullptr; + pl_robustness_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_ROBUSTNESS_FEATURES_EXT; + pl_robustness_features.pipelineRobustness = VK_FALSE; + + if (pipeline_robustness) { + vk12_features.pNext = &pl_robustness_features; + device_extensions.push_back("VK_EXT_pipeline_robustness"); + } + vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2); device->fp16 = device->fp16 && vk12_features.shaderFloat16; + device->pipeline_robustness = pl_robustness_features.pipelineRobustness; + if (!vk11_features.storageBuffer16BitAccess) { std::cerr << "ggml_vulkan: device " << GGML_VK_NAME << idx << " does not support 16-bit storage." << std::endl; throw std::runtime_error("Unsupported device"); @@ -3190,7 +3215,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& if (ne01 > max_groups_x) { groups_z = 64; - groups_x /= groups_z; + groups_x = CEIL_DIV(groups_x, groups_z); } // compute @@ -3767,7 +3792,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte if (ne01 > max_groups_x) { groups_z = 64; - groups_x /= groups_z; + groups_x = CEIL_DIV(groups_x, groups_z); } // compute diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp index d5b989735bc0b..5fc1ba4ad364b 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp @@ -2,6 +2,15 @@ #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require #endif +#include "types.comp" + +#if defined(A_TYPE_PACKED16) +layout (binding = 0) readonly buffer A_PACKED16 {A_TYPE_PACKED16 data_a_packed16[];}; +#endif +#if defined(A_TYPE_PACKED32) +layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32[];}; +#endif + #if defined(DATA_A_F32) vec2 dequantize(uint ib, uint iqs, uint a_offset) { return vec2(data_a[a_offset + ib], data_a[a_offset + ib + 1]); @@ -20,6 +29,11 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) { const uint vui = uint(data_a[a_offset + ib].qs[iqs]); return (vec2(vui & 0xF, vui >> 4) - 8.0f) * d; } +vec4 dequantize4(uint ib, uint iqs, uint a_offset) { + const float d = float(data_a_packed16[a_offset + ib].d); + const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]); + return (vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, (vui >> 12) & 0xF) - 8.0f) * d; +} #endif #if defined(DATA_A_Q4_1) @@ -29,6 +43,12 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) { const uint vui = uint(data_a[a_offset + ib].qs[iqs]); return vec2(vui & 0xF, vui >> 4) * d + m; } +vec4 dequantize4(uint ib, uint iqs, uint a_offset) { + const float d = float(data_a_packed16[a_offset + ib].d); + const float m = float(data_a_packed16[a_offset + ib].m); + const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]); + return vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, (vui >> 12) & 0xF) * d + m; +} #endif #if defined(DATA_A_Q5_0) @@ -39,6 +59,14 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) { const uint vui = uint(data_a[a_offset + ib].qs[iqs]); return (vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) - 16.0f) * d; } +vec4 dequantize4(uint ib, uint iqs, uint a_offset) { + const float d = float(data_a_packed16[a_offset + ib].d); + const uint uint_qh = uint(data_a_packed16[a_offset + ib].qh[1]) << 16 | data_a_packed16[a_offset + ib].qh[0]; + const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); + const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10); + const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]); + return (vec4(((vui >> 0) & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, ((vui >> 12) & 0xF) | qh1.y) - 16.0f) * d; +} #endif #if defined(DATA_A_Q5_1) @@ -50,6 +78,15 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) { const uint vui = uint(data_a[a_offset + ib].qs[iqs]); return vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) * d + m; } +vec4 dequantize4(uint ib, uint iqs, uint a_offset) { + const float d = float(data_a_packed16[a_offset + ib].d); + const float m = float(data_a_packed16[a_offset + ib].m); + const uint uint_qh = data_a_packed16[a_offset + ib].qh; + const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); + const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10); + const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]); + return vec4(((vui >> 0) & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, ((vui >> 12) & 0xF) | qh1.y) * d + m; +} #endif #if defined(DATA_A_Q8_0) @@ -57,6 +94,12 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) { const float d = float(data_a[a_offset + ib].d); return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1])) * d; } +vec4 dequantize4(uint ib, uint iqs, uint a_offset) { + const float d = float(data_a_packed16[a_offset + ib].d); + uint32_t v0 = data_a_packed16[a_offset + ib].qs[iqs/2]; + uint32_t v1 = data_a_packed16[a_offset + ib].qs[iqs/2 + 1]; + return vec4(int8_t(v0 & 0xFF), int8_t((v0 >> 8) & 0xFF), int8_t(v1 & 0xFF), int8_t((v1 >> 8) & 0xFF)) * d; +} #endif #if defined(DATA_A_IQ4_NL) @@ -65,4 +108,9 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) { const uint vui = uint(data_a[a_offset + ib].qs[iqs]); return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d; } +vec4 dequantize4(uint ib, uint iqs, uint a_offset) { + const float d = float(data_a_packed16[a_offset + ib].d); + const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]); + return vec4(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[(vui >> 4) & 0xF], kvalues_iq4nl[(vui >> 8) & 0xF], kvalues_iq4nl[(vui >> 12) & 0xF]) * d; +} #endif diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp index 970aac6ef72cc..00807a060c46b 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp @@ -3,7 +3,7 @@ #ifdef FLOAT16 #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require #endif -#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require +#extension GL_EXT_shader_explicit_arithmetic_types : require #include "mul_mat_vec_base.comp" @@ -12,16 +12,48 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout (constant_id = 0) const uint BLOCK_SIZE = 32; layout (constant_id = 1) const uint NUM_ROWS = 1; +#if !defined(DATA_A_F32) && !defined(DATA_A_F16) +#define K_PER_ITER 8 +#else +#define K_PER_ITER 2 +#endif + + uint a_offset, b_offset, d_offset, y_offset; shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE]; void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i, bool lastiter) { - const uint col = i*BLOCK_SIZE + 2*tid; + const uint col = i*BLOCK_SIZE + K_PER_ITER*tid; const uint iqs = (col%QUANT_K)/QUANT_R; // quant index const uint iybs = col - col%QUANT_K; // y block start index +#if K_PER_ITER == 8 +#if QUANT_R == 2 + B_TYPE_VEC4 bv02 = data_b_v4[(b_offset + iybs + iqs) / 4]; + B_TYPE_VEC4 bv13 = data_b_v4[(b_offset + iybs + iqs + y_offset) / 4]; + FLOAT_TYPE b0 = FLOAT_TYPE(bv02.x); + FLOAT_TYPE b1 = FLOAT_TYPE(bv13.x); + FLOAT_TYPE b2 = FLOAT_TYPE(bv02.y); + FLOAT_TYPE b3 = FLOAT_TYPE(bv13.y); + FLOAT_TYPE b4 = FLOAT_TYPE(bv02.z); + FLOAT_TYPE b5 = FLOAT_TYPE(bv13.z); + FLOAT_TYPE b6 = FLOAT_TYPE(bv02.w); + FLOAT_TYPE b7 = FLOAT_TYPE(bv13.w); +#else + B_TYPE_VEC4 bv0 = data_b_v4[(b_offset + iybs + iqs) / 4]; + B_TYPE_VEC4 bv1 = data_b_v4[(b_offset + iybs + iqs) / 4 + 1]; + FLOAT_TYPE b0 = FLOAT_TYPE(bv0.x); + FLOAT_TYPE b1 = FLOAT_TYPE(bv0.y); + FLOAT_TYPE b2 = FLOAT_TYPE(bv0.z); + FLOAT_TYPE b3 = FLOAT_TYPE(bv0.w); + FLOAT_TYPE b4 = FLOAT_TYPE(bv1.x); + FLOAT_TYPE b5 = FLOAT_TYPE(bv1.y); + FLOAT_TYPE b6 = FLOAT_TYPE(bv1.z); + FLOAT_TYPE b7 = FLOAT_TYPE(bv1.w); +#endif +#else // Check if the second of the pair of elements is OOB, and don't fetch B or // accumulate it. We still fetch a pair of elements for A, which is fine for // quantized formats since they'll be within the same block. We should @@ -34,9 +66,24 @@ void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_ if (!OOB) { b1 = FLOAT_TYPE(data_b[b_offset + iybs + iqs + y_offset]); } +#endif [[unroll]] for (uint n = 0; n < num_rows; ++n) { const uint ib = ((first_row + n)*p.ncols + col)/QUANT_K; // block index +#if K_PER_ITER == 8 + const vec4 v = dequantize4(ib, iqs, a_offset); + const vec4 v2 = dequantize4(ib, iqs+(4/QUANT_R), a_offset); + + // matrix multiplication + temp[n] = fma(FLOAT_TYPE(v.x), b0, temp[n]); + temp[n] = fma(FLOAT_TYPE(v.y), b1, temp[n]); + temp[n] = fma(FLOAT_TYPE(v.z), b2, temp[n]); + temp[n] = fma(FLOAT_TYPE(v.w), b3, temp[n]); + temp[n] = fma(FLOAT_TYPE(v2.x), b4, temp[n]); + temp[n] = fma(FLOAT_TYPE(v2.y), b5, temp[n]); + temp[n] = fma(FLOAT_TYPE(v2.z), b6, temp[n]); + temp[n] = fma(FLOAT_TYPE(v2.w), b7, temp[n]); +#else const vec2 v = dequantize(ib, iqs, a_offset); // matrix multiplication @@ -44,6 +91,7 @@ void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_ if (!OOB) { temp[n] = fma(FLOAT_TYPE(v.y), b1, temp[n]); } +#endif } } @@ -61,22 +109,33 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { temp[i] = FLOAT_TYPE(0); } - const int unroll_count = 8; - - const uint num_iters = (p.ncols >= 2*tid) ? ((p.ncols - 2*tid + BLOCK_SIZE - 1) / BLOCK_SIZE) : 0; - const uint unrolled_iters = num_iters & ~(2*unroll_count - 1); + uint num_iters = p.ncols / (K_PER_ITER * BLOCK_SIZE); + if (num_iters * K_PER_ITER * BLOCK_SIZE + K_PER_ITER*tid < p.ncols) { + num_iters++; + } + int unroll_count = 4; + uint unrolled_iters = num_iters & ~(unroll_count - 1); uint i = 0; while (i < unrolled_iters) { // Manually partially unroll the loop [[unroll]] for (uint k = 0; k < unroll_count; ++k) { - iter(temp, first_row, num_rows, tid, i, false); - i += 2; + iter(temp, first_row, num_rows, tid, i*K_PER_ITER, false); + i++; + } + } + unroll_count = 2; + unrolled_iters = num_iters & ~(unroll_count - 1); + while (i < unrolled_iters) { + // Manually partially unroll the loop + [[unroll]] for (uint k = 0; k < unroll_count; ++k) { + iter(temp, first_row, num_rows, tid, i*K_PER_ITER, false); + i++; } } while (i < num_iters) { - iter(temp, first_row, num_rows, tid, i, true); - i += 2; + iter(temp, first_row, num_rows, tid, i*K_PER_ITER, true); + i++; } // sum up partial sums and write back result @@ -106,6 +165,9 @@ void main() { if (first_row + NUM_ROWS <= p.stride_d) { compute_outputs(first_row, NUM_ROWS); } else { + if (first_row >= p.stride_d) { + return; + } compute_outputs(first_row, p.stride_d - first_row); } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp index 5920bc93641c8..8d0a5791374d3 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp @@ -12,6 +12,9 @@ layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; +layout (binding = 1) readonly buffer BV2 {B_TYPE_VEC2 data_b_v2[];}; +layout (binding = 1) readonly buffer BV4 {B_TYPE_VEC4 data_b_v4[];}; + layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; #ifdef MUL_MAT_ID layout (binding = 3) readonly buffer IDS {int data_ids[];}; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp index ec8eadcd5828a..e2625d32b902d 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp @@ -9,6 +9,10 @@ shared FLOAT_TYPE tmp[32]; void main() { const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; + if (row >= p.stride_d) { + return; + } + uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp index 3ca4ad85a5ca0..a28804533c320 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp @@ -9,6 +9,10 @@ shared FLOAT_TYPE tmp[32]; void main() { const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; + if (row >= p.stride_d) { + return; + } + uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp index b7c9b722d126b..5846f2e86f3e3 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp @@ -8,30 +8,14 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; shared FLOAT_TYPE tmp[32]; -// Declare aliased versions of A and B bindings that can use 16b/32b loads for -// the quantized values, and vec4 loads for B. -struct block_q4_K_u32 -{ - f16vec2 d; - uint32_t scales[3*QUANT_K/64/4]; - uint32_t qs[QUANT_K/2/4]; -}; - -struct block_q4_K_u16 -{ - f16vec2 d; - uint16_t scales[3*QUANT_K/64/2]; - uint16_t qs[QUANT_K/2/2]; -}; - -layout (binding = 0) readonly buffer A_u32 {block_q4_K_u32 data_a_u32[];}; -layout (binding = 0) readonly buffer A_u16 {block_q4_K_u16 data_a_u16[];}; -layout (binding = 1) readonly buffer BV4 {B_TYPE_VEC4 data_b_v4[];}; - // This shader assumes K_QUANTS_PER_ITERATION == 2 for alignment of loads void main() { const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; + if (row >= p.stride_d) { + return; + } + uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); @@ -64,9 +48,9 @@ void main() { const FLOAT_TYPE dall = FLOAT_TYPE(d.x); const FLOAT_TYPE dmin = FLOAT_TYPE(d.y); - uint32_t scale0_u32 = data_a_u16[ib0 + i].scales[v_im ]; - uint32_t scale4_u32 = data_a_u16[ib0 + i].scales[v_im + 2]; - uint32_t scale8_u32 = data_a_u16[ib0 + i].scales[v_im + 4]; + uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ]; + uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2]; + uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4]; uvec4 scale0 = uvec4(unpack8(scale0_u32)); uvec4 scale4 = uvec4(unpack8(scale4_u32)); uvec4 scale8 = uvec4(unpack8(scale8_u32)); @@ -80,8 +64,8 @@ void main() { const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2)); const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2)); - uint32_t qs0_u32 = data_a_u32[ib0 + i].qs[q_offset / 4]; - uint32_t qs64_u32 = data_a_u32[ib0 + i].qs[q_offset / 4 + 16]; + uint32_t qs0_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4]; + uint32_t qs64_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4 + 16]; uint32_t qs0_u32_lo4 = qs0_u32 & 0x0F0F0F0F; uint32_t qs0_u32_hi4 = (qs0_u32 >> 4) & 0x0F0F0F0F; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp index 2306785af4226..22a6bfae4fb61 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp @@ -1,5 +1,7 @@ #version 450 +#extension GL_EXT_shader_explicit_arithmetic_types : require + #include "mul_mat_vec_base.comp" layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; @@ -9,6 +11,10 @@ shared FLOAT_TYPE tmp[32]; void main() { const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; + if (row >= p.stride_d) { + return; + } + uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); @@ -31,70 +37,106 @@ void main() { const uint8_t hm1 = uint8_t(1 << (2*v_im)); const uint8_t hm2 = uint8_t(hm1 << 4); - tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp + FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += 2) { const uint y1_idx = i * QUANT_K + y_offset; const uint y2_idx = y1_idx + 128; - const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib0 + i].d.x); - const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib0 + i].d.y); - - const uint8_t sc0 = uint8_t( data_a[ib0 + i].scales[v_im * 2 ] & 0x3f); - const uint8_t sc1 = uint8_t( data_a[ib0 + i].scales[v_im * 2 + 1] & 0x3f); - const uint8_t sc2 = uint8_t( data_a[ib0 + i].scales[v_im * 2 + 4] & 0x3f); - const uint8_t sc3 = uint8_t( data_a[ib0 + i].scales[v_im * 2 + 5] & 0x3f); - const uint8_t sc4 = uint8_t(( data_a[ib0 + i].scales[v_im * 2 + 8] & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 ] & 0xc0) >> 2)); - const uint8_t sc5 = uint8_t(( data_a[ib0 + i].scales[v_im * 2 + 9] & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 1] & 0xc0) >> 2)); - const uint8_t sc6 = uint8_t(((data_a[ib0 + i].scales[v_im * 2 + 8] >> 4) & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 4] & 0xc0) >> 2)); - const uint8_t sc7 = uint8_t(((data_a[ib0 + i].scales[v_im * 2 + 9] >> 4) & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 5] & 0xc0) >> 2)); - - const uint8_t q4_0 = uint8_t(data_a[ib0 + i].qs[q_offset ] & 0xf); - const uint8_t q4_1 = uint8_t(data_a[ib0 + i].qs[q_offset + 1] & 0xf); - const uint8_t q4_2 = uint8_t(data_a[ib0 + i].qs[q_offset + 16] & 0xf); - const uint8_t q4_3 = uint8_t(data_a[ib0 + i].qs[q_offset + 17] & 0xf); - const uint8_t q4_4 = uint8_t(data_a[ib0 + i].qs[q_offset ] >> 4); - const uint8_t q4_5 = uint8_t(data_a[ib0 + i].qs[q_offset + 1] >> 4); - const uint8_t q4_6 = uint8_t(data_a[ib0 + i].qs[q_offset + 16] >> 4); - const uint8_t q4_7 = uint8_t(data_a[ib0 + i].qs[q_offset + 17] >> 4); - const uint8_t q4_8 = uint8_t(data_a[ib0 + i].qs[q_offset + 64] & 0xf); - const uint8_t q4_9 = uint8_t(data_a[ib0 + i].qs[q_offset + 65] & 0xf); - const uint8_t q4_10 = uint8_t(data_a[ib0 + i].qs[q_offset + 80] & 0xf); - const uint8_t q4_11 = uint8_t(data_a[ib0 + i].qs[q_offset + 81] & 0xf); - const uint8_t q4_12 = uint8_t(data_a[ib0 + i].qs[q_offset + 64] >> 4); - const uint8_t q4_13 = uint8_t(data_a[ib0 + i].qs[q_offset + 65] >> 4); - const uint8_t q4_14 = uint8_t(data_a[ib0 + i].qs[q_offset + 80] >> 4); - const uint8_t q4_15 = uint8_t(data_a[ib0 + i].qs[q_offset + 81] >> 4); + f16vec2 d = data_a[ib0 + i].d; + const FLOAT_TYPE dall = FLOAT_TYPE(d.x); + const FLOAT_TYPE dmin = FLOAT_TYPE(d.y); + + uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ]; + uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2]; + uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4]; + uvec4 scale0 = uvec4(unpack8(scale0_u32)); + uvec4 scale4 = uvec4(unpack8(scale4_u32)); + uvec4 scale8 = uvec4(unpack8(scale8_u32)); + + const uint32_t sc0 = ( scale0.x & 0x3f); + const uint32_t sc1 = ( scale0.y & 0x3f); + const uint32_t sc2 = ( scale4.x & 0x3f); + const uint32_t sc3 = ( scale4.y & 0x3f); + const uint32_t sc4 = (( scale8.x & 0x0f) | ((scale0.x & 0xc0) >> 2)); + const uint32_t sc5 = (( scale8.y & 0x0f) | ((scale0.y & 0xc0) >> 2)); + const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2)); + const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2)); + + uint32_t qs0_16_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]) << 16); + uint32_t qs64_80_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 32]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 40]) << 16); + + uint32_t qs0_16_u32_lo4 = qs0_16_u32 & 0x0F0F0F0F; + uint32_t qs0_16_u32_hi4 = (qs0_16_u32 >> 4) & 0x0F0F0F0F; + uint32_t qs64_80_u32_lo4 = qs64_80_u32 & 0x0F0F0F0F; + uint32_t qs64_80_u32_hi4 = (qs64_80_u32 >> 4) & 0x0F0F0F0F; + + uvec4 qs0_16_lo4 = uvec4(unpack8(qs0_16_u32_lo4)); + uvec4 qs64_80_lo4 = uvec4(unpack8(qs64_80_u32_lo4)); + uvec4 qs0_16_hi4 = uvec4(unpack8(qs0_16_u32_hi4)); + uvec4 qs64_80_hi4 = uvec4(unpack8(qs64_80_u32_hi4)); + + const uint32_t q4_0 = qs0_16_lo4.x; + const uint32_t q4_1 = qs0_16_lo4.y; + const uint32_t q4_2 = qs0_16_lo4.z; + const uint32_t q4_3 = qs0_16_lo4.w; + const uint32_t q4_4 = qs0_16_hi4.x; + const uint32_t q4_5 = qs0_16_hi4.y; + const uint32_t q4_6 = qs0_16_hi4.z; + const uint32_t q4_7 = qs0_16_hi4.w; + const uint32_t q4_8 = qs64_80_lo4.x; + const uint32_t q4_9 = qs64_80_lo4.y; + const uint32_t q4_10 = qs64_80_lo4.z; + const uint32_t q4_11 = qs64_80_lo4.w; + const uint32_t q4_12 = qs64_80_hi4.x; + const uint32_t q4_13 = qs64_80_hi4.y; + const uint32_t q4_14 = qs64_80_hi4.z; + const uint32_t q4_15 = qs64_80_hi4.w; + + B_TYPE_VEC2 by10 = data_b_v2[(b_offset + y1_idx) / 2]; + B_TYPE_VEC2 by116 = data_b_v2[(b_offset + y1_idx) / 2 + 8]; + B_TYPE_VEC2 by132 = data_b_v2[(b_offset + y1_idx) / 2 + 16]; + B_TYPE_VEC2 by148 = data_b_v2[(b_offset + y1_idx) / 2 + 24]; + B_TYPE_VEC2 by20 = data_b_v2[(b_offset + y2_idx) / 2]; + B_TYPE_VEC2 by216 = data_b_v2[(b_offset + y2_idx) / 2 + 8]; + B_TYPE_VEC2 by232 = data_b_v2[(b_offset + y2_idx) / 2 + 16]; + B_TYPE_VEC2 by248 = data_b_v2[(b_offset + y2_idx) / 2 + 24]; + + uint32_t qh0 = data_a_packed16[ib0 + i].qh[l0 / 2]; + uint32_t qh1 = qh0 >> 8; + uint32_t qh16 = data_a_packed16[ib0 + i].qh[l0 / 2 + 8]; + uint32_t qh17 = qh16 >> 8; const FLOAT_TYPE sx = - fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]), (q4_0 + (((data_a[ib0 + i].qh[l0 ] & hm1) != 0) ? 16 : 0)), - fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 1]), (q4_1 + (((data_a[ib0 + i].qh[l0 + 1] & hm1) != 0) ? 16 : 0)), - fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 16]), (q4_2 + (((data_a[ib0 + i].qh[l0 + 16] & hm1) != 0) ? 16 : 0)), - FLOAT_TYPE(data_b[b_offset + y1_idx + 17]) * (q4_3 + (((data_a[ib0 + i].qh[l0 + 17] & hm1) != 0) ? 16 : 0))))); + fma(FLOAT_TYPE(by10.x), (q4_0 + (((qh0 & hm1) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(by10.y), (q4_1 + (((qh1 & hm1) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(by116.x), (q4_2 + (((qh16 & hm1) != 0) ? 16 : 0)), + FLOAT_TYPE(by116.y) * (q4_3 + (((qh17 & hm1) != 0) ? 16 : 0))))); const FLOAT_TYPE sy = - fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), (q4_4 + (((data_a[ib0 + i].qh[l0 ] & (hm1 << 1)) != 0) ? 16 : 0)), - fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 33]), (q4_5 + (((data_a[ib0 + i].qh[l0 + 1] & (hm1 << 1)) != 0) ? 16 : 0)), - fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 48]), (q4_6 + (((data_a[ib0 + i].qh[l0 + 16] & (hm1 << 1)) != 0) ? 16 : 0)), - FLOAT_TYPE(data_b[b_offset + y1_idx + 49]) * (q4_7 + (((data_a[ib0 + i].qh[l0 + 17] & (hm1 << 1)) != 0) ? 16 : 0))))); + fma(FLOAT_TYPE(by132.x), (q4_4 + (((qh0 & (hm1 << 1)) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(by132.y), (q4_5 + (((qh1 & (hm1 << 1)) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(by148.x), (q4_6 + (((qh16 & (hm1 << 1)) != 0) ? 16 : 0)), + FLOAT_TYPE(by148.y) * (q4_7 + (((qh17 & (hm1 << 1)) != 0) ? 16 : 0))))); const FLOAT_TYPE sz = - fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]), (q4_8 + (((data_a[ib0 + i].qh[l0 ] & hm2) != 0) ? 16 : 0)), - fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 1]), (q4_9 + (((data_a[ib0 + i].qh[l0 + 1] & hm2) != 0) ? 16 : 0)), - fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 16]), (q4_10 + (((data_a[ib0 + i].qh[l0 + 16] & hm2) != 0) ? 16 : 0)), - FLOAT_TYPE(data_b[b_offset + y2_idx + 17]) * (q4_11 + (((data_a[ib0 + i].qh[l0 + 17] & hm2) != 0) ? 16 : 0))))); + fma(FLOAT_TYPE(by20.x), (q4_8 + (((qh0 & hm2) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(by20.y), (q4_9 + (((qh1 & hm2) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(by216.x), (q4_10 + (((qh16 & hm2) != 0) ? 16 : 0)), + FLOAT_TYPE(by216.y) * (q4_11 + (((qh17 & hm2) != 0) ? 16 : 0))))); const FLOAT_TYPE sw = - fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), (q4_12 + (((data_a[ib0 + i].qh[l0 ] & (hm2 << 1)) != 0) ? 16 : 0)), - fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 33]), (q4_13 + (((data_a[ib0 + i].qh[l0 + 1] & (hm2 << 1)) != 0) ? 16 : 0)), - fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 48]), (q4_14 + (((data_a[ib0 + i].qh[l0 + 16] & (hm2 << 1)) != 0) ? 16 : 0)), - FLOAT_TYPE(data_b[b_offset + y2_idx + 49]) * (q4_15 + (((data_a[ib0 + i].qh[l0 + 17] & (hm2 << 1)) != 0) ? 16 : 0))))); + fma(FLOAT_TYPE(by232.x), (q4_12 + (((qh0 & (hm2 << 1)) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(by232.y), (q4_13 + (((qh1 & (hm2 << 1)) != 0) ? 16 : 0)), + fma(FLOAT_TYPE(by248.x), (q4_14 + (((qh16 & (hm2 << 1)) != 0) ? 16 : 0)), + FLOAT_TYPE(by248.y) * (q4_15 + (((qh17 & (hm2 << 1)) != 0) ? 16 : 0))))); const FLOAT_TYPE smin = - fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 1 ]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 17]), sc2, - fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 49]), sc3, - fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 1 ]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 17]), sc6, - (FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 49])) * sc7))); - const uint tmp_idx = 16 * ix + tid; - tmp[tmp_idx] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, tmp[tmp_idx])); + fma(FLOAT_TYPE(by10.x) + FLOAT_TYPE(by10.y) + FLOAT_TYPE(by116.x) + FLOAT_TYPE(by116.y), sc2, + fma(FLOAT_TYPE(by132.x) + FLOAT_TYPE(by132.y) + FLOAT_TYPE(by148.x) + FLOAT_TYPE(by148.y), sc3, + fma(FLOAT_TYPE(by20.x) + FLOAT_TYPE(by20.y) + FLOAT_TYPE(by216.x) + FLOAT_TYPE(by216.y), sc6, + (FLOAT_TYPE(by232.x) + FLOAT_TYPE(by232.y) + FLOAT_TYPE(by248.x) + FLOAT_TYPE(by248.y)) * sc7))); + temp = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp)); } + tmp[gl_LocalInvocationID.x] = temp; + // sum up partial sums and write back result barrier(); [[unroll]] for (uint s = 16; s > 0; s >>= 1) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp index 95c286eeb17e1..0b392d68d04d0 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -1,5 +1,7 @@ #version 450 +#extension GL_EXT_shader_explicit_arithmetic_types : require + #include "mul_mat_vec_base.comp" layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; @@ -9,6 +11,10 @@ shared FLOAT_TYPE tmp[32]; void main() { const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; + if (row >= p.stride_d) { + return; + } + uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); @@ -36,41 +42,66 @@ void main() { const uint s_offset = 8*v_im + is; const uint y_offset = 128*v_im + l0; - tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp + FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { const uint y_idx = i * QUANT_K + y_offset; const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d); -#if K_QUANTS_PER_ITERATION == 1 - const uint tmp_idx = 16 * ix + tid; - tmp[tmp_idx] = fma(FLOAT_TYPE(data_b[b_offset + y_idx + 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x03) << 4)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + 16]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x03) << 4)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + 32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x0c) << 2)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + 48]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x0c) << 2)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + 64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x30) >> 0)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + 80]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x30) >> 0)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + 96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0xc0) >> 2)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx +112]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0xc0) >> 2)) - 32), tmp[tmp_idx])))))))); -#else + FLOAT_TYPE scales[4]; + scales[0] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]); + scales[1] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]); + scales[2] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]); + scales[3] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]); + + uint32_t ql0_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 1]) << 16); + uint32_t ql32_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 16]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 17]) << 16); + + uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F; + uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F; + uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F; + uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F; + + uint32_t qh_u32 = uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2 + 1]) << 16); + uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4; + uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2; + uint32_t qh4_u32 = (qh_u32 & 0x30303030) << 0; + uint32_t qh6_u32 = (qh_u32 & 0xC0C0C0C0) >> 2; + + uint32_t q0_u32 = ql0_u32_lo4 | qh0_u32; + uint32_t q1_u32 = ql32_u32_lo4 | qh2_u32; + uint32_t q2_u32 = ql0_u32_hi4 | qh4_u32; + uint32_t q3_u32 = ql32_u32_hi4 | qh6_u32; + + uvec4 q0 = uvec4(unpack8(q0_u32)); + uvec4 q1 = uvec4(unpack8(q1_u32)); + uvec4 q2 = uvec4(unpack8(q2_u32)); + uvec4 q3 = uvec4(unpack8(q3_u32)); + + B_TYPE_VEC4 by0 = data_b_v4[(b_offset + y_idx) / 4]; + B_TYPE_VEC4 by32 = data_b_v4[(b_offset + y_idx) / 4 + 8]; + B_TYPE_VEC4 by64 = data_b_v4[(b_offset + y_idx) / 4 + 16]; + B_TYPE_VEC4 by96 = data_b_v4[(b_offset + y_idx) / 4 + 24]; + FLOAT_TYPE sum = FLOAT_TYPE(0.0); [[unroll]] for (int l = 0; l < 4; ++l) { - sum = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+ 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 0) & 3) << 4)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 2) & 3) << 4)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 4) & 3) << 4)) - 32), - fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 6) & 3) << 4)) - 32), sum)))); + sum = fma(FLOAT_TYPE(by0[l]) * scales[0], FLOAT_TYPE(int8_t(q0[l]) - 32), + fma(FLOAT_TYPE(by32[l]) * scales[1], FLOAT_TYPE(int8_t(q1[l]) - 32), + fma(FLOAT_TYPE(by64[l]) * scales[2], FLOAT_TYPE(int8_t(q2[l]) - 32), + fma(FLOAT_TYPE(by96[l]) * scales[3], FLOAT_TYPE(int8_t(q3[l]) - 32), sum)))); } - tmp[16 * ix + tid] += sum; -#endif + temp += sum * d; } + tmp[gl_LocalInvocationID.x] = temp; + // sum up partial sums and write back result barrier(); [[unroll]] for (uint s = 16; s > 0; s >>= 1) { if (tid < s) { tmp[tid] += tmp[tid + s]; - } + } barrier(); } if (tid == 0) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp index 21dce72fc7dfb..7a34820bc2789 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp @@ -1,6 +1,8 @@ -#if !defined(DATA_A_F32) && !defined(DATA_A_F16) -#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require -#endif + +#if !defined(GGML_TYPES_COMP) +#define GGML_TYPES_COMP + +#extension GL_EXT_shader_explicit_arithmetic_types : require #if defined(DATA_A_F32) #define QUANT_K 1 @@ -38,8 +40,14 @@ struct block_q4_0 float16_t d; uint8_t qs[16]; }; +struct block_q4_0_packed16 +{ + float16_t d; + uint16_t qs[16/2]; +}; #define A_TYPE block_q4_0 +#define A_TYPE_PACKED16 block_q4_0_packed16 #endif #if defined(DATA_A_Q4_1) @@ -54,7 +62,15 @@ struct block_q4_1 uint8_t qs[16]; }; +struct block_q4_1_packed16 +{ + float16_t d; + float16_t m; + uint16_t qs[16/2]; +}; + #define A_TYPE block_q4_1 +#define A_TYPE_PACKED16 block_q4_1_packed16 #endif #if defined(DATA_A_Q5_0) @@ -70,7 +86,15 @@ struct block_q5_0 uint8_t qs[16]; }; +struct block_q5_0_packed16 +{ + float16_t d; + uint16_t qh[2]; + uint16_t qs[16/2]; +}; + #define A_TYPE block_q5_0 +#define A_TYPE_PACKED16 block_q5_0_packed16 #endif #if defined(DATA_A_Q5_1) @@ -87,7 +111,16 @@ struct block_q5_1 uint8_t qs[16]; }; +struct block_q5_1_packed16 +{ + float16_t d; + float16_t m; + uint qh; + uint16_t qs[16/2]; +}; + #define A_TYPE block_q5_1 +#define A_TYPE_PACKED16 block_q5_1_packed16 #endif #if defined(DATA_A_Q8_0) @@ -100,8 +133,14 @@ struct block_q8_0 float16_t d; int8_t qs[32]; }; +struct block_q8_0_packed16 +{ + float16_t d; + uint16_t qs[32/2]; +}; #define A_TYPE block_q8_0 +#define A_TYPE_PACKED16 block_q8_0_packed16 #endif // K-quants @@ -116,7 +155,23 @@ struct block_q2_K f16vec2 d; }; +struct block_q2_K_packed16 +{ + uint16_t scales[QUANT_K/16/2]; + uint16_t qs[QUANT_K/4/2]; + f16vec2 d; +}; + +struct block_q2_K_packed32 +{ + uint32_t scales[QUANT_K/16/4]; + uint32_t qs[QUANT_K/4/4]; + f16vec2 d; +}; + #define A_TYPE block_q2_K +#define A_TYPE_PACKED16 block_q2_K_packed16 +#define A_TYPE_PACKED32 block_q2_K_packed32 #endif #if defined(DATA_A_Q3_K) @@ -131,7 +186,16 @@ struct block_q3_K float16_t d; }; +struct block_q3_K_packed16 +{ + uint16_t hmask[QUANT_K/8/2]; + uint16_t qs[QUANT_K/4/2]; + uint16_t scales[12/2]; + float16_t d; +}; + #define A_TYPE block_q3_K +#define A_TYPE_PACKED16 block_q3_K_packed16 #endif #if defined(DATA_A_Q4_K) @@ -145,7 +209,23 @@ struct block_q4_K uint8_t qs[QUANT_K/2]; }; +struct block_q4_K_packed16 +{ + f16vec2 d; + uint16_t scales[3*QUANT_K/64/2]; + uint16_t qs[QUANT_K/2/2]; +}; + +struct block_q4_K_packed32 +{ + f16vec2 d; + uint32_t scales[3*QUANT_K/64/4]; + uint32_t qs[QUANT_K/2/4]; +}; + #define A_TYPE block_q4_K +#define A_TYPE_PACKED16 block_q4_K_packed16 +#define A_TYPE_PACKED32 block_q4_K_packed32 #endif #if defined(DATA_A_Q5_K) @@ -160,7 +240,16 @@ struct block_q5_K uint8_t qs[QUANT_K/2]; }; +struct block_q5_K_packed16 +{ + f16vec2 d; + uint16_t scales[12/2]; + uint16_t qh[QUANT_K/8/2]; + uint16_t qs[QUANT_K/2/2]; +}; + #define A_TYPE block_q5_K +#define A_TYPE_PACKED16 block_q5_K_packed16 #endif #if defined(DATA_A_Q6_K) @@ -175,7 +264,16 @@ struct block_q6_K float16_t d; }; +struct block_q6_K_packed16 +{ + uint16_t ql[QUANT_K/2/2]; + uint16_t qh[QUANT_K/4/2]; + int8_t scales[QUANT_K/16]; + float16_t d; +}; + #define A_TYPE block_q6_K +#define A_TYPE_PACKED16 block_q6_K_packed16 #endif // IQuants @@ -191,10 +289,19 @@ struct block_iq4_nl uint8_t qs[QUANT_K/2]; }; +struct block_iq4_nl_packed16 +{ + float16_t d; + uint16_t qs[QUANT_K/2/2]; +}; + #define A_TYPE block_iq4_nl +#define A_TYPE_PACKED16 block_iq4_nl_packed16 const int8_t kvalues_iq4nl[16] = { int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10), int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113) }; #endif + +#endif // !defined(GGML_TYPES_COMP) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index fe3e4cb39ff47..f753109556f93 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -317,10 +317,10 @@ void process_shaders() { std::string data_a_key = "DATA_A_" + to_uppercase(tname); std::string shader = (string_ends_with(tname, "_k")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp"; - string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}})); - string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}})); + string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}})); + string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}})); - string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}})); + string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}})); // Dequant shaders if (tname != "f16") { From c396183f8359140ccd5ace3600fcc704a54b5db5 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Wed, 20 Nov 2024 01:40:18 -0600 Subject: [PATCH 021/245] vulkan: copy iq4_nl LUT into shared memory (#10409) --- .../ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp | 2 ++ .../ggml-vulkan/vulkan-shaders/get_rows_quant.comp | 4 ++++ .../src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp | 4 ++++ ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp | 4 ++++ ggml/src/ggml-vulkan/vulkan-shaders/types.comp | 13 ++++++++++++- .../vulkan-shaders/vulkan-shaders-gen.cpp | 6 +++--- 6 files changed, 29 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp index 34ef3da30b82c..8de14fc03f102 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp @@ -10,6 +10,8 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; void main() { const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64; + init_iq4nl_shmem(); + const uint tid = gl_LocalInvocationID.x % 64; const uint il = tid/32; const uint ir = tid%32; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp index 8d30b63c165b2..7f608315b6844 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp @@ -12,6 +12,10 @@ void main() { const uint i11 = (gl_GlobalInvocationID.z)/p.ne12; const uint i12 = (gl_GlobalInvocationID.z)%p.ne12; +#if defined(DATA_A_IQ4_NL) + init_iq4nl_shmem(); +#endif + if (i00 >= p.ne00) { return; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp index 00807a060c46b..2d5b8e4661312 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp @@ -161,6 +161,10 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { void main() { const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z); +#if defined(DATA_A_IQ4_NL) + init_iq4nl_shmem(); +#endif + // do NUM_ROWS at a time, unless there aren't enough remaining rows if (first_row + NUM_ROWS <= p.stride_d) { compute_outputs(first_row, NUM_ROWS); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp index fffdd18189d55..2ff5c430519bc 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp @@ -75,6 +75,10 @@ shared u16vec2 row_ids[3072]; #endif void main() { +#if defined(DATA_A_IQ4_NL) + init_iq4nl_shmem(); +#endif + #ifdef MUL_MAT_ID const uint expert_idx = gl_GlobalInvocationID.z; #else diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp index 7a34820bc2789..bc28e0ab857aa 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp @@ -298,10 +298,21 @@ struct block_iq4_nl_packed16 #define A_TYPE block_iq4_nl #define A_TYPE_PACKED16 block_iq4_nl_packed16 -const int8_t kvalues_iq4nl[16] = { +const int8_t kvalues_iq4nl_const[16] = { int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10), int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113) }; + +shared FLOAT_TYPE kvalues_iq4nl[16]; + +void init_iq4nl_shmem() +{ + // copy the table into shared memory and sync + if (gl_LocalInvocationIndex.x < 16) { + kvalues_iq4nl[gl_LocalInvocationIndex.x] = FLOAT_TYPE(kvalues_iq4nl_const[gl_LocalInvocationIndex.x]); + } + barrier(); +} #endif #endif // !defined(GGML_TYPES_COMP) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index f753109556f93..6bbe8e96edd9f 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -331,11 +331,11 @@ void process_shaders() { shader = (tname == "f32" || tname == "f16") ? "get_rows.comp" : "get_rows_quant.comp"; if (tname == "f16") { - string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}); + string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}})); } else { - string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}}); + string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}})); } - string_to_spv("get_rows_" + tname + "_f32", shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}}); + string_to_spv("get_rows_" + tname + "_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}})); } } From b08b75c5622f10976e351f4d6d790cfa9fc2e110 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Wed, 20 Nov 2024 12:57:53 +0100 Subject: [PATCH 022/245] llama : add .clang-format file (#10415) --- .clang-format | 161 ++++++ examples/llama-bench/llama-bench.cpp | 702 +++++++++++++++------------ 2 files changed, 551 insertions(+), 312 deletions(-) create mode 100644 .clang-format diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000000000..45232b80ed8cd --- /dev/null +++ b/.clang-format @@ -0,0 +1,161 @@ +--- +Language: Cpp +AlignAfterOpenBracket: Align +AlignArrayOfStructures: Left +AlignConsecutiveAssignments: AcrossComments +AlignConsecutiveBitFields: AcrossComments +AlignConsecutiveDeclarations: AcrossComments +AlignConsecutiveMacros: AcrossComments +# AlignConsecutiveShortCaseStatements: AcrossComments +AlignEscapedNewlines: Left # LeftWithLastLine +AlignOperands: Align +AlignTrailingComments: + Kind: Always + OverEmptyLines: 1 +AllowAllArgumentsOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: false +# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: Inline +AllowShortIfStatementsOnASingleLine: Never +AllowShortLambdasOnASingleLine: Inline +AllowShortLoopsOnASingleLine: false +AlwaysBreakBeforeMultilineStrings: true +BinPackArguments: true +BinPackParameters: true # OnePerLine +BitFieldColonSpacing: Both +BreakBeforeBraces: Custom # Attach +BraceWrapping: + AfterCaseLabel: true + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + BeforeLambdaBody: false + BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: false + SplitEmptyRecord: false + SplitEmptyNamespace: false +# BreakAdjacentStringLiterals: true +BreakAfterAttributes: Never +BreakBeforeBinaryOperators: None +BreakBeforeInlineASMColon: OnlyMultiline +BreakBeforeTernaryOperators: false +# BreakBinaryOperations: Never +BreakConstructorInitializers: AfterColon +# BreakFunctionDefinitionParameters: false +BreakInheritanceList: AfterComma +BreakStringLiterals: true +# BreakTemplateDeclarations: Yes +ColumnLimit: 120 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: false +DerivePointerAlignment: false +DisableFormat: false +EmptyLineBeforeAccessModifier: Leave +EmptyLineAfterAccessModifier: Never +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +IncludeBlocks: Regroup +IncludeCategories: + - Regex: '^<.*\.h>' + Priority: 1 + SortPriority: 0 + - Regex: '^<.*' + Priority: 2 + SortPriority: 0 + - Regex: '.*' + Priority: 3 + SortPriority: 0 +IncludeIsMainRegex: '([-_](test|unittest))?$' +IncludeIsMainSourceRegex: '' +IndentAccessModifiers: false +IndentCaseBlocks: true +IndentCaseLabels: true +IndentExternBlock: NoIndent +IndentGotoLabels: false +IndentPPDirectives: AfterHash +IndentWidth: 4 +IndentWrappedFunctionNames: false +InsertBraces: true # NOTE: may lead to incorrect formatting +InsertNewlineAtEOF: true +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +LambdaBodyIndentation: Signature +LineEnding: LF +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Auto +ObjCBlockIndentWidth: 4 +ObjCSpaceAfterProperty: true +ObjCSpaceBeforeProtocolList: true +PPIndentWidth: -1 +PackConstructorInitializers: CurrentLine +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 1 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 200 +PointerAlignment: Middle +QualifierAlignment: Left +#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict'] +RawStringFormats: + - Language: Cpp + Delimiters: + - cc + - CC + - cpp + - Cpp + - CPP + - 'c++' + - 'C++' + CanonicalDelimiter: '' +ReferenceAlignment: Middle +ReflowComments: false # IndentOnly +SeparateDefinitionBlocks: Always +SortIncludes: CaseInsensitive +SortUsingDeclarations: LexicographicNumeric +SpaceAfterCStyleCast: true +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyBlock: false +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 2 +SpacesInAngles: Never +SpacesInContainerLiterals: true +SpacesInLineCommentPrefix: + Minimum: 1 + Maximum: -1 +SpacesInParentheses: false +SpacesInSquareBrackets: false +SpaceBeforeSquareBrackets: false +Standard: c++17 +TabWidth: 4 +UseTab: Never +WhitespaceSensitiveMacros: ['STRINGIZE'] +... + diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 8f4e0e206a27e..3dc84a75cbec7 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -6,28 +6,28 @@ #include #include #include +#include #include #include -#include #include #include #include #include #include #include -#include #include +#include +#include "common.h" #include "ggml.h" #include "llama.h" -#include "common.h" #ifdef _WIN32 -#define WIN32_LEAN_AND_MEAN -#ifndef NOMINMAX -# define NOMINMAX -#endif -#include +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include #endif // utils @@ -36,8 +36,7 @@ static uint64_t get_time_ns() { return std::chrono::nanoseconds(clock::now().time_since_epoch()).count(); } -template -static std::string join(const std::vector & values, const std::string & delim) { +template static std::string join(const std::vector & values, const std::string & delim) { std::ostringstream str; for (size_t i = 0; i < values.size(); i++) { str << values[i]; @@ -48,38 +47,35 @@ static std::string join(const std::vector & values, const std::string & delim return str.str(); } -template -static std::vector transform_to_str(const std::vector & values, F f) { +template static std::vector transform_to_str(const std::vector & values, F f) { std::vector str_values; std::transform(values.begin(), values.end(), std::back_inserter(str_values), f); return str_values; } -template -static T avg(const std::vector & v) { +template static T avg(const std::vector & v) { if (v.empty()) { return 0; } T sum = std::accumulate(v.begin(), v.end(), T(0)); - return sum / (T)v.size(); + return sum / (T) v.size(); } -template -static T stdev(const std::vector & v) { +template static T stdev(const std::vector & v) { if (v.size() <= 1) { return 0; } - T mean = avg(v); + T mean = avg(v); T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0)); - T stdev = std::sqrt(sq_sum / (T)(v.size() - 1) - mean * mean * (T)v.size() / (T)(v.size() - 1)); + T stdev = std::sqrt(sq_sum / (T) (v.size() - 1) - mean * mean * (T) v.size() / (T) (v.size() - 1)); return stdev; } static std::string get_cpu_info() { std::vector cpu_list; for (size_t i = 0; i < ggml_backend_dev_count(); i++) { - auto * dev = ggml_backend_dev_get(i); - auto dev_type = ggml_backend_dev_type(dev); + auto * dev = ggml_backend_dev_get(i); + auto dev_type = ggml_backend_dev_type(dev); if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) { cpu_list.push_back(ggml_backend_dev_description(dev)); } @@ -90,8 +86,8 @@ static std::string get_cpu_info() { static std::string get_gpu_info() { std::vector gpu_list; for (size_t i = 0; i < ggml_backend_dev_count(); i++) { - auto * dev = ggml_backend_dev_get(i); - auto dev_type = ggml_backend_dev_type(dev); + auto * dev = ggml_backend_dev_get(i); + auto dev_type = ggml_backend_dev_type(dev); if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) { gpu_list.push_back(ggml_backend_dev_description(dev)); } @@ -100,17 +96,24 @@ static std::string get_gpu_info() { } // command line params -enum output_formats {NONE, CSV, JSON, JSONL, MARKDOWN, SQL}; +enum output_formats { NONE, CSV, JSON, JSONL, MARKDOWN, SQL }; static const char * output_format_str(output_formats format) { switch (format) { - case NONE: return "none"; - case CSV: return "csv"; - case JSON: return "json"; - case JSONL: return "jsonl"; - case MARKDOWN: return "md"; - case SQL: return "sql"; - default: GGML_ABORT("invalid output format"); + case NONE: + return "none"; + case CSV: + return "csv"; + case JSON: + return "json"; + case JSONL: + return "jsonl"; + case MARKDOWN: + return "md"; + case SQL: + return "sql"; + default: + GGML_ABORT("invalid output format"); } } @@ -135,10 +138,14 @@ static bool output_format_from_str(const std::string & s, output_formats & forma static const char * split_mode_str(llama_split_mode mode) { switch (mode) { - case LLAMA_SPLIT_MODE_NONE: return "none"; - case LLAMA_SPLIT_MODE_LAYER: return "layer"; - case LLAMA_SPLIT_MODE_ROW: return "row"; - default: GGML_ABORT("invalid split mode"); + case LLAMA_SPLIT_MODE_NONE: + return "none"; + case LLAMA_SPLIT_MODE_LAYER: + return "layer"; + case LLAMA_SPLIT_MODE_ROW: + return "row"; + default: + GGML_ABORT("invalid split mode"); } } @@ -149,59 +156,59 @@ static std::string pair_str(const std::pair & p) { } struct cmd_params { - std::vector model; - std::vector n_prompt; - std::vector n_gen; + std::vector model; + std::vector n_prompt; + std::vector n_gen; std::vector> n_pg; - std::vector n_batch; - std::vector n_ubatch; - std::vector type_k; - std::vector type_v; - std::vector n_threads; - std::vector cpu_mask; - std::vector cpu_strict; - std::vector poll; - std::vector n_gpu_layers; - std::vector rpc_servers; - std::vector split_mode; - std::vector main_gpu; - std::vector no_kv_offload; - std::vector flash_attn; - std::vector> tensor_split; - std::vector use_mmap; - std::vector embeddings; - ggml_numa_strategy numa; - int reps; - ggml_sched_priority prio; - int delay; - bool verbose; - bool progress; - output_formats output_format; - output_formats output_format_stderr; + std::vector n_batch; + std::vector n_ubatch; + std::vector type_k; + std::vector type_v; + std::vector n_threads; + std::vector cpu_mask; + std::vector cpu_strict; + std::vector poll; + std::vector n_gpu_layers; + std::vector rpc_servers; + std::vector split_mode; + std::vector main_gpu; + std::vector no_kv_offload; + std::vector flash_attn; + std::vector> tensor_split; + std::vector use_mmap; + std::vector embeddings; + ggml_numa_strategy numa; + int reps; + ggml_sched_priority prio; + int delay; + bool verbose; + bool progress; + output_formats output_format; + output_formats output_format_stderr; }; static const cmd_params cmd_params_defaults = { - /* model */ {"models/7B/ggml-model-q4_0.gguf"}, - /* n_prompt */ {512}, - /* n_gen */ {128}, + /* model */ { "models/7B/ggml-model-q4_0.gguf" }, + /* n_prompt */ { 512 }, + /* n_gen */ { 128 }, /* n_pg */ {}, - /* n_batch */ {2048}, - /* n_ubatch */ {512}, - /* type_k */ {GGML_TYPE_F16}, - /* type_v */ {GGML_TYPE_F16}, - /* n_threads */ {cpu_get_num_math()}, - /* cpu_mask */ {"0x0"}, - /* cpu_strict */ {false}, - /* poll */ {50}, - /* n_gpu_layers */ {99}, - /* rpc_servers */ {""}, - /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, - /* main_gpu */ {0}, - /* no_kv_offload */ {false}, - /* flash_attn */ {false}, - /* tensor_split */ {std::vector(llama_max_devices(), 0.0f)}, - /* use_mmap */ {true}, - /* embeddings */ {false}, + /* n_batch */ { 2048 }, + /* n_ubatch */ { 512 }, + /* type_k */ { GGML_TYPE_F16 }, + /* type_v */ { GGML_TYPE_F16 }, + /* n_threads */ { cpu_get_num_math() }, + /* cpu_mask */ { "0x0" }, + /* cpu_strict */ { false }, + /* poll */ { 50 }, + /* n_gpu_layers */ { 99 }, + /* rpc_servers */ { "" }, + /* split_mode */ { LLAMA_SPLIT_MODE_LAYER }, + /* main_gpu */ { 0 }, + /* no_kv_offload */ { false }, + /* flash_attn */ { false }, + /* tensor_split */ { std::vector(llama_max_devices(), 0.0f) }, + /* use_mmap */ { true }, + /* embeddings */ { false }, /* numa */ GGML_NUMA_STRATEGY_DISABLED, /* reps */ 5, /* prio */ GGML_SCHED_PRIO_NORMAL, @@ -218,38 +225,59 @@ static void print_usage(int /* argc */, char ** argv) { printf("options:\n"); printf(" -h, --help\n"); printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); - printf(" -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); + printf(" -p, --n-prompt (default: %s)\n", + join(cmd_params_defaults.n_prompt, ",").c_str()); printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); - printf(" -pg (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str()); - printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); - printf(" -ub, --ubatch-size (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str()); - printf(" -ctk, --cache-type-k (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); - printf(" -ctv, --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); - printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); - printf(" -C, --cpu-mask (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str()); - printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str()); + printf(" -pg (default: %s)\n", + join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str()); + printf(" -b, --batch-size (default: %s)\n", + join(cmd_params_defaults.n_batch, ",").c_str()); + printf(" -ub, --ubatch-size (default: %s)\n", + join(cmd_params_defaults.n_ubatch, ",").c_str()); + printf(" -ctk, --cache-type-k (default: %s)\n", + join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); + printf(" -ctv, --cache-type-v (default: %s)\n", + join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); + printf(" -t, --threads (default: %s)\n", + join(cmd_params_defaults.n_threads, ",").c_str()); + printf(" -C, --cpu-mask (default: %s)\n", + join(cmd_params_defaults.cpu_mask, ",").c_str()); + printf(" --cpu-strict <0|1> (default: %s)\n", + join(cmd_params_defaults.cpu_strict, ",").c_str()); printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str()); - printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); + printf(" -ngl, --n-gpu-layers (default: %s)\n", + join(cmd_params_defaults.n_gpu_layers, ",").c_str()); if (llama_supports_rpc()) { - printf(" -rpc, --rpc (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str()); + printf(" -rpc, --rpc (default: %s)\n", + join(cmd_params_defaults.rpc_servers, ",").c_str()); } - printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); - printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); - printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); - printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str()); - printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); + printf(" -sm, --split-mode (default: %s)\n", + join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); + printf(" -mg, --main-gpu (default: %s)\n", + join(cmd_params_defaults.main_gpu, ",").c_str()); + printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", + join(cmd_params_defaults.no_kv_offload, ",").c_str()); + printf(" -fa, --flash-attn <0|1> (default: %s)\n", + join(cmd_params_defaults.flash_attn, ",").c_str()); + printf(" -mmp, --mmap <0|1> (default: %s)\n", + join(cmd_params_defaults.use_mmap, ",").c_str()); printf(" --numa (default: disabled)\n"); - printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); + printf(" -embd, --embeddings <0|1> (default: %s)\n", + join(cmd_params_defaults.embeddings, ",").c_str()); printf(" -ts, --tensor-split (default: 0)\n"); printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio); printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay); - printf(" -o, --output (default: %s)\n", output_format_str(cmd_params_defaults.output_format)); - printf(" -oe, --output-err (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr)); + printf(" -o, --output (default: %s)\n", + output_format_str(cmd_params_defaults.output_format)); + printf(" -oe, --output-err (default: %s)\n", + output_format_str(cmd_params_defaults.output_format_stderr)); printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); printf(" --progress (default: %s)\n", cmd_params_defaults.progress ? "1" : "0"); printf("\n"); - printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); + printf( + "Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter " + "multiple times.\n"); } static ggml_type ggml_type_from_name(const std::string & s) { @@ -281,22 +309,21 @@ static ggml_type ggml_type_from_name(const std::string & s) { return GGML_TYPE_COUNT; } - static cmd_params parse_cmd_params(int argc, char ** argv) { - cmd_params params; - std::string arg; - bool invalid_param = false; - const std::string arg_prefix = "--"; - const char split_delim = ','; - - params.verbose = cmd_params_defaults.verbose; - params.output_format = cmd_params_defaults.output_format; + cmd_params params; + std::string arg; + bool invalid_param = false; + const std::string arg_prefix = "--"; + const char split_delim = ','; + + params.verbose = cmd_params_defaults.verbose; + params.output_format = cmd_params_defaults.output_format; params.output_format_stderr = cmd_params_defaults.output_format_stderr; - params.reps = cmd_params_defaults.reps; - params.numa = cmd_params_defaults.numa; - params.prio = cmd_params_defaults.prio; - params.delay = cmd_params_defaults.delay; - params.progress = cmd_params_defaults.progress; + params.reps = cmd_params_defaults.reps; + params.numa = cmd_params_defaults.numa; + params.prio = cmd_params_defaults.prio; + params.delay = cmd_params_defaults.delay; + params.progress = cmd_params_defaults.progress; for (int i = 1; i < argc; i++) { arg = argv[i]; @@ -338,7 +365,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])}); + params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) }); } else if (arg == "-b" || arg == "--batch-size") { if (++i >= argc) { invalid_param = true; @@ -358,7 +385,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - auto p = string_split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); std::vector types; for (const auto & t : p) { ggml_type gt = ggml_type_from_name(t); @@ -377,7 +404,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - auto p = string_split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); std::vector types; for (const auto & t : p) { ggml_type gt = ggml_type_from_name(t); @@ -437,7 +464,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - auto p = string_split(argv[i], split_delim); + auto p = string_split(argv[i], split_delim); std::vector modes; for (const auto & m : p) { llama_split_mode mode; @@ -476,10 +503,16 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } else { std::string value(argv[i]); - /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } - else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } - else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } - else { invalid_param = true; break; } + /**/ if (value == "distribute" || value == "") { + params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; + } else if (value == "isolate") { + params.numa = GGML_NUMA_STRATEGY_ISOLATE; + } else if (value == "numactl") { + params.numa = GGML_NUMA_STRATEGY_NUMACTL; + } else { + invalid_param = true; + break; + } } } else if (arg == "-fa" || arg == "--flash-attn") { if (++i >= argc) { @@ -509,9 +542,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } for (auto ts : string_split(argv[i], split_delim)) { // split string by ; and / - const std::regex regex{R"([;/]+)"}; - std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1}; - std::vector split_arg{it, {}}; + const std::regex regex{ R"([;/]+)" }; + std::sregex_token_iterator it{ ts.begin(), ts.end(), regex, -1 }; + std::vector split_arg{ it, {} }; GGML_ASSERT(split_arg.size() <= llama_max_devices()); std::vector tensor_split(llama_max_devices()); @@ -570,52 +603,94 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } // set defaults - if (params.model.empty()) { params.model = cmd_params_defaults.model; } - if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; } - if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; } - if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; } - if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; } - if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; } - if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; } - if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; } - if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } - if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; } - if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; } - if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } - if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; } - if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; } - if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } - if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; } - if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; } - if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } - if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; } - if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; } - if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; } + if (params.model.empty()) { + params.model = cmd_params_defaults.model; + } + if (params.n_prompt.empty()) { + params.n_prompt = cmd_params_defaults.n_prompt; + } + if (params.n_gen.empty()) { + params.n_gen = cmd_params_defaults.n_gen; + } + if (params.n_pg.empty()) { + params.n_pg = cmd_params_defaults.n_pg; + } + if (params.n_batch.empty()) { + params.n_batch = cmd_params_defaults.n_batch; + } + if (params.n_ubatch.empty()) { + params.n_ubatch = cmd_params_defaults.n_ubatch; + } + if (params.type_k.empty()) { + params.type_k = cmd_params_defaults.type_k; + } + if (params.type_v.empty()) { + params.type_v = cmd_params_defaults.type_v; + } + if (params.n_gpu_layers.empty()) { + params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; + } + if (params.rpc_servers.empty()) { + params.rpc_servers = cmd_params_defaults.rpc_servers; + } + if (params.split_mode.empty()) { + params.split_mode = cmd_params_defaults.split_mode; + } + if (params.main_gpu.empty()) { + params.main_gpu = cmd_params_defaults.main_gpu; + } + if (params.no_kv_offload.empty()) { + params.no_kv_offload = cmd_params_defaults.no_kv_offload; + } + if (params.flash_attn.empty()) { + params.flash_attn = cmd_params_defaults.flash_attn; + } + if (params.tensor_split.empty()) { + params.tensor_split = cmd_params_defaults.tensor_split; + } + if (params.use_mmap.empty()) { + params.use_mmap = cmd_params_defaults.use_mmap; + } + if (params.embeddings.empty()) { + params.embeddings = cmd_params_defaults.embeddings; + } + if (params.n_threads.empty()) { + params.n_threads = cmd_params_defaults.n_threads; + } + if (params.cpu_mask.empty()) { + params.cpu_mask = cmd_params_defaults.cpu_mask; + } + if (params.cpu_strict.empty()) { + params.cpu_strict = cmd_params_defaults.cpu_strict; + } + if (params.poll.empty()) { + params.poll = cmd_params_defaults.poll; + } return params; } struct cmd_params_instance { - std::string model; - int n_prompt; - int n_gen; - int n_batch; - int n_ubatch; - ggml_type type_k; - ggml_type type_v; - int n_threads; - std::string cpu_mask; - bool cpu_strict; - int poll; - int n_gpu_layers; - std::string rpc_servers; - llama_split_mode split_mode; - int main_gpu; - bool no_kv_offload; - bool flash_attn; + std::string model; + int n_prompt; + int n_gen; + int n_batch; + int n_ubatch; + ggml_type type_k; + ggml_type type_v; + int n_threads; + std::string cpu_mask; + bool cpu_strict; + int poll; + int n_gpu_layers; + std::string rpc_servers; + llama_split_mode split_mode; + int main_gpu; + bool no_kv_offload; + bool flash_attn; std::vector tensor_split; - bool use_mmap; - bool embeddings; + bool use_mmap; + bool embeddings; llama_model_params to_llama_mparams() const { llama_model_params mparams = llama_model_default_params(); @@ -624,35 +699,31 @@ struct cmd_params_instance { if (!rpc_servers.empty()) { mparams.rpc_servers = rpc_servers.c_str(); } - mparams.split_mode = split_mode; - mparams.main_gpu = main_gpu; + mparams.split_mode = split_mode; + mparams.main_gpu = main_gpu; mparams.tensor_split = tensor_split.data(); - mparams.use_mmap = use_mmap; + mparams.use_mmap = use_mmap; return mparams; } bool equal_mparams(const cmd_params_instance & other) const { - return model == other.model && - n_gpu_layers == other.n_gpu_layers && - rpc_servers == other.rpc_servers && - split_mode == other.split_mode && - main_gpu == other.main_gpu && - use_mmap == other.use_mmap && + return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers == other.rpc_servers && + split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split; } llama_context_params to_llama_cparams() const { llama_context_params cparams = llama_context_default_params(); - cparams.n_ctx = n_prompt + n_gen; - cparams.n_batch = n_batch; - cparams.n_ubatch = n_ubatch; - cparams.type_k = type_k; - cparams.type_v = type_v; + cparams.n_ctx = n_prompt + n_gen; + cparams.n_batch = n_batch; + cparams.n_ubatch = n_ubatch; + cparams.type_k = type_k; + cparams.type_v = type_v; cparams.offload_kqv = !no_kv_offload; - cparams.flash_attn = flash_attn; - cparams.embeddings = embeddings; + cparams.flash_attn = flash_attn; + cparams.embeddings = embeddings; return cparams; } @@ -662,6 +733,7 @@ static std::vector get_cmd_params_instances(const cmd_param std::vector instances; // this ordering minimizes the number of times that each model needs to be reloaded + // clang-format off for (const auto & m : params.model) for (const auto & nl : params.n_gpu_layers) for (const auto & rpc : params.rpc_servers) @@ -767,100 +839,94 @@ static std::vector get_cmd_params_instances(const cmd_param instances.push_back(instance); } } + // clang-format on return instances; } struct test { static const std::string build_commit; - static const int build_number; + static const int build_number; static const std::string cpu_info; static const std::string gpu_info; - std::string model_filename; - std::string model_type; - uint64_t model_size; - uint64_t model_n_params; - int n_batch; - int n_ubatch; - int n_threads; - std::string cpu_mask; - bool cpu_strict; - int poll; - ggml_type type_k; - ggml_type type_v; - int n_gpu_layers; - llama_split_mode split_mode; - int main_gpu; - bool no_kv_offload; - bool flash_attn; - std::vector tensor_split; - bool use_mmap; - bool embeddings; - int n_prompt; - int n_gen; - std::string test_time; - std::vector samples_ns; + std::string model_filename; + std::string model_type; + uint64_t model_size; + uint64_t model_n_params; + int n_batch; + int n_ubatch; + int n_threads; + std::string cpu_mask; + bool cpu_strict; + int poll; + ggml_type type_k; + ggml_type type_v; + int n_gpu_layers; + llama_split_mode split_mode; + int main_gpu; + bool no_kv_offload; + bool flash_attn; + std::vector tensor_split; + bool use_mmap; + bool embeddings; + int n_prompt; + int n_gen; + std::string test_time; + std::vector samples_ns; test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) { model_filename = inst.model; char buf[128]; llama_model_desc(lmodel, buf, sizeof(buf)); - model_type = buf; - model_size = llama_model_size(lmodel); + model_type = buf; + model_size = llama_model_size(lmodel); model_n_params = llama_model_n_params(lmodel); - n_batch = inst.n_batch; - n_ubatch = inst.n_ubatch; - n_threads = inst.n_threads; - cpu_mask = inst.cpu_mask; - cpu_strict = inst.cpu_strict; - poll = inst.poll; - type_k = inst.type_k; - type_v = inst.type_v; - n_gpu_layers = inst.n_gpu_layers; - split_mode = inst.split_mode; - main_gpu = inst.main_gpu; - no_kv_offload = inst.no_kv_offload; - flash_attn = inst.flash_attn; - tensor_split = inst.tensor_split; - use_mmap = inst.use_mmap; - embeddings = inst.embeddings; - n_prompt = inst.n_prompt; - n_gen = inst.n_gen; + n_batch = inst.n_batch; + n_ubatch = inst.n_ubatch; + n_threads = inst.n_threads; + cpu_mask = inst.cpu_mask; + cpu_strict = inst.cpu_strict; + poll = inst.poll; + type_k = inst.type_k; + type_v = inst.type_v; + n_gpu_layers = inst.n_gpu_layers; + split_mode = inst.split_mode; + main_gpu = inst.main_gpu; + no_kv_offload = inst.no_kv_offload; + flash_attn = inst.flash_attn; + tensor_split = inst.tensor_split; + use_mmap = inst.use_mmap; + embeddings = inst.embeddings; + n_prompt = inst.n_prompt; + n_gen = inst.n_gen; // RFC 3339 date-time format - time_t t = time(NULL); + time_t t = time(NULL); std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t)); test_time = buf; (void) ctx; } - uint64_t avg_ns() const { - return ::avg(samples_ns); - } + uint64_t avg_ns() const { return ::avg(samples_ns); } - uint64_t stdev_ns() const { - return ::stdev(samples_ns); - } + uint64_t stdev_ns() const { return ::stdev(samples_ns); } std::vector get_ts() const { - int n_tokens = n_prompt + n_gen; + int n_tokens = n_prompt + n_gen; std::vector ts; - std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; }); + std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), + [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; }); return ts; } - double avg_ts() const { - return ::avg(get_ts()); - } + double avg_ts() const { return ::avg(get_ts()); } - double stdev_ts() const { - return ::stdev(get_ts()); - } + double stdev_ts() const { return ::stdev(get_ts()); } static std::string get_backend() { std::vector backends; for (size_t i = 0; i < ggml_backend_reg_count(); i++) { - auto * reg = ggml_backend_reg_get(i); + auto * reg = ggml_backend_reg_get(i); std::string name = ggml_backend_reg_name(reg); if (name != "CPU") { backends.push_back(ggml_backend_reg_name(reg)); @@ -871,36 +937,27 @@ struct test { static const std::vector & get_fields() { static const std::vector fields = { - "build_commit", "build_number", - "cpu_info", "gpu_info", "backends", - "model_filename", "model_type", "model_size", "model_n_params", - "n_batch", "n_ubatch", - "n_threads", "cpu_mask", "cpu_strict", "poll", - "type_k", "type_v", - "n_gpu_layers", "split_mode", - "main_gpu", "no_kv_offload", "flash_attn", - "tensor_split", "use_mmap", "embeddings", - "n_prompt", "n_gen", "test_time", - "avg_ns", "stddev_ns", - "avg_ts", "stddev_ts", + "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename", + "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", + "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", + "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap", + "embeddings", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", + "avg_ts", "stddev_ts", }; return fields; } - enum field_type {STRING, BOOL, INT, FLOAT}; + enum field_type { STRING, BOOL, INT, FLOAT }; static field_type get_field_type(const std::string & field) { - if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || - field == "n_threads" || field == "poll" || - field == "model_size" || field == "model_n_params" || - field == "n_gpu_layers" || field == "main_gpu" || - field == "n_prompt" || field == "n_gen" || - field == "avg_ns" || field == "stddev_ns") { + if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" || + field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || + field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" || + field == "stddev_ns") { return INT; } - if (field == "f16_kv" || field == "no_kv_offload" || - field == "cpu_strict" || - field == "flash_attn" || field == "use_mmap" || field == "embeddings") { + if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || + field == "use_mmap" || field == "embeddings") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts") { @@ -911,7 +968,7 @@ struct test { std::vector get_values() const { std::string tensor_split_str; - int max_nonzero = 0; + int max_nonzero = 0; for (size_t i = 0; i < llama_max_devices(); i++) { if (tensor_split[i] > 0) { max_nonzero = i; @@ -925,29 +982,47 @@ struct test { tensor_split_str += "/"; } } - std::vector values = { - build_commit, std::to_string(build_number), - cpu_info, gpu_info, get_backend(), - model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), - std::to_string(n_batch), std::to_string(n_ubatch), - std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll), - ggml_type_name(type_k), ggml_type_name(type_v), - std::to_string(n_gpu_layers), split_mode_str(split_mode), - std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn), - tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings), - std::to_string(n_prompt), std::to_string(n_gen), test_time, - std::to_string(avg_ns()), std::to_string(stdev_ns()), - std::to_string(avg_ts()), std::to_string(stdev_ts()) - }; + std::vector values = { build_commit, + std::to_string(build_number), + cpu_info, + gpu_info, + get_backend(), + model_filename, + model_type, + std::to_string(model_size), + std::to_string(model_n_params), + std::to_string(n_batch), + std::to_string(n_ubatch), + std::to_string(n_threads), + cpu_mask, + std::to_string(cpu_strict), + std::to_string(poll), + ggml_type_name(type_k), + ggml_type_name(type_v), + std::to_string(n_gpu_layers), + split_mode_str(split_mode), + std::to_string(main_gpu), + std::to_string(no_kv_offload), + std::to_string(flash_attn), + tensor_split_str, + std::to_string(use_mmap), + std::to_string(embeddings), + std::to_string(n_prompt), + std::to_string(n_gen), + test_time, + std::to_string(avg_ns()), + std::to_string(stdev_ns()), + std::to_string(avg_ts()), + std::to_string(stdev_ts()) }; return values; } std::map get_map() const { std::map map; - auto fields = get_fields(); - auto values = get_values(); - std::transform(fields.begin(), fields.end(), values.begin(), - std::inserter(map, map.end()), std::make_pair); + auto fields = get_fields(); + auto values = get_values(); + std::transform(fields.begin(), fields.end(), values.begin(), std::inserter(map, map.end()), + std::make_pair); return map; } }; @@ -961,9 +1036,12 @@ struct printer { virtual ~printer() {} FILE * fout; + virtual void print_header(const cmd_params & params) { (void) params; } + virtual void print_test(const test & t) = 0; - virtual void print_footer() { } + + virtual void print_footer() {} }; struct csv_printer : public printer { @@ -979,7 +1057,7 @@ struct csv_printer : public printer { return escaped; } - void print_header(const cmd_params & params) override { + void print_header(const cmd_params & params) override { std::vector fields = test::get_fields(); fprintf(fout, "%s\n", join(fields, ",").c_str()); (void) params; @@ -992,7 +1070,6 @@ struct csv_printer : public printer { } }; - static std::string escape_json(const std::string & value) { std::string escaped; for (auto c : value) { @@ -1000,7 +1077,7 @@ static std::string escape_json(const std::string & value) { escaped += "\\\""; } else if (c == '\\') { escaped += "\\\\"; - } else if (c <= 0x1f) { + } else if (c <= 0x1f) { char buf[8]; snprintf(buf, sizeof(buf), "\\u%04x", c); escaped += buf; @@ -1033,7 +1110,8 @@ struct json_printer : public printer { void print_fields(const std::vector & fields, const std::vector & values) { assert(fields.size() == values.size()); for (size_t i = 0; i < fields.size(); i++) { - fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str()); + fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), + format_json_value(fields.at(i), values.at(i)).c_str()); } } @@ -1051,12 +1129,9 @@ struct json_printer : public printer { fflush(fout); } - void print_footer() override { - fprintf(fout, "\n]\n"); - } + void print_footer() override { fprintf(fout, "\n]\n"); } }; - struct jsonl_printer : public printer { void print_fields(const std::vector & fields, const std::vector & values) { assert(fields.size() == values.size()); @@ -1116,7 +1191,7 @@ struct markdown_printer : public printer { return 13; } - int width = std::max((int)field.length(), 10); + int width = std::max((int) field.length(), 10); if (test::get_field_type(field) == test::STRING) { return -width; @@ -1230,18 +1305,18 @@ struct markdown_printer : public printer { fprintf(fout, "|"); for (const auto & field : fields) { std::string value; - char buf[128]; + char buf[128]; if (field == "model") { value = t.model_type; } else if (field == "size") { - if (t.model_size < 1024*1024*1024) { + if (t.model_size < 1024 * 1024 * 1024) { snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0); } else { snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0); } value = buf; } else if (field == "params") { - if (t.model_n_params < 1000*1000*1000) { + if (t.model_n_params < 1000 * 1000 * 1000) { snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6); } else { snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9); @@ -1303,7 +1378,8 @@ struct sql_printer : public printer { std::vector fields = test::get_fields(); fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n"); for (size_t i = 0; i < fields.size(); i++) { - fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(), i < fields.size() - 1 ? "," : ""); + fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(), + i < fields.size() - 1 ? "," : ""); } fprintf(fout, ");\n"); fprintf(fout, "\n"); @@ -1324,8 +1400,8 @@ struct sql_printer : public printer { static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) { llama_set_n_threads(ctx, n_threads, n_threads); - const llama_model * model = llama_get_model(ctx); - const int32_t n_vocab = llama_n_vocab(model); + const llama_model * model = llama_get_model(ctx); + const int32_t n_vocab = llama_n_vocab(model); std::vector tokens(n_batch); @@ -1333,7 +1409,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th while (n_processed < n_prompt) { int n_tokens = std::min(n_prompt - n_processed, n_batch); - tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; + tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; for (int i = 1; i < n_tokens; i++) { tokens[i] = std::rand() % n_vocab; } @@ -1347,8 +1423,8 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th static void test_gen(llama_context * ctx, int n_gen, int n_threads) { llama_set_n_threads(ctx, n_threads, n_threads); - const llama_model * model = llama_get_model(ctx); - const int32_t n_vocab = llama_n_vocab(model); + const llama_model * model = llama_get_model(ctx); + const int32_t n_vocab = llama_n_vocab(model); llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; @@ -1411,7 +1487,7 @@ int main(int argc, char ** argv) { set_process_priority(params.prio); // initialize printer - std::unique_ptr p = create_printer(params.output_format); + std::unique_ptr p = create_printer(params.output_format); std::unique_ptr p_err = create_printer(params.output_format_stderr); if (p) { @@ -1426,13 +1502,13 @@ int main(int argc, char ** argv) { std::vector params_instances = get_cmd_params_instances(params); - llama_model * lmodel = nullptr; + llama_model * lmodel = nullptr; const cmd_params_instance * prev_inst = nullptr; - int params_idx = 0; + int params_idx = 0; auto params_count = params_instances.size(); for (const auto & inst : params_instances) { - params_idx ++; + params_idx++; if (params.progress) { fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count); } @@ -1475,7 +1551,7 @@ int main(int argc, char ** argv) { tpp.poll = t.poll; tpp.prio = params.prio; - struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp); + struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp); if (!threadpool) { fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); exit(1); @@ -1505,13 +1581,15 @@ int main(int argc, char ** argv) { if (t.n_prompt > 0) { if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps); + fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, + i + 1, params.reps); } test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); } if (t.n_gen > 0) { if (params.progress) { - fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps); + fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, + i + 1, params.reps); } test_gen(ctx, t.n_gen, t.n_threads); } From f10765723276d96a318cfdb44edaa9232f3172b4 Mon Sep 17 00:00:00 2001 From: bandoti <141645996+bandoti@users.noreply.github.com> Date: Wed, 20 Nov 2024 12:22:19 -0400 Subject: [PATCH 023/245] cmake: add link dependencies to cmake find pkg (#10433) * cmake pkg: find accelerate, openmp, memkind libs * cmake pkg: find BLAS libs * try BLAS_LIBRARIES instead * Add BLAS link opts * Add more link deps. and set GGML_ vars --- cmake/llama-config.cmake.in | 136 +++++++++++++++++++++++++++--------- 1 file changed, 104 insertions(+), 32 deletions(-) diff --git a/cmake/llama-config.cmake.in b/cmake/llama-config.cmake.in index 28a8c18b65176..5c55bc6b822a6 100644 --- a/cmake/llama-config.cmake.in +++ b/cmake/llama-config.cmake.in @@ -3,12 +3,60 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@) set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@) set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@) +set(GGML_STATIC @GGML_STATIC@) +set(GGML_NATIVE @GGML_NATIVE@) +set(GGML_LTO @GGML_LTO@) +set(GGML_CCACHE @GGML_CCACHE@) +set(GGML_AVX @GGML_AVX@) +set(GGML_AVX2 @GGML_AVX2@) +set(GGML_AVX512 @GGML_AVX512@) +set(GGML_AVX512_VBMI @GGML_AVX512_VBMI@) +set(GGML_AVX512_VNNI @GGML_AVX512_VNNI@) +set(GGML_AVX512_BF16 @GGML_AVX512_BF16@) +set(GGML_AMX_TILE @GGML_AMX_TILE@) +set(GGML_AMX_INT8 @GGML_AMX_INT8@) +set(GGML_AMX_BF16 @GGML_AMX_BF16@) +set(GGML_FMA @GGML_FMA@) +set(GGML_LASX @GGML_LASX@) +set(GGML_LSX @GGML_LSX@) +set(GGML_RVV @GGML_RVV@) +set(GGML_SVE @GGML_SVE@) + set(GGML_ACCELERATE @GGML_ACCELERATE@) +set(GGML_OPENMP @GGML_OPENMP@) +set(GGML_CPU_HBM @GGML_CPU_HBM@) +set(GGML_BLAS_VENDOR @GGML_BLAS_VENDOR@) + +set(GGML_CUDA_FORCE_MMQ @GGML_CUDA_FORCE_MMQ@) +set(GGML_CUDA_FORCE_CUBLAS @GGML_CUDA_FORCE_CUBLAS@) +set(GGML_CUDA_F16 @GGML_CUDA_F16@) +set(GGML_CUDA_PEER_MAX_BATCH_SIZE @GGML_CUDA_PEER_MAX_BATCH_SIZE@) +set(GGML_CUDA_NO_PEER_COPY @GGML_CUDA_NO_PEER_COPY@) +set(GGML_CUDA_NO_VMM @GGML_CUDA_NO_VMM@) +set(GGML_CUDA_FA_ALL_QUANTS @GGML_CUDA_FA_ALL_QUANTS@) +set(GGML_CUDA_GRAPHS @GGML_CUDA_GRAPHS@) + +set(GGML_HIP_UMA @GGML_HIP_UMA@) + set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@) -set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@) -set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@) -set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@) -set(GGML_OPENMP @GGML_OPENMP@) +set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@) +set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@) +set(GGML_VULKAN_SHADER_DEBUG_INFO @GGML_VULKAN_SHADER_DEBUG_INFO@) +set(GGML_VULKAN_PERF @GGML_VULKAN_PERF@) +set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@) +set(GGML_VULKAN_RUN_TESTS @GGML_VULKAN_RUN_TESTS@) + +set(GGML_METAL_USE_BF16 @GGML_METAL_USE_BF16@) +set(GGML_METAL_NDEBUG @GGML_METAL_NDEBUG@) +set(GGML_METAL_SHADER_DEBUG @GGML_METAL_SHADER_DEBUG@) +set(GGML_METAL_EMBED_LIBRARY @GGML_METAL_EMBED_LIBRARY@) +set(GGML_METAL_MACOSX_VERSION_MIN @GGML_METAL_MACOSX_VERSION_MIN@) +set(GGML_METAL_STD @GGML_METAL_STD@) + +set(GGML_SYCL_F16 @GGML_SYCL_F16@) +set(GGML_SYCL_TARGET @GGML_SYCL_TARGET@) +set(GGML_SYCL_DEVICE_ARCH @GGML_SYCL_DEVICE_ARCH@) + @PACKAGE_INIT@ @@ -20,6 +68,7 @@ find_package(Threads REQUIRED) set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@") set(_llama_link_deps "") +set(_llama_link_opts "") foreach(_ggml_lib ggml ggml-base) string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY") find_library(${_ggml_lib_var} ${_ggml_lib} @@ -49,41 +98,63 @@ foreach(backend amx blas cann cpu cuda hip kompute metal musa rpc sycl vulkan) endif() endforeach() -if (APPLE AND GGML_ACCELERATE) - find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED) -endif() +if (NOT LLAMA_SHARED_LIB) + if (APPLE AND GGML_ACCELERATE) + find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED) + list(APPEND _llama_link_deps ${ACCELERATE_FRAMEWORK}) + endif() -if (GGML_BLAS) - find_package(BLAS REQUIRED) -endif() + if (GGML_OPENMP) + find_package(OpenMP REQUIRED) + list(APPEND _llama_link_deps OpenMP::OpenMP_C OpenMP::OpenMP_CXX) + endif() -if (GGML_CUDA) - find_package(CUDAToolkit REQUIRED) -endif() + if (GGML_CPU_HBM) + find_library(memkind memkind REQUIRED) + list(APPEND _llama_link_deps memkind) + endif() -if (GGML_METAL) - find_library(FOUNDATION_LIBRARY Foundation REQUIRED) - find_library(METAL_FRAMEWORK Metal REQUIRED) - find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) -endif() + if (GGML_BLAS) + find_package(BLAS REQUIRED) + list(APPEND _llama_link_deps ${BLAS_LIBRARIES}) + list(APPEND _llama_link_opts ${BLAS_LINKER_FLAGS}) + endif() -if (GGML_VULKAN) - find_package(Vulkan REQUIRED) -endif() + if (GGML_CUDA) + find_package(CUDAToolkit REQUIRED) + endif() -if (GGML_HIP) - find_package(hip REQUIRED) - find_package(hipblas REQUIRED) - find_package(rocblas REQUIRED) -endif() + if (GGML_METAL) + find_library(FOUNDATION_LIBRARY Foundation REQUIRED) + find_library(METAL_FRAMEWORK Metal REQUIRED) + find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) + list(APPEND _llama_link_deps ${FOUNDATION_LIBRARY} + ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK}) + endif() -if (GGML_SYCL) - find_package(IntelSYCL REQUIRED) - find_package(MKL REQUIRED) -endif() + if (GGML_VULKAN) + find_package(Vulkan REQUIRED) + list(APPEND _llama_link_deps Vulkan::Vulkan) + endif() -if (GGML_OPENMP) - find_package(OpenMP REQUIRED) + if (GGML_HIP) + find_package(hip REQUIRED) + find_package(hipblas REQUIRED) + find_package(rocblas REQUIRED) + list(APPEND _llama_link_deps hip::host roc::rocblas roc::hipblas) + endif() + + if (GGML_SYCL) + find_package(DNNL) + if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL") + list(APPEND _llama_link_deps DNNL::dnnl) + endif() + if (WIN32) + find_package(IntelSYCL REQUIRED) + find_package(MKL REQUIRED) + list(APPEND _llama_link_deps IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) + endif() + endif() endif() find_library(llama_LIBRARY llama @@ -97,6 +168,7 @@ set_target_properties(llama PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}" INTERFACE_LINK_LIBRARIES "${_llama_link_deps}" + INTERFACE_LINK_OPTIONS "${_llama_link_opts}" INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}" IMPORTED_LINK_INTERFACE_LANGUAGES "CXX" IMPORTED_LOCATION "${llama_LIBRARY}" From bd3170323695e91dc0bf50620661c582e583c1a2 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Wed, 20 Nov 2024 13:47:36 -0600 Subject: [PATCH 024/245] vulkan: predicate max operation in soft_max shaders/soft_max (#10437) Fixes #10434 --- ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp index f9727679ec5f2..6e20b6411cadc 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp @@ -73,7 +73,9 @@ void soft_max(uint num_iters) { FLOAT_TYPE v = a * p.scale + slope * b; - max_val = max(max_val, v); + if (col < p.KX) { + max_val = max(max_val, v); + } if (idx < DATA_CACHE_SIZE) { data_cache[idx] = v; From 6aefea28c2d65550b64b6f81ce39dd2993529f0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 20 Nov 2024 14:56:04 +0100 Subject: [PATCH 025/245] ggml-opt: fix data corruption (ggml/1022) --- ggml/src/ggml-backend.cpp | 2 + ggml/src/ggml-impl.h | 3 + ggml/src/ggml-opt.cpp | 147 +++++++++++++++++-------------------- ggml/src/ggml.c | 94 ++++++++++++++---------- tests/test-backend-ops.cpp | 1 - 5 files changed, 129 insertions(+), 118 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 9dcde8d11952a..3433d082e775b 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -252,6 +252,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten } void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + GGML_ASSERT(tensor); ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; if (size == 0) { @@ -266,6 +267,7 @@ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, siz } void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + GGML_ASSERT(tensor); ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; if (size == 0) { diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 92a64fe5a5930..3965be78755a1 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -295,6 +295,9 @@ struct ggml_cgraph { enum ggml_cgraph_eval_order order; }; +// returns a slice of cgraph with nodes [i0, i1) +// the slice does not have leafs or gradients +// if you need the gradients, get them from the original graph struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1); // Memory allocation diff --git a/ggml/src/ggml-opt.cpp b/ggml/src/ggml-opt.cpp index 040205a31bab0..7c3e24103a250 100644 --- a/ggml/src/ggml-opt.cpp +++ b/ggml/src/ggml-opt.cpp @@ -14,51 +14,51 @@ #include struct ggml_opt_dataset { - struct ggml_context * ctx; - ggml_backend_buffer_t buf; - struct ggml_tensor * data; - struct ggml_tensor * labels; + struct ggml_context * ctx = nullptr; + ggml_backend_buffer_t buf = nullptr; + struct ggml_tensor * data = nullptr; + struct ggml_tensor * labels = nullptr; - int64_t ndata; - int64_t ndata_shard; - size_t nbs_data; - size_t nbs_labels; + int64_t ndata = -1; + int64_t ndata_shard = -1; + size_t nbs_data = -1; + size_t nbs_labels = -1; std::vector permutation; }; struct ggml_opt_context { - ggml_backend_sched_t backend_sched; - ggml_cgraph * allocated_graph; - ggml_cgraph * allocated_graph_copy; - struct ggml_context * ctx_static; - struct ggml_context * ctx_static_cpu; - struct ggml_context * ctx_compute; - struct ggml_context * ctx_copy; - ggml_backend_buffer_t buf_static; - ggml_backend_buffer_t buf_static_cpu; + ggml_backend_sched_t backend_sched = nullptr; + ggml_cgraph * allocated_graph = nullptr; + ggml_cgraph * allocated_graph_copy = nullptr; + struct ggml_context * ctx_static = nullptr; + struct ggml_context * ctx_static_cpu = nullptr; + struct ggml_context * ctx_compute = nullptr; + struct ggml_context * ctx_copy = nullptr; + ggml_backend_buffer_t buf_static = nullptr; + ggml_backend_buffer_t buf_static_cpu = nullptr; std::mt19937 rng; - struct ggml_tensor * inputs; - struct ggml_tensor * outputs; - struct ggml_tensor * labels; + struct ggml_tensor * inputs = nullptr; + struct ggml_tensor * outputs = nullptr; + struct ggml_tensor * labels = nullptr; - struct ggml_tensor * loss; - struct ggml_tensor * pred; - struct ggml_tensor * ncorrect; + struct ggml_tensor * loss = nullptr; + struct ggml_tensor * pred = nullptr; + struct ggml_tensor * ncorrect = nullptr; - struct ggml_cgraph * gf; - struct ggml_cgraph * gb_grad; - struct ggml_cgraph * gb_opt; + struct ggml_cgraph * gf = nullptr; + struct ggml_cgraph * gb_grad = nullptr; + struct ggml_cgraph * gb_opt = nullptr; - int64_t iter; - int32_t opt_period; - int32_t opt_i; - bool loss_per_datapoint; + int64_t iter = 1; + int32_t opt_period = 1; + int32_t opt_i = 0; + bool loss_per_datapoint = false; - ggml_opt_get_optimizer_params get_opt_pars; - void * get_opt_pars_ud; - struct ggml_tensor * adamw_params; + ggml_opt_get_optimizer_params get_opt_pars = nullptr; + void * get_opt_pars_ud = nullptr; + struct ggml_tensor * adamw_params = nullptr; }; struct ggml_opt_result { @@ -67,8 +67,8 @@ struct ggml_opt_result { std::vector pred; int64_t ncorrect = 0; - bool loss_per_datapoint = false; - int64_t opt_period = -1; + int64_t opt_period = -1; + bool loss_per_datapoint = false; }; // ====== Dataset ====== @@ -188,11 +188,11 @@ struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * us } struct ggml_opt_params ggml_opt_default_params( - ggml_backend_sched_t backend_sched, - struct ggml_context * ctx_compute, - struct ggml_tensor * inputs, - struct ggml_tensor * outputs, - enum ggml_opt_loss_type loss_type) { + ggml_backend_sched_t backend_sched, + struct ggml_context * ctx_compute, + struct ggml_tensor * inputs, + struct ggml_tensor * outputs, + enum ggml_opt_loss_type loss_type) { return { /*backend_sched =*/ backend_sched, /*ctx_compute =*/ ctx_compute, @@ -237,25 +237,33 @@ static ggml_tensor * map_tensor(std::map & tensor_ return new_tensor; } -static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * graph) { +static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * src) { std::map tensor_map; - ggml_cgraph * new_graph = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true); + ggml_cgraph * dst = ggml_new_graph_custom(ctx, src->size, /*grads =*/ true); - for (int i = 0; i < graph->n_leafs; i++) { - ggml_build_forward_expand(new_graph, map_tensor(tensor_map, ctx, graph->leafs[i])); + for (int i = 0; i < src->n_leafs; i++) { + ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->leafs[i])); } - for (int i = 0; i < graph->n_nodes; i++) { - ggml_build_forward_expand(new_graph, map_tensor(tensor_map, ctx, graph->nodes[i])); + GGML_ASSERT(dst->n_leafs == src->n_leafs); + for (int i = 0; i < src->n_nodes; i++) { + ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->nodes[i])); } - for (int i = 0; i < graph->n_nodes; ++i) { - const size_t igrad_src = ggml_hash_find(&graph->visited_hash_set, graph->nodes[i]); - const size_t igrad_dst = ggml_hash_find(&new_graph->visited_hash_set, new_graph->nodes[i]); - graph->grads[igrad_dst] = new_graph->grads[igrad_src]; - graph->grad_accs[igrad_dst] = new_graph->grad_accs[igrad_src]; + GGML_ASSERT(dst->n_nodes == src->n_nodes); + for (int i = 0; i < src->n_nodes; ++i) { + const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]); + const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]); + + GGML_ASSERT(igrad_src != GGML_HASHSET_FULL); + GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src)); + GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL); + GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst)); + + dst->grads[igrad_dst] = src->grads[igrad_src]; + dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src]; } - return new_graph; + return dst; } static void ggml_opt_alloc_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph) { @@ -284,18 +292,13 @@ static void ggml_opt_alloc_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) { ggml_opt_context_t result = new struct ggml_opt_context; - result->backend_sched = params.backend_sched; - result->allocated_graph = nullptr; - result->allocated_graph_copy = nullptr; - result->ctx_compute = params.ctx_compute; - result->ctx_copy = nullptr; - result->inputs = params.inputs; - result->outputs = params.outputs; - result->iter = 1; - result->opt_period = params.opt_period; - result->opt_i = 0; - result->get_opt_pars = params.get_opt_pars; - result->get_opt_pars_ud = params.get_opt_pars_ud; + result->backend_sched = params.backend_sched; + result->ctx_compute = params.ctx_compute; + result->inputs = params.inputs; + result->outputs = params.outputs; + result->opt_period = params.opt_period; + result->get_opt_pars = params.get_opt_pars; + result->get_opt_pars_ud = params.get_opt_pars_ud; GGML_ASSERT(result->inputs->data && "the inputs must be allocated statically"); GGML_ASSERT(result->opt_period >= 1); @@ -348,7 +351,6 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) { switch (params.loss_type) { case GGML_OPT_LOSS_TYPE_MEAN: { - result->labels = nullptr; result->loss = ggml_sum(result->ctx_static, result->outputs); ggml_set_name(result->loss, "loss_sum"); const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs)); @@ -358,7 +360,6 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) { break; } case GGML_OPT_LOSS_TYPE_SUM: { - result->labels = nullptr; result->loss = ggml_sum(result->ctx_static, result->outputs); ggml_set_name(result->loss, "loss_sum"); result->loss_per_datapoint = false; @@ -413,14 +414,7 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) { } if (params.build_type == GGML_OPT_BUILD_TYPE_FORWARD) { - result->gb_grad = nullptr; - result->gb_opt = nullptr; - result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0)); - result->buf_static_cpu = nullptr; - - ggml_opt_alloc_graph(result, result->gf); - return result; } @@ -429,14 +423,8 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) { ggml_build_backward_expand(result->ctx_static, result->ctx_compute, result->gb_grad, accumulate); if (params.build_type == GGML_OPT_BUILD_TYPE_GRAD) { - result->gb_opt = nullptr; - result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0)); - result->buf_static_cpu = nullptr; - - ggml_opt_alloc_graph(result, result->gb_grad); ggml_graph_reset(result->gb_grad); - return result; } @@ -466,7 +454,6 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) { result->buf_static_cpu = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx_static_cpu, ggml_backend_cpu_buffer_type()); - ggml_opt_alloc_graph(result, result->gb_opt); ggml_graph_reset(result->gb_opt); return result; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index c797172abe3d5..c0cad48ac645e 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -5019,8 +5019,10 @@ static void ggml_hash_map_free(struct hash_map * map) { } // utility functions to change gradients -// if a is in acc_table, modify gradients in-place and mark result as gradient accumulator -// else if a is in zero_table, replace a +// isrc is the index of tensor in cgraph->visited_has_set.keys +// the corresponding gradient (accumulators) are also at position isrc +// if tensor has a gradient accumulator, modify that accumulator in-place +// else if there is no gradient for tensor, set the corresponding value // else, just add/subtract/etc. the gradients static void ggml_add_or_set( @@ -5028,11 +5030,14 @@ static void ggml_add_or_set( struct ggml_cgraph * cgraph, size_t isrc, struct ggml_tensor * tensor) { + struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc]; + GGML_ASSERT(src); if (cgraph->grads[isrc]) { - cgraph->grads[isrc] = ggml_add_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]); + cgraph->grads[isrc] = ggml_add_impl(ctx, cgraph->grads[isrc], tensor, /*inplace =*/ cgraph->grad_accs[isrc]); } else { cgraph->grads[isrc] = tensor; } + ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name); ggml_build_forward_expand(cgraph, cgraph->grads[isrc]); } @@ -5040,18 +5045,20 @@ static void ggml_acc_or_set( struct ggml_context * ctx, struct ggml_cgraph * cgraph, size_t isrc, - struct ggml_tensor * src, struct ggml_tensor * tensor, const size_t nb1, const size_t nb2, const size_t nb3, const size_t offset) { + struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc]; + GGML_ASSERT(src); if (cgraph->grads[isrc]) { cgraph->grads[isrc] = ggml_acc_impl(ctx, cgraph->grads[isrc], tensor, nb1, nb2, nb3, offset, cgraph->grad_accs[isrc]); } else { struct ggml_tensor * a_zero = ggml_scale(ctx, src, 0.0f); // FIXME this is going to produce NaN if a contains inf/NaN cgraph->grads[isrc] = ggml_acc_impl(ctx, a_zero, tensor, nb1, nb2, nb3, offset, false); } + ggml_format_name(cgraph->grads[isrc], "grad for %s", cgraph->visited_hash_set.keys[isrc]->name); ggml_build_forward_expand(cgraph, cgraph->grads[isrc]); } @@ -5059,13 +5066,15 @@ static void ggml_add1_or_set( struct ggml_context * ctx, struct ggml_cgraph * cgraph, size_t isrc, - struct ggml_tensor * src, struct ggml_tensor * tensor) { + struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc]; + GGML_ASSERT(src); if (cgraph->grads[isrc]) { cgraph->grads[isrc] = ggml_add1_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]); } else { cgraph->grads[isrc] = ggml_repeat(ctx, tensor, src); } + ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name); ggml_build_forward_expand(cgraph, cgraph->grads[isrc]); } @@ -5074,11 +5083,14 @@ static void ggml_sub_or_set( struct ggml_cgraph * cgraph, size_t isrc, struct ggml_tensor * tensor) { + struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc]; + GGML_ASSERT(src); if (cgraph->grads[isrc]) { cgraph->grads[isrc] = ggml_sub_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]); } else { cgraph->grads[isrc] = ggml_neg(ctx, tensor); } + ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name); ggml_build_forward_expand(cgraph, cgraph->grads[isrc]); } @@ -5095,12 +5107,12 @@ static void ggml_compute_backward( struct ggml_tensor * src1 = tensor->src[1]; struct ggml_tensor * src2 = tensor->src[2]; struct ggml_hash_set * hash_set = &cgraph->visited_hash_set; - const size_t isrc0 = ggml_hash_find(hash_set, src0); - const size_t isrc1 = ggml_hash_find(hash_set, src1); - const size_t isrc2 = ggml_hash_find(hash_set, src2); - const bool src0_needs_grads = isrc0 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc0) && grads_needed[isrc0]; - const bool src1_needs_grads = isrc1 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc1) && grads_needed[isrc1]; - const bool src2_needs_grads = isrc2 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc2) && grads_needed[isrc2]; + const size_t isrc0 = src0 ? ggml_hash_find(hash_set, src0) : (size_t) -1; + const size_t isrc1 = src1 ? ggml_hash_find(hash_set, src1) : (size_t) -1; + const size_t isrc2 = src2 ? ggml_hash_find(hash_set, src2) : (size_t) -1; + const bool src0_needs_grads = src0 && isrc0 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc0) && grads_needed[isrc0]; + const bool src1_needs_grads = src1 && isrc1 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc1) && grads_needed[isrc1]; + const bool src2_needs_grads = src2 && isrc2 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc2) && grads_needed[isrc2]; switch (tensor->op) { case GGML_OP_DUP: { @@ -5200,7 +5212,7 @@ static void ggml_compute_backward( } break; case GGML_OP_SUM: { if (src0_needs_grads) { - ggml_add1_or_set(ctx, cgraph, isrc0, src0, grad); + ggml_add1_or_set(ctx, cgraph, isrc0, grad); } } break; case GGML_OP_SUM_ROWS: { @@ -5210,7 +5222,7 @@ static void ggml_compute_backward( } break; case GGML_OP_MEAN: { if (src0_needs_grads) { - ggml_add1_or_set(ctx, cgraph, isrc0, src0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], false)); + ggml_add1_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], false)); } } break; case GGML_OP_REPEAT: { @@ -5363,7 +5375,7 @@ static void ggml_compute_backward( nb3 = (nb3 / n0) * ng; } - ggml_acc_or_set(ctx, cgraph, isrc0, src0, grad, nb1, nb2, nb3, offset); + ggml_acc_or_set(ctx, cgraph, isrc0, grad, nb1, nb2, nb3, offset); } } break; case GGML_OP_PERMUTE: { @@ -5597,10 +5609,9 @@ void ggml_build_backward_expand( const int n_nodes_f = cgraph->n_nodes; - const size_t hash_size = ggml_hash_size(2*cgraph->size); - memset(cgraph->grads, 0, hash_size*sizeof(struct ggml_tensor *)); - memset(cgraph->grad_accs, 0, hash_size*sizeof(struct ggml_tensor *)); - bool * grads_needed = calloc(hash_size, sizeof(bool)); + memset(cgraph->grads, 0, cgraph->visited_hash_set.size*sizeof(struct ggml_tensor *)); + memset(cgraph->grad_accs, 0, cgraph->visited_hash_set.size*sizeof(struct ggml_tensor *)); + bool * grads_needed = calloc(cgraph->visited_hash_set.size, sizeof(bool)); { bool any_params = false; @@ -5621,7 +5632,7 @@ void ggml_build_backward_expand( continue; } - bool node_needs_grad = node->flags & GGML_TENSOR_FLAG_PARAM; + bool node_needs_grad = (node->flags & GGML_TENSOR_FLAG_PARAM) || (node->flags & GGML_TENSOR_FLAG_LOSS); bool ignore_src[GGML_MAX_SRC] = {false}; switch (node->op) { // gradients in node->src[0] for one reason or another have no effect on output gradients @@ -5638,7 +5649,7 @@ void ggml_build_backward_expand( } break; // gradients in node->src[1] for one reason or another have no effect on output gradients - case GGML_OP_CPY: // gradients in CPY target are irrelevant + case GGML_OP_CPY: // gradients in CPY target are irrelevant case GGML_OP_GET_ROWS: // row indices not differentiable case GGML_OP_GET_ROWS_BACK: // same as for GET_ROWS case GGML_OP_ROPE: // positions not differentiable @@ -5665,9 +5676,12 @@ void ggml_build_backward_expand( node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE); const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node); + GGML_ASSERT(igrad != GGML_HASHSET_FULL); + GGML_ASSERT(ggml_bitset_get(cgraph->visited_hash_set.used, igrad)); if ((accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) || (node->flags & GGML_TENSOR_FLAG_LOSS)) { - cgraph->grads[igrad] = ggml_dup_tensor(ctx_static, node); - cgraph->grad_accs[igrad] = cgraph->grads[igrad]; + cgraph->grad_accs[igrad] = ggml_dup_tensor(ctx_static, node); + cgraph->grads[igrad] = cgraph->grad_accs[igrad]; + ggml_format_name(cgraph->grad_accs[igrad], "grad acc for %s", node->name); } grads_needed[igrad] = true; } @@ -5761,15 +5775,15 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) { struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) { struct ggml_cgraph cgraph = { - /*.size =*/ 0, - /*.n_nodes =*/ i1 - i0, - /*.n_leafs =*/ 0, - /*.nodes =*/ cgraph0->nodes + i0, - /*.grads =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL, - /*.grad_accs =*/ cgraph0->grad_accs ? cgraph0->grad_accs + i0 : NULL, - /*.leafs =*/ NULL, - /*.hash_table =*/ { 0, NULL, NULL }, - /*.order =*/ cgraph0->order, + /*.size =*/ 0, + /*.n_nodes =*/ i1 - i0, + /*.n_leafs =*/ 0, + /*.nodes =*/ cgraph0->nodes + i0, + /*.grads =*/ NULL, // gradients would need visited_hash_set + /*.grad_accs =*/ NULL, + /*.leafs =*/ NULL, + /*.visited_hash_set =*/ { 0, NULL, NULL }, + /*.order =*/ cgraph0->order, }; return cgraph; @@ -5799,12 +5813,22 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) { } } + if (dst->grads) { + memset(dst->grads, 0, dst->visited_hash_set.size*sizeof(struct ggml_tensor *)); + memset(dst->grad_accs, 0, dst->visited_hash_set.size*sizeof(struct ggml_tensor *)); + } if (src->grads) { GGML_ASSERT(dst->grads != NULL); GGML_ASSERT(dst->grad_accs != NULL); for (int i = 0; i < src->n_nodes; ++i) { const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]); const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]); + + GGML_ASSERT(igrad_src != GGML_HASHSET_FULL); + GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src)); + GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL); + GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst)); + dst->grads[igrad_dst] = src->grads[igrad_src]; dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src]; } @@ -5839,12 +5863,8 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) { if (node->op == GGML_OP_OPT_STEP_ADAMW) { // clear momenta - if (node->src[2]->data) { - ggml_set_zero(node->src[2]); - } - if (node->src[3]->data) { - ggml_set_zero(node->src[3]); - } + ggml_set_zero(node->src[2]); + ggml_set_zero(node->src[3]); } // initial gradients of loss should be 1, 0 otherwise diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 01ac7166e0e22..37342c156d7c1 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -819,7 +819,6 @@ struct test_case { } } - // TODO: refactor so that this check is only needed once for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { if (!ggml_backend_supports_op(backend, t)) { printf("not supported [%s] ", ggml_backend_name(backend)); From 6f97400020f1550dcb2e02314b161c19768dc05b Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 20 Nov 2024 13:25:08 +0100 Subject: [PATCH 026/245] ggml/sched : do not skip views in pre-assignments --- ggml/src/ggml-backend.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 3433d082e775b..45da0c27daffa 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -886,9 +886,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; int * node_backend_id = &tensor_backend_id(node); - if (ggml_is_view_op(node->op)) { - continue; - } // do not overwrite user assignments if (*node_backend_id == -1) { *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node); From 3882f3fd7a5ef1248306b4d7f844dff8032ecd98 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 21 Nov 2024 09:22:11 +0200 Subject: [PATCH 027/245] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index e9bd2dbb05e2c..d101d2b57f8fa 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -2884dd72fea8922910fe53387c3d17ab928d3a8e +6fcbd60bc72ac3f7ad43f78c87e535f2e6206f58 From 6fb5f1a95925383ec52878934ec14ccad018ab88 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 21 Nov 2024 10:22:47 +0200 Subject: [PATCH 028/245] llama : handle KV shift for recurrent models (#10402) ggml-ci --- src/llama.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index bfc2b1d5afcbb..407274ba8cc63 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -18220,13 +18220,13 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { static void llama_kv_cache_update_internal(struct llama_context & lctx) { bool need_reserve = false; - // apply K-shift if needed - if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) { + if (lctx.kv_self.has_shift) { if (!llama_kv_cache_can_shift(&lctx)) { - GGML_ABORT("Deepseek2 does not support K-shift"); + GGML_ABORT("The current context does not support K-shift"); } - { + // apply K-shift if needed + if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) { ggml_backend_sched_reset(lctx.sched.get()); ggml_cgraph * gf = llama_build_graph_k_shift(lctx); @@ -20472,7 +20472,7 @@ void llama_kv_cache_update(struct llama_context * ctx) { } bool llama_kv_cache_can_shift(struct llama_context * ctx) { - return ctx->model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA + return !ctx->kv_self.recurrent && ctx->model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA } // deprecated From 708842d161de9db97f2add74bdd4e99b66a15a14 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Thu, 21 Nov 2024 18:18:50 +0100 Subject: [PATCH 029/245] cuda : optimize argmax (#10441) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * cuda : optimize argmax * remove unused parameter ggml-ci * fixup : use full warps ggml-ci * Apply suggestions from code review Co-authored-by: Johannes Gäßler * fix ub * ggml : check ne00 <= INT32_MAX in argmax and argsort --------- Co-authored-by: Johannes Gäßler --- ggml/src/ggml-cuda/argmax.cu | 96 +++++++++++++++++++--------------- ggml/src/ggml-cuda/common.cuh | 30 +++++------ ggml/src/ggml-cuda/quantize.cu | 8 +-- ggml/src/ggml.c | 2 + tests/test-backend-ops.cpp | 29 ++++++++++ 5 files changed, 104 insertions(+), 61 deletions(-) diff --git a/ggml/src/ggml-cuda/argmax.cu b/ggml/src/ggml-cuda/argmax.cu index aab04eca7a385..5340eedc08916 100644 --- a/ggml/src/ggml-cuda/argmax.cu +++ b/ggml/src/ggml-cuda/argmax.cu @@ -1,57 +1,69 @@ -#include "common.cuh" +#include +#include + #include "argmax.cuh" +#include "common.cuh" #include "sum.cuh" -#include +static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __restrict__ dst, const int64_t ncols) { + const int64_t row = blockIdx.x; -static __global__ void argmax_f32( - const float * x, int32_t * dst, const int64_t ncols, const int64_t nrows) { + float maxval = -FLT_MAX; + int argmax = -1; + const float * rowx = x + row * ncols; - int argmax_thread = 0; - const int64_t row0 = (int64_t)blockIdx.x*WARP_SIZE; + for (int32_t col = threadIdx.x; col < ncols; col += blockDim.x) { + const float val = rowx[col]; + if (val > maxval) { + maxval = val; + argmax = col; + } + } #pragma unroll - for (int64_t row1 = 0; row1 < WARP_SIZE; ++row1) { - const int64_t row = row0 + row1; - - if (row >= nrows) { - break; + for (int offset = 16; offset > 0; offset >>= 1) { + const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE); + const int col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE); + if (val > maxval) { + maxval = val; + argmax = col; } + } - float maxval = -FLT_MAX; - int argmax = -1; - - for (int32_t col = threadIdx.x; col < ncols; col += WARP_SIZE) { - const float val = x[row*ncols + col]; - const int bigger = val > maxval; - const int not_bigger = bigger ^ 0x00000001; - - maxval = maxval*not_bigger + val*bigger; - argmax = argmax*not_bigger + col*bigger; + const int n_warps = blockDim.x / WARP_SIZE; + const int lane_id = threadIdx.x % WARP_SIZE; + const int warp_id = threadIdx.x / WARP_SIZE; + if (n_warps > 1) { + constexpr int max_warps = 1024 / WARP_SIZE; + __shared__ float shared_maxval[max_warps]; + __shared__ int shared_argmax[max_warps]; + if (lane_id == 0) { + shared_maxval[warp_id] = maxval; + shared_argmax[warp_id] = argmax; } + __syncthreads(); + + if (warp_id == 0) { + if (lane_id < n_warps) { + maxval = shared_maxval[lane_id]; + argmax = shared_argmax[lane_id]; + } #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, mask, WARP_SIZE); - const int col = __shfl_xor_sync(0xFFFFFFFF, argmax, mask, WARP_SIZE); - const int bigger = val > maxval; - const int not_bigger = bigger ^ 0x00000001; - - maxval = maxval*not_bigger + val*bigger; - argmax = argmax*not_bigger + col*bigger; + for (int offset = 16; offset > 0; offset >>= 1) { + const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE); + const int col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE); + if (val > maxval) { + maxval = val; + argmax = col; + } + } } - - const int store = row1 == threadIdx.x; - argmax_thread += store*argmax; } - const int row = row0 + threadIdx.x; - - if (row >= nrows) { - return; + if (warp_id == 0 && lane_id == 0) { + dst[row] = argmax; } - - dst[row] = argmax_thread; } void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { @@ -70,10 +82,10 @@ void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { cudaStream_t stream = ctx.stream(); - const int64_t num_blocks = (nrows + WARP_SIZE - 1) / WARP_SIZE; - - const dim3 blocks_dim(WARP_SIZE, 1, 1); + const int64_t num_blocks = nrows; + const int64_t num_threads = std::min(1024, (ne00 + WARP_SIZE - 1) / WARP_SIZE * WARP_SIZE); + const dim3 blocks_dim(num_threads, 1, 1); const dim3 blocks_num(num_blocks, 1, 1); - argmax_f32<<>>(src0_d, dst_d, ne00, nrows); + argmax_f32<<>>(src0_d, dst_d, ne00); } diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index e146c691c6f87..b0dd16066b4ba 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -180,8 +180,8 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) { return __reduce_add_sync(0xffffffff, x); #else #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - x += __shfl_xor_sync(0xffffffff, x, mask, 32); + for (int offset = 16; offset > 0; offset >>= 1) { + x += __shfl_xor_sync(0xffffffff, x, offset, 32); } return x; #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE @@ -189,17 +189,17 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) { static __device__ __forceinline__ float warp_reduce_sum(float x) { #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - x += __shfl_xor_sync(0xffffffff, x, mask, 32); + for (int offset = 16; offset > 0; offset >>= 1) { + x += __shfl_xor_sync(0xffffffff, x, offset, 32); } return x; } static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) { #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32); - a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32); + for (int offset = 16; offset > 0; offset >>= 1) { + a.x += __shfl_xor_sync(0xffffffff, a.x, offset, 32); + a.y += __shfl_xor_sync(0xffffffff, a.y, offset, 32); } return a; } @@ -209,16 +209,16 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32); + for (int offset = 16; offset > 0; offset >>= 1) { + const half2 a_other = __shfl_xor_sync(0xffffffff, a, offset, 32); reinterpret_cast(a.x) += __low2half(a_other); reinterpret_cast(a.y) += __high2half(a_other); } return a; #else #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32)); + for (int offset = 16; offset > 0; offset >>= 1) { + a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, offset, 32)); } return a; #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) @@ -231,8 +231,8 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { static __device__ __forceinline__ float warp_reduce_max(float x) { #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); + for (int offset = 16; offset > 0; offset >>= 1) { + x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, offset, 32)); } return x; } @@ -275,8 +275,8 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); + for (int offset = 16; offset > 0; offset >>= 1) { + x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, 32)); } return x; #else diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu index 45408ce8684e4..1702e4ce2feba 100644 --- a/ggml/src/ggml-cuda/quantize.cu +++ b/ggml/src/ggml-cuda/quantize.cu @@ -69,8 +69,8 @@ static __global__ void quantize_mmq_q8_1( // Exchange max. abs. value between vals_per_scale/4 threads. #pragma unroll - for (int mask = vals_per_scale/8; mask > 0; mask >>= 1) { - amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, mask, WARP_SIZE)); + for (int offset = vals_per_scale/8; offset > 0; offset >>= 1) { + amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, offset, WARP_SIZE)); } float sum; @@ -79,8 +79,8 @@ static __global__ void quantize_mmq_q8_1( // Exchange calculate sum across vals_per_sum/4 threads. #pragma unroll - for (int mask = vals_per_sum/8; mask > 0; mask >>= 1) { - sum += __shfl_xor_sync(0xFFFFFFFF, sum, mask, WARP_SIZE); + for (int offset = vals_per_sum/8; offset > 0; offset >>= 1) { + sum += __shfl_xor_sync(0xFFFFFFFF, sum, offset, WARP_SIZE); } } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index c0cad48ac645e..545009438a83f 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -2255,6 +2255,7 @@ struct ggml_tensor * ggml_argmax( struct ggml_context * ctx, struct ggml_tensor * a) { GGML_ASSERT(ggml_is_matrix(a)); + GGML_ASSERT(a->ne[0] <= INT32_MAX); struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]); @@ -4138,6 +4139,7 @@ struct ggml_tensor * ggml_argsort( struct ggml_context * ctx, struct ggml_tensor * a, enum ggml_sort_order order) { + GGML_ASSERT(a->ne[0] <= INT32_MAX); struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne); ggml_set_op_params_i32(result, 0, (int32_t) order); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 37342c156d7c1..b2b5705243ea2 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1154,6 +1154,26 @@ struct test_argmax : public test_case { return out; } + void initialize_tensors(ggml_context * ctx) override { + std::random_device rd; + std::default_random_engine rng(rd()); + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + if (t->type == GGML_TYPE_F32) { + // initialize with unique values to avoid ties + for (int64_t r = 0; r < ggml_nrows(t); r++) { + std::vector data(t->ne[0]); + for (int i = 0; i < t->ne[0]; i++) { + data[i] = i; + } + std::shuffle(data.begin(), data.end(), rng); + ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float)); + } + } else { + init_tensor_uniform(t); + } + } + } + double max_nmse_err() override { return 0.0; } @@ -3440,6 +3460,11 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1)); test_cases.emplace_back(new test_argmax()); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32, 1, 1, 1})); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {100, 10, 1, 1})); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1})); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {2000, 10, 1, 1})); + test_cases.emplace_back(new test_count_equal()); for (int ne3 : {1, 3}) { // CUDA backward pass only supports ne3 == 1 @@ -3830,6 +3855,10 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {64, 64, 20, 1}, false, 1.0f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 64, 20, 1}, false, 1.0f, 0.0f)); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32, 10, 1, 1})); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1})); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32000, 512, 1, 1})); + for (int bs : {1, 512}) { for (ggml_type type_a : all_types) { for (ggml_type type_b : {GGML_TYPE_F32}) { From e763d5c3726cf1de0fd140946a33cd3afa61491a Mon Sep 17 00:00:00 2001 From: leo-pony Date: Fri, 22 Nov 2024 14:07:20 +0800 Subject: [PATCH 030/245] CANN: Support Ascend310P to accelerate F32 and F16 Model (#10216) * CANN Support Ascend310P to accelerate F32 and F16 Model * Add compile option soc type macro ASCEND_310P to ggml-cann lib * Remove unused code * Remove the ascend soc_type hard code compile option in CMakelist.txt --- ggml/src/ggml-cann/CMakeLists.txt | 29 ++++++++++++++++ ggml/src/ggml-cann/aclnn_ops.cpp | 18 ++++++++++ ggml/src/ggml-cann/kernels/CMakeLists.txt | 7 ++-- ggml/src/ggml-cann/kernels/dup.cpp | 32 +++++++++++++----- ggml/src/ggml-cann/kernels/get_row_f16.cpp | 37 +++++++++++++-------- ggml/src/ggml-cann/kernels/get_row_f32.cpp | 36 ++++++++++++-------- ggml/src/ggml-cann/kernels/get_row_q4_0.cpp | 5 ++- 7 files changed, 123 insertions(+), 41 deletions(-) diff --git a/ggml/src/ggml-cann/CMakeLists.txt b/ggml/src/ggml-cann/CMakeLists.txt index c8e15c6d40ce9..756200b893d02 100644 --- a/ggml/src/ggml-cann/CMakeLists.txt +++ b/ggml/src/ggml-cann/CMakeLists.txt @@ -3,6 +3,33 @@ if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOM message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}") endif() +# Auto-detech Soc type and Soc version, if detect failed, will abort build +set(SOC_VERSION "") +function(detect_ascend_soc_type SOC_VERSION) + execute_process( + COMMAND bash -c "npu-smi info|awk -F' ' 'NF > 0 && NR==7 {print $3}'" + OUTPUT_VARIABLE npu_info + RESULT_VARIABLE npu_result + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if("${npu_info}" STREQUAL "" OR ${npu_result}) + message(FATAL_ERROR "Auto-detech ascend soc type failed, please specify manually or check ascend device working normally.") + endif() + set(${SOC_VERSION} "Ascend${npu_info}" PARENT_SCOPE) +endfunction() + +if(NOT SOC_TYPE) + detect_ascend_soc_type(SOC_VERSION) + set(SOC_TYPE "${SOC_VERSION}") + message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}") +else() + string(TOLOWER ${SOC_TYPE} SOC_VERSION) +endif() + +# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND310P. +string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}") +set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}") + if (CANN_INSTALL_DIR) # Only Support Linux. if (NOT UNIX) @@ -39,6 +66,8 @@ if (CANN_INSTALL_DIR) target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS}) target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64) + target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}") + message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}") message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}") else() diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index a4ec8418e2ab3..1f4ee986ceb89 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2312,6 +2312,14 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { switch (src0->type) { case GGML_TYPE_F32: + { +#ifdef ASCEND_310P + // Special operation for get_row_f32 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes + if ((src0->ne[0] % 8) != 0) { + size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); + ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len)); + } +#endif aclrtlaunch_ascendc_get_row_f32( 24, ctx.stream(), src0->data, src1->data, dst->data, ((ggml_tensor*)src0->extra)->ne, @@ -2320,7 +2328,16 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, ((ggml_tensor*)dst->extra)->nb); break; + } case GGML_TYPE_F16: + { +#ifdef ASCEND_310P + // Special operation for get_row_f16 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes + if ((src0->ne[0] % 16) != 0) { + size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); // out is also f32, even input is f16 + ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len)); + } +#endif aclrtlaunch_ascendc_get_row_f16( 24, ctx.stream(), src0->data, src1->data, dst->data, ((ggml_tensor*)src0->extra)->ne, @@ -2329,6 +2346,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, ((ggml_tensor*)dst->extra)->nb); break; + } case GGML_TYPE_Q4_0: aclrtlaunch_ascendc_get_row_q4_0( 24, ctx.stream(), src0->data, src1->data, dst->data, diff --git a/ggml/src/ggml-cann/kernels/CMakeLists.txt b/ggml/src/ggml-cann/kernels/CMakeLists.txt index 5b4fef91b5877..6a4e17cce54c9 100644 --- a/ggml/src/ggml-cann/kernels/CMakeLists.txt +++ b/ggml/src/ggml-cann/kernels/CMakeLists.txt @@ -1,7 +1,3 @@ -if (NOT SOC_TYPE) - set (SOC_TYPE "Ascend910B3") -endif() - file(GLOB SRC_FILES get_row_f32.cpp get_row_f16.cpp @@ -13,7 +9,6 @@ file(GLOB SRC_FILES dup.cpp ) -string(TOLOWER ${SOC_TYPE} SOC_VERSION) set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR}) set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim") @@ -30,4 +25,6 @@ ascendc_library(ascendc_kernels STATIC ${SRC_FILES} ) +message(STATUS "CANN: compile ascend kernels witch SOC_VERSION:${SOC_VERSION}.") +ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}") # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP) diff --git a/ggml/src/ggml-cann/kernels/dup.cpp b/ggml/src/ggml-cann/kernels/dup.cpp index e2c651152f486..99f03e05883aa 100644 --- a/ggml/src/ggml-cann/kernels/dup.cpp +++ b/ggml/src/ggml-cann/kernels/dup.cpp @@ -5,6 +5,7 @@ using namespace AscendC; #define BUFFER_NUM 2 +const int64_t SUPPORTED_MAX_DIM = 65535; // currently the limit of max block dim supportted by dup kernel is 65535template template class DupByRows { @@ -19,6 +20,7 @@ class DupByRows { // Input has four dims. int64_t op_block_num = GetBlockNum(); int64_t op_block_idx = GetBlockIdx(); + assert(op_block_idx < SUPPORTED_MAX_DIM && op_block_idx >= 0, "Invalid block index:%d, max is:%d\n", op_block_idx, SUPPORTED_MAX_DIM); // param num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3]; @@ -51,24 +53,36 @@ class DupByRows { __aicore__ inline void copy_in() { LocalTensor src_local = src_queue.AllocTensor(); - - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = num_elem * sizeof(SRC_T); - DataCopyPadExtParams padParams; - DataCopyPad(src_local, src_gm, dataCopyParams, padParams); - + const size_t elem_per_block = 32 / sizeof(SRC_T); + size_t tail = num_elem % elem_per_block; + size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem; + DataCopy(src_local, src_gm, cpy_elements_len); src_queue.EnQue(src_local); } __aicore__ inline void copy_out() { LocalTensor dst_local = dst_queue.DeQue(); - +#ifdef ASCEND_310P + const size_t elem_per_block = 32 / sizeof(DST_T); + size_t tail = num_elem % elem_per_block; + size_t len = num_elem & ~(elem_per_block - 1); + if (len > 0) { + DataCopy(dst_gm, dst_local, len); + } + if(tail != 0) { + for (size_t i = tail; i < elem_per_block; i++) { + dst_local[len + i].SetValue(0, 0); + } + SetAtomicAdd(); + DataCopy(dst_gm[len], dst_local[len], elem_per_block); + SetAtomicNone(); + } +#else DataCopyExtParams dataCopyParams; dataCopyParams.blockCount = 1; dataCopyParams.blockLen = num_elem * sizeof(DST_T); DataCopyPad(dst_gm, dst_local, dataCopyParams); - +#endif dst_queue.FreeTensor(dst_local); } diff --git a/ggml/src/ggml-cann/kernels/get_row_f16.cpp b/ggml/src/ggml-cann/kernels/get_row_f16.cpp index c704b5b2ec0f3..416b45104de5b 100644 --- a/ggml/src/ggml-cann/kernels/get_row_f16.cpp +++ b/ggml/src/ggml-cann/kernels/get_row_f16.cpp @@ -14,7 +14,7 @@ class GET_ROW_F16 { int64_t *output_ne_ub, size_t *output_nb_ub) { // TODO, use template for F16/f32 int64_t op_block_num = GetBlockNum(); - int64_t op_block_idx = GetBlockIdx(); + op_block_idx = GetBlockIdx(); for (int i = 0; i < 4; i++) { input_ne[i] = input_ne_ub[i]; @@ -59,32 +59,42 @@ class GET_ROW_F16 { } __aicore__ inline void copy_in(uint32_t offset, size_t len) { + size_t origin_len = len; LocalTensor input_local = input_queue.AllocTensor(); - size_t tail = len % 32; - len = len & ~31; - DataCopy(input_local, input_gm[offset], len); + const size_t elem_per_block = 32 / sizeof(half); + size_t tail = len % elem_per_block; + len = len & ~(elem_per_block - 1); if(tail != 0) { - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = tail * sizeof(half); - DataCopyPadExtParams padParams; - DataCopyPad(input_local[len], input_gm[offset + len], - dataCopyParams, padParams); + len += elem_per_block; } + DataCopy(input_local, input_gm[offset], len); input_queue.EnQue(input_local); } __aicore__ inline void copy_out(uint32_t offset, size_t len) { LocalTensor output_local = output_queue.DeQue(); - size_t tail = len % 32; - len = len & ~31; - DataCopy(output_gm[offset], output_local, len); + const size_t elem_per_block = 32 / sizeof(float); + size_t tail = len % elem_per_block; + len = len & ~(elem_per_block - 1); + if (len > 0) { + DataCopy(output_gm[offset], output_local, len); + } + if(tail != 0) { +#ifdef ASCEND_310P + for (size_t i = tail; i < elem_per_block; i++) { + output_local[len + i].SetValue(0, 0); + } + SetAtomicAdd(); + DataCopy(output_gm[offset + len], output_local[len], elem_per_block); + SetAtomicNone(); +#else DataCopyExtParams dataCopyParams; dataCopyParams.blockCount = 1; dataCopyParams.blockLen = tail * sizeof(float); DataCopyPad(output_gm[offset + len], output_local[len], dataCopyParams); +#endif } output_queue.FreeTensor(output_local); } @@ -150,6 +160,7 @@ class GET_ROW_F16 { GlobalTensor output_gm; TQue input_queue; TQue output_queue; + int64_t op_block_idx; }; template diff --git a/ggml/src/ggml-cann/kernels/get_row_f32.cpp b/ggml/src/ggml-cann/kernels/get_row_f32.cpp index 9db080af36998..02116905b18e4 100644 --- a/ggml/src/ggml-cann/kernels/get_row_f32.cpp +++ b/ggml/src/ggml-cann/kernels/get_row_f32.cpp @@ -13,7 +13,7 @@ class GET_ROW_F32 { int64_t *indices_ne_ub, size_t *indices_nb_ub, int64_t *output_ne_ub, size_t *output_nb_ub) { int64_t op_block_num = GetBlockNum(); - int64_t op_block_idx = GetBlockIdx(); + op_block_idx = GetBlockIdx(); for (int i = 0; i < 4; i++) { input_ne[i] = input_ne_ub[i]; @@ -55,31 +55,40 @@ class GET_ROW_F32 { __aicore__ inline void copy_in(uint32_t offset, size_t len) { LocalTensor input_local = input_queue.AllocTensor(); - size_t tail = len % 32; - len = len & ~31; - DataCopy(input_local, input_gm[offset], len); + const size_t elem_per_block = 32 / sizeof(float); + size_t tail = len % elem_per_block; + len = len & ~(elem_per_block - 1); if(tail != 0) { - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = tail * sizeof(float); - DataCopyPadExtParams padParams; - DataCopyPad(input_local[len], input_gm[offset + len], - dataCopyParams, padParams); + len += elem_per_block; } + DataCopy(input_local, input_gm[offset], len); input_queue.EnQue(input_local); } __aicore__ inline void copy_out(uint32_t offset, size_t len) { LocalTensor output_local = output_queue.DeQue(); - size_t tail = len % 32; - len = len & ~31; - DataCopy(output_gm[offset], output_local, len); + const size_t elem_per_block = 32 / sizeof(float); + size_t tail = len % elem_per_block; + len = len & ~(elem_per_block - 1); + if (len > 0) { + DataCopy(output_gm[offset], output_local, len); + } + if(tail != 0) { +#ifdef ASCEND_310P + for (size_t i = tail; i < elem_per_block; i++) { + output_local[len + i].SetValue(0, 0); + } + SetAtomicAdd(); + DataCopy(output_gm[offset + len], output_local[len], elem_per_block); + SetAtomicNone(); +#else DataCopyExtParams dataCopyParams; dataCopyParams.blockCount = 1; dataCopyParams.blockLen = tail * sizeof(float); DataCopyPad(output_gm[offset + len], output_local[len], dataCopyParams); +#endif } output_queue.FreeTensor(output_local); } @@ -144,6 +153,7 @@ class GET_ROW_F32 { GlobalTensor output_gm; TQue input_queue; TQue output_queue; + int64_t op_block_idx; }; template diff --git a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp index a80bfeec2417d..377211096e1f5 100644 --- a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +++ b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp @@ -110,9 +110,12 @@ class GET_ROW_Q4_0 { LocalTensor output_local = output_queue.AllocTensor(); // TODO: cast more data to speed up. +#ifdef ASCEND_310P + // TODO: 310P support quantification +#else Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0); Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0); - +#endif // Only mul need compile by group. half scale = scale_gm.GetValue(scale_offset); From b47ede8b5b76d9d9ef3bdca183e411a18f5ef61d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 22 Nov 2024 08:32:40 +0100 Subject: [PATCH 031/245] GitHub: ask for more info in issue templates (#10426) * GitHub: ask for more info in issues [no ci] * refactor issue templates to be component-specific * more understandable issue description * add dropdown for llama.cpp module --- .github/ISSUE_TEMPLATE/01-bug-low.yml | 50 ---------- .../ISSUE_TEMPLATE/010-bug-compilation.yml | 73 ++++++++++++++ .github/ISSUE_TEMPLATE/011-bug-results.yml | 98 +++++++++++++++++++ .github/ISSUE_TEMPLATE/019-bug-misc.yml | 78 +++++++++++++++ .github/ISSUE_TEMPLATE/02-bug-medium.yml | 50 ---------- ...05-enhancement.yml => 020-enhancement.yml} | 2 +- .github/ISSUE_TEMPLATE/03-bug-high.yml | 50 ---------- .../{06-research.yml => 030-research.yml} | 2 +- .github/ISSUE_TEMPLATE/04-bug-critical.yml | 50 ---------- .../{07-refactor.yml => 040-refactor.yml} | 2 +- 10 files changed, 252 insertions(+), 203 deletions(-) delete mode 100644 .github/ISSUE_TEMPLATE/01-bug-low.yml create mode 100644 .github/ISSUE_TEMPLATE/010-bug-compilation.yml create mode 100644 .github/ISSUE_TEMPLATE/011-bug-results.yml create mode 100644 .github/ISSUE_TEMPLATE/019-bug-misc.yml delete mode 100644 .github/ISSUE_TEMPLATE/02-bug-medium.yml rename .github/ISSUE_TEMPLATE/{05-enhancement.yml => 020-enhancement.yml} (97%) delete mode 100644 .github/ISSUE_TEMPLATE/03-bug-high.yml rename .github/ISSUE_TEMPLATE/{06-research.yml => 030-research.yml} (97%) delete mode 100644 .github/ISSUE_TEMPLATE/04-bug-critical.yml rename .github/ISSUE_TEMPLATE/{07-refactor.yml => 040-refactor.yml} (95%) diff --git a/.github/ISSUE_TEMPLATE/01-bug-low.yml b/.github/ISSUE_TEMPLATE/01-bug-low.yml deleted file mode 100644 index 54785854f776e..0000000000000 --- a/.github/ISSUE_TEMPLATE/01-bug-low.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: Low Severity Bugs -description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches) -title: "Bug: " -labels: ["bug-unconfirmed", "low severity"] -body: - - type: markdown - attributes: - value: | - Thanks for taking the time to fill out this bug report! - Please include information about your system, the steps to reproduce the bug, - and the version of llama.cpp that you are using. - If possible, please provide a minimal code example that reproduces the bug. - - type: textarea - id: what-happened - attributes: - label: What happened? - description: Also tell us, what did you expect to happen? - placeholder: Tell us what you see! - validations: - required: true - - type: textarea - id: version - attributes: - label: Name and Version - description: Which executable and which version of our software are you running? (use `--version` to get a version string) - placeholder: | - $./llama-cli --version - version: 2999 (42b4109e) - built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu - validations: - required: true - - type: dropdown - id: operating-system - attributes: - label: What operating system are you seeing the problem on? - multiple: true - options: - - Linux - - Mac - - Windows - - BSD - - Other? (Please let us know in description) - validations: - required: false - - type: textarea - id: logs - attributes: - label: Relevant log output - description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. - render: shell diff --git a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml new file mode 100644 index 0000000000000..550ee1b498ab9 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml @@ -0,0 +1,73 @@ +name: Bug (compilation) +description: Something goes wrong when trying to compile llama.cpp. +title: "Compile bug: " +labels: ["bug-unconfirmed", "compilation"] +body: + - type: markdown + attributes: + value: > + Thanks for taking the time to fill out this bug report! + This issue template is intended for bug reports where the compilation of llama.cpp fails. + Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`. + If the compilation succeeds with ccache disabled you should be able to permanently fix the issue + by clearing `~/.cache/ccache` (on Linux). + - type: textarea + id: commit + attributes: + label: Git commit + description: Which commit are you trying to compile? + placeholder: | + $git rev-parse HEAD + 84a07a17b1b08cf2b9747c633a2372782848a27f + validations: + required: true + - type: dropdown + id: operating-system + attributes: + label: Which operating systems do you know to be affected? + multiple: true + options: + - Linux + - Mac + - Windows + - BSD + - Other? (Please let us know in description) + validations: + required: true + - type: dropdown + id: backends + attributes: + label: GGML backends + description: Which GGML backends do you know to be affected? + options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan] + multiple: true + - type: textarea + id: steps_to_reproduce + attributes: + label: Steps to Reproduce + description: > + Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it. + If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us. + placeholder: > + Here are the exact commands that I used: ... + validations: + required: true + - type: textarea + id: first_bad_commit + attributes: + label: First Bad Commit + description: > + If the bug was not present on an earlier version: when did it start appearing? + If possible, please do a git bisect and identify the exact commit that introduced the bug. + validations: + required: false + - type: textarea + id: logs + attributes: + label: Relevant log output + description: > + Please copy and paste any relevant log output, including the command that you entered and any generated text. + This will be automatically formatted into code, so no need for backticks. + render: shell + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/011-bug-results.yml b/.github/ISSUE_TEMPLATE/011-bug-results.yml new file mode 100644 index 0000000000000..1adb162b79a77 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml @@ -0,0 +1,98 @@ +name: Bug (model use) +description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module). +title: "Eval bug: " +labels: ["bug-unconfirmed", "model evaluation"] +body: + - type: markdown + attributes: + value: > + Thanks for taking the time to fill out this bug report! + This issue template is intended for bug reports where the model evaluation results + (i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation. + If you encountered the issue while using an external UI (e.g. ollama), + please reproduce your issue using one of the examples/binaries in this repository. + The `llama-cli` binary can be used for simple and reproducible model inference. + - type: textarea + id: version + attributes: + label: Name and Version + description: Which version of our software are you running? (use `--version` to get a version string) + placeholder: | + $./llama-cli --version + version: 2999 (42b4109e) + built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu + validations: + required: true + - type: dropdown + id: operating-system + attributes: + label: Which operating systems do you know to be affected? + multiple: true + options: + - Linux + - Mac + - Windows + - BSD + - Other? (Please let us know in description) + validations: + required: true + - type: dropdown + id: backends + attributes: + label: GGML backends + description: Which GGML backends do you know to be affected? + options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan] + multiple: true + - type: textarea + id: hardware + attributes: + label: Hardware + description: Which CPUs/GPUs are you using? + placeholder: > + e.g. Ryzen 5950X + 2x RTX 4090 + validations: + required: true + - type: textarea + id: model + attributes: + label: Model + description: > + Which model at which quantization were you using when encountering the bug? + If you downloaded a GGUF file off of Huggingface, please provide a link. + placeholder: > + e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M + validations: + required: false + - type: textarea + id: steps_to_reproduce + attributes: + label: Steps to Reproduce + description: > + Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it. + If you can narrow down the bug to specific hardware, compile flags, or command line arguments, + that information would be very much appreciated by us. + placeholder: > + e.g. when I run llama-cli with -ngl 99 I get garbled outputs. + When I use -ngl 0 it works correctly. + Here are the exact commands that I used: ... + validations: + required: true + - type: textarea + id: first_bad_commit + attributes: + label: First Bad Commit + description: > + If the bug was not present on an earlier version: when did it start appearing? + If possible, please do a git bisect and identify the exact commit that introduced the bug. + validations: + required: false + - type: textarea + id: logs + attributes: + label: Relevant log output + description: > + Please copy and paste any relevant log output, including the command that you entered and any generated text. + This will be automatically formatted into code, so no need for backticks. + render: shell + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/019-bug-misc.yml b/.github/ISSUE_TEMPLATE/019-bug-misc.yml new file mode 100644 index 0000000000000..124cdee919089 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml @@ -0,0 +1,78 @@ +name: Bug (misc.) +description: Something is not working the way it should (and it's not covered by any of the above cases). +title: "Misc. bug: " +labels: ["bug-unconfirmed"] +body: + - type: markdown + attributes: + value: > + Thanks for taking the time to fill out this bug report! + This issue template is intended for miscellaneous bugs that don't fit into any other category. + If you encountered the issue while using an external UI (e.g. ollama), + please reproduce your issue using one of the examples/binaries in this repository. + - type: textarea + id: version + attributes: + label: Name and Version + description: Which version of our software are you running? (use `--version` to get a version string) + placeholder: | + $./llama-cli --version + version: 2999 (42b4109e) + built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu + validations: + required: true + - type: dropdown + id: operating-system + attributes: + label: Which operating systems do you know to be affected? + multiple: true + options: + - Linux + - Mac + - Windows + - BSD + - Other? (Please let us know in description) + validations: + required: true + - type: dropdown + id: module + attributes: + label: Which llama.cpp modules do you know to be affected? + multiple: true + options: + - libllama (core library) + - llama-cli + - llama-server + - llama-bench + - llama-quantize + - Python/Bash scripts + - Other (Please specify in the next section) + validations: + required: true + - type: textarea + id: steps_to_reproduce + attributes: + label: Steps to Reproduce + description: > + Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it. + validations: + required: true + - type: textarea + id: first_bad_commit + attributes: + label: First Bad Commit + description: > + If the bug was not present on an earlier version: when did it start appearing? + If possible, please do a git bisect and identify the exact commit that introduced the bug. + validations: + required: false + - type: textarea + id: logs + attributes: + label: Relevant log output + description: > + Please copy and paste any relevant log output, including the command that you entered and any generated text. + This will be automatically formatted into code, so no need for backticks. + render: shell + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/02-bug-medium.yml b/.github/ISSUE_TEMPLATE/02-bug-medium.yml deleted file mode 100644 index a6285c6f05bac..0000000000000 --- a/.github/ISSUE_TEMPLATE/02-bug-medium.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: Medium Severity Bug -description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable) -title: "Bug: " -labels: ["bug-unconfirmed", "medium severity"] -body: - - type: markdown - attributes: - value: | - Thanks for taking the time to fill out this bug report! - Please include information about your system, the steps to reproduce the bug, - and the version of llama.cpp that you are using. - If possible, please provide a minimal code example that reproduces the bug. - - type: textarea - id: what-happened - attributes: - label: What happened? - description: Also tell us, what did you expect to happen? - placeholder: Tell us what you see! - validations: - required: true - - type: textarea - id: version - attributes: - label: Name and Version - description: Which executable and which version of our software are you running? (use `--version` to get a version string) - placeholder: | - $./llama-cli --version - version: 2999 (42b4109e) - built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu - validations: - required: true - - type: dropdown - id: operating-system - attributes: - label: What operating system are you seeing the problem on? - multiple: true - options: - - Linux - - Mac - - Windows - - BSD - - Other? (Please let us know in description) - validations: - required: false - - type: textarea - id: logs - attributes: - label: Relevant log output - description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. - render: shell diff --git a/.github/ISSUE_TEMPLATE/05-enhancement.yml b/.github/ISSUE_TEMPLATE/020-enhancement.yml similarity index 97% rename from .github/ISSUE_TEMPLATE/05-enhancement.yml rename to .github/ISSUE_TEMPLATE/020-enhancement.yml index 58fca73183d41..02dd4f575a686 100644 --- a/.github/ISSUE_TEMPLATE/05-enhancement.yml +++ b/.github/ISSUE_TEMPLATE/020-enhancement.yml @@ -1,5 +1,5 @@ name: Enhancement -description: Used to request enhancements for llama.cpp +description: Used to request enhancements for llama.cpp. title: "Feature Request: " labels: ["enhancement"] body: diff --git a/.github/ISSUE_TEMPLATE/03-bug-high.yml b/.github/ISSUE_TEMPLATE/03-bug-high.yml deleted file mode 100644 index ff816b93769c3..0000000000000 --- a/.github/ISSUE_TEMPLATE/03-bug-high.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: High Severity Bug -description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow) -title: "Bug: " -labels: ["bug-unconfirmed", "high severity"] -body: - - type: markdown - attributes: - value: | - Thanks for taking the time to fill out this bug report! - Please include information about your system, the steps to reproduce the bug, - and the version of llama.cpp that you are using. - If possible, please provide a minimal code example that reproduces the bug. - - type: textarea - id: what-happened - attributes: - label: What happened? - description: Also tell us, what did you expect to happen? - placeholder: Tell us what you see! - validations: - required: true - - type: textarea - id: version - attributes: - label: Name and Version - description: Which executable and which version of our software are you running? (use `--version` to get a version string) - placeholder: | - $./llama-cli --version - version: 2999 (42b4109e) - built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu - validations: - required: true - - type: dropdown - id: operating-system - attributes: - label: What operating system are you seeing the problem on? - multiple: true - options: - - Linux - - Mac - - Windows - - BSD - - Other? (Please let us know in description) - validations: - required: false - - type: textarea - id: logs - attributes: - label: Relevant log output - description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. - render: shell diff --git a/.github/ISSUE_TEMPLATE/06-research.yml b/.github/ISSUE_TEMPLATE/030-research.yml similarity index 97% rename from .github/ISSUE_TEMPLATE/06-research.yml rename to .github/ISSUE_TEMPLATE/030-research.yml index 3ae4e9f8caaa4..18975dbbfd0fe 100644 --- a/.github/ISSUE_TEMPLATE/06-research.yml +++ b/.github/ISSUE_TEMPLATE/030-research.yml @@ -1,5 +1,5 @@ name: Research -description: Track new technical research area +description: Track new technical research area. title: "Research: " labels: ["research 🔬"] body: diff --git a/.github/ISSUE_TEMPLATE/04-bug-critical.yml b/.github/ISSUE_TEMPLATE/04-bug-critical.yml deleted file mode 100644 index 7af42a80b3b93..0000000000000 --- a/.github/ISSUE_TEMPLATE/04-bug-critical.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: Critical Severity Bug -description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss) -title: "Bug: " -labels: ["bug-unconfirmed", "critical severity"] -body: - - type: markdown - attributes: - value: | - Thanks for taking the time to fill out this bug report! - Please include information about your system, the steps to reproduce the bug, - and the version of llama.cpp that you are using. - If possible, please provide a minimal code example that reproduces the bug. - - type: textarea - id: what-happened - attributes: - label: What happened? - description: Also tell us, what did you expect to happen? - placeholder: Tell us what you see! - validations: - required: true - - type: textarea - id: version - attributes: - label: Name and Version - description: Which executable and which version of our software are you running? (use `--version` to get a version string) - placeholder: | - $./llama-cli --version - version: 2999 (42b4109e) - built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu - validations: - required: true - - type: dropdown - id: operating-system - attributes: - label: What operating system are you seeing the problem on? - multiple: true - options: - - Linux - - Mac - - Windows - - BSD - - Other? (Please let us know in description) - validations: - required: false - - type: textarea - id: logs - attributes: - label: Relevant log output - description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. - render: shell diff --git a/.github/ISSUE_TEMPLATE/07-refactor.yml b/.github/ISSUE_TEMPLATE/040-refactor.yml similarity index 95% rename from .github/ISSUE_TEMPLATE/07-refactor.yml rename to .github/ISSUE_TEMPLATE/040-refactor.yml index 3a68d3d5355d6..b6e6ab36defd6 100644 --- a/.github/ISSUE_TEMPLATE/07-refactor.yml +++ b/.github/ISSUE_TEMPLATE/040-refactor.yml @@ -1,5 +1,5 @@ name: Refactor (Maintainers) -description: Used to track refactoring opportunities +description: Used to track refactoring opportunities. title: "Refactor: " labels: ["refactor"] body: From 4ebae9712b3c5dec20602a639d8f984fbb34986a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=95=AD=E6=BE=A7=E9=82=A6?= <45505768+shou692199@users.noreply.github.com> Date: Fri, 22 Nov 2024 17:44:08 +0800 Subject: [PATCH 032/245] ci: Update oneAPI runtime dll packaging (#10428) This is the minimum runtime dll dependencies for oneAPI 2025.0 --- .github/workflows/build.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 6ef0770f3fa2f..572f91643fbba 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -986,13 +986,14 @@ jobs: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin" - cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.4.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_win_proxy_loader.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_level_zero.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin From 50a43a6f44da85b62b7359195f2aa3c18f49c33f Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Sat, 23 Nov 2024 14:41:12 +0100 Subject: [PATCH 033/245] ggml : do not use ARM features not included in the build (#10457) --- ggml/src/ggml-cpu/ggml-cpu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 0d23669c25bea..4b58254e7d108 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -13896,7 +13896,7 @@ int ggml_cpu_has_vsx(void) { } int ggml_cpu_has_neon(void) { -#if defined(__ARM_ARCH) +#if defined(__ARM_ARCH) && defined(__ARM_NEON) return ggml_arm_arch_features.has_neon; #else return 0; @@ -13904,7 +13904,7 @@ int ggml_cpu_has_neon(void) { } int ggml_cpu_has_sve(void) { -#if defined(__ARM_ARCH) +#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE) return ggml_arm_arch_features.has_sve; #else return 0; @@ -13912,7 +13912,7 @@ int ggml_cpu_has_sve(void) { } int ggml_cpu_has_matmul_int8(void) { -#if defined(__ARM_ARCH) +#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_MATMUL_INT8) return ggml_arm_arch_features.has_i8mm; #else return 0; @@ -13920,7 +13920,7 @@ int ggml_cpu_has_matmul_int8(void) { } int ggml_cpu_get_sve_cnt(void) { -#if defined(__ARM_ARCH) +#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE) return ggml_arm_arch_features.sve_cnt; #else return 0; From 6ffef43ab048a9c3df82b7046293220039ecc8b7 Mon Sep 17 00:00:00 2001 From: momonga <146910567+mmngays@users.noreply.github.com> Date: Sun, 24 Nov 2024 09:09:22 +0900 Subject: [PATCH 034/245] fix gguf-py: Conversion error when multiple licenses are configured (#9807) * fix general.license list to str * fix join license list --------- Co-authored-by: momonga <115213907+mmnga@users.noreply.github.com> --- gguf-py/gguf/metadata.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gguf-py/gguf/metadata.py b/gguf-py/gguf/metadata.py index 321cbcd4c5507..962c27b204464 100644 --- a/gguf-py/gguf/metadata.py +++ b/gguf-py/gguf/metadata.py @@ -545,7 +545,10 @@ def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter): gguf_writer.add_size_label(self.size_label) if self.license is not None: - gguf_writer.add_license(self.license) + if isinstance(self.license, list): + gguf_writer.add_license(",".join(self.license)) + else: + gguf_writer.add_license(self.license) if self.license_name is not None: gguf_writer.add_license_name(self.license_name) if self.license_link is not None: From 68ebe1826ae4f27a5a440c29af7ffa48159062c1 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Sun, 24 Nov 2024 02:02:34 -0700 Subject: [PATCH 035/245] convert : XLMRoberta Type Vocab Size (#10458) This matches the key in common bert-based embedding models and may have a value other than 1 in it. Branch: XLMRobertaTypeVocabSize Signed-off-by: Gabe Goodhart --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 9f4b8154b88a8..80a179b86af7e 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2707,7 +2707,7 @@ def set_vocab(self): self.gguf_writer.add_token_scores(scores) self.gguf_writer.add_token_types(toktypes) self.gguf_writer.add_add_space_prefix(add_prefix) - self.gguf_writer.add_token_type_count(1) + self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) if precompiled_charsmap: self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) From 823b211ffddaf80b8a0c93fd25aebdf3bb59f970 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Sun, 24 Nov 2024 16:10:26 +0100 Subject: [PATCH 036/245] llama : fix op mul check with command-r-plus (#10476) --- src/llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 407274ba8cc63..11f3c22b73dcc 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7185,12 +7185,12 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w } break; case GGML_OP_ADD: { - ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, w->ne[0], 512); + ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]); op_tensor = ggml_add(ctx, a, w); } break; case GGML_OP_MUL: { - ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, w->ne[0], 512); + ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]); op_tensor = ggml_mul(ctx, a, w); } break; case GGML_OP_DIV: From 02dc28e517c0a9232a55e03869bdbc639f7f2aee Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 24 Nov 2024 18:03:25 +0200 Subject: [PATCH 037/245] flake.lock: Update (#10470) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flake lock file updates: • Updated input 'nixpkgs': 'github:NixOS/nixpkgs/5e4fbfb6b3de1aa2872b76d49fafc942626e2add?narHash=sha256-OZiZ3m8SCMfh3B6bfGC/Bm4x3qc1m2SVEAlkV6iY7Yg%3D' (2024-11-15) → 'github:NixOS/nixpkgs/23e89b7da85c3640bbc2173fe04f4bd114342367?narHash=sha256-y/MEyuJ5oBWrWAic/14LaIr/u5E0wRVzyYsouYY3W6w%3D' (2024-11-19) Co-authored-by: github-actions[bot] --- flake.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flake.lock b/flake.lock index ee8cf07e37220..d114f4422a36a 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1731676054, - "narHash": "sha256-OZiZ3m8SCMfh3B6bfGC/Bm4x3qc1m2SVEAlkV6iY7Yg=", + "lastModified": 1732014248, + "narHash": "sha256-y/MEyuJ5oBWrWAic/14LaIr/u5E0wRVzyYsouYY3W6w=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "5e4fbfb6b3de1aa2872b76d49fafc942626e2add", + "rev": "23e89b7da85c3640bbc2173fe04f4bd114342367", "type": "github" }, "original": { From 611f67975a9208814289434137eff1ed48f1524b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 25 Nov 2024 09:58:41 +0200 Subject: [PATCH 038/245] speculative : refactor and add a simpler example (#10362) * speculative : refactor and add a simpler example ggml-ci * speculative : clean-up and add comments and TODOs [no ci] * speculative : manage context in common_speculative ggml-ci * speculative : simplify ggml-ci * speculative : simplify (cont) ggml-ci * speculative : add --draft-min CLI arg * speculative : minor fixup * make : build fixes * speculative : do not redraft previous drafts ggml-ci * speculative : fix the draft sampling ggml-ci * speculative : fix compile warning * common : refactor args ggml-ci * common : change defaults [no ci] * common : final touches ggml-ci --- Makefile | 1 + common/CMakeLists.txt | 2 + common/arg.cpp | 438 +++++++++--------- common/common.cpp | 76 ++- common/common.h | 41 +- common/sampling.cpp | 45 +- common/sampling.h | 23 +- common/speculative.cpp | 269 +++++++++++ common/speculative.h | 28 ++ examples/CMakeLists.txt | 1 + examples/batched/batched.cpp | 8 +- examples/infill/infill.cpp | 2 +- examples/llava/llava-cli.cpp | 2 +- examples/llava/minicpmv-cli.cpp | 2 +- examples/lookahead/lookahead.cpp | 2 +- examples/lookup/lookup-stats.cpp | 3 +- examples/lookup/lookup.cpp | 4 +- examples/main/main.cpp | 2 +- examples/parallel/parallel.cpp | 2 +- examples/retrieval/retrieval.cpp | 4 +- examples/save-load-state/save-load-state.cpp | 8 +- examples/server/server.cpp | 10 +- examples/server/utils.hpp | 57 --- examples/speculative-simple/CMakeLists.txt | 5 + examples/speculative-simple/README.md | 12 + .../speculative-simple/speculative-simple.cpp | 273 +++++++++++ examples/speculative/speculative.cpp | 30 +- tests/test-arg-parser.cpp | 4 +- 28 files changed, 1028 insertions(+), 326 deletions(-) create mode 100644 common/speculative.cpp create mode 100644 common/speculative.h create mode 100644 examples/speculative-simple/CMakeLists.txt create mode 100644 examples/speculative-simple/README.md create mode 100644 examples/speculative-simple/speculative-simple.cpp diff --git a/Makefile b/Makefile index 5c899438515e1..dd6d864ad513a 100644 --- a/Makefile +++ b/Makefile @@ -966,6 +966,7 @@ OBJ_COMMON = \ $(DIR_COMMON)/console.o \ $(DIR_COMMON)/ngram-cache.o \ $(DIR_COMMON)/sampling.o \ + $(DIR_COMMON)/speculative.o \ $(DIR_COMMON)/build-info.o \ $(DIR_COMMON)/json-schema-to-grammar.o diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 5ab1ffa1922aa..62a8a7db5652f 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -66,6 +66,8 @@ add_library(${TARGET} STATIC ngram-cache.h sampling.cpp sampling.h + speculative.cpp + speculative.h ) if (BUILD_SHARED_LIBS) diff --git a/common/arg.cpp b/common/arg.cpp index 4115b2f7511d3..32240f21f2469 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -233,10 +233,11 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context } } - postprocess_cpu_params(params.cpuparams, nullptr); + postprocess_cpu_params(params.cpuparams, nullptr); postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams); - postprocess_cpu_params(params.draft_cpuparams, ¶ms.cpuparams); - postprocess_cpu_params(params.draft_cpuparams_batch, ¶ms.cpuparams_batch); + + postprocess_cpu_params(params.speculative.cpuparams, ¶ms.cpuparams); + postprocess_cpu_params(params.speculative.cpuparams_batch, ¶ms.cpuparams_batch); if (params.prompt_cache_all && (params.interactive || params.interactive_first)) { throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); @@ -251,7 +252,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context for (auto & antiprompt : params.antiprompt) { string_process_escapes(antiprompt); } - for (auto & seq_breaker : params.sparams.dry_sequence_breakers) { + for (auto & seq_breaker : params.sampling.dry_sequence_breakers) { string_process_escapes(seq_breaker); } } @@ -329,7 +330,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex std::string sampler_type_chars; std::string sampler_type_names; - for (const auto & sampler : params.sparams.samplers) { + for (const auto & sampler : params.sampling.samplers) { sampler_type_chars += common_sampler_type_to_chr(sampler); sampler_type_names += common_sampler_type_to_str(sampler) + ";"; } @@ -407,26 +408,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } )); - add_opt(common_arg( - {"-td", "--threads-draft"}, "N", - "number of threads to use during generation (default: same as --threads)", - [](common_params & params, int value) { - params.draft_cpuparams.n_threads = value; - if (params.draft_cpuparams.n_threads <= 0) { - params.draft_cpuparams.n_threads = std::thread::hardware_concurrency(); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"-tbd", "--threads-batch-draft"}, "N", - "number of threads to use during batch and prompt processing (default: same as --threads-draft)", - [](common_params & params, int value) { - params.draft_cpuparams_batch.n_threads = value; - if (params.draft_cpuparams_batch.n_threads <= 0) { - params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency(); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-C", "--cpu-mask"}, "M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")", @@ -515,108 +496,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.cpuparams_batch.poll = value; } )); - add_opt(common_arg( - {"-Cd", "--cpu-mask-draft"}, "M", - "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", - [](common_params & params, const std::string & mask) { - params.draft_cpuparams.mask_valid = true; - if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) { - throw std::invalid_argument("invalid cpumask"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"-Crd", "--cpu-range-draft"}, "lo-hi", - "Ranges of CPUs for affinity. Complements --cpu-mask-draft", - [](common_params & params, const std::string & range) { - params.draft_cpuparams.mask_valid = true; - if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) { - throw std::invalid_argument("invalid range"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--cpu-strict-draft"}, "<0|1>", - "Use strict CPU placement for draft model (default: same as --cpu-strict)", - [](common_params & params, int value) { - params.draft_cpuparams.strict_cpu = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--prio-draft"}, "N", - string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority), - [](common_params & params, int prio) { - if (prio < 0 || prio > 3) { - throw std::invalid_argument("invalid value"); - } - params.draft_cpuparams.priority = (enum ggml_sched_priority) prio; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--poll-draft"}, "<0|1>", - "Use polling to wait for draft model work (default: same as --poll])", - [](common_params & params, int value) { - params.draft_cpuparams.poll = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"-Cbd", "--cpu-mask-batch-draft"}, "M", - "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", - [](common_params & params, const std::string & mask) { - params.draft_cpuparams_batch.mask_valid = true; - if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) { - throw std::invalid_argument("invalid cpumask"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi", - "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)", - [](common_params & params, const std::string & range) { - params.draft_cpuparams_batch.mask_valid = true; - if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) { - throw std::invalid_argument("invalid cpumask"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--cpu-strict-batch-draft"}, "<0|1>", - "Use strict CPU placement for draft model (default: --cpu-strict-draft)", - [](common_params & params, int value) { - params.draft_cpuparams_batch.strict_cpu = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--prio-batch-draft"}, "N", - string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority), - [](common_params & params, int prio) { - if (prio < 0 || prio > 3) { - throw std::invalid_argument("invalid value"); - } - params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--poll-batch-draft"}, "<0|1>", - "Use polling to wait for draft model work (default: --poll-draft)", - [](common_params & params, int value) { - params.draft_cpuparams_batch.poll = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(common_arg( - {"--draft"}, "N", - string_format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft), - [](common_params & params, int value) { - params.n_draft = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); - add_opt(common_arg( - {"-ps", "--p-split"}, "N", - string_format("speculative decoding split probability (default: %.1f)", (double)params.p_split), - [](common_params & params, const std::string & value) { - params.p_split = std::stof(value); - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-lcs", "--lookup-cache-static"}, "FNAME", "path to static lookup cache to use for lookup decoding (not updated by generation)", @@ -701,7 +580,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), [](common_params & params) { params.no_perf = true; - params.sparams.no_perf = true; + params.sampling.no_perf = true; } ).set_env("LLAMA_ARG_NO_PERF")); add_opt(common_arg( @@ -883,155 +762,155 @@ common_params_context common_params_parser_init(common_params & params, llama_ex string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), [](common_params & params, const std::string & value) { const auto sampler_names = string_split(value, ';'); - params.sparams.samplers = common_sampler_types_from_names(sampler_names, true); + params.sampling.samplers = common_sampler_types_from_names(sampler_names, true); } ).set_sparam()); add_opt(common_arg( {"-s", "--seed"}, "SEED", - string_format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED), + string_format("RNG seed (default: %d, use random seed for %d)", params.sampling.seed, LLAMA_DEFAULT_SEED), [](common_params & params, const std::string & value) { - params.sparams.seed = std::stoul(value); + params.sampling.seed = std::stoul(value); } ).set_sparam()); add_opt(common_arg( {"--sampling-seq"}, "SEQUENCE", string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), [](common_params & params, const std::string & value) { - params.sparams.samplers = common_sampler_types_from_chars(value); + params.sampling.samplers = common_sampler_types_from_chars(value); } ).set_sparam()); add_opt(common_arg( {"--ignore-eos"}, "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)", [](common_params & params) { - params.sparams.ignore_eos = true; + params.sampling.ignore_eos = true; } ).set_sparam()); add_opt(common_arg( {"--penalize-nl"}, - string_format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"), + string_format("penalize newline tokens (default: %s)", params.sampling.penalize_nl ? "true" : "false"), [](common_params & params) { - params.sparams.penalize_nl = true; + params.sampling.penalize_nl = true; } ).set_sparam()); add_opt(common_arg( {"--temp"}, "N", - string_format("temperature (default: %.1f)", (double)params.sparams.temp), + string_format("temperature (default: %.1f)", (double)params.sampling.temp), [](common_params & params, const std::string & value) { - params.sparams.temp = std::stof(value); - params.sparams.temp = std::max(params.sparams.temp, 0.0f); + params.sampling.temp = std::stof(value); + params.sampling.temp = std::max(params.sampling.temp, 0.0f); } ).set_sparam()); add_opt(common_arg( {"--top-k"}, "N", - string_format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k), + string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k), [](common_params & params, int value) { - params.sparams.top_k = value; + params.sampling.top_k = value; } ).set_sparam()); add_opt(common_arg( {"--top-p"}, "N", - string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p), + string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p), [](common_params & params, const std::string & value) { - params.sparams.top_p = std::stof(value); + params.sampling.top_p = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--min-p"}, "N", - string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p), + string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p), [](common_params & params, const std::string & value) { - params.sparams.min_p = std::stof(value); + params.sampling.min_p = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--xtc-probability"}, "N", - string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability), + string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability), [](common_params & params, const std::string & value) { - params.sparams.xtc_probability = std::stof(value); + params.sampling.xtc_probability = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--xtc-threshold"}, "N", - string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold), + string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold), [](common_params & params, const std::string & value) { - params.sparams.xtc_threshold = std::stof(value); + params.sampling.xtc_threshold = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--typical"}, "N", - string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p), + string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p), [](common_params & params, const std::string & value) { - params.sparams.typ_p = std::stof(value); + params.sampling.typ_p = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--repeat-last-n"}, "N", - string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n), + string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n), [](common_params & params, int value) { - params.sparams.penalty_last_n = value; - params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n); + params.sampling.penalty_last_n = value; + params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n); } ).set_sparam()); add_opt(common_arg( {"--repeat-penalty"}, "N", - string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat), + string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat), [](common_params & params, const std::string & value) { - params.sparams.penalty_repeat = std::stof(value); + params.sampling.penalty_repeat = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--presence-penalty"}, "N", - string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present), + string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_present), [](common_params & params, const std::string & value) { - params.sparams.penalty_present = std::stof(value); + params.sampling.penalty_present = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--frequency-penalty"}, "N", - string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq), + string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_freq), [](common_params & params, const std::string & value) { - params.sparams.penalty_freq = std::stof(value); + params.sampling.penalty_freq = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--dry-multiplier"}, "N", - string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sparams.dry_multiplier), + string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sampling.dry_multiplier), [](common_params & params, const std::string & value) { - params.sparams.dry_multiplier = std::stof(value); + params.sampling.dry_multiplier = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--dry-base"}, "N", - string_format("set DRY sampling base value (default: %.2f)", (double)params.sparams.dry_base), + string_format("set DRY sampling base value (default: %.2f)", (double)params.sampling.dry_base), [](common_params & params, const std::string & value) { float potential_base = std::stof(value); if (potential_base >= 1.0f) { - params.sparams.dry_base = potential_base; + params.sampling.dry_base = potential_base; } } ).set_sparam()); add_opt(common_arg( {"--dry-allowed-length"}, "N", - string_format("set allowed length for DRY sampling (default: %d)", params.sparams.dry_allowed_length), + string_format("set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length), [](common_params & params, int value) { - params.sparams.dry_allowed_length = value; + params.sampling.dry_allowed_length = value; } ).set_sparam()); add_opt(common_arg( {"--dry-penalty-last-n"}, "N", - string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sparams.dry_penalty_last_n), + string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n), [](common_params & params, int value) { - params.sparams.dry_penalty_last_n = value; + params.sampling.dry_penalty_last_n = value; } ).set_sparam()); add_opt(common_arg( {"--dry-sequence-breaker"}, "STRING", string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n", - params.sparams.dry_sequence_breakers.empty() ? "none" : - std::accumulate(std::next(params.sparams.dry_sequence_breakers.begin()), - params.sparams.dry_sequence_breakers.end(), - std::string("'") + (params.sparams.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sparams.dry_sequence_breakers[0]) + "'", + params.sampling.dry_sequence_breakers.empty() ? "none" : + std::accumulate(std::next(params.sampling.dry_sequence_breakers.begin()), + params.sampling.dry_sequence_breakers.end(), + std::string("'") + (params.sampling.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sampling.dry_sequence_breakers[0]) + "'", [](const std::string& a, const std::string& b) { std::string formatted_b = (b == "\n") ? "\\n" : b; return a + ", '" + formatted_b + "'"; @@ -1040,51 +919,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex static bool defaults_cleared = false; if (!defaults_cleared) { - params.sparams.dry_sequence_breakers.clear(); + params.sampling.dry_sequence_breakers.clear(); defaults_cleared = true; } if (value == "none") { - params.sparams.dry_sequence_breakers.clear(); + params.sampling.dry_sequence_breakers.clear(); } else { - params.sparams.dry_sequence_breakers.emplace_back(value); + params.sampling.dry_sequence_breakers.emplace_back(value); } } ).set_sparam()); add_opt(common_arg( {"--dynatemp-range"}, "N", - string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range), + string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range), [](common_params & params, const std::string & value) { - params.sparams.dynatemp_range = std::stof(value); + params.sampling.dynatemp_range = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--dynatemp-exp"}, "N", - string_format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent), + string_format("dynamic temperature exponent (default: %.1f)", (double)params.sampling.dynatemp_exponent), [](common_params & params, const std::string & value) { - params.sparams.dynatemp_exponent = std::stof(value); + params.sampling.dynatemp_exponent = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--mirostat"}, "N", string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n" - "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat), + "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat), [](common_params & params, int value) { - params.sparams.mirostat = value; + params.sampling.mirostat = value; } ).set_sparam()); add_opt(common_arg( {"--mirostat-lr"}, "N", - string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta), + string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta), [](common_params & params, const std::string & value) { - params.sparams.mirostat_eta = std::stof(value); + params.sampling.mirostat_eta = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--mirostat-ent"}, "N", - string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau), + string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau), [](common_params & params, const std::string & value) { - params.sparams.mirostat_tau = std::stof(value); + params.sampling.mirostat_tau = std::stof(value); } ).set_sparam()); add_opt(common_arg( @@ -1100,7 +979,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex try { if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); - params.sparams.logit_bias.push_back({key, bias}); + params.sampling.logit_bias.push_back({key, bias}); } else { throw std::invalid_argument("invalid input format"); } @@ -1111,9 +990,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--grammar"}, "GRAMMAR", - string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()), + string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()), [](common_params & params, const std::string & value) { - params.sparams.grammar = value; + params.sampling.grammar = value; } ).set_sparam()); add_opt(common_arg( @@ -1127,7 +1006,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex std::copy( std::istreambuf_iterator(file), std::istreambuf_iterator(), - std::back_inserter(params.sparams.grammar) + std::back_inserter(params.sampling.grammar) ); } ).set_sparam()); @@ -1135,7 +1014,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"-j", "--json-schema"}, "SCHEMA", "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead", [](common_params & params, const std::string & value) { - params.sparams.grammar = json_schema_to_grammar(json::parse(value)); + params.sampling.grammar = json_schema_to_grammar(json::parse(value)); } ).set_sparam()); add_opt(common_arg( @@ -1444,17 +1323,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } ).set_env("LLAMA_ARG_N_GPU_LAYERS")); - add_opt(common_arg( - {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", - "number of layers to store in VRAM for the draft model", - [](common_params & params, int value) { - params.n_gpu_layers_draft = value; - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-sm", "--split-mode"}, "{none,layer,row}", "how to split the model across multiple GPUs, one of:\n" @@ -1593,13 +1461,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.model = value; } ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL")); - add_opt(common_arg( - {"-md", "--model-draft"}, "FNAME", - "draft model for speculative decoding (default: unused)", - [](common_params & params, const std::string & value) { - params.model_draft = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-mu", "--model-url"}, "MODEL_URL", "model download url (default: unused)", @@ -2037,5 +1898,168 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_env("LLAMA_LOG_TIMESTAMPS")); + // speculative parameters + add_opt(common_arg( + {"-td", "--threads-draft"}, "N", + "number of threads to use during generation (default: same as --threads)", + [](common_params & params, int value) { + params.speculative.cpuparams.n_threads = value; + if (params.speculative.cpuparams.n_threads <= 0) { + params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency(); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"-tbd", "--threads-batch-draft"}, "N", + "number of threads to use during batch and prompt processing (default: same as --threads-draft)", + [](common_params & params, int value) { + params.speculative.cpuparams_batch.n_threads = value; + if (params.speculative.cpuparams_batch.n_threads <= 0) { + params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"-Cd", "--cpu-mask-draft"}, "M", + "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", + [](common_params & params, const std::string & mask) { + params.speculative.cpuparams.mask_valid = true; + if (!parse_cpu_mask(mask, params.speculative.cpuparams.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"-Crd", "--cpu-range-draft"}, "lo-hi", + "Ranges of CPUs for affinity. Complements --cpu-mask-draft", + [](common_params & params, const std::string & range) { + params.speculative.cpuparams.mask_valid = true; + if (!parse_cpu_range(range, params.speculative.cpuparams.cpumask)) { + throw std::invalid_argument("invalid range"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--cpu-strict-draft"}, "<0|1>", + "Use strict CPU placement for draft model (default: same as --cpu-strict)", + [](common_params & params, int value) { + params.speculative.cpuparams.strict_cpu = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--prio-draft"}, "N", + string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams.priority), + [](common_params & params, int prio) { + if (prio < 0 || prio > 3) { + throw std::invalid_argument("invalid value"); + } + params.speculative.cpuparams.priority = (enum ggml_sched_priority) prio; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--poll-draft"}, "<0|1>", + "Use polling to wait for draft model work (default: same as --poll])", + [](common_params & params, int value) { + params.speculative.cpuparams.poll = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"-Cbd", "--cpu-mask-batch-draft"}, "M", + "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", + [](common_params & params, const std::string & mask) { + params.speculative.cpuparams_batch.mask_valid = true; + if (!parse_cpu_mask(mask, params.speculative.cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi", + "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)", + [](common_params & params, const std::string & range) { + params.speculative.cpuparams_batch.mask_valid = true; + if (!parse_cpu_range(range, params.speculative.cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--cpu-strict-batch-draft"}, "<0|1>", + "Use strict CPU placement for draft model (default: --cpu-strict-draft)", + [](common_params & params, int value) { + params.speculative.cpuparams_batch.strict_cpu = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--prio-batch-draft"}, "N", + string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams_batch.priority), + [](common_params & params, int prio) { + if (prio < 0 || prio > 3) { + throw std::invalid_argument("invalid value"); + } + params.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--poll-batch-draft"}, "<0|1>", + "Use polling to wait for draft model work (default: --poll-draft)", + [](common_params & params, int value) { + params.speculative.cpuparams_batch.poll = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--draft-max", "--draft", "--draft-n"}, "N", + string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max), + [](common_params & params, int value) { + params.speculative.n_max = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--draft-min", "--draft-n-min"}, "N", + string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min), + [](common_params & params, int value) { + params.speculative.n_min = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--draft-p-split"}, "P", + string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split), + [](common_params & params, const std::string & value) { + params.speculative.p_split = std::stof(value); + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--draft-p-min"}, "P", + string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min), + [](common_params & params, const std::string & value) { + params.speculative.p_min = std::stof(value); + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"-cd", "--ctx-size-draft"}, "N", + string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx), + [](common_params & params, int value) { + params.speculative.n_ctx = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", + "number of layers to store in VRAM for the draft model", + [](common_params & params, int value) { + params.speculative.n_gpu_layers = value; + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"-md", "--model-draft"}, "FNAME", + "draft model for speculative decoding (default: unused)", + [](common_params & params, const std::string & value) { + params.speculative.model = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + return ctx_arg; } diff --git a/common/common.cpp b/common/common.cpp index d314523db4c62..c398329d05bf5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -536,12 +536,12 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat [](const unsigned char c) { return !std::isprint(c); }), detokenized.end()); - buf << "\n" << std::to_string(i) - << ":token '" << detokenized << "'" - << ":pos " << std::to_string(batch.pos[i]) - << ":n_seq_id " << std::to_string(batch.n_seq_id[i]) - << ":seq_id " << std::to_string(batch.seq_id[i][0]) - << ":logits " << std::to_string(batch.logits[i]); + buf << "\n" << std::to_string(i) + << ", token '" << detokenized << "'" + << ", pos " << std::to_string(batch.pos[i]) + << ", n_seq_id " << std::to_string(batch.n_seq_id[i]) + << ", seq_id " << std::to_string(batch.seq_id[i][0]) + << ", logits " << std::to_string(batch.logits[i]); } buf << " ]"; @@ -925,9 +925,9 @@ struct common_init_result common_init_from_params(common_params & params) { common_lora_adapters_apply(lctx, iparams.lora_adapters); } - if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) { + if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) { LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__); - params.sparams.ignore_eos = false; + params.sampling.ignore_eos = false; } if (params.warmup) { @@ -1490,6 +1490,66 @@ void common_batch_add( batch.n_tokens++; } +// +// Token utils +// + +size_t common_lcp(const llama_tokens & a, const llama_tokens & b) { + size_t i; + for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} + + return i; +} + +size_t common_lcs(const llama_tokens & a, const llama_tokens & b) { + // check for empty sequences + if (a.empty() || b.empty()) { + return 0; + } + + // get the lengths of the input sequences + size_t a_len = a.size(); + size_t b_len = b.size(); + + // initialize the maximum length of the longest common subsequence (LCS) + size_t max_length = 0; + + // use two rows instead of a 2D matrix to optimize space + std::vector prev_row(b_len + 1, 0); + std::vector curr_row(b_len + 1, 0); + + // iterate through the elements of a + for (size_t i = 1; i <= a_len; i++) { + // iterate through the elements of b + for (size_t j = 1; j <= b_len; j++) { + // if elements at the current positions match + if (a[i - 1] == b[j - 1]) { + // if it's the first element of either sequences, set LCS length to 1 + if (i == 1 || j == 1) { + curr_row[j] = 1; + } else { + // increment LCS length by 1 compared to the previous element + curr_row[j] = prev_row[j - 1] + 1; + } + + // update max_length if necessary + if (curr_row[j] > max_length) { + max_length = curr_row[j]; + } + } else { + // reset LCS length if elements don't match + curr_row[j] = 0; + } + } + + // update the previous row for the next iteration + prev_row = curr_row; + } + + // return the maximum length of the LCS + return max_length; +} + // // Vocab utils // diff --git a/common/common.h b/common/common.h index 7977cc7a99a78..5c579b5abfe03 100644 --- a/common/common.h +++ b/common/common.h @@ -33,6 +33,8 @@ struct common_lora_adapter_container : common_lora_adapter_info { struct llama_lora_adapter * adapter; }; +using llama_tokens = std::vector; + // build info extern int LLAMA_BUILD_NUMBER; extern char const * LLAMA_COMMIT; @@ -101,8 +103,8 @@ enum dimre_method { DIMRE_METHOD_MEAN, }; -// sampler parameters -struct common_sampler_params { +// sampling parameters +struct common_params_sampling { uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler int32_t n_prev = 64; // number of previous tokens to remember @@ -153,19 +155,30 @@ struct common_sampler_params { std::string print() const; }; +struct common_params_speculative { + int32_t n_ctx = 0; // draft context size + int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding + int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding + int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default) + float p_split = 0.1f; // speculative decoding split probability + float p_min = 0.9f; // minimum speculative decoding probability (greedy) + + struct cpu_params cpuparams; + struct cpu_params cpuparams_batch; + + std::string model = ""; // draft model for speculative decoding // NOLINT +}; + struct common_params { int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 4096; // context size int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_draft = 5; // number of tokens to draft during speculative decoding int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_parallel = 1; // number of parallel sequences to decode int32_t n_sequences = 1; // number of sequences to decode - float p_split = 0.1f; // speculative decoding split probability int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) - int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs int32_t grp_attn_n = 1; // group-attention factor @@ -182,8 +195,6 @@ struct common_params { struct cpu_params cpuparams; struct cpu_params cpuparams_batch; - struct cpu_params draft_cpuparams; - struct cpu_params draft_cpuparams_batch; ggml_backend_sched_eval_callback cb_eval = nullptr; void * cb_eval_user_data = nullptr; @@ -195,10 +206,10 @@ struct common_params { enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings - struct common_sampler_params sparams; + struct common_params_sampling sampling; + struct common_params_speculative speculative; std::string model = ""; // model path // NOLINT - std::string model_draft = ""; // draft model for speculative decoding // NOLINT std::string model_alias = "unknown"; // model alias // NOLINT std::string model_url = ""; // model url to download // NOLINT std::string hf_token = ""; // HF token // NOLINT @@ -461,7 +472,9 @@ struct llama_model * common_load_model_from_hf(const char * repo, const char * f // clear LoRA adapters from context, then apply new list of adapters void common_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters); +// // Batch utils +// void common_batch_clear(struct llama_batch & batch); @@ -472,6 +485,16 @@ void common_batch_add( const std::vector & seq_ids, bool logits); +// +// Token utils +// + +// longest common prefix +size_t common_lcp(const llama_tokens & a, const llama_tokens & b); + +// longet common subsequence +size_t common_lcs(const llama_tokens & a, const llama_tokens & b); + // // Vocab utils // diff --git a/common/sampling.cpp b/common/sampling.cpp index 7922fde47d369..0c4699a89c8b2 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -99,7 +99,7 @@ struct ring_buffer { }; struct common_sampler { - common_sampler_params params; + common_params_sampling params; struct llama_sampler * grmr; struct llama_sampler * chain; @@ -125,7 +125,7 @@ struct common_sampler { } }; -std::string common_sampler_params::print() const { +std::string common_params_sampling::print() const { char result[1024]; snprintf(result, sizeof(result), @@ -141,7 +141,7 @@ std::string common_sampler_params::print() const { return std::string(result); } -struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) { +struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) { llama_sampler_chain_params lparams = llama_sampler_chain_default_params(); lparams.no_perf = params.no_perf; @@ -320,6 +320,45 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co return cur_p.data[cur_p.selected].id; } +std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector & idxs, const llama_tokens & draft, bool grammar_first) { + GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1"); + + std::vector result; + result.reserve(idxs.size()); + + size_t i = 0; + for (; i < draft.size(); i++) { + const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first); + + common_sampler_accept(gsmpl, id, true); + + result.push_back(id); + + if (draft[i] != id) { + break; + } + } + + if (i == draft.size()) { + const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first); + + common_sampler_accept(gsmpl, id, true); + + result.push_back(id); + } + + return result; +} + +std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) { + std::vector idxs(draft.size() + 1); + for (size_t i = 0; i < idxs.size(); ++i) { + idxs[i] = i; + } + + return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first); +} + uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) { return llama_sampler_get_seed(gsmpl->chain); } diff --git a/common/sampling.h b/common/sampling.h index d37f25ad37c4a..348911b18888b 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -36,7 +36,7 @@ struct common_sampler; // llama_sampler API overloads -struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params); +struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params); void common_sampler_free(struct common_sampler * gsmpl); @@ -60,6 +60,27 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam // llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false); +// generalized version of common_sampler_sample +// +// will cross-reference the sampled tokens with a batch of draft tokens and accept those that match +// if the sampler disagrees at some point, we stop and return the accepted tokens up to now +// +// common_sampler_sample_n(gsmpl, ctx, { idx }, {}); +// +// is equivalent to +// +// common_sampler_sample(gsmpl, ctx, idx); +// common_sampler_accept(gsmpl, token, true); +// +// requires: idxs.size() == draft.size() + 1 +// +// returns at least 1 token, up to idxs.size() +// +std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector & idxs, const llama_tokens & draft, bool grammar_first = false); + +// assume idxs == [ 0, 1, 2, ..., draft.size() ] +std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false); + uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl); // helpers diff --git a/common/speculative.cpp b/common/speculative.cpp new file mode 100644 index 0000000000000..fe315a2703e9c --- /dev/null +++ b/common/speculative.cpp @@ -0,0 +1,269 @@ +#include "speculative.h" + +#include "log.h" +#include "common.h" +#include "sampling.h" + +#include + +#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128 +#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5 + +struct common_speculative { + struct llama_context * ctx; + struct common_sampler * smpl; + + llama_batch batch; + llama_tokens prompt; +}; + +struct common_speculative * common_speculative_init( + struct llama_context * ctx_dft) { + auto * result = new common_speculative { + /* .ctx = */ ctx_dft, + /* .smpl = */ nullptr, + /* .batch = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1), + /* .prompt = */ {}, + }; + + // TODO: optimize or pass from outside? +#if 0 + { + common_params_sampling params; + params.no_perf = false; + + params.top_k = 40; + params.top_p = 0.9; + + params.samplers = { + COMMON_SAMPLER_TYPE_TOP_K, + COMMON_SAMPLER_TYPE_TOP_P, + COMMON_SAMPLER_TYPE_INFILL, + }; + + result->smpl = common_sampler_init(llama_get_model(ctx_dft), params); + } +#else + { + common_params_sampling params; + params.no_perf = false; + + params.top_k = 10; + + params.samplers = { + COMMON_SAMPLER_TYPE_TOP_K, + }; + + result->smpl = common_sampler_init(llama_get_model(ctx_dft), params); + } +#endif + + return result; +} + +void common_speculative_free(struct common_speculative * spec) { + common_sampler_free(spec->smpl); + + llama_batch_free(spec->batch); + + delete spec; +} + +bool common_speculative_are_compatible( + const struct llama_context * ctx_tgt, + const struct llama_context * ctx_dft) { + const struct llama_model * model_tgt = llama_get_model(ctx_tgt); + const struct llama_model * model_dft = llama_get_model(ctx_dft); + + const bool vocab_type_tgt = llama_vocab_type(model_tgt); + LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt); + + const bool vocab_type_dft = llama_vocab_type(model_dft); + LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft); + + if (vocab_type_tgt != vocab_type_dft) { + LOG_ERR("%s: draft model vocab type must match target model to use speculation but " + "vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt); + return false; + } + + if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) || + llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) || + llama_token_bos(model_tgt) != llama_token_bos(model_dft) || + llama_token_eos(model_tgt) != llama_token_eos(model_dft) + ) { + LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__); + return false; + } + + { + const int n_vocab_tgt = llama_n_vocab(model_tgt); + const int n_vocab_dft = llama_n_vocab(model_dft); + + const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft); + + if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) { + LOG_ERR("%s: draft model vocab must closely match target model to use speculation but " + "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n", + __func__, n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE); + return false; + } + + for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) { + const char * token_text_tgt = llama_token_get_text(model_tgt, i); + const char * token_text_dft = llama_token_get_text(model_dft, i); + if (std::strcmp(token_text_tgt, token_text_dft) != 0) { + LOG_ERR("%s: draft model vocab must match target model to use speculation but " + "token %d content differs - target '%s', draft '%s'\n", __func__, i, + common_token_to_piece(ctx_tgt, i).c_str(), + common_token_to_piece(ctx_dft, i).c_str()); + return false; + } + } + } + + return true; +} + +llama_tokens common_speculative_gen_draft( + struct common_speculative * spec, + struct common_speculative_params params, + const llama_tokens & prompt_tgt, + llama_token id_last) { + auto & batch = spec->batch; + auto & ctx = spec->ctx; + auto & smpl = spec->smpl; + auto & prompt = spec->prompt; + + int reuse_i = 0; + int reuse_n = 0; + + const int n_ctx = llama_n_ctx(ctx) - params.n_draft; + + const int i_start = std::max(0, (int) prompt_tgt.size() - n_ctx); + + // reuse as much as possible from the old draft context + // ideally, the draft context should be as big as the target context and we will always reuse the entire prompt + for (int i = 0; i < (int) prompt.size(); ++i) { + int cur = 0; + while (i_start + cur < (int) prompt_tgt.size() && + i + cur < (int) prompt.size() && + prompt_tgt[i_start + cur] == prompt[i + cur]) { + cur++; + } + + if ((cur >= params.n_reuse || n_ctx >= (int) prompt_tgt.size()) && cur > reuse_n) { + reuse_i = i; + reuse_n = cur; + } + } + + LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size()); + + llama_tokens result; + result.reserve(params.n_draft); + + if (reuse_n == 0) { + llama_kv_cache_clear(ctx); + + prompt.clear(); + } else { + // this happens when a previous draft has been discarded (for example, due to being too small), but the + // target model agreed with it. in this case, we simply pass back the previous results to save compute + if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) { + for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) { + result.push_back(prompt[i]); + + if (params.n_draft <= (int) result.size()) { + break; + } + } + + return result; + } + + if (reuse_i > 0) { + llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i); + llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i); + + prompt.erase(prompt.begin(), prompt.begin() + reuse_i); + } + + if (reuse_n < (int) prompt.size()) { + llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1); + + prompt.erase(prompt.begin() + reuse_n, prompt.end()); + } + } + + // prepare a batch to evaluate any new tokens in the prompt + common_batch_clear(batch); + + for (size_t i = i_start + reuse_n; i < prompt_tgt.size(); ++i) { + //LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]); + common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false); + + prompt.push_back(prompt_tgt[i]); + } + + // we should rarely end-up here during normal decoding + if (batch.n_tokens > 0) { + //LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str()); + + llama_decode(ctx, batch); + } + + const llama_pos n_past = prompt.size(); + + LOG_DBG("%s: n_past = %d\n", __func__, n_past); + + common_batch_clear(batch); + common_batch_add (batch, id_last, n_past, { 0 }, true); + + prompt.push_back(id_last); + + //LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str()); + + llama_decode(ctx, batch); + + common_sampler_reset(smpl); + + // sample n_draft tokens from the draft model + for (int i = 0; i < params.n_draft; ++i) { + common_batch_clear(batch); + + common_sampler_sample(smpl, ctx, 0, true); + + const auto * cur_p = common_sampler_get_candidates(smpl); + + for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) { + LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n", + k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx, cur_p->data[k].id).c_str()); + } + + // add drafted token for each sequence + const llama_token id = cur_p->data[0].id; + + // only collect very high-confidence draft tokens + if (cur_p->data[0].p < params.p_min) { + break; + } + + common_sampler_accept(smpl, id, true); + + result.push_back(id); + + if (params.n_draft <= (int) result.size()) { + break; + } + + common_batch_add(batch, id, n_past + i + 1, { 0 }, true); + + // evaluate the drafted tokens on the draft model + llama_decode(ctx, batch); + + prompt.push_back(id); + } + + return result; +} diff --git a/common/speculative.h b/common/speculative.h new file mode 100644 index 0000000000000..50ec0344618aa --- /dev/null +++ b/common/speculative.h @@ -0,0 +1,28 @@ +#pragma once + +#include "llama.h" +#include "common.h" + +struct common_speculative; + +struct common_speculative_params { + int n_draft = 16; // max drafted tokens + int n_reuse = 256; + + float p_min = 0.9f; // min probabiliy required to accept a token in the draft +}; + +struct common_speculative * common_speculative_init(struct llama_context * ctx_dft); + +void common_speculative_free(struct common_speculative * spec); + +bool common_speculative_are_compatible( + const struct llama_context * ctx_tgt, + const struct llama_context * ctx_dft); + +// sample up to n_draft tokens and add them to the batch using the draft model +llama_tokens common_speculative_gen_draft( + struct common_speculative * spec, + struct common_speculative_params params, + const llama_tokens & prompt, + llama_token id_last); diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d63a96c1c2547..9bd099d4ef8a5 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -50,5 +50,6 @@ else() add_subdirectory(simple) add_subdirectory(simple-chat) add_subdirectory(speculative) + add_subdirectory(speculative-simple) add_subdirectory(tokenize) endif() diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 3b554033e7ee4..ba219cd4b32ae 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -68,10 +68,10 @@ int main(int argc, char ** argv) { llama_sampler * smpl = llama_sampler_chain_init(sparams); - llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k)); - llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep)); - llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp)); - llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed)); + llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sampling.top_k)); + llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sampling.top_p, params.sampling.min_keep)); + llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp)); + llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed)); if (ctx == NULL) { LOG_ERR("%s: error: failed to create the llama_context\n" , __func__); diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 15b358dc4e854..ef700895720ff 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -73,7 +73,7 @@ int main(int argc, char ** argv) { common_init(); - auto & sparams = params.sparams; + auto & sparams = params.sampling; console::init(params.simple_io, params.use_color); atexit([]() { console::cleanup(); }); diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 1610985858fc9..2691c6e6b2dd2 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -191,7 +191,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ LOG("\n"); - struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams); + struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling); if (!smpl) { LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); exit(1); diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index cbecec343c640..e9cbb51ed90ab 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -237,7 +237,7 @@ static struct common_sampler * llama_init(struct llava_context * ctx_llava, comm LOG_INF("\n"); - struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams); + struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling); return smpl; } diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index 3c0ccfea2ccd7..8d0ef8b3d75e6 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -115,7 +115,7 @@ int main(int argc, char ** argv) { llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1); // target model sampling context - struct common_sampler * smpl = common_sampler_init(model, params.sparams); + struct common_sampler * smpl = common_sampler_init(model, params.sampling); // verification n-grams std::vector ngrams_cur(G); diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp index 7faebe7ba11fc..dff07c075c47f 100644 --- a/examples/lookup/lookup-stats.cpp +++ b/examples/lookup/lookup-stats.cpp @@ -21,7 +21,7 @@ int main(int argc, char ** argv){ common_init(); - const int n_draft = params.n_draft; + const int n_draft = params.speculative.n_max; // init llama.cpp llama_backend_init(); @@ -40,6 +40,7 @@ int main(int argc, char ** argv){ common_ngram_cache ngram_cache_context; common_ngram_cache ngram_cache_dynamic; common_ngram_cache ngram_cache_static; + int64_t t_draft_flat_us = 0; int64_t t_draft_us = 0; diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index a04728b1834cc..4d92bb2385358 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -22,7 +22,7 @@ int main(int argc, char ** argv){ common_init(); // max. number of additional tokens to draft if match is found - const int n_draft = params.n_draft; + const int n_draft = params.speculative.n_max; const bool dump_kv_cache = params.dump_kv_cache; @@ -102,7 +102,7 @@ int main(int argc, char ** argv){ bool has_eos = false; - struct common_sampler * smpl = common_sampler_init(model, params.sparams); + struct common_sampler * smpl = common_sampler_init(model, params.sampling); std::vector draft; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 7c4ce4be2abae..957451af7ce0a 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -100,7 +100,7 @@ int main(int argc, char ** argv) { common_init(); - auto & sparams = params.sparams; + auto & sparams = params.sampling; // save choice to use color for later // (note for later: this is a slightly awkward choice) diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 43c8f3ed56ba9..fd2b1c0112838 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -160,7 +160,7 @@ int main(int argc, char ** argv) { for (size_t i = 0; i < clients.size(); ++i) { auto & client = clients[i]; client.id = i; - client.smpl = common_sampler_init(model, params.sparams); + client.smpl = common_sampler_init(model, params.sampling); } std::vector tokens_system; diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 1768aae510067..e78a8596d8cfe 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -282,8 +282,8 @@ int main(int argc, char ** argv) { return a.second > b.second; }); - LOG("Top %d similar chunks:\n", params.sparams.top_k); - for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) { + LOG("Top %d similar chunks:\n", params.sampling.top_k); + for (int i = 0; i < std::min(params.sampling.top_k, (int) chunks.size()); i++) { LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str()); LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos); LOG("similarity: %f\n", similarities[i].second); diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 8c49a52a66124..2f0cf9baa32b7 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -9,7 +9,7 @@ int main(int argc, char ** argv) { common_params params; params.prompt = "The quick brown fox"; - params.sparams.seed = 1234; + params.sampling.seed = 1234; if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; @@ -42,7 +42,7 @@ int main(int argc, char ** argv) { llama_sampler * smpl = llama_sampler_chain_init(sparams); - llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed)); + llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sampling.seed)); // tokenize prompt auto tokens = common_tokenize(ctx, params.prompt, true); @@ -106,7 +106,7 @@ int main(int argc, char ** argv) { llama_sampler * smpl2 = llama_sampler_chain_init(sparams); - llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sparams.seed)); + llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sampling.seed)); printf("\nsecond run: %s", params.prompt.c_str()); @@ -169,7 +169,7 @@ int main(int argc, char ** argv) { llama_sampler * smpl3 = llama_sampler_chain_init(sparams); - llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sparams.seed)); + llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sampling.seed)); printf("\nsingle seq run: %s", params.prompt.c_str()); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index b8e003be9730e..6c55d65c01330 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -175,7 +175,7 @@ struct server_slot { // sampling json json_schema; - struct common_sampler_params sparams; + struct common_params_sampling sparams; struct common_sampler * smpl = nullptr; llama_token sampled; @@ -687,7 +687,7 @@ struct server_context { SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx); - slot.sparams = params.sparams; + slot.sparams = params.sampling; slot.callback_on_release = [this](int) { queue_tasks.pop_deferred_task(); @@ -743,7 +743,7 @@ struct server_context { } // length of the Longest Common Subsequence between the current slot's prompt and the input prompt - int cur_lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens); + int cur_lcs_len = common_lcs(slot.cache_tokens, task.prompt_tokens); // fraction of the common subsequence length compared to the current slot's prompt length float cur_similarity = static_cast(cur_lcs_len) / static_cast(slot.cache_tokens.size()); @@ -788,7 +788,7 @@ struct server_context { bool launch_slot_with_task(server_slot & slot, const server_task & task) { slot_params default_params; // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them) - auto default_sparams = params.sparams; + auto default_sparams = params.sampling; const auto & data = task.data; if (data.count("__oaicompat") != 0) { @@ -1960,7 +1960,7 @@ struct server_context { if (slot.params.cache_prompt) { // reuse any previously computed tokens that are common with the new prompt - slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens); + slot.n_past = common_lcp(slot.cache_tokens, prompt_tokens); // reuse chunks from the cached prompt by shifting their KV cache in the new position if (params.n_cache_reuse > 0) { diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index c47ed3e47a76d..1665e9dc37db6 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -24,7 +24,6 @@ #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" using json = nlohmann::ordered_json; -using llama_tokens = std::vector; #define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) #define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) @@ -439,62 +438,6 @@ static std::string gen_chatcmplid() { // other common utils // -static size_t longest_common_prefix(const llama_tokens & a, const llama_tokens & b) { - size_t i; - for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} - - return i; -} - -static size_t longest_common_subsequence(const llama_tokens & a, const llama_tokens & b) { - // check for empty sequences - if (a.empty() || b.empty()) { - return 0; - } - - // get the lengths of the input sequences - size_t a_len = a.size(); - size_t b_len = b.size(); - - // initialize the maximum length of the longest common subsequence (LCS) - size_t max_length = 0; - - // use two rows instead of a 2D matrix to optimize space - std::vector prev_row(b_len + 1, 0); - std::vector curr_row(b_len + 1, 0); - - // iterate through the elements of a - for (size_t i = 1; i <= a_len; i++) { - // iterate through the elements of b - for (size_t j = 1; j <= b_len; j++) { - // if elements at the current positions match - if (a[i - 1] == b[j - 1]) { - // if it's the first element of either sequences, set LCS length to 1 - if (i == 1 || j == 1) { - curr_row[j] = 1; - } else { - // increment LCS length by 1 compared to the previous element - curr_row[j] = prev_row[j - 1] + 1; - } - - // update max_length if necessary - if (curr_row[j] > max_length) { - max_length = curr_row[j]; - } - } else { - // reset LCS length if elements don't match - curr_row[j] = 0; - } - } - - // update the previous row for the next iteration - prev_row = curr_row; - } - - // return the maximum length of the LCS - return max_length; -} - static bool ends_with(const std::string & str, const std::string & suffix) { return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); } diff --git a/examples/speculative-simple/CMakeLists.txt b/examples/speculative-simple/CMakeLists.txt new file mode 100644 index 0000000000000..7a3a141c27994 --- /dev/null +++ b/examples/speculative-simple/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET llama-speculative-simple) +add_executable(${TARGET} speculative-simple.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/speculative-simple/README.md b/examples/speculative-simple/README.md new file mode 100644 index 0000000000000..e3a6c6b4aa0bf --- /dev/null +++ b/examples/speculative-simple/README.md @@ -0,0 +1,12 @@ +# llama.cpp/examples/speculative-simple + +Demonstration of basic greedy speculative decoding + +```bash +./bin/llama-speculative-simple \ + -m ../models/qwen2.5-32b-coder-instruct/ggml-model-q8_0.gguf \ + -md ../models/qwen2.5-1.5b-coder-instruct/ggml-model-q4_0.gguf \ + -f test.txt -c 0 -ngl 99 --color \ + --sampling-seq k --top-k 1 -fa --temp 0.0 \ + -ngld 99 --draft-max 16 --draft-min 5 --draft-p-min 0.9 +``` diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp new file mode 100644 index 0000000000000..1bc7f428cf3b5 --- /dev/null +++ b/examples/speculative-simple/speculative-simple.cpp @@ -0,0 +1,273 @@ +#include "arg.h" +#include "common.h" +#include "sampling.h" +#include "speculative.h" +#include "log.h" +#include "llama.h" + +#include +#include +#include +#include + +int main(int argc, char ** argv) { + common_params params; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) { + return 1; + } + + if (params.n_predict < -1) { + LOG_ERR("%s: --n-predict must be >= -1\n", __func__); + return 1; + } + + common_init(); + + if (params.speculative.model.empty()) { + LOG_ERR("%s: --model-draft is required\n", __func__); + return 1; + } + + // init llama.cpp + llama_backend_init(); + llama_numa_init(params.numa); + + llama_model * model_tgt = NULL; + llama_model * model_dft = NULL; + + llama_context * ctx_tgt = NULL; + llama_context * ctx_dft = NULL; + + // load the target model + common_init_result llama_init_tgt = common_init_from_params(params); + + model_tgt = llama_init_tgt.model; + ctx_tgt = llama_init_tgt.context; + + // load the draft model + params.model = params.speculative.model; + params.n_ctx = params.speculative.n_ctx; + params.n_batch = params.speculative.n_ctx > 0 ? params.speculative.n_ctx : params.n_batch; + params.n_gpu_layers = params.speculative.n_gpu_layers; + + if (params.speculative.cpuparams.n_threads > 0) { + params.cpuparams.n_threads = params.speculative.cpuparams.n_threads; + } + + params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads; + common_init_result llama_init_dft = common_init_from_params(params); + + model_dft = llama_init_dft.model; + ctx_dft = llama_init_dft.context; + + if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) { + return 1; + } + + // Tokenize the prompt + std::vector inp; + inp = common_tokenize(ctx_tgt, params.prompt, true, true); + + if (llama_n_ctx(ctx_tgt) < (int) inp.size()) { + LOG_ERR("%s: the prompt exceeds the context size (%d tokens, ctx %d)\n", __func__, (int) inp.size(), llama_n_ctx(ctx_tgt)); + + return 1; + } + + if (llama_n_batch(ctx_tgt) < (int) inp.size()) { + LOG_ERR("%s: the prompt exceeds the batch size (%d tokens, batch %d)\n", __func__, (int) inp.size(), llama_n_batch(ctx_tgt)); + + return 1; + } + + LOG("\n\n"); + + for (auto id : inp) { + LOG("%s", common_token_to_piece(ctx_tgt, id).c_str()); + } + + // how many tokens to draft each time + int n_draft = params.speculative.n_max; + int n_draft_min = params.speculative.n_min; + + float p_min = params.speculative.p_min; + + int n_predict = 0; + int n_drafted = 0; + int n_accept = 0; + + // used to determine end of generation + bool has_eos = false; + + // ================================================ + // everything until here is standard initialization + // the relevant stuff for speculative decoding starts here + + const auto t_enc_start = ggml_time_us(); + + // target model sampling context + struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling); + + // eval the prompt + llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1)); + + // note: keep the last token separate! + llama_token id_last = inp.back(); + + // all tokens currently in the target context + auto prompt_tgt = std::vector(inp.begin(), inp.end() - 1); + + int n_past = inp.size() - 1; + + // init the speculator + struct common_speculative_params params_spec; + params_spec.n_draft = n_draft; + params_spec.n_reuse = llama_n_ctx(ctx_dft) - n_draft; + params_spec.p_min = p_min; + + struct common_speculative * spec = common_speculative_init(ctx_dft); + + llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1); + + const auto t_enc_end = ggml_time_us(); + + const auto t_dec_start = ggml_time_us(); + + while (true) { + // optionally, generate draft tokens that can be appended to the target batch + // + // this is the most important part of the speculation. the more probable tokens that are provided here + // the better the performance will be. in theory, this computation can be performed asynchronously and even + // offloaded to a remote device. it doesn't even have to be based on an LLM. instead, it can provide tokens + // from a cache or lookup tables. + // + llama_tokens draft = common_speculative_gen_draft(spec, params_spec, prompt_tgt, id_last); + + //LOG_DBG("draft: %s\n", string_from(ctx_dft, draft).c_str()); + + // always have a token to evaluate from before - id_last + common_batch_clear(batch_tgt); + common_batch_add (batch_tgt, id_last, n_past++, { 0 }, true); + + // evaluate the target model on [id_last, draft0, draft1, ..., draftN-1] + { + // do not waste time on small drafts + if (draft.size() < n_draft_min) { + draft.clear(); + } + + for (size_t i = 0; i < draft.size(); ++i) { + common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true); + } + + //LOG_DBG("target batch: %s\n", string_from(ctx_tgt, batch_tgt).c_str()); + + llama_decode(ctx_tgt, batch_tgt); + } + + // sample from the full target batch and return the accepted tokens based on the target sampler + // + // for each token to be accepted, the sampler would have to sample that same token + // in such cases, instead of decoding the sampled token as we normally do, we simply continue with the + // available logits from the batch and sample the next token until we run out of logits or the sampler + // disagrees with the draft + // + const auto ids = common_sampler_sample_and_accept_n(smpl, ctx_tgt, draft); + + //LOG_DBG("ids: %s\n", string_from(ctx_tgt, ids).c_str()); + + GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token + + n_past += ids.size() - 1; + n_drafted += batch_tgt.n_tokens - 1; + n_accept += ids.size() - 1; + + // process the accepted tokens and update contexts + // + // this is the standard token post-processing that we normally do + // in this case, we do it for a group of accepted tokens at once + // + { + llama_token id; + std::string token_str; + + for (size_t i = 0; i < ids.size(); ++i) { + id = ids[i]; + + ++n_predict; + + if (llama_token_is_eog(model_tgt, id)) { + has_eos = true; + break; + } + + token_str = common_token_to_piece(ctx_tgt, id); + + if (params.use_color && i + 1 < ids.size()) { + LOG("\u001b[%dm%s\u001b[37m", (36 - 0 % 6), token_str.c_str()); + } else { + LOG("%s", token_str.c_str()); + } + } + + if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) { + break; + } + + LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d, '%s')\n", (int) ids.size() - 1, (int) draft.size(), id, token_str.c_str()); + + { + LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past); + + llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1); + } + + prompt_tgt.push_back(id_last); + prompt_tgt.insert(prompt_tgt.end(), ids.begin(), ids.end() - 1); + + // remember the last accepted token for the next iteration + id_last = id; + } + } + + auto t_dec_end = ggml_time_us(); + + const int n_input = inp.size(); + + LOG("\n\n"); + + LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); + LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); + + LOG_INF("\n"); + LOG_INF("n_draft = %d\n", n_draft); + LOG_INF("n_predict = %d\n", n_predict); + LOG_INF("n_drafted = %d\n", n_drafted); + LOG_INF("n_accept = %d\n", n_accept); + LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); + + LOG_INF("\n"); + LOG_INF("draft:\n\n"); + + llama_perf_context_print(ctx_dft); + + LOG_INF("\n"); + LOG_INF("target:\n\n"); + common_perf_print(ctx_tgt, smpl); + + common_sampler_free(smpl); + common_speculative_free(spec); + + llama_free(ctx_tgt); + llama_free_model(model_tgt); + + llama_free(ctx_dft); + llama_free_model(model_dft); + + llama_backend_free(); + + LOG("\n\n"); + + return 0; +} diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index f6fa4cb5d5354..20bb61de7b22a 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -12,7 +12,7 @@ #include #include -#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100 +#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5 struct seq_draft { @@ -33,7 +33,7 @@ int main(int argc, char ** argv) { common_params params; // needed to get candidate probs even for temp <= 0.0 - params.sparams.n_probs = 128; + params.sampling.n_probs = 128; if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) { return 1; @@ -41,7 +41,7 @@ int main(int argc, char ** argv) { common_init(); - if (params.model_draft.empty()) { + if (params.speculative.model.empty()) { LOG_ERR("%s: --model-draft is required\n", __func__); return 1; } @@ -50,9 +50,9 @@ int main(int argc, char ** argv) { const int n_seq_dft = params.n_parallel; // probability threshold for splitting a draft branch (only for n_seq_dft > 1) - const float p_split = params.p_split; + const float p_draft_split = params.speculative.p_split; - std::default_random_engine rng(params.sparams.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sparams.seed); + std::default_random_engine rng(params.sampling.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sampling.seed); std::uniform_real_distribution<> u_dist; // init llama.cpp @@ -71,13 +71,13 @@ int main(int argc, char ** argv) { ctx_tgt = llama_init_tgt.context; // load the draft model - params.model = params.model_draft; - params.n_gpu_layers = params.n_gpu_layers_draft; - if (params.draft_cpuparams.n_threads > 0) { - params.cpuparams.n_threads = params.draft_cpuparams.n_threads; + params.model = params.speculative.model; + params.n_gpu_layers = params.speculative.n_gpu_layers; + if (params.speculative.cpuparams.n_threads > 0) { + params.cpuparams.n_threads = params.speculative.cpuparams.n_threads; } - params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads; + params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads; common_init_result llama_init_dft = common_init_from_params(params); model_dft = llama_init_dft.model; ctx_dft = llama_init_dft.context; @@ -165,7 +165,7 @@ int main(int argc, char ** argv) { //GGML_ASSERT(n_vocab == llama_n_vocab(model_dft)); // how many tokens to draft each time - int n_draft = params.n_draft; + int n_draft = params.speculative.n_max; int n_predict = 0; int n_drafted = 0; @@ -178,14 +178,14 @@ int main(int argc, char ** argv) { bool has_eos = false; // target model sampling context (reuse the llama_context's sampling instance) - struct common_sampler * smpl = common_sampler_init(model_tgt, params.sparams); + struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling); // draft sequence data std::vector drafts(n_seq_dft); for (int s = 0; s < n_seq_dft; ++s) { // allocate llama_sampler for each draft sequence - drafts[s].smpl = common_sampler_init(model_dft, params.sparams); + drafts[s].smpl = common_sampler_init(model_dft, params.sampling); } llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1); @@ -225,7 +225,7 @@ int main(int argc, char ** argv) { // for stochastic sampling, attempt to match the token with the drafted tokens { bool accept = false; - if (params.sparams.temp > 0) { + if (params.sampling.temp > 0) { // stochastic verification common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true); @@ -489,7 +489,7 @@ int main(int argc, char ** argv) { // attempt to split the branch if the probability is high enough for (int f = 1; f < 8; ++f) { - if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) { + if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) { LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur); llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1); diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp index 3665238b5a2d8..69604b87ceec4 100644 --- a/tests/test-arg-parser.cpp +++ b/tests/test-arg-parser.cpp @@ -70,7 +70,7 @@ int main(void) { // non-existence arg in specific example (--draft cannot be used outside llama-speculative) argv = {"binary_name", "--draft", "123"}; - assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER)); + assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_EMBEDDING)); printf("test-arg-parser: test valid usage\n\n"); @@ -96,7 +96,7 @@ int main(void) { // --draft cannot be used outside llama-speculative argv = {"binary_name", "--draft", "123"}; assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE)); - assert(params.n_draft == 123); + assert(params.speculative.n_max == 123); // skip this part on windows, because setenv is not supported #ifdef _WIN32 From c9dd5ab17073cf1b993d4e8bcc3cc4056cdaaeef Mon Sep 17 00:00:00 2001 From: Neo Zhang Jianyu Date: Mon, 25 Nov 2024 17:31:10 +0800 Subject: [PATCH 039/245] [SYCL] Fix building Win package for oneAPI 2025.0 update (#10483) * fix build package for 2025.0 * debug * debug * fix * rm debug --------- Co-authored-by: arthw <14088817+arthw@users.noreply.github.com> --- .github/workflows/build.yml | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 572f91643fbba..abaf2c504c8f4 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -952,7 +952,7 @@ jobs: env: WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe - WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel + WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI" steps: - name: Clone @@ -962,7 +962,8 @@ jobs: fetch-depth: 0 - name: Install - run: scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL + run: | + scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL - name: Build id: cmake_build @@ -981,27 +982,34 @@ jobs: echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT fi - - name: Pack artifacts + - name: Build the release package id: pack_artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }} run: | echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin" + cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin + + cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin + echo "cp oneAPI running time dll files to ./build/bin done" 7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/* - - name: Upload artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + - name: Upload the release package + if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }} uses: actions/upload-artifact@v4 with: path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip From f966337645320faec3a0e97e6e9c7401b3aba05d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 25 Nov 2024 15:08:04 +0200 Subject: [PATCH 040/245] metal : minor code formatting --- ggml/src/ggml-metal/ggml-metal.m | 598 +++++++++++++-------------- ggml/src/ggml-metal/ggml-metal.metal | 43 +- 2 files changed, 323 insertions(+), 318 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index d1abb3cef0ec4..3a533d7f9c9af 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -1951,316 +1951,316 @@ static void ggml_metal_encode_node( } #endif - // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs - // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel - if ([device supportsFamily:MTLGPUFamilyApple7] && - !ggml_is_transposed(src0) && - !ggml_is_transposed(src1) && - src1t == GGML_TYPE_F32 && - ne00 % 32 == 0 && ne00 >= 64 && - (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) { - //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); - - // some Metal matrix data types require aligned pointers - // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5) - switch (src0->type) { - case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break; - case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8 == 0); break; - case GGML_TYPE_BF16: GGML_ASSERT(nb01 % 8 == 0); break; - default: break; - } + // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs + // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel + if ([device supportsFamily:MTLGPUFamilyApple7] && + !ggml_is_transposed(src0) && + !ggml_is_transposed(src1) && + src1t == GGML_TYPE_F32 && + ne00 % 32 == 0 && ne00 >= 64 && + (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) { + //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); - id pipeline = nil; - - switch (src0->type) { - case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32 ].pipeline; break; - case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32 ].pipeline; break; - case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32 ].pipeline; break; - case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32 ].pipeline; break; - case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32 ].pipeline; break; - case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32 ].pipeline; break; - case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32 ].pipeline; break; - case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32 ].pipeline; break; - case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32 ].pipeline; break; - case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32 ].pipeline; break; - case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32 ].pipeline; break; - case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32 ].pipeline; break; - case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32 ].pipeline; break; - case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break; - case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break; - case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break; - case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32 ].pipeline; break; - case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32 ].pipeline; break; - case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break; - case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32 ].pipeline; break; - case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break; - case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break; - default: GGML_ABORT("MUL MAT-MAT not implemented"); - } + // some Metal matrix data types require aligned pointers + // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5) + switch (src0->type) { + case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break; + case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8 == 0); break; + case GGML_TYPE_BF16: GGML_ASSERT(nb01 % 8 == 0); break; + default: break; + } - ggml_metal_kargs_mul_mm args = { - /*.ne00 =*/ ne00, - /*.ne02 =*/ ne02, - /*.nb01 =*/ nb01, - /*.nb02 =*/ nb02, - /*.nb03 =*/ nb03, - /*.ne12 =*/ ne12, - /*.nb10 =*/ nb10, - /*.nb11 =*/ nb11, - /*.nb12 =*/ nb12, - /*.nb13 =*/ nb13, - /*.ne0 =*/ ne0, - /*.ne1 =*/ ne1, - /*.r2 =*/ r2, - /*.r3 =*/ r3, - }; + id pipeline = nil; + + switch (src0->type) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32 ].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32 ].pipeline; break; + case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32 ].pipeline; break; + case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32 ].pipeline; break; + case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32 ].pipeline; break; + case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32 ].pipeline; break; + case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32 ].pipeline; break; + case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32 ].pipeline; break; + case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32 ].pipeline; break; + case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32 ].pipeline; break; + case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32 ].pipeline; break; + case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32 ].pipeline; break; + case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32 ].pipeline; break; + case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break; + case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break; + case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break; + case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32 ].pipeline; break; + case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32 ].pipeline; break; + case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break; + case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32 ].pipeline; break; + case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break; + case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break; + default: GGML_ABORT("MUL MAT-MAT not implemented"); + } + + ggml_metal_kargs_mul_mm args = { + /*.ne00 =*/ ne00, + /*.ne02 =*/ ne02, + /*.nb01 =*/ nb01, + /*.nb02 =*/ nb02, + /*.nb03 =*/ nb03, + /*.ne12 =*/ ne12, + /*.nb10 =*/ nb10, + /*.nb11 =*/ nb11, + /*.nb12 =*/ nb12, + /*.nb13 =*/ nb13, + /*.ne0 =*/ ne0, + /*.ne1 =*/ ne1, + /*.r2 =*/ r2, + /*.r3 =*/ r3, + }; - [encoder setComputePipelineState:pipeline]; - [encoder setBytes:&args length:sizeof(args) atIndex:0]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; + [encoder setComputePipelineState:pipeline]; + [encoder setBytes:&args length:sizeof(args) atIndex:0]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; - [encoder setThreadgroupMemoryLength:8192 atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; - } else { - int nth0 = 32; - int nth1 = 1; - int nrows = 1; - //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); + [encoder setThreadgroupMemoryLength:8192 atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; + } else { + int nth0 = 32; + int nth1 = 1; + int nrows = 1; + //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); - id pipeline = nil; + id pipeline = nil; - // use custom matrix x vector kernel - switch (src0t) { - case GGML_TYPE_F32: - { - GGML_ASSERT(src1t == GGML_TYPE_F32); - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline; + // use custom matrix x vector kernel + switch (src0t) { + case GGML_TYPE_F32: + { + GGML_ASSERT(src1t == GGML_TYPE_F32); + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline; + nrows = 4; + } break; + case GGML_TYPE_F16: + { + nth0 = 32; + nth1 = 1; + if (src1t == GGML_TYPE_F32) { + if (ne11 * ne12 < 4) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline; + } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline; + nrows = ne11; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32].pipeline; nrows = 4; - } break; - case GGML_TYPE_F16: - { - nth0 = 32; - nth1 = 1; - if (src1t == GGML_TYPE_F32) { - if (ne11 * ne12 < 4) { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline; - } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline; - nrows = ne11; - } else { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32].pipeline; - nrows = 4; - } - } else { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16].pipeline; - nrows = 4; - } - } break; - case GGML_TYPE_BF16: - { - nth0 = 32; - nth1 = 1; - if (src1t == GGML_TYPE_F32) { - if (ne11 * ne12 < 4) { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW].pipeline; - } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4].pipeline; - nrows = ne11; - } else { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32].pipeline; - nrows = 4; - } - } else { - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16].pipeline; - nrows = 4; - } - } break; - case GGML_TYPE_Q4_0: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32].pipeline; - } break; - case GGML_TYPE_Q4_1: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32].pipeline; - } break; - case GGML_TYPE_Q5_0: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32].pipeline; - } break; - case GGML_TYPE_Q5_1: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32].pipeline; - } break; - case GGML_TYPE_Q8_0: - { - nth0 = 8; - nth1 = 8; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32].pipeline; - } break; - case GGML_TYPE_Q2_K: - { - nth0 = 2; - nth1 = 32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32].pipeline; - } break; - case GGML_TYPE_Q3_K: - { - nth0 = 2; - nth1 = 32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32].pipeline; - } break; - case GGML_TYPE_Q4_K: - { - nth0 = 4; //1; - nth1 = 8; //32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32].pipeline; - } break; - case GGML_TYPE_Q5_K: - { - nth0 = 2; - nth1 = 32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32].pipeline; - } break; - case GGML_TYPE_Q6_K: - { - nth0 = 2; - nth1 = 32; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32].pipeline; - } break; - case GGML_TYPE_IQ2_XXS: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32].pipeline; - } break; - case GGML_TYPE_IQ2_XS: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline; - } break; - case GGML_TYPE_IQ3_XXS: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline; - } break; - case GGML_TYPE_IQ3_S: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32].pipeline; - } break; - case GGML_TYPE_IQ2_S: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32].pipeline; - } break; - case GGML_TYPE_IQ1_S: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32].pipeline; - } break; - case GGML_TYPE_IQ1_M: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32].pipeline; - } break; - case GGML_TYPE_IQ4_NL: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32].pipeline; - } break; - case GGML_TYPE_IQ4_XS: - { - nth0 = 4; - nth1 = 16; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32].pipeline; - } break; - default: - { - GGML_LOG_ERROR("Asserting on type %d\n", (int)src0t); - GGML_ABORT("not implemented"); } - }; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16].pipeline; + nrows = 4; + } + } break; + case GGML_TYPE_BF16: + { + nth0 = 32; + nth1 = 1; + if (src1t == GGML_TYPE_F32) { + if (ne11 * ne12 < 4) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW].pipeline; + } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4].pipeline; + nrows = ne11; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32].pipeline; + nrows = 4; + } + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16].pipeline; + nrows = 4; + } + } break; + case GGML_TYPE_Q4_0: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32].pipeline; + } break; + case GGML_TYPE_Q4_1: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32].pipeline; + } break; + case GGML_TYPE_Q5_0: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32].pipeline; + } break; + case GGML_TYPE_Q5_1: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32].pipeline; + } break; + case GGML_TYPE_Q8_0: + { + nth0 = 8; + nth1 = 8; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32].pipeline; + } break; + case GGML_TYPE_Q2_K: + { + nth0 = 2; + nth1 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32].pipeline; + } break; + case GGML_TYPE_Q3_K: + { + nth0 = 2; + nth1 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32].pipeline; + } break; + case GGML_TYPE_Q4_K: + { + nth0 = 4; //1; + nth1 = 8; //32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32].pipeline; + } break; + case GGML_TYPE_Q5_K: + { + nth0 = 2; + nth1 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32].pipeline; + } break; + case GGML_TYPE_Q6_K: + { + nth0 = 2; + nth1 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32].pipeline; + } break; + case GGML_TYPE_IQ2_XXS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32].pipeline; + } break; + case GGML_TYPE_IQ2_XS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline; + } break; + case GGML_TYPE_IQ3_XXS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline; + } break; + case GGML_TYPE_IQ3_S: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32].pipeline; + } break; + case GGML_TYPE_IQ2_S: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32].pipeline; + } break; + case GGML_TYPE_IQ1_S: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32].pipeline; + } break; + case GGML_TYPE_IQ1_M: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32].pipeline; + } break; + case GGML_TYPE_IQ4_NL: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32].pipeline; + } break; + case GGML_TYPE_IQ4_XS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32].pipeline; + } break; + default: + { + GGML_LOG_ERROR("Asserting on type %d\n", (int)src0t); + GGML_ABORT("not implemented"); + } + }; - ggml_metal_kargs_mul_mv args = { - /*.ne00 =*/ ne00, - /*.ne01 =*/ ne01, - /*.ne02 =*/ ne02, - /*.nb00 =*/ nb00, - /*.nb01 =*/ nb01, - /*.nb02 =*/ nb02, - /*.nb03 =*/ nb03, - /*.ne10 =*/ ne10, - /*.ne11 =*/ ne11, - /*.ne12 =*/ ne12, - /*.nb10 =*/ nb10, - /*.nb11 =*/ nb11, - /*.nb12 =*/ nb12, - /*.nb13 =*/ nb13, - /*.ne0 =*/ ne0, - /*.ne1 =*/ ne1, - /*.r2 =*/ r2, - /*.r3 =*/ r3, - }; + ggml_metal_kargs_mul_mv args = { + /*.ne00 =*/ ne00, + /*.ne01 =*/ ne01, + /*.ne02 =*/ ne02, + /*.nb00 =*/ nb00, + /*.nb01 =*/ nb01, + /*.nb02 =*/ nb02, + /*.nb03 =*/ nb03, + /*.ne10 =*/ ne10, + /*.ne11 =*/ ne11, + /*.ne12 =*/ ne12, + /*.nb10 =*/ nb10, + /*.nb11 =*/ nb11, + /*.nb12 =*/ nb12, + /*.nb13 =*/ nb13, + /*.ne0 =*/ ne0, + /*.ne1 =*/ ne1, + /*.r2 =*/ r2, + /*.r3 =*/ r3, + }; - [encoder setComputePipelineState:pipeline]; - [encoder setBytes:&args length:sizeof(args) atIndex:0]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; + [encoder setComputePipelineState:pipeline]; + [encoder setBytes:&args length:sizeof(args) atIndex:0]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; - if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q5_0 || - src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q2_K || - src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) { - const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128; - [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) { - const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4; - [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS) { - const int mem_size = 32*sizeof(float); - [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_Q4_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_Q3_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_Q5_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_Q6_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } else { - const int64_t ny = (ne11 + nrows - 1)/nrows; - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - } + if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q5_0 || + src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q2_K || + src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) { + const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128; + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) { + const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4; + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS) { + const int mem_size = 32*sizeof(float); + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q4_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q3_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q5_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q6_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } else { + const int64_t ny = (ne11 + nrows - 1)/nrows; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + } } break; case GGML_OP_MUL_MAT_ID: { diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 971f5054bce2a..eaca38864bd65 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -5447,12 +5447,12 @@ kernel void kernel_mul_mm( const int im = tgpig.z; // if this block is of 64x32 shape or smaller - short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M; - short n_cols = (args.ne1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (args.ne1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N; + const short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M; + const short n_cols = (args.ne1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (args.ne1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N; // a thread shouldn't load data outside of the matrix - short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1; - short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1; + const short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1; + const short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1; simdgroup_T8x8 ma[4]; simdgroup_float8x8 mb[2]; @@ -5467,20 +5467,23 @@ kernel void kernel_mul_mm( const int i12 = im%args.ne12; const int i13 = im/args.ne12; - uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03; - short offset1 = il/nl; + const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03; + const short offset1 = il/nl; + + device const block_q * x = (device const block_q *)(src0 + + args.nb01*(r0*BLOCK_SIZE_M + thread_row) + offset0) + offset1; - device const block_q * x = (device const block_q *)(src0 + (r0*BLOCK_SIZE_M + thread_row)*args.nb01 + offset0) + offset1; device const float * y = (device const float *)(src1 + args.nb13*i13 + args.nb12*i12 - + args.nb11*(r1 * BLOCK_SIZE_N + thread_col) + + args.nb11*(r1*BLOCK_SIZE_N + thread_col) + args.nb10*(BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL))); for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) { // load data and store to threadgroup memory T4x4 temp_a; dequantize_func(x, il, temp_a); + threadgroup_barrier(mem_flags::mem_threadgroup); #pragma unroll(16) @@ -5490,44 +5493,46 @@ kernel void kernel_mul_mm( + (tiitg/THREAD_PER_ROW)%8 + (i&7)*8) = temp_a[i/4][i%4]; } - *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL)*8*32 + 8*(tiitg/THREAD_PER_COL)) = *((device float2x4 *) y); + *(threadgroup float2x4 *)(sb + 32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL)) = *((device float2x4 *) y); il = (il + 2 < nl) ? il + 2 : il % 2; - x = (il < 2) ? x + (2+nl-1)/nl : x; + x = (il < 2) ? x + (2 + nl - 1)/nl : x; y += BLOCK_SIZE_K; threadgroup_barrier(mem_flags::mem_threadgroup); // load matrices from threadgroup memory and conduct outer products - threadgroup T * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2)); - threadgroup float * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2)); + threadgroup const T * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2)); + threadgroup const float * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2)); #pragma unroll(4) - for (short ik = 0; ik < BLOCK_SIZE_K / 8; ik++) { + for (short ik = 0; ik < BLOCK_SIZE_K/8; ik++) { #pragma unroll(4) for (short i = 0; i < 4; i++) { simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i); } + simdgroup_barrier(mem_flags::mem_none); + #pragma unroll(2) for (short i = 0; i < 2; i++) { simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i); } - lsma += BLOCK_SIZE_M/SG_MAT_ROW * SG_MAT_SIZE; - lsmb += BLOCK_SIZE_N/SG_MAT_ROW * SG_MAT_SIZE; - #pragma unroll(8) for (short i = 0; i < 8; i++){ simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]); } + + lsma += (BLOCK_SIZE_M/SG_MAT_ROW)*SG_MAT_SIZE; + lsmb += (BLOCK_SIZE_N/SG_MAT_ROW)*SG_MAT_SIZE; } } if ((r0 + 1) * BLOCK_SIZE_M <= args.ne0 && (r1 + 1) * BLOCK_SIZE_N <= args.ne1) { device float * C = (device float *) dst + - (BLOCK_SIZE_M * r0 + 32 * (sgitg & 1)) + \ - (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0; + (BLOCK_SIZE_M * r0 + 32*(sgitg & 1)) + \ + (BLOCK_SIZE_N * r1 + 16*(sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0; for (short i = 0; i < 8; i++) { simdgroup_store(mc[i], C + 8 * (i%4) + 8 * args.ne0 * (i/4), args.ne0); @@ -5536,7 +5541,7 @@ kernel void kernel_mul_mm( // block is smaller than 64x32, we should avoid writing data outside of the matrix threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup float * temp_str = ((threadgroup float *) shmem) \ - + 32 * (sgitg&1) + (16 * (sgitg>>1))*BLOCK_SIZE_M; + + 32*(sgitg&1) + (16*(sgitg >> 1))*BLOCK_SIZE_M; for (short i = 0; i < 8; i++) { simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M); } From f8abca200219a0fc7f9ccec8af5be77a15a4ad35 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 25 Nov 2024 15:17:32 +0200 Subject: [PATCH 041/245] tests : fix compile warning --- tests/test-quantize-fns.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index 8d0bf0470fe4e..c77c8ed1388d7 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -79,9 +79,9 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) { } // Total dot product error -static float dot_product_error( - const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float *test_data2 -) { +static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float * test_data2) { + GGML_UNUSED(qfns); + std::vector tmp_q1(2*test_size); std::vector tmp_q2(2*test_size); From fa4365c6f1697084ecbe87778177e46fbc8d9559 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Mon, 25 Nov 2024 15:13:39 +0100 Subject: [PATCH 042/245] ggml : add support for dynamic loading of backends (#10469) * ggml : add support for dynamic loading of backends --------- Co-authored-by: Georgi Gerganov --- Makefile | 3 +- Package.swift | 3 +- common/common.cpp | 3 + examples/CMakeLists.txt | 27 +-- examples/eval-callback/CMakeLists.txt | 3 +- examples/llama-bench/llama-bench.cpp | 15 +- examples/main/main.cpp | 12 +- examples/simple-chat/simple-chat.cpp | 3 + examples/simple/simple.cpp | 4 + ggml/CMakeLists.txt | 1 + ggml/include/ggml-backend.h | 15 ++ ggml/include/ggml-cpu.h | 38 +--- ggml/include/ggml.h | 31 +++ ggml/src/CMakeLists.txt | 41 +++- ggml/src/ggml-amx/CMakeLists.txt | 10 +- ggml/src/ggml-amx/ggml-amx.cpp | 7 +- ggml/src/ggml-backend-impl.h | 44 +++-- ggml/src/ggml-backend-reg.cpp | 252 +++++++++++++++++++++++-- ggml/src/ggml-blas/CMakeLists.txt | 9 +- ggml/src/ggml-blas/ggml-blas.cpp | 7 +- ggml/src/ggml-cann/CMakeLists.txt | 6 +- ggml/src/ggml-cann/ggml-cann.cpp | 13 +- ggml/src/ggml-cpu/CMakeLists.txt | 19 +- ggml/src/ggml-cpu/ggml-cpu.c | 23 --- ggml/src/ggml-cpu/ggml-cpu.cpp | 50 +++-- ggml/src/ggml-cuda/CMakeLists.txt | 11 +- ggml/src/ggml-cuda/ggml-cuda.cu | 71 ++++++- ggml/src/ggml-hip/CMakeLists.txt | 10 +- ggml/src/ggml-kompute/CMakeLists.txt | 10 +- ggml/src/ggml-kompute/ggml-kompute.cpp | 7 +- ggml/src/ggml-metal/CMakeLists.txt | 9 +- ggml/src/ggml-metal/ggml-metal.m | 34 +++- ggml/src/ggml-musa/CMakeLists.txt | 10 +- ggml/src/ggml-rpc/CMakeLists.txt | 8 +- ggml/src/ggml-rpc/ggml-rpc.cpp | 7 +- ggml/src/ggml-sycl/CMakeLists.txt | 10 +- ggml/src/ggml-sycl/ggml-sycl.cpp | 12 +- ggml/src/ggml-vulkan/CMakeLists.txt | 12 +- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 7 +- ggml/src/ggml.c | 23 +++ pocs/CMakeLists.txt | 4 +- src/llama.cpp | 77 ++++---- tests/CMakeLists.txt | 13 +- tests/test-backend-ops.cpp | 26 +-- 44 files changed, 728 insertions(+), 272 deletions(-) diff --git a/Makefile b/Makefile index dd6d864ad513a..14c05e93e7535 100644 --- a/Makefile +++ b/Makefile @@ -251,7 +251,7 @@ endif # # keep standard at C11 and C++11 -MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon +MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU MK_CFLAGS = -std=c11 -fPIC MK_CXXFLAGS = -std=c++11 -fPIC MK_NVCCFLAGS = -std=c++11 @@ -290,6 +290,7 @@ endif # some memory allocation are available on Linux through GNU extensions in libc ifeq ($(UNAME_S),Linux) MK_CPPFLAGS += -D_GNU_SOURCE + MK_LDFLAGS += -ldl endif # RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1, diff --git a/Package.swift b/Package.swift index 6b68aecdebec1..d9e8a4e2d21d7 100644 --- a/Package.swift +++ b/Package.swift @@ -43,7 +43,8 @@ linkerSettings.append(.linkedFramework("Accelerate")) cSettings.append( contentsOf: [ .define("GGML_USE_ACCELERATE"), - .define("GGML_USE_METAL") + .define("GGML_USE_METAL"), + .define("GGML_USE_CPU") ] ) #endif diff --git a/common/common.cpp b/common/common.cpp index c398329d05bf5..98524f7467ab4 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -377,6 +377,9 @@ void common_init() { #endif LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type); + + // load dynamic backends + ggml_backend_load_all(); } std::string common_params_get_system_info(const common_params & params) { diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 9bd099d4ef8a5..632409d5591b9 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -12,13 +12,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) else() - add_subdirectory(cvector-generator) add_subdirectory(batched-bench) add_subdirectory(batched) - add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(embedding) add_subdirectory(eval-callback) - add_subdirectory(export-lora) add_subdirectory(gbnf-validator) add_subdirectory(gguf-hash) add_subdirectory(gguf-split) @@ -27,24 +24,16 @@ else() add_subdirectory(imatrix) add_subdirectory(infill) add_subdirectory(llama-bench) - add_subdirectory(llava) add_subdirectory(lookahead) add_subdirectory(lookup) add_subdirectory(main) add_subdirectory(parallel) add_subdirectory(passkey) add_subdirectory(perplexity) - add_subdirectory(quantize-stats) add_subdirectory(quantize) add_subdirectory(retrieval) - if (GGML_RPC) - add_subdirectory(rpc) - endif() if (LLAMA_BUILD_SERVER) - add_subdirectory(server) - endif() - if (GGML_SYCL) - add_subdirectory(sycl) + add_subdirectory(server) endif() add_subdirectory(save-load-state) add_subdirectory(simple) @@ -52,4 +41,18 @@ else() add_subdirectory(speculative) add_subdirectory(speculative-simple) add_subdirectory(tokenize) + if (NOT GGML_BACKEND_DL) + # these examples use the backends directly and cannot be built with dynamic loading + add_subdirectory(convert-llama2c-to-ggml) + add_subdirectory(cvector-generator) + add_subdirectory(export-lora) + add_subdirectory(quantize-stats) + add_subdirectory(llava) + if (GGML_RPC) + add_subdirectory(rpc) + endif() + if (GGML_SYCL) + add_subdirectory(sycl) + endif() + endif() endif() diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt index a48753d38e16e..5d1048aad74b6 100644 --- a/examples/eval-callback/CMakeLists.txt +++ b/examples/eval-callback/CMakeLists.txt @@ -5,5 +5,6 @@ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) set(TEST_TARGET test-eval-callback) -add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0) +add_test(NAME ${TEST_TARGET} + COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0) set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 3dc84a75cbec7..bac606f471639 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1477,6 +1477,17 @@ int main(int argc, char ** argv) { cmd_params params = parse_cmd_params(argc, argv); + // initialize backends + ggml_backend_load_all(); + auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (!cpu_dev) { + fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__); + return 1; + } + auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev); + auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new"); + auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free"); + // initialize llama.cpp if (!params.verbose) { llama_log_set(llama_null_log_callback, NULL); @@ -1551,7 +1562,7 @@ int main(int argc, char ** argv) { tpp.poll = t.poll; tpp.prio = params.prio; - struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp); + struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp); if (!threadpool) { fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); exit(1); @@ -1612,7 +1623,7 @@ int main(int argc, char ** argv) { llama_free(ctx); - ggml_threadpool_free(threadpool); + ggml_threadpool_free_fn(threadpool); } llama_free_model(lmodel); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 957451af7ce0a..d0c28f317b8c5 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -165,6 +165,10 @@ int main(int argc, char ** argv) { LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads); + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU)); + auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new"); + auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free"); + struct ggml_threadpool_params tpp_batch = ggml_threadpool_params_from_cpu_params(params.cpuparams_batch); struct ggml_threadpool_params tpp = @@ -174,7 +178,7 @@ int main(int argc, char ** argv) { struct ggml_threadpool * threadpool_batch = NULL; if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) { - threadpool_batch = ggml_threadpool_new(&tpp_batch); + threadpool_batch = ggml_threadpool_new_fn(&tpp_batch); if (!threadpool_batch) { LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads); return 1; @@ -184,7 +188,7 @@ int main(int argc, char ** argv) { tpp.paused = true; } - struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp); + struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp); if (!threadpool) { LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); return 1; @@ -890,8 +894,8 @@ int main(int argc, char ** argv) { llama_backend_free(); - ggml_threadpool_free(threadpool); - ggml_threadpool_free(threadpool_batch); + ggml_threadpool_free_fn(threadpool); + ggml_threadpool_free_fn(threadpool_batch); return 0; } diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp index 5f9973163732d..7f4da666b08ec 100644 --- a/examples/simple-chat/simple-chat.cpp +++ b/examples/simple-chat/simple-chat.cpp @@ -62,6 +62,9 @@ int main(int argc, char ** argv) { } }, nullptr); + // load dynamic backends + ggml_backend_load_all(); + // initialize the model llama_model_params model_params = llama_model_default_params(); model_params.n_gpu_layers = ngl; diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 59760fe95db22..3288c0250a001 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -74,6 +74,10 @@ int main(int argc, char ** argv) { } } + // load dynamic backends + + ggml_backend_load_all(); + // initialize the model llama_model_params model_params = llama_model_default_params(); diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 2d32da1b6d879..70b5cfdf7fbb4 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -33,6 +33,7 @@ else() endif() option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT}) +option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF) # # option list diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index cef164764bb1a..19881a5059f17 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -190,6 +190,14 @@ extern "C" { typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads); // Get additional buffer types provided by the device (returns a NULL-terminated array) typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device); + // Set the abort callback for the backend + typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data); + // Get a list of feature flags supported by the backend (returns a NULL-terminated array) + struct ggml_backend_feature { + const char * name; + const char * value; + }; + typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg); // // Backend registry @@ -214,6 +222,13 @@ extern "C" { // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL) GGML_API ggml_backend_t ggml_backend_init_best(void); + // Load a backend from a dynamic library and register it + GGML_API ggml_backend_reg_t ggml_backend_load(const char * path); + // Unload a backend if loaded dynamically and unregister it + GGML_API void ggml_backend_unload(ggml_backend_reg_t reg); + // Load all known backends from dynamic libraries + GGML_API void ggml_backend_load_all(void); + // // Backend scheduler // diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index 7571ef9798364..a5358d047a08e 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -7,29 +7,6 @@ extern "C" { #endif - // Scheduling priorities - enum ggml_sched_priority { - GGML_SCHED_PRIO_NORMAL, - GGML_SCHED_PRIO_MEDIUM, - GGML_SCHED_PRIO_HIGH, - GGML_SCHED_PRIO_REALTIME - }; - - // Threadpool params - // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults - struct ggml_threadpool_params { - bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings) - int n_threads; // number of threads - enum ggml_sched_priority prio; // thread priority - uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling) - bool strict_cpu; // strict cpu placement - bool paused; // start in paused state - }; - - struct ggml_threadpool; // forward declaration, see ggml.c - - typedef struct ggml_threadpool * ggml_threadpool_t; - // the compute plan that needs to be prepared for ggml_graph_compute() // since https://github.com/ggerganov/ggml/issues/287 struct ggml_cplan { @@ -75,14 +52,11 @@ extern "C" { GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3); GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value); - GGML_BACKEND_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); - GGML_BACKEND_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads); - GGML_BACKEND_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1); - GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params); - GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); - GGML_BACKEND_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool); - GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); - GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); + GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params); + GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); + GGML_BACKEND_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool); + GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); + GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); // ggml_graph_plan() has to be called before ggml_graph_compute() // when plan.work_size > 0, caller must allocate memory for plan.work_data @@ -104,10 +78,10 @@ extern "C" { GGML_BACKEND_API int ggml_cpu_has_sse3 (void); GGML_BACKEND_API int ggml_cpu_has_ssse3 (void); GGML_BACKEND_API int ggml_cpu_has_avx (void); + GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void); GGML_BACKEND_API int ggml_cpu_has_avx2 (void); GGML_BACKEND_API int ggml_cpu_has_f16c (void); GGML_BACKEND_API int ggml_cpu_has_fma (void); - GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void); GGML_BACKEND_API int ggml_cpu_has_avx512 (void); GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void); GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 69e6a24344b97..9843b09fbe83e 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2215,6 +2215,37 @@ extern "C" { GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type); + // ggml threadpool + // TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend + // the goal should be to create an API that other backends can use move everything to the ggml base + + // scheduling priorities + enum ggml_sched_priority { + GGML_SCHED_PRIO_NORMAL, + GGML_SCHED_PRIO_MEDIUM, + GGML_SCHED_PRIO_HIGH, + GGML_SCHED_PRIO_REALTIME + }; + + // threadpool params + // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults + struct ggml_threadpool_params { + bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings) + int n_threads; // number of threads + enum ggml_sched_priority prio; // thread priority + uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling) + bool strict_cpu; // strict cpu placement + bool paused; // start in paused state + }; + + struct ggml_threadpool; // forward declaration, see ggml.c + + typedef struct ggml_threadpool * ggml_threadpool_t; + + GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); + GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads); + GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1); + #ifdef __cplusplus } #endif diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 8df0e85c0d092..071508ddae021 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -202,6 +202,10 @@ endif() # ggml +if (GGML_BACKEND_DL AND NOT BUILD_SHARED_LIBS) + message(FATAL_ERROR "GGML_BACKEND_DL requires BUILD_SHARED_LIBS") +endif() + add_library(ggml-base ../include/ggml.h ../include/ggml-alloc.h @@ -226,6 +230,31 @@ add_library(ggml target_link_libraries(ggml PUBLIC ggml-base) +if (CMAKE_SYSTEM_NAME MATCHES "Linux") + target_link_libraries(ggml PRIVATE dl) +endif() + +function(ggml_add_backend_library backend) + if (GGML_BACKEND_DL) + add_library(${backend} MODULE ${ARGN}) + # write the shared library to the output directory + set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) + target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL) + else() + add_library(${backend} ${ARGN}) + target_link_libraries(ggml PUBLIC ${backend}) + install(TARGETS ${backend} LIBRARY) + endif() + + target_link_libraries(${backend} PRIVATE ggml-base) + target_include_directories(${backend} PRIVATE ..) + + if (${BUILD_SHARED_LIBS}) + target_compile_definitions(${backend} PRIVATE GGML_BACKEND_BUILD) + target_compile_definitions(${backend} PUBLIC GGML_BACKEND_SHARED) + endif() +endfunction() + function(ggml_add_backend backend) string(TOUPPER "GGML_${backend}" backend_id) if (${backend_id}) @@ -236,14 +265,10 @@ function(ggml_add_backend backend) # however, currently it is necessary for AMX, since it is enabled by default on llama.cpp if (${backend_id}) message(STATUS "Including ${backend} backend") - if (${BUILD_SHARED_LIBS}) - target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_BUILD) - target_compile_definitions(${backend_target} PUBLIC GGML_BACKEND_SHARED) + if (NOT GGML_BACKEND_DL) + string(TOUPPER "GGML_USE_${backend}" backend_use) + target_compile_definitions(ggml PUBLIC ${backend_use}) endif() - install(TARGETS ${backend_target} LIBRARY) - target_link_libraries(ggml PUBLIC ${backend_target}) - string(TOUPPER "GGML_USE_${backend}" backend_use) - target_compile_definitions(ggml PUBLIC ${backend_use}) endif() endif() endfunction() @@ -256,10 +281,10 @@ ggml_add_backend(CUDA) ggml_add_backend(HIP) ggml_add_backend(Kompute) ggml_add_backend(METAL) +ggml_add_backend(MUSA) ggml_add_backend(RPC) ggml_add_backend(SYCL) ggml_add_backend(Vulkan) -ggml_add_backend(MUSA) foreach (target ggml-base ggml) target_include_directories(${target} PUBLIC $ $) diff --git a/ggml/src/ggml-amx/CMakeLists.txt b/ggml/src/ggml-amx/CMakeLists.txt index d6676f3f67b20..cf3ade6f020ed 100644 --- a/ggml/src/ggml-amx/CMakeLists.txt +++ b/ggml/src/ggml-amx/CMakeLists.txt @@ -9,12 +9,10 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MA file(GLOB GGML_SOURCES_AMX "*.cpp") - add_library(ggml-amx - ${GGML_HEADERS_AMX} - ${GGML_SOURCES_AMX}) - - target_link_libraries(ggml-amx PRIVATE ggml-base) - target_include_directories(ggml-amx PRIVATE . ..) + ggml_add_backend_library(ggml-amx + ${GGML_HEADERS_AMX} + ${GGML_SOURCES_AMX} + ) # this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags # TODO: integrate AMX backend into the CPU backend diff --git a/ggml/src/ggml-amx/ggml-amx.cpp b/ggml/src/ggml-amx/ggml-amx.cpp index 8568e7965fd2e..6bfb3da274c39 100644 --- a/ggml/src/ggml-amx/ggml-amx.cpp +++ b/ggml/src/ggml-amx/ggml-amx.cpp @@ -409,8 +409,9 @@ static const struct ggml_backend_reg_i ggml_backend_amx_reg_i = { ggml_backend_reg_t ggml_backend_amx_reg(void) { static struct ggml_backend_reg ggml_backend_amx_reg = { - /* .iface = */ ggml_backend_amx_reg_i, - /* .context = */ NULL, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_amx_reg_i, + /* .context = */ NULL, }; return &ggml_backend_amx_reg; @@ -444,3 +445,5 @@ ggml_backend_reg_t ggml_backend_amx_reg(void) { } #endif + +GGML_BACKEND_DL_IMPL(ggml_backend_amx_reg) diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index fa8d5b7fb68c9..dff7749b416dc 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -8,6 +8,8 @@ extern "C" { #endif + #define GGML_BACKEND_API_VERSION 1 + // // Backend buffer type // @@ -63,20 +65,20 @@ extern "C" { enum ggml_backend_buffer_usage usage; }; - ggml_backend_buffer_t ggml_backend_buffer_init( + GGML_API ggml_backend_buffer_t ggml_backend_buffer_init( ggml_backend_buffer_type_t buft, struct ggml_backend_buffer_i iface, void * context, size_t size); // do not use directly, use ggml_backend_tensor_copy instead - bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst); + GGML_API bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst); // multi-buffer // buffer that contains a collection of buffers - ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers); - bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer); - void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); + GGML_API ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers); + GGML_API bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); // // Backend (stream) @@ -199,17 +201,37 @@ extern "C" { }; struct ggml_backend_reg { - // int api_version; // TODO: for dynamic loading + int api_version; // initialize to GGML_BACKEND_API_VERSION struct ggml_backend_reg_i iface; void * context; }; - // Internal backend registry API - void ggml_backend_register(ggml_backend_reg_t reg); - void ggml_backend_device_register(ggml_backend_dev_t device); - // TODO: backends can be loaded as a dynamic library, in which case it needs to export this function - // typedef ggml_backend_register_t * (*ggml_backend_init)(void); + GGML_API void ggml_backend_register(ggml_backend_reg_t reg); + GGML_API void ggml_backend_device_register(ggml_backend_dev_t device); + + // Add backend dynamic loading support to the backend + typedef ggml_backend_reg_t (*ggml_backend_init_t)(void); + + #ifdef GGML_BACKEND_DL + #ifdef __cplusplus + # define GGML_BACKEND_DL_IMPL(reg_fn) \ + extern "C" { \ + GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \ + } \ + ggml_backend_reg_t ggml_backend_init(void) { \ + return reg_fn(); \ + } + #else + # define GGML_BACKEND_DL_IMPL(reg_fn) \ + GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \ + ggml_backend_reg_t ggml_backend_init(void) { \ + return reg_fn(); \ + } + #endif + #else + # define GGML_BACKEND_DL_IMPL(reg_fn) + #endif #ifdef __cplusplus } diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 63e9d82017457..43d03d7fa7385 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -1,11 +1,29 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" -#include "ggml-cpu.h" #include "ggml-impl.h" +#include #include +#include #include +#ifdef _WIN32 +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +#elif defined(__APPLE__) +# include +# include +#else +# include +# include +#endif + // Backend registry +#ifdef GGML_USE_CPU +#include "ggml-cpu.h" +#endif #ifdef GGML_USE_CUDA #include "ggml-cuda.h" @@ -43,8 +61,13 @@ #include "ggml-kompute.h" #endif +struct ggml_backend_reg_entry { + ggml_backend_reg_t reg; + void * handle; +}; + struct ggml_backend_registry { - std::vector backends; + std::vector backends; std::vector devices; ggml_backend_registry() { @@ -75,11 +98,19 @@ struct ggml_backend_registry { #ifdef GGML_USE_KOMPUTE register_backend(ggml_backend_kompute_reg()); #endif - +#ifdef GGML_USE_CPU register_backend(ggml_backend_cpu_reg()); +#endif } - void register_backend(ggml_backend_reg_t reg) { + ~ggml_backend_registry() { + while (!backends.empty()) { + // use silent since the log system may have been destroyed at this point + unload_backend(backends.back().reg, true); + } + } + + void register_backend(ggml_backend_reg_t reg, void * handle = nullptr) { if (!reg) { return; } @@ -88,7 +119,7 @@ struct ggml_backend_registry { GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n", __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg)); #endif - backends.push_back(reg); + backends.push_back({ reg, handle }); for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) { register_device(ggml_backend_reg_dev_get(reg, i)); } @@ -100,6 +131,111 @@ struct ggml_backend_registry { #endif devices.push_back(device); } + + ggml_backend_reg_t load_backend(const char * path, bool silent) { +#ifdef _WIN32 + // suppress error dialogs for missing DLLs + DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); + SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); + + HMODULE handle = LoadLibraryA(path); + + if (!handle) { + if (!silent) { + GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError()); + } + SetErrorMode(old_mode); + return nullptr; + } + + ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init"); + + SetErrorMode(old_mode); + + if (!backend_init) { + if (!silent) { + GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError()); + } + FreeLibrary(handle); + return nullptr; + } +#else + void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL); + + if (!handle) { + if (!silent) { + GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror()); + } + return nullptr; + } + + auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init"); + + if (!backend_init) { + if (!silent) { + GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror()); + } + dlclose(handle); + return nullptr; + } +#endif + ggml_backend_reg_t reg = backend_init(); + + if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) { + if (!silent) { + if (!reg) { + GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path); + } else { + GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n", + __func__, path, reg->api_version, GGML_BACKEND_API_VERSION); + } + } +#ifdef _WIN32 + FreeLibrary(handle); +#else + dlclose(handle); +#endif + return nullptr; + } + + GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path); + register_backend(reg, handle); + return reg; + } + + void unload_backend(ggml_backend_reg_t reg, bool silent) { + auto it = std::find_if(backends.begin(), backends.end(), + [reg](ggml_backend_reg_entry entry) { return entry.reg == reg; }); + + if (it == backends.end()) { + if (!silent) { + GGML_LOG_ERROR("%s: backend not found\n", __func__); + } + return; + } + + if (!silent) { + GGML_LOG_DEBUG("%s: unloading %s backend\n", __func__, ggml_backend_reg_name(reg)); + } + + // remove devices + devices.erase( + std::remove_if(devices.begin(), devices.end(), + [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }), + devices.end()); + + // unload library + if (it->handle) { +#ifdef _WIN32 + FreeLibrary((HMODULE) it->handle); +#else + dlclose(it->handle); +#endif + } + + // remove backend + backends.erase(it); + } }; static ggml_backend_registry & get_reg() { @@ -123,7 +259,7 @@ size_t ggml_backend_reg_count() { ggml_backend_reg_t ggml_backend_reg_get(size_t index) { GGML_ASSERT(index < ggml_backend_reg_count()); - return get_reg().backends[index]; + return get_reg().backends[index].reg; } ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) { @@ -133,7 +269,7 @@ ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) { return reg; } } - return NULL; + return nullptr; } // Device enumeration @@ -153,7 +289,7 @@ ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) { return dev; } } - return NULL; + return nullptr; } ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) { @@ -163,14 +299,14 @@ ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) { return dev; } } - return NULL; + return nullptr; } // Convenience functions ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) { ggml_backend_dev_t dev = ggml_backend_dev_by_name(name); if (!dev) { - return NULL; + return nullptr; } return ggml_backend_dev_init(dev, params); } @@ -178,7 +314,7 @@ ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) { ggml_backend_dev_t dev = ggml_backend_dev_by_type(type); if (!dev) { - return NULL; + return nullptr; } return ggml_backend_dev_init(dev, params); } @@ -189,7 +325,97 @@ ggml_backend_t ggml_backend_init_best(void) { dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); } if (!dev) { - return NULL; + return nullptr; } - return ggml_backend_dev_init(dev, NULL); + return ggml_backend_dev_init(dev, nullptr); +} + +// Dynamic loading +ggml_backend_reg_t ggml_backend_load(const char * path) { + return get_reg().load_backend(path, false); +} + +void ggml_backend_unload(ggml_backend_reg_t reg) { + get_reg().unload_backend(reg, true); +} + +void ggml_backend_load_all() { + std::vector search_prefix; + + // add the executable directory to the search path + // FIXME: this is convenient for development, but it should probably be disabled in production + +#if defined(__APPLE__) + // get executable path + std::vector path; + uint32_t size; + while (true) { + size = path.size(); + if (_NSGetExecutablePath(path.data(), &size) == 0) { + break; + } + path.resize(size); + } + std::string base_path(path.data(), size); + // remove executable name + auto last_slash = base_path.find_last_of('/'); + if (last_slash != std::string::npos) { + base_path = base_path.substr(0, last_slash); + } + search_prefix.push_back(base_path + "/"); +#elif defined(__linux__) + std::string base_path = "."; + std::vector path(1024); + while (true) { + // get executable path + ssize_t len = readlink("/proc/self/exe", path.data(), path.size()); + if (len == -1) { + break; + } + if (len < (ssize_t) path.size()) { + base_path = std::string(path.data(), len); + // remove executable name + auto last_slash = base_path.find_last_of('/'); + if (last_slash != std::string::npos) { + base_path = base_path.substr(0, last_slash); + } + break; + } + path.resize(path.size() * 2); + } + + search_prefix.push_back(base_path + "/"); +#endif + + auto & reg = get_reg(); + + auto try_load = [&](const std::string & name) { + std::string os_name; +#ifdef _WIN32 + os_name = "ggml-" + name + ".dll"; +#else + os_name = "libggml-" + name + ".so"; +#endif + if (reg.load_backend(os_name.c_str(), true)) { + return; + } + for (const auto & prefix : search_prefix) { + if (reg.load_backend((prefix + os_name).c_str(), true)) { + return; + } + } + }; + + try_load("amx"); + try_load("blas"); + try_load("cann"); + try_load("cuda"); + try_load("hip"); + try_load("kompute"); + try_load("metal"); + try_load("rpc"); + try_load("sycl"); + try_load("vulkan"); + try_load("musa"); + try_load("cpu"); } diff --git a/ggml/src/ggml-blas/CMakeLists.txt b/ggml/src/ggml-blas/CMakeLists.txt index e2cbabf0dae74..0bf3c05d93a89 100644 --- a/ggml/src/ggml-blas/CMakeLists.txt +++ b/ggml/src/ggml-blas/CMakeLists.txt @@ -11,12 +11,9 @@ find_package(BLAS) if (BLAS_FOUND) message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}") - add_library(ggml-blas - ggml-blas.cpp - ) - - target_link_libraries(ggml-blas PRIVATE ggml-base) - target_include_directories(ggml-blas PRIVATE . ..) + ggml_add_backend_library(ggml-blas + ggml-blas.cpp + ) if (${GGML_BLAS_VENDOR} MATCHES "Apple") add_compile_definitions(ACCELERATE_NEW_LAPACK) diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index 648c9d875e346..ec158dfac6e3e 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -506,9 +506,12 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = { ggml_backend_reg_t ggml_backend_blas_reg(void) { static struct ggml_backend_reg ggml_backend_blas_reg = { - /* .iface = */ ggml_backend_blas_reg_i, - /* .context = */ NULL, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_blas_reg_i, + /* .context = */ NULL, }; return &ggml_backend_blas_reg; } + +GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg) diff --git a/ggml/src/ggml-cann/CMakeLists.txt b/ggml/src/ggml-cann/CMakeLists.txt index 756200b893d02..901327185fb75 100644 --- a/ggml/src/ggml-cann/CMakeLists.txt +++ b/ggml/src/ggml-cann/CMakeLists.txt @@ -61,9 +61,9 @@ if (CANN_INSTALL_DIR) file(GLOB GGML_SOURCES_CANN "*.cpp") - add_library(ggml-cann ${GGML_SOURCES_CANN}) - target_link_libraries(ggml-cann PRIVATE ggml-base ${CANN_LIBRARIES}) - target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS}) + ggml_add_backend_library(ggml-cann ${GGML_SOURCES_CANN}) + target_link_libraries(ggml-cann PRIVATE ${CANN_LIBRARIES}) + target_include_directories(ggml-cann PRIVATE ${CANN_INCLUDE_DIRS}) target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64) target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}") diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 776340881434d..d96f65936136d 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2064,16 +2064,17 @@ ggml_backend_reg_t ggml_backend_cann_reg() { dev_ctx->name = GGML_CANN_NAME + std::to_string(i); ggml_cann_set_device(i); ggml_backend_dev_t dev = new ggml_backend_device { - /* .interface = */ ggml_backend_cann_device_interface, - /* .reg = */ ®, - /* .context = */ dev_ctx + /* .iface = */ ggml_backend_cann_device_interface, + /* .reg = */ ®, + /* .context = */ dev_ctx }; ctx->devices.push_back(dev); } reg = ggml_backend_reg { - /* .interface = */ ggml_backend_cann_reg_interface, - /* .context = */ ctx + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_cann_reg_interface, + /* .context = */ ctx }; } @@ -2126,3 +2127,5 @@ void ggml_backend_cann_get_device_memory(int32_t device, size_t* free, ggml_cann_set_device(device); ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total)); } + +GGML_BACKEND_DL_IMPL(ggml_backend_cann_reg) diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 2880523331dbd..c2905d1fbf4e8 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -1,14 +1,13 @@ -add_library(ggml-cpu - ggml-cpu.c - ggml-cpu.cpp - ggml-cpu-aarch64.c - ggml-cpu-aarch64.h - ggml-cpu-quants.c - ggml-cpu-quants.h - ) +ggml_add_backend_library(ggml-cpu + ggml-cpu.c + ggml-cpu.cpp + ggml-cpu-aarch64.c + ggml-cpu-aarch64.h + ggml-cpu-quants.c + ggml-cpu-quants.h + ) -target_link_libraries(ggml-cpu PRIVATE ggml-base) -target_include_directories(ggml-cpu PRIVATE . ..) +target_include_directories(ggml-cpu PRIVATE .) if (APPLE AND GGML_ACCELERATE) find_library(ACCELERATE_FRAMEWORK Accelerate) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 4b58254e7d108..c6ede19d9d1c0 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -13578,29 +13578,6 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int #endif // GGML_USE_OPENMP -void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) { - p->n_threads = n_threads; - p->prio = 0; // default priority (usually means normal or inherited) - p->poll = 50; // hybrid-polling enabled - p->strict_cpu = false; // no strict placement (all threads share same cpumask) - p->paused = false; // threads are ready to go - memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited) -} - -struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) { - struct ggml_threadpool_params p; - ggml_threadpool_params_init(&p, n_threads); - return p; -} - -bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) { - if (p0->n_threads != p1->n_threads ) return false; - if (p0->prio != p1->prio ) return false; - if (p0->poll != p1->poll ) return false; - if (p0->strict_cpu != p1->strict_cpu ) return false; - return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0; -} - static struct ggml_threadpool * ggml_threadpool_new_impl( struct ggml_threadpool_params * tpp, struct ggml_cgraph * cgraph, diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 573b7c5b9b375..febed433ada2b 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -541,16 +541,12 @@ static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg return &ggml_backend_cpu_device; } -struct ggml_backend_feature { - const char * name; - const char * value; -}; - -// Not used yet // This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically, -// and additionally to allow other backends to expose their own list of features that applications can query using the same API. +// and additionally to allow other backends to expose their own list of features that applications can query using the same API static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) { static std::vector features = []() { + ggml_cpu_init(); + std::vector features; if (ggml_cpu_has_sse3()) { features.push_back({ "SSE3", "1" }); @@ -561,6 +557,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_avx()) { features.push_back({ "AVX", "1" }); } + if (ggml_cpu_has_avx_vnni()) { + features.push_back({ "AVX_VNNI", "1" }); + } if (ggml_cpu_has_avx2()) { features.push_back({ "AVX2", "1" }); } @@ -570,9 +569,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_fma()) { features.push_back({ "FMA", "1" }); } - if (ggml_cpu_has_avx_vnni()) { - features.push_back({ "AVX_VNNI", "1" }); - } if (ggml_cpu_has_avx512()) { features.push_back({ "AVX512", "1" }); } @@ -619,6 +615,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_llamafile()) { features.push_back({ "LLAMAFILE", "1" }); } + // TODO: rename this + #ifdef GGML_USE_CPU_AARCH64 + features.push_back({ "AARCH64_REPACK", "1" }); + #endif features.push_back({ nullptr, nullptr }); @@ -637,6 +637,29 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) { return (void *)ggml_backend_cpu_get_extra_bufts; } + if (strcmp(name, "ggml_backend_get_features") == 0) { + return (void *)ggml_backend_cpu_get_features; + } + if (strcmp(name, "ggml_backend_set_abort_callback") == 0) { + return (void *)ggml_backend_cpu_set_abort_callback; + } + if (strcmp(name, "ggml_backend_cpu_numa_init") == 0) { + return (void *)ggml_numa_init; + } + if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) { + return (void *)ggml_is_numa; + } + + // threadpool - TODO: move to ggml-base + if (strcmp(name, "ggml_threadpool_new") == 0) { + return (void *)ggml_threadpool_new; + } + if (strcmp(name, "ggml_threadpool_free") == 0) { + return (void *)ggml_threadpool_free; + } + if (strcmp(name, "ggml_backend_cpu_set_threadpool") == 0) { + return (void *)ggml_backend_cpu_set_threadpool; + } return NULL; @@ -655,9 +678,12 @@ ggml_backend_reg_t ggml_backend_cpu_reg(void) { ggml_cpu_init(); static struct ggml_backend_reg ggml_backend_cpu_reg = { - /* .iface = */ ggml_backend_cpu_reg_i, - /* .context = */ NULL, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_cpu_reg_i, + /* .context = */ NULL, }; return &ggml_backend_cpu_reg; } + +GGML_BACKEND_DL_IMPL(ggml_backend_cpu_reg) diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index e1482a269d698..b0cb93e070fd3 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -46,13 +46,10 @@ if (CUDAToolkit_FOUND) list(APPEND GGML_SOURCES_CUDA ${SRCS}) endif() - add_library(ggml-cuda - ${GGML_HEADERS_CUDA} - ${GGML_SOURCES_CUDA} - ) - - target_link_libraries(ggml-cuda PRIVATE ggml-base) - target_include_directories(ggml-cuda PRIVATE . ..) + ggml_add_backend_library(ggml-cuda + ${GGML_HEADERS_CUDA} + ${GGML_SOURCES_CUDA} + ) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE}) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index dd94ab03d5b6c..2a78a4393d0f7 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3126,6 +3126,61 @@ static ggml_backend_dev_t ggml_backend_cuda_reg_get_device(ggml_backend_reg_t re return ctx->devices[index]; } +static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t reg) { + static std::vector features = []() { + std::vector features; + #define _STRINGIFY(...) #__VA_ARGS__ + #define STRINGIFY(...) _STRINGIFY(__VA_ARGS__) + + #ifdef __CUDA_ARCH_LIST__ + features.push_back({ "ARCHS", STRINGIFY(__CUDA_ARCH_LIST__) }); + #endif + + #ifdef GGML_CUDA_FORCE_MMQ + features.push_back({ "FORCE_MMQ", "1" }); + #endif + + #ifdef GGML_CUDA_FORCE_CUBLAS + features.push_back({ "FORCE_CUBLAS", "1" }); + #endif + + #ifdef GGML_CUDA_NO_VMM + features.push_back({ "NO_VMM", "1" }); + #endif + + #ifdef GGML_CUDA_NO_PEER_COPY + features.push_back({ "NO_PEER_COPY", "1" }); + #endif + + #ifdef GGML_CUDA_F16 + features.push_back({ "F16", "1" }); + #endif + + #ifdef GGML_CUDA_USE_GRAPHS + features.push_back({ "USE_GRAPHS", "1" }); + #endif + + #ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE + features.push_back({ "PEER_MAX_BATCH_SIZE", STRINGIFY(GGML_CUDA_PEER_MAX_BATCH_SIZE) }); + #endif + + #ifdef GGML_CUDA_FA_ALL_QUANTS + features.push_back({ "FA_ALL_QUANTS", "1" }); + #endif + + #undef _STRINGIFY + #undef STRINGIFY + + features.push_back({ nullptr, nullptr }); + + return features; + }(); + + return features.data(); + + GGML_UNUSED(reg); +} + static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) { GGML_UNUSED(reg); if (strcmp(name, "ggml_backend_split_buffer_type") == 0) { @@ -3137,6 +3192,9 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) { return (void *)ggml_backend_cuda_unregister_host_buffer; } + if (strcmp(name, "ggml_backend_get_features") == 0) { + return (void *)ggml_backend_cuda_get_features; + } return nullptr; } @@ -3169,16 +3227,17 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { dev_ctx->description = prop.name; ggml_backend_dev_t dev = new ggml_backend_device { - /* .interface = */ ggml_backend_cuda_device_interface, - /* .reg = */ ®, - /* .context = */ dev_ctx + /* .iface = */ ggml_backend_cuda_device_interface, + /* .reg = */ ®, + /* .context = */ dev_ctx }; ctx->devices.push_back(dev); } reg = ggml_backend_reg { - /* .interface = */ ggml_backend_cuda_reg_interface, - /* .context = */ ctx + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_cuda_reg_interface, + /* .context = */ ctx }; } @@ -3209,3 +3268,5 @@ ggml_backend_t ggml_backend_cuda_init(int device) { return cuda_backend; } + +GGML_BACKEND_DL_IMPL(ggml_backend_cuda_reg) diff --git a/ggml/src/ggml-hip/CMakeLists.txt b/ggml/src/ggml-hip/CMakeLists.txt index fccf8eb8440b8..b15fbd24d6b36 100644 --- a/ggml/src/ggml-hip/CMakeLists.txt +++ b/ggml/src/ggml-hip/CMakeLists.txt @@ -64,12 +64,10 @@ else() list(APPEND GGML_SOURCES_ROCM ${SRCS}) endif() -add_library(ggml-hip - ${GGML_HEADERS_ROCM} - ${GGML_SOURCES_ROCM}) - -target_link_libraries(ggml-hip PRIVATE ggml-base) -target_include_directories(ggml-hip PRIVATE . ..) +ggml_add_backend_library(ggml-hip + ${GGML_HEADERS_ROCM} + ${GGML_SOURCES_ROCM} + ) # TODO: do not use CUDA definitions for HIP target_compile_definitions(ggml PUBLIC GGML_USE_CUDA) diff --git a/ggml/src/ggml-kompute/CMakeLists.txt b/ggml/src/ggml-kompute/CMakeLists.txt index 0bd027c7f537e..dc623926c7685 100644 --- a/ggml/src/ggml-kompute/CMakeLists.txt +++ b/ggml/src/ggml-kompute/CMakeLists.txt @@ -6,13 +6,13 @@ if (NOT glslc_executable) message(FATAL_ERROR "glslc not found") endif() -add_library(ggml-kompute - ggml-kompute.cpp - ../../include/ggml-kompute.h - ) +ggml_add_backend_library(ggml-kompute + ggml-kompute.cpp + ../../include/ggml-kompute.h + ) target_link_libraries(ggml-kompute PRIVATE ggml-base kompute) -target_include_directories(ggml-kompute PRIVATE . .. ${CMAKE_CURRENT_BINARY_DIR}) +target_include_directories(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1) diff --git a/ggml/src/ggml-kompute/ggml-kompute.cpp b/ggml/src/ggml-kompute/ggml-kompute.cpp index 2fea9e4cc8d38..24566404ded0f 100644 --- a/ggml/src/ggml-kompute/ggml-kompute.cpp +++ b/ggml/src/ggml-kompute/ggml-kompute.cpp @@ -2176,9 +2176,12 @@ static const struct ggml_backend_reg_i ggml_backend_kompute_reg_i = { ggml_backend_reg_t ggml_backend_kompute_reg() { static ggml_backend_reg reg = { - /* .iface = */ ggml_backend_kompute_reg_i, - /* .context = */ nullptr, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_kompute_reg_i, + /* .context = */ nullptr, }; return ® } + +GGML_BACKEND_DL_IMPL(ggml_backend_kompute_reg) diff --git a/ggml/src/ggml-metal/CMakeLists.txt b/ggml/src/ggml-metal/CMakeLists.txt index b237d79f47ddb..1bad272068244 100644 --- a/ggml/src/ggml-metal/CMakeLists.txt +++ b/ggml/src/ggml-metal/CMakeLists.txt @@ -4,19 +4,16 @@ find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) message(STATUS "Metal framework found") -add_library(ggml-metal - ggml-metal.m - ) +ggml_add_backend_library(ggml-metal + ggml-metal.m + ) target_link_libraries(ggml-metal PRIVATE - ggml-base ${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK} ) -target_include_directories(ggml-metal PRIVATE . ..) - if (GGML_METAL_NDEBUG) add_compile_definitions(GGML_METAL_NDEBUG) endif() diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 3a533d7f9c9af..63baaf163581c 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -4372,19 +4372,45 @@ static ggml_backend_dev_t ggml_backend_metal_reg_device_get(ggml_backend_reg_t r GGML_UNUSED(index); } +static struct ggml_backend_feature g_ggml_backend_metal_features[] = { +#if defined(GGML_METAL_EMBED_LIBRARY) + { "EMBED_LIBRARY", "1" }, +#endif +#if defined(GGML_METAL_USE_BF16) + { "BF16", "1" }, +#endif + { nil, nil }, +}; + +static struct ggml_backend_feature * ggml_backend_metal_get_features(ggml_backend_reg_t reg) { + return g_ggml_backend_metal_features; + + GGML_UNUSED(reg); +} + +static void * ggml_backend_metal_get_proc_address(ggml_backend_reg_t reg, const char * name) { + if (strcmp(name, "ggml_backend_get_features") == 0) { + return (void *)ggml_backend_metal_get_features; + } + + return NULL; + + GGML_UNUSED(reg); +} static struct ggml_backend_reg_i ggml_backend_metal_reg_i = { /* .get_name = */ ggml_backend_metal_reg_get_name, /* .device_count = */ ggml_backend_metal_reg_device_count, /* .device_get = */ ggml_backend_metal_reg_device_get, - /* .get_proc_address = */ NULL, + /* .get_proc_address = */ ggml_backend_metal_get_proc_address, }; ggml_backend_reg_t ggml_backend_metal_reg(void) { // TODO: make this thread-safe somehow? { g_ggml_backend_metal_reg = (struct ggml_backend_reg) { - /* .iface = */ ggml_backend_metal_reg_i, - /* .context = */ NULL, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_metal_reg_i, + /* .context = */ NULL, }; g_ggml_backend_metal_device = (struct ggml_backend_device) { @@ -4396,3 +4422,5 @@ ggml_backend_reg_t ggml_backend_metal_reg(void) { return &g_ggml_backend_metal_reg; } + +GGML_BACKEND_DL_IMPL(ggml_backend_metal_reg) diff --git a/ggml/src/ggml-musa/CMakeLists.txt b/ggml/src/ggml-musa/CMakeLists.txt index f3c0136920540..e1a69186e669f 100644 --- a/ggml/src/ggml-musa/CMakeLists.txt +++ b/ggml/src/ggml-musa/CMakeLists.txt @@ -47,12 +47,10 @@ if (MUSAToolkit_FOUND) set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22") endforeach() - add_library(ggml-musa - ${GGML_HEADERS_MUSA} - ${GGML_SOURCES_MUSA}) - - target_link_libraries(ggml-musa PRIVATE ggml-base) - target_include_directories(ggml-musa PRIVATE . ..) + ggml_add_backend_library(ggml-musa + ${GGML_HEADERS_MUSA} + ${GGML_SOURCES_MUSA} + ) # TODO: do not use CUDA definitions for MUSA target_compile_definitions(ggml PUBLIC GGML_USE_CUDA) diff --git a/ggml/src/ggml-rpc/CMakeLists.txt b/ggml/src/ggml-rpc/CMakeLists.txt index a2d6770eb053f..f5acb8ec2cb28 100644 --- a/ggml/src/ggml-rpc/CMakeLists.txt +++ b/ggml/src/ggml-rpc/CMakeLists.txt @@ -1,10 +1,8 @@ message(STATUS "Using RPC backend") -add_library(ggml-rpc - ggml-rpc.cpp) - -target_link_libraries(ggml-rpc PRIVATE ggml-base) -target_include_directories(ggml-rpc PRIVATE . ..) +ggml_add_backend_library(ggml-rpc + ggml-rpc.cpp + ) if (WIN32) target_link_libraries(ggml-rpc PRIVATE ws2_32) diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index 47357daabdf54..43108242639a3 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -1369,8 +1369,9 @@ static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = { ggml_backend_reg_t ggml_backend_rpc_reg(void) { static struct ggml_backend_reg ggml_backend_rpc_reg = { - /* .iface = */ ggml_backend_rpc_reg_i, - /* .context = */ NULL, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_rpc_reg_i, + /* .context = */ NULL, }; return &ggml_backend_rpc_reg; @@ -1401,3 +1402,5 @@ ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) { return dev; } + +GGML_BACKEND_DL_IMPL(ggml_backend_rpc_reg) diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt index d1d0ff83d636c..83f223fd7b6fc 100644 --- a/ggml/src/ggml-sycl/CMakeLists.txt +++ b/ggml/src/ggml-sycl/CMakeLists.txt @@ -16,12 +16,10 @@ endif() message(STATUS "SYCL found") #todo: AOT -add_library(ggml-sycl - ggml-sycl.cpp - ../../include/ggml-sycl.h) - -target_link_libraries(ggml-sycl PRIVATE ggml-base) -target_include_directories(ggml-sycl PRIVATE . ..) +ggml_add_backend_library(ggml-sycl + ggml-sycl.cpp + ../../include/ggml-sycl.h + ) if (GGML_SYCL_F16) if (GGML_SYCL_TARGET STREQUAL "AMD") diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 6169dac32bcca..285b5e5a1ce31 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4555,16 +4555,17 @@ ggml_backend_reg_t ggml_backend_sycl_reg() { dev_ctx->description = prop.get_name(); ggml_backend_dev_t dev = new ggml_backend_device { - /* .interface = */ ggml_backend_sycl_device_interface, - /* .reg = */ ®, - /* .context = */ dev_ctx + /* .iface = */ ggml_backend_sycl_device_interface, + /* .reg = */ ®, + /* .context = */ dev_ctx }; ctx->devices.push_back(dev); } reg = ggml_backend_reg { - /* .interface = */ ggml_backend_sycl_reg_interface, - /* .context = */ ctx + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_sycl_reg_interface, + /* .context = */ ctx }; } @@ -4599,3 +4600,4 @@ GGML_BACKEND_API void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id); ggml_sycl_info(main_gpu_id); } +GGML_BACKEND_DL_IMPL(ggml_backend_sycl_reg) diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt index 1e85dd15b7ab1..ae0485e04255d 100644 --- a/ggml/src/ggml-vulkan/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/CMakeLists.txt @@ -3,13 +3,13 @@ find_package(Vulkan COMPONENTS glslc REQUIRED) if (Vulkan_FOUND) message(STATUS "Vulkan found") - add_library(ggml-vulkan - ggml-vulkan.cpp - ../../include/ggml-vulkan.h - ) + ggml_add_backend_library(ggml-vulkan + ggml-vulkan.cpp + ../../include/ggml-vulkan.h + ) - target_link_libraries(ggml-vulkan PRIVATE ggml-base Vulkan::Vulkan) - target_include_directories(ggml-vulkan PRIVATE . .. ${CMAKE_CURRENT_BINARY_DIR}) + target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan) + target_include_directories(ggml-vulkan PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build # Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index ca71da2f7b7f5..49527fdf40e94 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -6738,8 +6738,9 @@ static const struct ggml_backend_reg_i ggml_backend_vk_reg_i = { ggml_backend_reg_t ggml_backend_vk_reg() { static ggml_backend_reg reg = { - /* .iface = */ ggml_backend_vk_reg_i, - /* .context = */ nullptr, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_vk_reg_i, + /* .context = */ nullptr, }; return ® @@ -7365,3 +7366,5 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) { VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")"); } #endif + +GGML_BACKEND_DL_IMPL(ggml_backend_vk_reg) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 545009438a83f..4b3b847b01499 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -7571,3 +7571,26 @@ void ggml_log_set(ggml_log_callback log_callback, void * user_data) { g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default; g_logger_state.log_callback_user_data = user_data; } + +void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) { + p->n_threads = n_threads; + p->prio = 0; // default priority (usually means normal or inherited) + p->poll = 50; // hybrid-polling enabled + p->strict_cpu = false; // no strict placement (all threads share same cpumask) + p->paused = false; // threads are ready to go + memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited) +} + +struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) { + struct ggml_threadpool_params p; + ggml_threadpool_params_init(&p, n_threads); + return p; +} + +bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) { + if (p0->n_threads != p1->n_threads ) return false; + if (p0->prio != p1->prio ) return false; + if (p0->poll != p1->poll ) return false; + if (p0->strict_cpu != p1->strict_cpu ) return false; + return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0; +} diff --git a/pocs/CMakeLists.txt b/pocs/CMakeLists.txt index 03e1d2c04be65..d49d14dee4351 100644 --- a/pocs/CMakeLists.txt +++ b/pocs/CMakeLists.txt @@ -8,5 +8,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) else() - add_subdirectory(vdot) + if (NOT GGML_BACKEND_DL) + add_subdirectory(vdot) + endif() endif() diff --git a/src/llama.cpp b/src/llama.cpp index 11f3c22b73dcc..b10d1d73eb0c6 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4870,7 +4870,9 @@ struct llama_model_loader { mappings.reserve(files.size()); mmaps_used.reserve(files.size()); for (const auto & file : files) { - std::unique_ptr mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa())); + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU)); + auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa"); + std::unique_ptr mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn())); mmaps_used.emplace_back(mapping->size, 0); if (mlock_mmaps) { std::unique_ptr mlock_mmap(new llama_mlock()); @@ -9199,7 +9201,7 @@ static bool llm_load_tensors( ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft); if (!dev) { // FIXME: workaround for CPU backend buft having a NULL device - dev = ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0); + dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); } ggml_backend_dev_props props; ggml_backend_dev_get_props(dev, &props); @@ -17452,8 +17454,9 @@ static enum ggml_status llama_graph_compute( int n_threads, ggml_threadpool * threadpool) { if (lctx.backend_cpu != nullptr) { - ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool); - ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data); + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(lctx.backend_cpu)); + auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool"); + set_threadpool_fn(lctx.backend_cpu, threadpool); } // set the number of threads for all the backends @@ -19487,7 +19490,11 @@ void llama_backend_init(void) { void llama_numa_init(enum ggml_numa_strategy numa) { if (numa != GGML_NUMA_STRATEGY_DISABLED) { - ggml_numa_init(numa); + auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + GGML_ASSERT(dev && "CPU backend is not loaded"); + auto * reg = ggml_backend_dev_backend_reg(dev); + auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init"); + numa_init_fn(numa); } } @@ -19761,9 +19768,6 @@ struct llama_context * llama_new_context_with_model( __func__, n_ctx_per_seq, hparams.n_ctx_train); } - ctx->abort_callback = params.abort_callback; - ctx->abort_callback_data = params.abort_callback_data; - ctx->logits_all = params.logits_all; // build worst-case graph for encoder if a model contains encoder @@ -19812,7 +19816,7 @@ struct llama_context * llama_new_context_with_model( } // add CPU backend - ctx->backend_cpu = ggml_backend_cpu_init(); + ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); if (ctx->backend_cpu == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); llama_free(ctx); @@ -19832,6 +19836,8 @@ struct llama_context * llama_new_context_with_model( } } + llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data); + if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); @@ -19877,7 +19883,8 @@ struct llama_context * llama_new_context_with_model( std::vector backend_ptrs; for (auto & backend : ctx->backends) { auto * buft = ggml_backend_get_default_buffer_type(backend.get()); - if (ggml_backend_is_cpu(backend.get()) && !model->devices.empty()) { + auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); + if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) { // use the host buffer of the first device CPU for faster transfer of the intermediate state auto * dev = model->devices[0]; auto * host_buft = ggml_backend_dev_host_buffer_type(dev); @@ -19905,7 +19912,8 @@ struct llama_context * llama_new_context_with_model( // pipeline parallelism requires support for async compute and events in all devices if (pipeline_parallel) { for (auto & backend : ctx->backends) { - if (ggml_backend_is_cpu(backend.get())) { + auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); + if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) { // ignore CPU backend continue; } @@ -21459,6 +21467,14 @@ int32_t llama_n_threads_batch(struct llama_context * ctx) { void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) { ctx->abort_callback = abort_callback; ctx->abort_callback_data = abort_callback_data; + + for (auto & backend : ctx->backends) { + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get())); + auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback"); + if (set_abort_callback_fn) { + set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data); + } + } } void llama_set_embeddings(struct llama_context * ctx, bool embeddings) { @@ -22200,32 +22216,23 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int } const char * llama_print_system_info(void) { - ggml_cpu_init(); // some ARM features are detected at runtime - static std::string s; - s = ""; - s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; - s += "AVX_VNNI = " + std::to_string(ggml_cpu_has_avx_vnni()) + " | "; - s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; - s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; - s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | "; - s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | "; - s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | "; - s += "AMX_INT8 = " + std::to_string(ggml_cpu_has_amx_int8()) + " | "; - s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; - s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; - s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | "; - s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; - s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; - s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; - s += "RISCV_VECT = " + std::to_string(ggml_cpu_has_riscv_v()) + " | "; - s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; - s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; - s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | "; - s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; - s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | "; - s += "LLAMAFILE = " + std::to_string(ggml_cpu_has_llamafile()) + " | "; + for (size_t i = 0; i < ggml_backend_reg_count(); i++) { + auto * reg = ggml_backend_reg_get(i); + auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features"); + if (get_features_fn) { + ggml_backend_feature * features = get_features_fn(reg); + s += ggml_backend_reg_name(reg); + s += " : "; + for (; features->name; features++) { + s += features->name; + s += " = "; + s += features->value; + s += " | "; + } + } + } return s.c_str(); } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b06f122e89873..82373ff4e1862 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -110,23 +110,26 @@ llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CU # llama_target_and_test(test-double-float.cpp) # SLOW llama_target_and_test(test-log.cpp) llama_target_and_test(test-arg-parser.cpp) -llama_target_and_test(test-quantize-fns.cpp) -llama_target_and_test(test-quantize-perf.cpp) llama_target_and_test(test-sampling.cpp) llama_target_and_test(test-chat-template.cpp) llama_target_and_test(test-grammar-parser.cpp) llama_target_and_test(test-grammar-integration.cpp) llama_target_and_test(test-llama-grammar.cpp) -llama_target_and_test(test-barrier.cpp) # llama_target_and_test(test-opt.cpp) # SLOW llama_target_and_test(test-backend-ops.cpp) -llama_target_and_test(test-rope.cpp) - llama_target_and_test(test-model-load-cancel.cpp LABEL "model") llama_target_and_test(test-autorelease.cpp LABEL "model") +if (NOT GGML_BACKEND_DL) + # these tests use the backends directly and cannot be built with dynamic loading + llama_target_and_test(test-barrier.cpp) + llama_target_and_test(test-quantize-fns.cpp) + llama_target_and_test(test-quantize-perf.cpp) + llama_target_and_test(test-rope.cpp) +endif() + # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8 if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index b2b5705243ea2..6376b0e4c66cf 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -16,7 +16,6 @@ #include -#include #include #include @@ -26,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -639,19 +637,20 @@ struct test_case { // determine number of runs int n_runs; + bool is_cpu = ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_CPU; if (op_flops(out) > 0) { // based on flops const uint64_t GFLOP = 1000 * 1000 * 1000; const uint64_t target_flops_cpu = 8ULL * GFLOP; const uint64_t target_flops_gpu = 100ULL * GFLOP; - uint64_t target_flops = ggml_backend_is_cpu(backend) ? target_flops_cpu : target_flops_gpu; + uint64_t target_flops = is_cpu ? target_flops_cpu : target_flops_gpu; n_runs = std::min(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_flops / op_flops(out)) + 1; } else { // based on memory size const size_t GB = 1ULL << 30; const size_t target_size_cpu = 8 * GB; const size_t target_size_gpu = 32 * GB; - size_t target_size = ggml_backend_is_cpu(backend) ? target_size_cpu : target_size_gpu; + size_t target_size = is_cpu ? target_size_cpu : target_size_gpu; n_runs = std::min(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1; } @@ -3873,7 +3872,11 @@ static std::vector> make_test_cases_perf() { static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) { if (mode == MODE_TEST) { auto test_cases = make_test_cases_eval(); - ggml_backend_t backend_cpu = ggml_backend_cpu_init(); + ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL); + if (backend_cpu == NULL) { + printf(" Failed to initialize CPU backend\n"); + return false; + } size_t n_ok = 0; for (auto & test : test_cases) { @@ -3953,7 +3956,9 @@ int main(int argc, char ** argv) { } } - // enumerate backends + // load and enumerate backends + ggml_backend_load_all(); + printf("Testing %zu devices\n\n", ggml_backend_dev_count()); size_t n_ok = 0; @@ -3969,16 +3974,15 @@ int main(int argc, char ** argv) { continue; } - ggml_backend_t backend = ggml_backend_dev_init(dev, NULL); - GGML_ASSERT(backend != NULL); - - if (backend_filter == NULL && ggml_backend_is_cpu(backend) && mode != MODE_GRAD) { + if (backend_filter == NULL && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU && mode != MODE_GRAD) { printf(" Skipping CPU backend\n"); - ggml_backend_free(backend); n_ok++; continue; } + ggml_backend_t backend = ggml_backend_dev_init(dev, NULL); + GGML_ASSERT(backend != NULL); + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); if (ggml_backend_set_n_threads_fn) { From 1cb813bd328ef034f0993c5f941d73f26ba42e82 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 25 Nov 2024 16:31:38 +0200 Subject: [PATCH 043/245] server : add speculative decoding support (#10455) * server : add speculative decoding support ggml-ci * server : add helper function slot.can_speculate() ggml-ci --- examples/server/server.cpp | 441 +++++++++++++++++++++++++------------ 1 file changed, 300 insertions(+), 141 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 6c55d65c01330..f9d20fee5a6f6 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2,10 +2,11 @@ #include "arg.h" #include "common.h" -#include "log.h" -#include "sampling.h" #include "json-schema-to-grammar.h" #include "llama.h" +#include "log.h" +#include "sampling.h" +#include "speculative.h" // Change JSON_ASSERT from assert() to GGML_ASSERT: #define JSON_ASSERT GGML_ASSERT @@ -121,12 +122,21 @@ struct slot_params { int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit std::vector antiprompt; + + struct common_params_sampling sampling; + struct common_params_speculative speculative; }; struct server_slot { int id; int id_task = -1; + llama_batch batch_spec; + + llama_context * ctx_dft = nullptr; + + common_speculative * spec = nullptr; + // the index relative to completion multi-task request size_t index = 0; @@ -175,7 +185,6 @@ struct server_slot { // sampling json json_schema; - struct common_params_sampling sparams; struct common_sampler * smpl = nullptr; llama_token sampled; @@ -212,7 +221,7 @@ struct server_slot { generated_token_probs.clear(); } - bool has_budget(common_params &global_params) { + bool has_budget(const common_params & global_params) { if (params.n_predict == -1 && global_params.n_predict == -1) { return true; // limitless } @@ -232,6 +241,10 @@ struct server_slot { return state != SLOT_STATE_IDLE; } + bool can_speculate() const { + return ctx_dft && params.speculative.n_max > 0 && params.cache_prompt; + } + void add_token(const completion_token_output & token) { if (!is_processing()) { SLT_WRN(*this, "%s", "slot is not processing\n"); @@ -591,11 +604,14 @@ struct server_response { }; struct server_context { + common_params params_base; + llama_model * model = nullptr; llama_context * ctx = nullptr; std::vector loras; - common_params params; + llama_model * model_dft = nullptr; + llama_context_params cparams_dft; llama_batch batch = {}; @@ -628,27 +644,41 @@ struct server_context { model = nullptr; } + if (model_dft) { + llama_free_model(model_dft); + model_dft = nullptr; + } + // Clear any sampling context for (server_slot & slot : slots) { - if (slot.smpl != nullptr) { - common_sampler_free(slot.smpl); - } + common_sampler_free(slot.smpl); + slot.smpl = nullptr; + + llama_free(slot.ctx_dft); + slot.ctx_dft = nullptr; + + common_speculative_free(slot.spec); + slot.spec = nullptr; + + llama_batch_free(slot.batch_spec); } llama_batch_free(batch); } - bool load_model(const common_params & params_) { - params = params_; + bool load_model(const common_params & params) { + SRV_INF("loading model '%s'\n", params.model.c_str()); - common_init_result llama_init = common_init_from_params(params); + params_base = params; + + common_init_result llama_init = common_init_from_params(params_base); model = llama_init.model; ctx = llama_init.context; loras = llama_init.lora_adapters; if (model == nullptr) { - SRV_ERR("failed to load model, '%s'\n", params.model.c_str()); + SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str()); return false; } @@ -657,6 +687,40 @@ struct server_context { add_bos_token = llama_add_bos_token(model); has_eos_token = !llama_add_eos_token(model); + if (!params_base.speculative.model.empty()) { + SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str()); + + auto params_dft = params_base; + + params_dft.model = params_base.speculative.model; + params_dft.n_ctx = params_base.speculative.n_ctx; + params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers; + + common_init_result llama_init_dft = common_init_from_params(params_dft); + + model_dft = llama_init_dft.model; + + if (model_dft == nullptr) { + SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str()); + return false; + } + + if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) { + SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str()); + + llama_free (llama_init_dft.context); + llama_free_model(llama_init_dft.model); + + return false; + } + + cparams_dft = common_context_params_to_llama(params_base); + cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context); + + // the context is not needed - we will create one for each slot + llama_free(llama_init_dft.context); + } + return true; } @@ -674,20 +738,36 @@ struct server_context { } void init() { - const int32_t n_ctx_slot = n_ctx / params.n_parallel; + const int32_t n_ctx_slot = n_ctx / params_base.n_parallel; - SRV_INF("initializing slots, n_slots = %d\n", params.n_parallel); + SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel); - for (int i = 0; i < params.n_parallel; i++) { + for (int i = 0; i < params_base.n_parallel; i++) { server_slot slot; slot.id = i; slot.n_ctx = n_ctx_slot; - slot.n_predict = params.n_predict; + slot.n_predict = params_base.n_predict; + + if (model_dft) { + slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1); + + slot.ctx_dft = llama_new_context_with_model(model_dft, cparams_dft); + if (slot.ctx_dft == nullptr) { + SRV_ERR("%s", "failed to create draft context\n"); + return; + } + + slot.spec = common_speculative_init(slot.ctx_dft); + if (slot.spec == nullptr) { + SRV_ERR("%s", "failed to create speculator\n"); + return; + } + } SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx); - slot.sparams = params.sampling; + slot.params.sampling = params_base.sampling; slot.callback_on_release = [this](int) { queue_tasks.pop_deferred_task(); @@ -707,7 +787,7 @@ struct server_context { const int32_t n_batch = llama_n_batch(ctx); // only a single seq_id per token is needed - batch = llama_batch_init(std::max(n_batch, params.n_parallel), 0, 1); + batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1); } metrics.init(); @@ -786,9 +866,11 @@ struct server_context { } bool launch_slot_with_task(server_slot & slot, const server_task & task) { - slot_params default_params; // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them) - auto default_sparams = params.sampling; + slot_params defaults; + defaults.sampling = params_base.sampling; + defaults.speculative = params_base.speculative; + const auto & data = task.data; if (data.count("__oaicompat") != 0) { @@ -799,42 +881,48 @@ struct server_context { slot.oaicompat_model = ""; } - slot.params.stream = json_value(data, "stream", false); - slot.params.cache_prompt = json_value(data, "cache_prompt", false); - slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict)); - slot.params.n_indent = json_value(data, "n_indent", default_params.n_indent); - slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k); - slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p); - slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p); - slot.sparams.xtc_probability = json_value(data, "xtc_probability", default_sparams.xtc_probability); - slot.sparams.xtc_threshold = json_value(data, "xtc_threshold", default_sparams.xtc_threshold); - slot.sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p); - slot.sparams.temp = json_value(data, "temperature", default_sparams.temp); - slot.sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range); - slot.sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent); - slot.sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n); - slot.sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat); - slot.sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq); - slot.sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present); - slot.sparams.dry_multiplier = json_value(data, "dry_multiplier", default_sparams.dry_multiplier); - slot.sparams.dry_base = json_value(data, "dry_base", default_sparams.dry_base); - slot.sparams.dry_allowed_length = json_value(data, "dry_allowed_length", default_sparams.dry_allowed_length); - slot.sparams.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", default_sparams.dry_penalty_last_n); - slot.sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat); - slot.sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau); - slot.sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta); - slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl); - slot.params.n_keep = json_value(data, "n_keep", default_params.n_keep); - slot.params.n_discard = json_value(data, "n_discard", default_params.n_discard); - slot.sparams.seed = json_value(data, "seed", default_sparams.seed); - slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); - slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); - //slot.params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", default_params.t_max_prompt_ms); // TODO: implement - slot.params.t_max_predict_ms = json_value(data, "t_max_predict_ms", default_params.t_max_predict_ms); - - if (slot.sparams.dry_base < 1.0f) - { - slot.sparams.dry_base = default_sparams.dry_base; + slot.params.stream = json_value(data, "stream", false); + slot.params.cache_prompt = json_value(data, "cache_prompt", false); + slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict)); + slot.params.n_indent = json_value(data, "n_indent", defaults.n_indent); + slot.params.n_keep = json_value(data, "n_keep", defaults.n_keep); + slot.params.n_discard = json_value(data, "n_discard", defaults.n_discard); + //slot.params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement + slot.params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms); + + slot.params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k); + slot.params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p); + slot.params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p); + slot.params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability); + slot.params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold); + slot.params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p); + slot.params.sampling.temp = json_value(data, "temperature", defaults.sampling.temp); + slot.params.sampling.dynatemp_range = json_value(data, "dynatemp_range", defaults.sampling.dynatemp_range); + slot.params.sampling.dynatemp_exponent = json_value(data, "dynatemp_exponent", defaults.sampling.dynatemp_exponent); + slot.params.sampling.penalty_last_n = json_value(data, "repeat_last_n", defaults.sampling.penalty_last_n); + slot.params.sampling.penalty_repeat = json_value(data, "repeat_penalty", defaults.sampling.penalty_repeat); + slot.params.sampling.penalty_freq = json_value(data, "frequency_penalty", defaults.sampling.penalty_freq); + slot.params.sampling.penalty_present = json_value(data, "presence_penalty", defaults.sampling.penalty_present); + slot.params.sampling.dry_multiplier = json_value(data, "dry_multiplier", defaults.sampling.dry_multiplier); + slot.params.sampling.dry_base = json_value(data, "dry_base", defaults.sampling.dry_base); + slot.params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length); + slot.params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n); + slot.params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat); + slot.params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau); + slot.params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta); + slot.params.sampling.penalize_nl = json_value(data, "penalize_nl", defaults.sampling.penalize_nl); + slot.params.sampling.seed = json_value(data, "seed", defaults.sampling.seed); + slot.params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs); + slot.params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep); + + slot.params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min); + slot.params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max); + slot.params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min); + + slot.params.speculative.n_min = std::min(slot.params.speculative.n_max, slot.params.speculative.n_min); + + if (slot.params.sampling.dry_base < 1.0f) { + slot.params.sampling.dry_base = defaults.sampling.dry_base; } // sequence breakers for DRY @@ -843,8 +931,8 @@ struct server_context { // Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39 if (data.contains("dry_sequence_breakers")) { - slot.sparams.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector()); - if (slot.sparams.dry_sequence_breakers.empty()) { + slot.params.sampling.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector()); + if (slot.params.sampling.dry_sequence_breakers.empty()) { send_error(task, "Error: dry_sequence_breakers must be a non-empty array of strings", ERROR_TYPE_INVALID_REQUEST); return false; } @@ -858,14 +946,14 @@ struct server_context { } if (data.contains("json_schema") && !data.contains("grammar")) { try { - auto schema = json_value(data, "json_schema", json::object()); - slot.sparams.grammar = json_schema_to_grammar(schema); + auto schema = json_value(data, "json_schema", json::object()); + slot.params.sampling.grammar = json_schema_to_grammar(schema); } catch (const std::exception & e) { send_error(task, std::string("\"json_schema\": ") + e.what(), ERROR_TYPE_INVALID_REQUEST); return false; } } else { - slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar); + slot.params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar); } if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) { @@ -875,10 +963,10 @@ struct server_context { } { - slot.sparams.logit_bias.clear(); + slot.params.sampling.logit_bias.clear(); if (json_value(data, "ignore_eos", false) && has_eos_token) { - slot.sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY}); + slot.params.sampling.logit_bias.push_back({llama_token_eos(model), -INFINITY}); } const auto & logit_bias = data.find("logit_bias"); @@ -899,12 +987,12 @@ struct server_context { if (el[0].is_number_integer()) { llama_token tok = el[0].get(); if (tok >= 0 && tok < n_vocab) { - slot.sparams.logit_bias.push_back({tok, bias}); + slot.params.sampling.logit_bias.push_back({tok, bias}); } } else if (el[0].is_string()) { auto toks = common_tokenize(model, el[0].get(), false); for (auto tok : toks) { - slot.sparams.logit_bias.push_back({tok, bias}); + slot.params.sampling.logit_bias.push_back({tok, bias}); } } } @@ -935,16 +1023,16 @@ struct server_context { sampler_names.emplace_back(name); } } - slot.sparams.samplers = common_sampler_types_from_names(sampler_names, false); + slot.params.sampling.samplers = common_sampler_types_from_names(sampler_names, false); } else if (samplers->is_string()){ std::string sampler_string; for (const auto & name : *samplers) { sampler_string += name; } - slot.sparams.samplers = common_sampler_types_from_chars(sampler_string); + slot.params.sampling.samplers = common_sampler_types_from_chars(sampler_string); } } else { - slot.sparams.samplers = default_sparams.samplers; + slot.params.sampling.samplers = defaults.sampling.samplers; } } @@ -953,7 +1041,7 @@ struct server_context { common_sampler_free(slot.smpl); } - slot.smpl = common_sampler_init(model, slot.sparams); + slot.smpl = common_sampler_init(model, slot.params.sampling); if (slot.smpl == nullptr) { // for now, the only error that may happen here is invalid grammar send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST); @@ -961,6 +1049,12 @@ struct server_context { } } + if (slot.ctx_dft) { + llama_batch_free(slot.batch_spec); + + slot.batch_spec = llama_batch_init(slot.params.speculative.n_max + 1, 0, 1); + } + slot.state = SLOT_STATE_STARTED; SLT_INF(slot, "%s", "processing task\n"); @@ -978,7 +1072,7 @@ struct server_context { bool process_token(completion_token_output & result, server_slot & slot) { // remember which tokens were sampled - used for repetition penalties during sampling - const std::string token_str = common_token_to_piece(ctx, result.tok, params.special); + const std::string token_str = common_token_to_piece(ctx, result.tok, params_base.special); slot.sampled = result.tok; // search stop word and delete it @@ -1043,7 +1137,7 @@ struct server_context { } // check the limits - if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params)) { + if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) { slot.stopped_limit = true; slot.has_next_token = false; @@ -1136,50 +1230,54 @@ struct server_context { json get_formated_generation(const server_slot & slot) const { std::vector samplers; - samplers.reserve(slot.sparams.samplers.size()); - for (const auto & sampler : slot.sparams.samplers) { + samplers.reserve(slot.params.sampling.samplers.size()); + for (const auto & sampler : slot.params.sampling.samplers) { samplers.emplace_back(common_sampler_type_to_str(sampler)); } return json { {"n_ctx", slot.n_ctx}, {"n_predict", slot.n_predict}, // Server configured n_predict - {"model", params.model_alias}, - {"seed", slot.sparams.seed}, + {"model", params_base.model_alias}, + {"seed", slot.params.sampling.seed}, {"seed_cur", slot.smpl ? common_sampler_get_seed(slot.smpl) : 0}, - {"temperature", slot.sparams.temp}, - {"dynatemp_range", slot.sparams.dynatemp_range}, - {"dynatemp_exponent", slot.sparams.dynatemp_exponent}, - {"top_k", slot.sparams.top_k}, - {"top_p", slot.sparams.top_p}, - {"min_p", slot.sparams.min_p}, - {"xtc_probability", slot.sparams.xtc_probability}, - {"xtc_threshold", slot.sparams.xtc_threshold}, - {"typical_p", slot.sparams.typ_p}, - {"repeat_last_n", slot.sparams.penalty_last_n}, - {"repeat_penalty", slot.sparams.penalty_repeat}, - {"presence_penalty", slot.sparams.penalty_present}, - {"frequency_penalty", slot.sparams.penalty_freq}, - {"dry_multiplier", slot.sparams.dry_multiplier}, - {"dry_base", slot.sparams.dry_base}, - {"dry_allowed_length", slot.sparams.dry_allowed_length}, - {"dry_penalty_last_n", slot.sparams.dry_penalty_last_n}, - {"dry_sequence_breakers", slot.sparams.dry_sequence_breakers}, - {"mirostat", slot.sparams.mirostat}, - {"mirostat_tau", slot.sparams.mirostat_tau}, - {"mirostat_eta", slot.sparams.mirostat_eta}, - {"penalize_nl", slot.sparams.penalize_nl}, + {"temperature", slot.params.sampling.temp}, + {"dynatemp_range", slot.params.sampling.dynatemp_range}, + {"dynatemp_exponent", slot.params.sampling.dynatemp_exponent}, + {"top_k", slot.params.sampling.top_k}, + {"top_p", slot.params.sampling.top_p}, + {"min_p", slot.params.sampling.min_p}, + {"xtc_probability", slot.params.sampling.xtc_probability}, + {"xtc_threshold", slot.params.sampling.xtc_threshold}, + {"typical_p", slot.params.sampling.typ_p}, + {"repeat_last_n", slot.params.sampling.penalty_last_n}, + {"repeat_penalty", slot.params.sampling.penalty_repeat}, + {"presence_penalty", slot.params.sampling.penalty_present}, + {"frequency_penalty", slot.params.sampling.penalty_freq}, + {"dry_multiplier", slot.params.sampling.dry_multiplier}, + {"dry_base", slot.params.sampling.dry_base}, + {"dry_allowed_length", slot.params.sampling.dry_allowed_length}, + {"dry_penalty_last_n", slot.params.sampling.dry_penalty_last_n}, + {"dry_sequence_breakers", slot.params.sampling.dry_sequence_breakers}, + {"mirostat", slot.params.sampling.mirostat}, + {"mirostat_tau", slot.params.sampling.mirostat_tau}, + {"mirostat_eta", slot.params.sampling.mirostat_eta}, + {"penalize_nl", slot.params.sampling.penalize_nl}, {"stop", slot.params.antiprompt}, {"max_tokens", slot.params.n_predict}, // User configured n_predict {"n_keep", slot.params.n_keep}, {"n_discard", slot.params.n_discard}, - {"ignore_eos", slot.sparams.ignore_eos}, + {"ignore_eos", slot.params.sampling.ignore_eos}, {"stream", slot.params.stream}, - //{"logit_bias", slot.sparams.logit_bias}, - {"n_probs", slot.sparams.n_probs}, - {"min_keep", slot.sparams.min_keep}, - {"grammar", slot.sparams.grammar}, + //{"logit_bias", slot.params.sampling.logit_bias}, + {"n_probs", slot.params.sampling.n_probs}, + {"min_keep", slot.params.sampling.min_keep}, + {"grammar", slot.params.sampling.grammar}, {"samplers", samplers}, + {"speculative", slot.can_speculate()}, + {"speculative.n_max", slot.params.speculative.n_max}, + {"speculative.n_min", slot.params.speculative.n_min}, + {"speculative.p_min", slot.params.speculative.p_min}, }; } @@ -1216,7 +1314,7 @@ struct server_context { {"index", slot.index}, }; - if (slot.sparams.n_probs > 0) { + if (slot.params.sampling.n_probs > 0) { const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false); const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size()); const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size()); @@ -1249,7 +1347,7 @@ struct server_context { {"content", !slot.params.stream ? slot.generated_text : ""}, {"id_slot", slot.id}, {"stop", true}, - {"model", params.model_alias}, + {"model", params_base.model_alias}, {"tokens_predicted", slot.n_decoded}, {"tokens_evaluated", slot.n_prompt_tokens}, {"generation_settings", get_formated_generation(slot)}, @@ -1265,7 +1363,7 @@ struct server_context { {"index", slot.index}, }; - if (slot.sparams.n_probs > 0) { + if (slot.params.sampling.n_probs > 0) { std::vector probs; if (!slot.params.stream && slot.stopped_word) { const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false); @@ -1422,10 +1520,10 @@ struct server_context { data.at("input_prefix"), data.at("input_suffix"), data.at("input_extra"), - params.n_batch, - params.n_predict, + params_base.n_batch, + params_base.n_predict, slots[0].n_ctx, // TODO: there should be a better way - params.spm_infill, + params_base.spm_infill, tokenized_prompts[i] ); create_task(data, tokens); @@ -1798,7 +1896,7 @@ struct server_context { // TODO: simplify and improve for (server_slot & slot : slots) { if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) { - if (!params.ctx_shift) { + if (!params_base.ctx_shift) { // this check is redundant (for good) // we should never get here, because generation should already stopped in process_token() slot.release(); @@ -1864,7 +1962,7 @@ struct server_context { int32_t batch_type = batch.n_tokens > 0 ? 0 : -1; // next, batch any pending prompts without exceeding n_batch - if (params.cont_batching || batch.n_tokens == 0) { + if (params_base.cont_batching || batch.n_tokens == 0) { for (auto & slot : slots) { // this slot still has a prompt to be processed if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) { @@ -1917,7 +2015,7 @@ struct server_context { continue; } } else { - if (!params.ctx_shift) { + if (!params_base.ctx_shift) { // if context shift is disabled, we make sure prompt size is smaller than KV size // TODO: there should be a separate parameter that control prompt truncation // context shift should be applied only during the generation phase @@ -1963,11 +2061,11 @@ struct server_context { slot.n_past = common_lcp(slot.cache_tokens, prompt_tokens); // reuse chunks from the cached prompt by shifting their KV cache in the new position - if (params.n_cache_reuse > 0) { + if (params_base.n_cache_reuse > 0) { size_t head_c = slot.n_past; // cache size_t head_p = slot.n_past; // current prompt - SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params.n_cache_reuse, slot.n_past); + SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params_base.n_cache_reuse, slot.n_past); while (head_c < slot.cache_tokens.size() && head_p < prompt_tokens.size()) { @@ -1980,7 +2078,7 @@ struct server_context { n_match++; } - if (n_match >= (size_t) params.n_cache_reuse) { + if (n_match >= (size_t) params_base.n_cache_reuse) { SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match); //for (size_t i = head_p; i < head_p + n_match; i++) { // SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); @@ -2168,38 +2266,99 @@ struct server_context { continue; // continue loop of slots } - completion_token_output result; - const llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i); + llama_token id; + + { + completion_token_output result; + + id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i); + + slot.i_batch = -1; + + common_sampler_accept(slot.smpl, id, true); + + slot.n_decoded += 1; + if (slot.n_decoded == 1) { + slot.t_start_generation = ggml_time_us(); + slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3; + metrics.on_prompt_eval(slot); + } + + result.tok = id; + + const auto * cur_p = common_sampler_get_candidates(slot.smpl); - common_sampler_accept(slot.smpl, id, true); + for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) { + result.probs.push_back({ + cur_p->data[i].id, + i >= cur_p->size ? 0.0f : cur_p->data[i].p, + }); + } + + if (!process_token(result, slot)) { + // release slot because of stop condition + slot.release(); + slot.print_timings(); + send_final_response(slot); + metrics.on_prediction(slot); + continue; + } + } - slot.n_decoded += 1; - if (slot.n_decoded == 1) { - slot.t_start_generation = ggml_time_us(); - slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3; - metrics.on_prompt_eval(slot); + // check if the slot supports speculative decoding + if (!slot.can_speculate()) { + continue; } - result.tok = id; + struct common_speculative_params params_spec; + params_spec.n_draft = slot.params.speculative.n_max; + params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max; + params_spec.p_min = slot.params.speculative.p_min; - const auto * cur_p = common_sampler_get_candidates(slot.smpl); + llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id); - for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) { - result.probs.push_back({ - cur_p->data[i].id, - i >= cur_p->size ? 0.0f : cur_p->data[i].p, - }); + // ignore small drafts + if (slot.params.speculative.n_min > (int) draft.size()) { + continue; } - if (!process_token(result, slot)) { - // release slot because of stop condition - slot.release(); - slot.print_timings(); - send_final_response(slot); - metrics.on_prediction(slot); + // construct the speculation batch + common_batch_clear(slot.batch_spec); + common_batch_add (slot.batch_spec, id, slot.n_past, { slot.id }, true); + + for (size_t i = 0; i < draft.size(); ++i) { + common_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true); + } + + llama_decode(ctx, slot.batch_spec); + + // the accepted tokens from the speculation + const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, draft); + + slot.n_past += ids.size(); + slot.n_decoded += ids.size(); + + slot.cache_tokens.push_back(id); + slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1); + + llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1); + + for (size_t i = 0; i < ids.size(); ++i) { + completion_token_output result; + + result.tok = ids[i]; + + if (!process_token(result, slot)) { + // release slot because of stop condition + slot.release(); + slot.print_timings(); + send_final_response(slot); + metrics.on_prediction(slot); + break; + } } - slot.i_batch = -1; + SRV_DBG("accepted %d/%d draft tokens\n", (int) ids.size() - 1, (int) draft.size()); } } @@ -2697,7 +2856,7 @@ int main(int argc, char ** argv) { const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) { json data = { { "default_generation_settings", ctx_server.default_generation_settings_for_props }, - { "total_slots", ctx_server.params.n_parallel }, + { "total_slots", ctx_server.params_base.n_parallel }, { "chat_template", llama_get_chat_template(ctx_server.model) }, }; @@ -2705,7 +2864,7 @@ int main(int argc, char ** argv) { }; const auto handle_props_change = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { - if (!ctx_server.params.endpoint_props) { + if (!ctx_server.params_base.endpoint_props) { res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED)); return; } @@ -2718,7 +2877,7 @@ int main(int argc, char ** argv) { }; const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) { - if (ctx_server.params.embedding) { + if (ctx_server.params_base.embedding) { res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); return; } @@ -2824,7 +2983,7 @@ int main(int argc, char ** argv) { // TODO: maybe merge this function with "handle_completions_generic" const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) { - if (ctx_server.params.embedding) { + if (ctx_server.params_base.embedding) { res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); return; } @@ -3001,7 +3160,7 @@ int main(int argc, char ** argv) { }; const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { - if (!ctx_server.params.reranking || ctx_server.params.embedding) { + if (!ctx_server.params_base.reranking || ctx_server.params_base.embedding) { res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED)); return; } From 9f9ca37cc7985f50b34b1e46a4a03c07fbb56f10 Mon Sep 17 00:00:00 2001 From: brucepro Date: Mon, 25 Nov 2024 08:11:55 -0800 Subject: [PATCH 044/245] Add download chat feature to server chat (#10481) * Add download chat feature to server chat Add a download feature next to the delete chat feature in the server vue chat interface. * code style --------- Co-authored-by: Xuan Son Nguyen --- examples/server/public/index.html | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 6216c08410a28..c54260867d2c8 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -81,7 +81,13 @@

Conversations

- + - - + - - - -
-
- - {{ messages.length === 0 ? 'Send a message to start' : '' }} -
-
-
+ +
+
+ + {{ messages.length === 0 ? 'Send a message to start' : '' }} +
+
- - - - +
+ + + + +
+
+ + +
+ + + + +
- -
- - - - - + +
+
+ + +
- -
-
- - -
+ +
+ + +
- -
- - - -
+ @@ -231,7 +303,7 @@

Settings

Penalties settings
@@ -252,10 +324,11 @@

Settings

+
@@ -273,458 +346,6 @@

Settings

- - diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 2ec13d7d2f536..9bca3f30e7574 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -16,12 +16,7 @@ // auto generated files (update with ./deps.sh) #include "index.html.hpp" -#include "completion.js.hpp" #include "loading.html.hpp" -#include "deps_daisyui.min.css.hpp" -#include "deps_markdown-it.js.hpp" -#include "deps_tailwindcss.js.hpp" -#include "deps_vue.esm-browser.js.hpp" #include #include @@ -103,12 +98,6 @@ struct server_task_result { bool error; }; -struct server_static_file { - const unsigned char * data; - unsigned int size; - const char * mime_type; -}; - struct slot_params { bool stream = true; bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt @@ -2457,16 +2446,6 @@ int main(int argc, char ** argv) { LOG_INF("%s\n", common_params_get_system_info(params).c_str()); LOG_INF("\n"); - // static files - std::map static_files = { - { "/", { index_html, index_html_len, "text/html; charset=utf-8" }}, - { "/completion.js", { completion_js, completion_js_len, "text/javascript; charset=utf-8" }}, - { "/deps_daisyui.min.css", { deps_daisyui_min_css, deps_daisyui_min_css_len, "text/css; charset=utf-8" }}, - { "/deps_markdown-it.js", { deps_markdown_it_js, deps_markdown_it_js_len, "text/javascript; charset=utf-8" }}, - { "/deps_tailwindcss.js", { deps_tailwindcss_js, deps_tailwindcss_js_len, "text/javascript; charset=utf-8" }}, - { "/deps_vue.esm-browser.js", { deps_vue_esm_browser_js, deps_vue_esm_browser_js_len, "text/javascript; charset=utf-8" }}, - }; - std::unique_ptr svr; #ifdef CPPHTTPLIB_OPENSSL_SUPPORT if (params.ssl_file_key != "" && params.ssl_file_cert != "") { @@ -2547,7 +2526,7 @@ int main(int argc, char ** argv) { // Middlewares // - auto middleware_validate_api_key = [¶ms, &res_error, &static_files](const httplib::Request & req, httplib::Response & res) { + auto middleware_validate_api_key = [¶ms, &res_error](const httplib::Request & req, httplib::Response & res) { static const std::unordered_set public_endpoints = { "/health", "/models", @@ -2560,7 +2539,7 @@ int main(int argc, char ** argv) { } // If path is public or is static file, skip validation - if (public_endpoints.find(req.path) != public_endpoints.end() || static_files.find(req.path) != static_files.end()) { + if (public_endpoints.find(req.path) != public_endpoints.end() || req.path == "/") { return true; } @@ -3317,14 +3296,11 @@ int main(int argc, char ** argv) { return 1; } } else { - // using embedded static files - for (const auto & it : static_files) { - const server_static_file & static_file = it.second; - svr->Get(it.first.c_str(), [&static_file](const httplib::Request &, httplib::Response & res) { - res.set_content(reinterpret_cast(static_file.data), static_file.size, static_file.mime_type); - return false; - }); - } + // using embedded static index.html + svr->Get("/", [](const httplib::Request &, httplib::Response & res) { + res.set_content(reinterpret_cast(index_html), index_html_len, "text/html; charset=utf-8"); + return false; + }); } // register API routes diff --git a/examples/server/webui/index.html b/examples/server/webui/index.html new file mode 100644 index 0000000000000..c7e18b45e1a51 --- /dev/null +++ b/examples/server/webui/index.html @@ -0,0 +1,268 @@ + + + + + + + 🦙 llama.cpp - chat + + + +
+
+ + + +
+ +
+
+

Conversations

+ + + +
+ + +
+ + New conversation +
+
+ {{ conv.messages[0].content }} +
+
+ Conversations are saved to browser's localStorage +
+
+
+ + +
+ +
+ + + +
llama.cpp
+ + +
+ + + + + +
+
+ + +
+
+ + {{ messages.length === 0 ? 'Send a message to start' : '' }} +
+
+
+
+ + + + +
+
+ + +
+ + + + + +
+
+ + +
+
+ + +
+
+
+ + +
+ + + +
+
+ +
+ + + + + + + +
+ + + + + + + + diff --git a/examples/server/webui/package-lock.json b/examples/server/webui/package-lock.json new file mode 100644 index 0000000000000..6b93090f06274 --- /dev/null +++ b/examples/server/webui/package-lock.json @@ -0,0 +1,2783 @@ +{ + "name": "webui", + "version": "0.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "webui", + "version": "0.0.0", + "dependencies": { + "autoprefixer": "^10.4.20", + "daisyui": "^4.12.14", + "markdown-it": "^14.1.0", + "postcss": "^8.4.49", + "tailwindcss": "^3.4.15", + "vite-plugin-singlefile": "^2.0.3", + "vue": "^3.5.13" + }, + "devDependencies": { + "vite": "^5.4.10" + } + }, + "node_modules/@alloc/quick-lru": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/@alloc/quick-lru/-/quick-lru-5.2.0.tgz", + "integrity": "sha512-UrcABB+4bUrFABwbluTIBErXwvbsU/V7TZWfmbgJfbkwiBuziS9gxdODUyuiecfdGQ85jglMW6juS3+z5TsKLw==", + "license": "MIT", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/@esbuild/aix-ppc64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.21.5.tgz", + "integrity": "sha512-1SDgH6ZSPTlggy1yI6+Dbkiz8xzpHJEVAlF/AM1tHPLsf5STom9rwtjE4hKAF20FfXXNTFqEYXyJNWh1GiZedQ==", + "cpu": [ + "ppc64" + ], + "license": "MIT", + "optional": true, + "os": [ + "aix" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/android-arm": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.21.5.tgz", + "integrity": "sha512-vCPvzSjpPHEi1siZdlvAlsPxXl7WbOVUBBAowWug4rJHb68Ox8KualB+1ocNvT5fjv6wpkX6o/iEpbDrf68zcg==", + "cpu": [ + "arm" + ], + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/android-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.21.5.tgz", + "integrity": "sha512-c0uX9VAUBQ7dTDCjq+wdyGLowMdtR/GoC2U5IYk/7D1H1JYC0qseD7+11iMP2mRLN9RcCMRcjC4YMclCzGwS/A==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/android-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.21.5.tgz", + "integrity": "sha512-D7aPRUUNHRBwHxzxRvp856rjUHRFW1SdQATKXH2hqA0kAZb1hKmi02OpYRacl0TxIGz/ZmXWlbZgjwWYaCakTA==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/darwin-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.21.5.tgz", + "integrity": "sha512-se/JjF8NlmKVG4kNIuyWMV/22ZaerB+qaSi5MdrXtd6R08kvs2qCN4C09miupktDitvh8jRFflwGFBQcxZRjbw==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/freebsd-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.21.5.tgz", + "integrity": "sha512-5JcRxxRDUJLX8JXp/wcBCy3pENnCgBR9bN6JsY4OmhfUtIHe3ZW0mawA7+RDAcMLrMIZaf03NlQiX9DGyB8h4g==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/freebsd-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.21.5.tgz", + "integrity": "sha512-J95kNBj1zkbMXtHVH29bBriQygMXqoVQOQYA+ISs0/2l3T9/kj42ow2mpqerRBxDJnmkUDCaQT/dfNXWX/ZZCQ==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-arm": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.21.5.tgz", + "integrity": "sha512-bPb5AHZtbeNGjCKVZ9UGqGwo8EUu4cLq68E95A53KlxAPRmUyYv2D6F0uUI65XisGOL1hBP5mTronbgo+0bFcA==", + "cpu": [ + "arm" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.21.5.tgz", + "integrity": "sha512-ibKvmyYzKsBeX8d8I7MH/TMfWDXBF3db4qM6sy+7re0YXya+K1cem3on9XgdT2EQGMu4hQyZhan7TeQ8XkGp4Q==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-ia32": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.21.5.tgz", + "integrity": "sha512-YvjXDqLRqPDl2dvRODYmmhz4rPeVKYvppfGYKSNGdyZkA01046pLWyRKKI3ax8fbJoK5QbxblURkwK/MWY18Tg==", + "cpu": [ + "ia32" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-loong64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.21.5.tgz", + "integrity": "sha512-uHf1BmMG8qEvzdrzAqg2SIG/02+4/DHB6a9Kbya0XDvwDEKCoC8ZRWI5JJvNdUjtciBGFQ5PuBlpEOXQj+JQSg==", + "cpu": [ + "loong64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-mips64el": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.21.5.tgz", + "integrity": "sha512-IajOmO+KJK23bj52dFSNCMsz1QP1DqM6cwLUv3W1QwyxkyIWecfafnI555fvSGqEKwjMXVLokcV5ygHW5b3Jbg==", + "cpu": [ + "mips64el" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-ppc64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.21.5.tgz", + "integrity": "sha512-1hHV/Z4OEfMwpLO8rp7CvlhBDnjsC3CttJXIhBi+5Aj5r+MBvy4egg7wCbe//hSsT+RvDAG7s81tAvpL2XAE4w==", + "cpu": [ + "ppc64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-riscv64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.21.5.tgz", + "integrity": "sha512-2HdXDMd9GMgTGrPWnJzP2ALSokE/0O5HhTUvWIbD3YdjME8JwvSCnNGBnTThKGEB91OZhzrJ4qIIxk/SBmyDDA==", + "cpu": [ + "riscv64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-s390x": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.21.5.tgz", + "integrity": "sha512-zus5sxzqBJD3eXxwvjN1yQkRepANgxE9lgOW2qLnmr8ikMTphkjgXu1HR01K4FJg8h1kEEDAqDcZQtbrRnB41A==", + "cpu": [ + "s390x" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.21.5.tgz", + "integrity": "sha512-1rYdTpyv03iycF1+BhzrzQJCdOuAOtaqHTWJZCWvijKD2N5Xu0TtVC8/+1faWqcP9iBCWOmjmhoH94dH82BxPQ==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/netbsd-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.21.5.tgz", + "integrity": "sha512-Woi2MXzXjMULccIwMnLciyZH4nCIMpWQAs049KEeMvOcNADVxo0UBIQPfSmxB3CWKedngg7sWZdLvLczpe0tLg==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/openbsd-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.21.5.tgz", + "integrity": "sha512-HLNNw99xsvx12lFBUwoT8EVCsSvRNDVxNpjZ7bPn947b8gJPzeHWyNVhFsaerc0n3TsbOINvRP2byTZ5LKezow==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/sunos-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.21.5.tgz", + "integrity": "sha512-6+gjmFpfy0BHU5Tpptkuh8+uw3mnrvgs+dSPQXQOv3ekbordwnzTVEb4qnIvQcYXq6gzkyTnoZ9dZG+D4garKg==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "sunos" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/win32-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.21.5.tgz", + "integrity": "sha512-Z0gOTd75VvXqyq7nsl93zwahcTROgqvuAcYDUr+vOv8uHhNSKROyU961kgtCD1e95IqPKSQKH7tBTslnS3tA8A==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/win32-ia32": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.21.5.tgz", + "integrity": "sha512-SWXFF1CL2RVNMaVs+BBClwtfZSvDgtL//G/smwAc5oVK/UPu2Gu9tIaRgFmYFFKrmg3SyAjSrElf0TiJ1v8fYA==", + "cpu": [ + "ia32" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/win32-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.21.5.tgz", + "integrity": "sha512-tQd/1efJuzPC6rCFwEvLtci/xNFcTZknmXs98FYDfGE4wP9ClFV98nyKrzJKVPMhdDnjzLhdUyMX4PsQAPjwIw==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@rollup/rollup-android-arm-eabi": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.28.0.tgz", + "integrity": "sha512-wLJuPLT6grGZsy34g4N1yRfYeouklTgPhH1gWXCYspenKYD0s3cR99ZevOGw5BexMNywkbV3UkjADisozBmpPQ==", + "cpu": [ + "arm" + ], + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-android-arm64": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.28.0.tgz", + "integrity": "sha512-eiNkznlo0dLmVG/6wf+Ifi/v78G4d4QxRhuUl+s8EWZpDewgk7PX3ZyECUXU0Zq/Ca+8nU8cQpNC4Xgn2gFNDA==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-darwin-x64": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.28.0.tgz", + "integrity": "sha512-8hxgfReVs7k9Js1uAIhS6zq3I+wKQETInnWQtgzt8JfGx51R1N6DRVy3F4o0lQwumbErRz52YqwjfvuwRxGv1w==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-freebsd-arm64": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.28.0.tgz", + "integrity": "sha512-lA1zZB3bFx5oxu9fYud4+g1mt+lYXCoch0M0V/xhqLoGatbzVse0wlSQ1UYOWKpuSu3gyN4qEc0Dxf/DII1bhQ==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@rollup/rollup-freebsd-x64": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.28.0.tgz", + "integrity": "sha512-aI2plavbUDjCQB/sRbeUZWX9qp12GfYkYSJOrdYTL/C5D53bsE2/nBPuoiJKoWp5SN78v2Vr8ZPnB+/VbQ2pFA==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@rollup/rollup-linux-arm-gnueabihf": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.28.0.tgz", + "integrity": "sha512-WXveUPKtfqtaNvpf0iOb0M6xC64GzUX/OowbqfiCSXTdi/jLlOmH0Ba94/OkiY2yTGTwteo4/dsHRfh5bDCZ+w==", + "cpu": [ + "arm" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm-musleabihf": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.28.0.tgz", + "integrity": "sha512-yLc3O2NtOQR67lI79zsSc7lk31xjwcaocvdD1twL64PK1yNaIqCeWI9L5B4MFPAVGEVjH5k1oWSGuYX1Wutxpg==", + "cpu": [ + "arm" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-gnu": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.28.0.tgz", + "integrity": "sha512-+P9G9hjEpHucHRXqesY+3X9hD2wh0iNnJXX/QhS/J5vTdG6VhNYMxJ2rJkQOxRUd17u5mbMLHM7yWGZdAASfcg==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-musl": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.28.0.tgz", + "integrity": "sha512-1xsm2rCKSTpKzi5/ypT5wfc+4bOGa/9yI/eaOLW0oMs7qpC542APWhl4A37AENGZ6St6GBMWhCCMM6tXgTIplw==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-powerpc64le-gnu": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.28.0.tgz", + "integrity": "sha512-zgWxMq8neVQeXL+ouSf6S7DoNeo6EPgi1eeqHXVKQxqPy1B2NvTbaOUWPn/7CfMKL7xvhV0/+fq/Z/J69g1WAQ==", + "cpu": [ + "ppc64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-gnu": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.28.0.tgz", + "integrity": "sha512-VEdVYacLniRxbRJLNtzwGt5vwS0ycYshofI7cWAfj7Vg5asqj+pt+Q6x4n+AONSZW/kVm+5nklde0qs2EUwU2g==", + "cpu": [ + "riscv64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-s390x-gnu": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.28.0.tgz", + "integrity": "sha512-LQlP5t2hcDJh8HV8RELD9/xlYtEzJkm/aWGsauvdO2ulfl3QYRjqrKW+mGAIWP5kdNCBheqqqYIGElSRCaXfpw==", + "cpu": [ + "s390x" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-gnu": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.28.0.tgz", + "integrity": "sha512-Nl4KIzteVEKE9BdAvYoTkW19pa7LR/RBrT6F1dJCV/3pbjwDcaOq+edkP0LXuJ9kflW/xOK414X78r+K84+msw==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-musl": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.28.0.tgz", + "integrity": "sha512-eKpJr4vBDOi4goT75MvW+0dXcNUqisK4jvibY9vDdlgLx+yekxSm55StsHbxUsRxSTt3JEQvlr3cGDkzcSP8bw==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-win32-arm64-msvc": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.28.0.tgz", + "integrity": "sha512-Vi+WR62xWGsE/Oj+mD0FNAPY2MEox3cfyG0zLpotZdehPFXwz6lypkGs5y38Jd/NVSbOD02aVad6q6QYF7i8Bg==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-ia32-msvc": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.28.0.tgz", + "integrity": "sha512-kN/Vpip8emMLn/eOza+4JwqDZBL6MPNpkdaEsgUtW1NYN3DZvZqSQrbKzJcTL6hd8YNmFTn7XGWMwccOcJBL0A==", + "cpu": [ + "ia32" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-msvc": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.28.0.tgz", + "integrity": "sha512-Bvno2/aZT6usSa7lRDL2+hMjVAGjuqaymF1ApZm31JXzniR/hvr14jpU+/z4X6Gt5BPlzosscyJZGUvguXIqeQ==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@vue/compiler-dom": { + "version": "3.5.13", + "resolved": "https://registry.npmjs.org/@vue/compiler-dom/-/compiler-dom-3.5.13.tgz", + "integrity": "sha512-ZOJ46sMOKUjO3e94wPdCzQ6P1Lx/vhp2RSvfaab88Ajexs0AHeV0uasYhi99WPaogmBlRHNRuly8xV75cNTMDA==", + "license": "MIT", + "dependencies": { + "@vue/compiler-core": "3.5.13", + "@vue/shared": "3.5.13" + } + }, + "node_modules/@vue/compiler-dom/node_modules/@babel/helper-string-parser": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.25.9.tgz", + "integrity": "sha512-4A/SCr/2KLd5jrtOMFzaKjVtAei3+2r/NChoBNoZ3EyP/+GlhoaEGoWOZUmFmoITP7zOJyHIMm+DYRd8o3PvHA==", + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@vue/compiler-dom/node_modules/@babel/helper-validator-identifier": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.25.9.tgz", + "integrity": "sha512-Ed61U6XJc3CVRfkERJWDz4dJwKe7iLmmJsbOGu9wSloNSFttHV0I8g6UAgb7qnK5ly5bGLPd4oXZlxCdANBOWQ==", + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@vue/compiler-dom/node_modules/@babel/parser": { + "version": "7.26.2", + "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.26.2.tgz", + "integrity": "sha512-DWMCZH9WA4Maitz2q21SRKHo9QXZxkDsbNZoVD62gusNtNBBqDg9i7uOhASfTfIGNzW+O+r7+jAlM8dwphcJKQ==", + "license": "MIT", + "dependencies": { + "@babel/types": "^7.26.0" + }, + "bin": { + "parser": "bin/babel-parser.js" + }, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@vue/compiler-dom/node_modules/@babel/types": { + "version": "7.26.0", + "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.26.0.tgz", + "integrity": "sha512-Z/yiTPj+lDVnF7lWeKCIJzaIkI0vYO87dMpZ4bg4TDrFe4XXLFWL1TbXU27gBP3QccxV9mZICCrnjnYlJjXHOA==", + "license": "MIT", + "dependencies": { + "@babel/helper-string-parser": "^7.25.9", + "@babel/helper-validator-identifier": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@vue/compiler-dom/node_modules/@vue/compiler-core": { + "version": "3.5.13", + "resolved": "https://registry.npmjs.org/@vue/compiler-core/-/compiler-core-3.5.13.tgz", + "integrity": "sha512-oOdAkwqUfW1WqpwSYJce06wvt6HljgY3fGeM9NcVA1HaYOij3mZG9Rkysn0OHuyUAGMbEbARIpsG+LPVlBJ5/Q==", + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.25.3", + "@vue/shared": "3.5.13", + "entities": "^4.5.0", + "estree-walker": "^2.0.2", + "source-map-js": "^1.2.0" + } + }, + "node_modules/@vue/compiler-dom/node_modules/estree-walker": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/estree-walker/-/estree-walker-2.0.2.tgz", + "integrity": "sha512-Rfkk/Mp/DL7JVje3u18FxFujQlTNR2q6QfMSMB7AvCBx91NGj/ba3kCfza0f6dVDbw7YlRf/nDrn7pQrCCyQ/w==", + "license": "MIT" + }, + "node_modules/@vue/compiler-dom/node_modules/source-map-js": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/@vue/compiler-sfc": { + "version": "3.5.13", + "resolved": "https://registry.npmjs.org/@vue/compiler-sfc/-/compiler-sfc-3.5.13.tgz", + "integrity": "sha512-6VdaljMpD82w6c2749Zhf5T9u5uLBWKnVue6XWxprDobftnletJ8+oel7sexFfM3qIxNmVE7LSFGTpv6obNyaQ==", + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.25.3", + "@vue/compiler-core": "3.5.13", + "@vue/compiler-dom": "3.5.13", + "@vue/compiler-ssr": "3.5.13", + "@vue/shared": "3.5.13", + "estree-walker": "^2.0.2", + "magic-string": "^0.30.11", + "postcss": "^8.4.48", + "source-map-js": "^1.2.0" + } + }, + "node_modules/@vue/compiler-sfc/node_modules/@babel/helper-string-parser": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.25.9.tgz", + "integrity": "sha512-4A/SCr/2KLd5jrtOMFzaKjVtAei3+2r/NChoBNoZ3EyP/+GlhoaEGoWOZUmFmoITP7zOJyHIMm+DYRd8o3PvHA==", + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@vue/compiler-sfc/node_modules/@babel/helper-validator-identifier": { + "version": "7.25.9", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.25.9.tgz", + "integrity": "sha512-Ed61U6XJc3CVRfkERJWDz4dJwKe7iLmmJsbOGu9wSloNSFttHV0I8g6UAgb7qnK5ly5bGLPd4oXZlxCdANBOWQ==", + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@vue/compiler-sfc/node_modules/@babel/parser": { + "version": "7.26.2", + "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.26.2.tgz", + "integrity": "sha512-DWMCZH9WA4Maitz2q21SRKHo9QXZxkDsbNZoVD62gusNtNBBqDg9i7uOhASfTfIGNzW+O+r7+jAlM8dwphcJKQ==", + "license": "MIT", + "dependencies": { + "@babel/types": "^7.26.0" + }, + "bin": { + "parser": "bin/babel-parser.js" + }, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@vue/compiler-sfc/node_modules/@babel/types": { + "version": "7.26.0", + "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.26.0.tgz", + "integrity": "sha512-Z/yiTPj+lDVnF7lWeKCIJzaIkI0vYO87dMpZ4bg4TDrFe4XXLFWL1TbXU27gBP3QccxV9mZICCrnjnYlJjXHOA==", + "license": "MIT", + "dependencies": { + "@babel/helper-string-parser": "^7.25.9", + "@babel/helper-validator-identifier": "^7.25.9" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@vue/compiler-sfc/node_modules/@jridgewell/sourcemap-codec": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.0.tgz", + "integrity": "sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ==", + "license": "MIT" + }, + "node_modules/@vue/compiler-sfc/node_modules/@vue/compiler-core": { + "version": "3.5.13", + "resolved": "https://registry.npmjs.org/@vue/compiler-core/-/compiler-core-3.5.13.tgz", + "integrity": "sha512-oOdAkwqUfW1WqpwSYJce06wvt6HljgY3fGeM9NcVA1HaYOij3mZG9Rkysn0OHuyUAGMbEbARIpsG+LPVlBJ5/Q==", + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.25.3", + "@vue/shared": "3.5.13", + "entities": "^4.5.0", + "estree-walker": "^2.0.2", + "source-map-js": "^1.2.0" + } + }, + "node_modules/@vue/compiler-sfc/node_modules/@vue/compiler-ssr": { + "version": "3.5.13", + "resolved": "https://registry.npmjs.org/@vue/compiler-ssr/-/compiler-ssr-3.5.13.tgz", + "integrity": "sha512-wMH6vrYHxQl/IybKJagqbquvxpWCuVYpoUJfCqFZwa/JY1GdATAQ+TgVtgrwwMZ0D07QhA99rs/EAAWfvG6KpA==", + "license": "MIT", + "dependencies": { + "@vue/compiler-dom": "3.5.13", + "@vue/shared": "3.5.13" + } + }, + "node_modules/@vue/compiler-sfc/node_modules/estree-walker": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/estree-walker/-/estree-walker-2.0.2.tgz", + "integrity": "sha512-Rfkk/Mp/DL7JVje3u18FxFujQlTNR2q6QfMSMB7AvCBx91NGj/ba3kCfza0f6dVDbw7YlRf/nDrn7pQrCCyQ/w==", + "license": "MIT" + }, + "node_modules/@vue/compiler-sfc/node_modules/magic-string": { + "version": "0.30.14", + "resolved": "https://registry.npmjs.org/magic-string/-/magic-string-0.30.14.tgz", + "integrity": "sha512-5c99P1WKTed11ZC0HMJOj6CDIue6F8ySu+bJL+85q1zBEIY8IklrJ1eiKC2NDRh3Ct3FcvmJPyQHb9erXMTJNw==", + "license": "MIT", + "dependencies": { + "@jridgewell/sourcemap-codec": "^1.5.0" + } + }, + "node_modules/@vue/compiler-sfc/node_modules/source-map-js": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/@vue/runtime-dom": { + "version": "3.5.13", + "resolved": "https://registry.npmjs.org/@vue/runtime-dom/-/runtime-dom-3.5.13.tgz", + "integrity": "sha512-dLaj94s93NYLqjLiyFzVs9X6dWhTdAlEAciC3Moq7gzAc13VJUdCnjjRurNM6uTLFATRHexHCTu/Xp3eW6yoog==", + "license": "MIT", + "dependencies": { + "@vue/reactivity": "3.5.13", + "@vue/runtime-core": "3.5.13", + "@vue/shared": "3.5.13", + "csstype": "^3.1.3" + } + }, + "node_modules/@vue/runtime-dom/node_modules/@vue/reactivity": { + "version": "3.5.13", + "resolved": "https://registry.npmjs.org/@vue/reactivity/-/reactivity-3.5.13.tgz", + "integrity": "sha512-NaCwtw8o48B9I6L1zl2p41OHo/2Z4wqYGGIK1Khu5T7yxrn+ATOixn/Udn2m+6kZKB/J7cuT9DbWWhRxqixACg==", + "license": "MIT", + "dependencies": { + "@vue/shared": "3.5.13" + } + }, + "node_modules/@vue/runtime-dom/node_modules/@vue/runtime-core": { + "version": "3.5.13", + "resolved": "https://registry.npmjs.org/@vue/runtime-core/-/runtime-core-3.5.13.tgz", + "integrity": "sha512-Fj4YRQ3Az0WTZw1sFe+QDb0aXCerigEpw418pw1HBUKFtnQHWzwojaukAs2X/c9DQz4MQ4bsXTGlcpGxU/RCIw==", + "license": "MIT", + "dependencies": { + "@vue/reactivity": "3.5.13", + "@vue/shared": "3.5.13" + } + }, + "node_modules/@vue/runtime-dom/node_modules/csstype": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.1.3.tgz", + "integrity": "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==", + "license": "MIT" + }, + "node_modules/@vue/server-renderer": { + "version": "3.5.13", + "resolved": "https://registry.npmjs.org/@vue/server-renderer/-/server-renderer-3.5.13.tgz", + "integrity": "sha512-wAi4IRJV/2SAW3htkTlB+dHeRmpTiVIK1OGLWV1yeStVSebSQQOwGwIq0D3ZIoBj2C2qpgz5+vX9iEBkTdk5YA==", + "license": "MIT", + "dependencies": { + "@vue/compiler-ssr": "3.5.13", + "@vue/shared": "3.5.13" + }, + "peerDependencies": { + "vue": "3.5.13" + } + }, + "node_modules/@vue/server-renderer/node_modules/@vue/compiler-ssr": { + "version": "3.5.13", + "resolved": "https://registry.npmjs.org/@vue/compiler-ssr/-/compiler-ssr-3.5.13.tgz", + "integrity": "sha512-wMH6vrYHxQl/IybKJagqbquvxpWCuVYpoUJfCqFZwa/JY1GdATAQ+TgVtgrwwMZ0D07QhA99rs/EAAWfvG6KpA==", + "license": "MIT", + "dependencies": { + "@vue/compiler-dom": "3.5.13", + "@vue/shared": "3.5.13" + } + }, + "node_modules/@vue/shared": { + "version": "3.5.13", + "resolved": "https://registry.npmjs.org/@vue/shared/-/shared-3.5.13.tgz", + "integrity": "sha512-/hnE/qP5ZoGpol0a5mDi45bOd7t3tjYJBjsgCsivow7D48cJeV5l05RD82lPqi7gRiphZM37rnhW1l6ZoCNNnQ==", + "license": "MIT" + }, + "node_modules/arg": { + "version": "5.0.2", + "resolved": "https://registry.npmjs.org/arg/-/arg-5.0.2.tgz", + "integrity": "sha512-PYjyFOLKQ9y57JvQ6QLo8dAgNqswh8M1RMJYdQduT6xbWSgK36P/Z/v+p888pM69jMMfS8Xd8F6I1kQ/I9HUGg==", + "license": "MIT" + }, + "node_modules/argparse": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", + "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==", + "license": "Python-2.0" + }, + "node_modules/autoprefixer": { + "version": "10.4.20", + "resolved": "https://registry.npmjs.org/autoprefixer/-/autoprefixer-10.4.20.tgz", + "integrity": "sha512-XY25y5xSv/wEoqzDyXXME4AFfkZI0P23z6Fs3YgymDnKJkCGOnkL0iTxCa85UTqaSgfcqyf3UA6+c7wUvx/16g==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/autoprefixer" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "browserslist": "^4.23.3", + "caniuse-lite": "^1.0.30001646", + "fraction.js": "^4.3.7", + "normalize-range": "^0.1.2", + "picocolors": "^1.0.1", + "postcss-value-parser": "^4.2.0" + }, + "bin": { + "autoprefixer": "bin/autoprefixer" + }, + "engines": { + "node": "^10 || ^12 || >=14" + }, + "peerDependencies": { + "postcss": "^8.1.0" + } + }, + "node_modules/browserslist": { + "version": "4.24.2", + "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.24.2.tgz", + "integrity": "sha512-ZIc+Q62revdMcqC6aChtW4jz3My3klmCO1fEmINZY/8J3EpBg5/A/D0AKmBveUh6pgoeycoMkVMko84tuYS+Gg==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/browserslist" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "caniuse-lite": "^1.0.30001669", + "electron-to-chromium": "^1.5.41", + "node-releases": "^2.0.18", + "update-browserslist-db": "^1.1.1" + }, + "bin": { + "browserslist": "cli.js" + }, + "engines": { + "node": "^6 || ^7 || ^8 || ^9 || ^10 || ^11 || ^12 || >=13.7" + } + }, + "node_modules/browserslist/node_modules/electron-to-chromium": { + "version": "1.5.67", + "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.67.tgz", + "integrity": "sha512-nz88NNBsD7kQSAGGJyp8hS6xSPtWwqNogA0mjtc2nUYeEf3nURK9qpV18TuBdDmEDgVWotS8Wkzf+V52dSQ/LQ==", + "license": "ISC" + }, + "node_modules/browserslist/node_modules/escalade": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz", + "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/browserslist/node_modules/node-releases": { + "version": "2.0.18", + "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.18.tgz", + "integrity": "sha512-d9VeXT4SJ7ZeOqGX6R5EM022wpL+eWPooLI+5UpWn2jCT1aosUQEhQP214x33Wkwx3JQMvIm+tIoVOdodFS40g==", + "license": "MIT" + }, + "node_modules/browserslist/node_modules/update-browserslist-db": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.1.1.tgz", + "integrity": "sha512-R8UzCaa9Az+38REPiJ1tXlImTJXlVfgHZsglwBD/k6nj76ctsH1E3q4doGrukiLQd3sGQYu56r5+lo5r94l29A==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/browserslist" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "escalade": "^3.2.0", + "picocolors": "^1.1.0" + }, + "bin": { + "update-browserslist-db": "cli.js" + }, + "peerDependencies": { + "browserslist": ">= 4.21.0" + } + }, + "node_modules/caniuse-lite": { + "version": "1.0.30001684", + "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001684.tgz", + "integrity": "sha512-G1LRwLIQjBQoyq0ZJGqGIJUXzJ8irpbjHLpVRXDvBEScFJ9b17sgK6vlx0GAJFE21okD7zXl08rRRUfq6HdoEQ==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/caniuse-lite" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "CC-BY-4.0" + }, + "node_modules/chokidar": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.6.0.tgz", + "integrity": "sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==", + "license": "MIT", + "dependencies": { + "anymatch": "~3.1.2", + "braces": "~3.0.2", + "glob-parent": "~5.1.2", + "is-binary-path": "~2.1.0", + "is-glob": "~4.0.1", + "normalize-path": "~3.0.0", + "readdirp": "~3.6.0" + }, + "engines": { + "node": ">= 8.10.0" + }, + "funding": { + "url": "https://paulmillr.com/funding/" + }, + "optionalDependencies": { + "fsevents": "~2.3.2" + } + }, + "node_modules/chokidar/node_modules/anymatch": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.3.tgz", + "integrity": "sha512-KMReFUr0B4t+D+OBkjR3KYqvocp2XaSzO55UcB6mgQMd3KbcE+mWTyvVV7D/zsdEbNnV6acZUutkiHQXvTr1Rw==", + "license": "ISC", + "dependencies": { + "normalize-path": "^3.0.0", + "picomatch": "^2.0.4" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/chokidar/node_modules/binary-extensions": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-2.3.0.tgz", + "integrity": "sha512-Ceh+7ox5qe7LJuLHoY0feh3pHuUDHAcRUeyL2VYghZwfpkNIy/+8Ocg0a3UuSoYzavmylwuLWQOf3hl0jjMMIw==", + "license": "MIT", + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/chokidar/node_modules/braces": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz", + "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==", + "license": "MIT", + "dependencies": { + "fill-range": "^7.1.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/chokidar/node_modules/fill-range": { + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz", + "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==", + "license": "MIT", + "dependencies": { + "to-regex-range": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/chokidar/node_modules/glob-parent": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz", + "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==", + "license": "ISC", + "dependencies": { + "is-glob": "^4.0.1" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/chokidar/node_modules/is-binary-path": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/is-binary-path/-/is-binary-path-2.1.0.tgz", + "integrity": "sha512-ZMERYes6pDydyuGidse7OsHxtbI7WVeUEozgR/g7rd0xUimYNlvZRE/K2MgZTjWy725IfelLeVcEM97mmtRGXw==", + "license": "MIT", + "dependencies": { + "binary-extensions": "^2.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/chokidar/node_modules/is-number": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz", + "integrity": "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==", + "license": "MIT", + "engines": { + "node": ">=0.12.0" + } + }, + "node_modules/chokidar/node_modules/picomatch": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz", + "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==", + "license": "MIT", + "engines": { + "node": ">=8.6" + }, + "funding": { + "url": "https://github.com/sponsors/jonschlinkert" + } + }, + "node_modules/chokidar/node_modules/readdirp": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-3.6.0.tgz", + "integrity": "sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==", + "license": "MIT", + "dependencies": { + "picomatch": "^2.2.1" + }, + "engines": { + "node": ">=8.10.0" + } + }, + "node_modules/chokidar/node_modules/to-regex-range": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz", + "integrity": "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==", + "license": "MIT", + "dependencies": { + "is-number": "^7.0.0" + }, + "engines": { + "node": ">=8.0" + } + }, + "node_modules/css-selector-tokenizer": { + "version": "0.8.0", + "resolved": "https://registry.npmjs.org/css-selector-tokenizer/-/css-selector-tokenizer-0.8.0.tgz", + "integrity": "sha512-Jd6Ig3/pe62/qe5SBPTN8h8LeUg/pT4lLgtavPf7updwwHpvFzxvOQBHYj2LZDMjUnBzgvIUSjRcf6oT5HzHFg==", + "license": "MIT", + "dependencies": { + "cssesc": "^3.0.0", + "fastparse": "^1.1.2" + } + }, + "node_modules/css-selector-tokenizer/node_modules/cssesc": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/cssesc/-/cssesc-3.0.0.tgz", + "integrity": "sha512-/Tb/JcjK111nNScGob5MNtsntNM1aCNUDipB/TkwZFhyDrrE47SOx/18wF2bbjgc3ZzCSKW1T5nt5EbFoAz/Vg==", + "license": "MIT", + "bin": { + "cssesc": "bin/cssesc" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/css-selector-tokenizer/node_modules/fastparse": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/fastparse/-/fastparse-1.1.2.tgz", + "integrity": "sha512-483XLLxTVIwWK3QTrMGRqUfUpoOs/0hbQrl2oz4J0pAcm3A3bu84wxTFqGqkJzewCLdME38xJLJAxBABfQT8sQ==", + "license": "MIT" + }, + "node_modules/culori": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/culori/-/culori-3.3.0.tgz", + "integrity": "sha512-pHJg+jbuFsCjz9iclQBqyL3B2HLCBF71BwVNujUYEvCeQMvV97R59MNK3R2+jgJ3a1fcZgI9B3vYgz8lzr/BFQ==", + "license": "MIT", + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + } + }, + "node_modules/daisyui": { + "version": "4.12.14", + "resolved": "https://registry.npmjs.org/daisyui/-/daisyui-4.12.14.tgz", + "integrity": "sha512-hA27cdBasdwd4/iEjn+aidoCrRroDuo3G5W9NDKaVCJI437Mm/3eSL/2u7MkZ0pt8a+TrYF3aT2pFVemTS3how==", + "license": "MIT", + "dependencies": { + "css-selector-tokenizer": "^0.8", + "culori": "^3", + "picocolors": "^1", + "postcss-js": "^4" + }, + "engines": { + "node": ">=16.9.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/daisyui" + } + }, + "node_modules/didyoumean": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/didyoumean/-/didyoumean-1.2.2.tgz", + "integrity": "sha512-gxtyfqMg7GKyhQmb056K7M3xszy/myH8w+B4RT+QXBQsvAOdc3XymqDDPHx1BgPgsdAA5SIifona89YtRATDzw==", + "license": "Apache-2.0" + }, + "node_modules/dlv": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/dlv/-/dlv-1.1.3.tgz", + "integrity": "sha512-+HlytyjlPKnIG8XuRG8WvmBP8xs8P71y+SKKS6ZXWoEgLuePxtDoUEiH7WkdePWrQ5JBpE6aoVqfZfJUQkjXwA==", + "license": "MIT" + }, + "node_modules/entities": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz", + "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, + "node_modules/esbuild": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.21.5.tgz", + "integrity": "sha512-mg3OPMV4hXywwpoDxu3Qda5xCKQi+vCTZq8S9J/EpkhB2HzKXq4SNFZE3+NK93JYxc8VMSep+lOUSC/RVKaBqw==", + "hasInstallScript": true, + "license": "MIT", + "bin": { + "esbuild": "bin/esbuild" + }, + "engines": { + "node": ">=12" + }, + "optionalDependencies": { + "@esbuild/aix-ppc64": "0.21.5", + "@esbuild/android-arm": "0.21.5", + "@esbuild/android-arm64": "0.21.5", + "@esbuild/android-x64": "0.21.5", + "@esbuild/darwin-arm64": "0.21.5", + "@esbuild/darwin-x64": "0.21.5", + "@esbuild/freebsd-arm64": "0.21.5", + "@esbuild/freebsd-x64": "0.21.5", + "@esbuild/linux-arm": "0.21.5", + "@esbuild/linux-arm64": "0.21.5", + "@esbuild/linux-ia32": "0.21.5", + "@esbuild/linux-loong64": "0.21.5", + "@esbuild/linux-mips64el": "0.21.5", + "@esbuild/linux-ppc64": "0.21.5", + "@esbuild/linux-riscv64": "0.21.5", + "@esbuild/linux-s390x": "0.21.5", + "@esbuild/linux-x64": "0.21.5", + "@esbuild/netbsd-x64": "0.21.5", + "@esbuild/openbsd-x64": "0.21.5", + "@esbuild/sunos-x64": "0.21.5", + "@esbuild/win32-arm64": "0.21.5", + "@esbuild/win32-ia32": "0.21.5", + "@esbuild/win32-x64": "0.21.5" + } + }, + "node_modules/esbuild/node_modules/@esbuild/darwin-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.21.5.tgz", + "integrity": "sha512-DwqXqZyuk5AiWWf3UfLiRDJ5EDd49zg6O9wclZ7kUMv2WRFr4HKjXp/5t8JZ11QbQfUS6/cRCKGwYhtNAY88kQ==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/fast-glob": { + "version": "3.3.2", + "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.3.2.tgz", + "integrity": "sha512-oX2ruAFQwf/Orj8m737Y5adxDQO0LAB7/S5MnxCdTNDd4p6BsyIVsv9JQsATbTSq8KHRpLwIHbVlUNatxd+1Ow==", + "license": "MIT", + "dependencies": { + "@nodelib/fs.stat": "^2.0.2", + "@nodelib/fs.walk": "^1.2.3", + "glob-parent": "^5.1.2", + "merge2": "^1.3.0", + "micromatch": "^4.0.4" + }, + "engines": { + "node": ">=8.6.0" + } + }, + "node_modules/fast-glob/node_modules/@nodelib/fs.scandir": { + "version": "2.1.5", + "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz", + "integrity": "sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==", + "license": "MIT", + "dependencies": { + "@nodelib/fs.stat": "2.0.5", + "run-parallel": "^1.1.9" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/fast-glob/node_modules/@nodelib/fs.stat": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/@nodelib/fs.stat/-/fs.stat-2.0.5.tgz", + "integrity": "sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A==", + "license": "MIT", + "engines": { + "node": ">= 8" + } + }, + "node_modules/fast-glob/node_modules/@nodelib/fs.walk": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/@nodelib/fs.walk/-/fs.walk-1.2.8.tgz", + "integrity": "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==", + "license": "MIT", + "dependencies": { + "@nodelib/fs.scandir": "2.1.5", + "fastq": "^1.6.0" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/fast-glob/node_modules/fastq": { + "version": "1.17.1", + "resolved": "https://registry.npmjs.org/fastq/-/fastq-1.17.1.tgz", + "integrity": "sha512-sRVD3lWVIXWg6By68ZN7vho9a1pQcN/WBFaAAsDDFzlJjvoGx0P8z7V1t72grFJfJhu3YPZBuu25f7Kaw2jN1w==", + "license": "ISC", + "dependencies": { + "reusify": "^1.0.4" + } + }, + "node_modules/fast-glob/node_modules/glob-parent": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz", + "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==", + "license": "ISC", + "dependencies": { + "is-glob": "^4.0.1" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/fast-glob/node_modules/merge2": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz", + "integrity": "sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==", + "license": "MIT", + "engines": { + "node": ">= 8" + } + }, + "node_modules/fast-glob/node_modules/queue-microtask": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz", + "integrity": "sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT" + }, + "node_modules/fast-glob/node_modules/reusify": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/reusify/-/reusify-1.0.4.tgz", + "integrity": "sha512-U9nH88a3fc/ekCF1l0/UP1IosiuIjyTh7hBvXVMHYgVcfGvt897Xguj2UOLDeI5BG2m7/uwyaLVT6fbtCwTyzw==", + "license": "MIT", + "engines": { + "iojs": ">=1.0.0", + "node": ">=0.10.0" + } + }, + "node_modules/fast-glob/node_modules/run-parallel": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz", + "integrity": "sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT", + "dependencies": { + "queue-microtask": "^1.2.2" + } + }, + "node_modules/fraction.js": { + "version": "4.3.7", + "resolved": "https://registry.npmjs.org/fraction.js/-/fraction.js-4.3.7.tgz", + "integrity": "sha512-ZsDfxO51wGAXREY55a7la9LScWpwv9RxIrYABrlvOFBlH/ShPnrtsXeuUIfXKKOVicNxQ+o8JTbJvjS4M89yew==", + "license": "MIT", + "engines": { + "node": "*" + }, + "funding": { + "type": "patreon", + "url": "https://github.com/sponsors/rawify" + } + }, + "node_modules/fsevents": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", + "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/glob-parent": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-6.0.2.tgz", + "integrity": "sha512-XxwI8EOhVQgWp6iDL+3b0r86f4d6AX6zSU55HfB4ydCEuXLXc5FcYeOu+nnGftS4TEju/11rt4KJPTMgbfmv4A==", + "license": "ISC", + "dependencies": { + "is-glob": "^4.0.3" + }, + "engines": { + "node": ">=10.13.0" + } + }, + "node_modules/is-glob": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz", + "integrity": "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==", + "license": "MIT", + "dependencies": { + "is-extglob": "^2.1.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/is-glob/node_modules/is-extglob": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", + "integrity": "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/jiti": { + "version": "1.21.6", + "resolved": "https://registry.npmjs.org/jiti/-/jiti-1.21.6.tgz", + "integrity": "sha512-2yTgeWTWzMWkHu6Jp9NKgePDaYHbntiwvYuuJLbbN9vl7DC9DvXKOB2BC3ZZ92D3cvV/aflH0osDfwpHepQ53w==", + "license": "MIT", + "bin": { + "jiti": "bin/jiti.js" + } + }, + "node_modules/lilconfig": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-2.1.0.tgz", + "integrity": "sha512-utWOt/GHzuUxnLKxB6dk81RoOeoNeHgbrXiuGk4yyF5qlRz+iIVWu56E2fqGHFrXz0QNUhLB/8nKqvRH66JKGQ==", + "license": "MIT", + "engines": { + "node": ">=10" + } + }, + "node_modules/linkify-it": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/linkify-it/-/linkify-it-5.0.0.tgz", + "integrity": "sha512-5aHCbzQRADcdP+ATqnDuhhJ/MRIqDkZX5pyjFHRRysS8vZ5AbqGEoFIb6pYHPZ+L/OC2Lc+xT8uHVVR5CAK/wQ==", + "license": "MIT", + "dependencies": { + "uc.micro": "^2.0.0" + } + }, + "node_modules/markdown-it": { + "version": "14.1.0", + "resolved": "https://registry.npmjs.org/markdown-it/-/markdown-it-14.1.0.tgz", + "integrity": "sha512-a54IwgWPaeBCAAsv13YgmALOF1elABB08FxO9i+r4VFk5Vl4pKokRPeX8u5TCgSsPi6ec1otfLjdOpVcgbpshg==", + "license": "MIT", + "dependencies": { + "argparse": "^2.0.1", + "entities": "^4.4.0", + "linkify-it": "^5.0.0", + "mdurl": "^2.0.0", + "punycode.js": "^2.3.1", + "uc.micro": "^2.1.0" + }, + "bin": { + "markdown-it": "bin/markdown-it.mjs" + } + }, + "node_modules/mdurl": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/mdurl/-/mdurl-2.0.0.tgz", + "integrity": "sha512-Lf+9+2r+Tdp5wXDXC4PcIBjTDtq4UKjCPMQhKIuzpJNW0b96kVqSwW0bT7FhRSfmAiFYgP+SCRvdrDozfh0U5w==", + "license": "MIT" + }, + "node_modules/micromatch": { + "version": "4.0.8", + "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.8.tgz", + "integrity": "sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==", + "license": "MIT", + "dependencies": { + "braces": "^3.0.3", + "picomatch": "^2.3.1" + }, + "engines": { + "node": ">=8.6" + } + }, + "node_modules/micromatch/node_modules/braces": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz", + "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==", + "license": "MIT", + "dependencies": { + "fill-range": "^7.1.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/micromatch/node_modules/fill-range": { + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz", + "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==", + "license": "MIT", + "dependencies": { + "to-regex-range": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/micromatch/node_modules/is-number": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz", + "integrity": "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==", + "license": "MIT", + "engines": { + "node": ">=0.12.0" + } + }, + "node_modules/micromatch/node_modules/picomatch": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz", + "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==", + "license": "MIT", + "engines": { + "node": ">=8.6" + }, + "funding": { + "url": "https://github.com/sponsors/jonschlinkert" + } + }, + "node_modules/micromatch/node_modules/to-regex-range": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz", + "integrity": "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==", + "license": "MIT", + "dependencies": { + "is-number": "^7.0.0" + }, + "engines": { + "node": ">=8.0" + } + }, + "node_modules/normalize-path": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-3.0.0.tgz", + "integrity": "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/normalize-range": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/normalize-range/-/normalize-range-0.1.2.tgz", + "integrity": "sha512-bdok/XvKII3nUpklnV6P2hxtMNrCboOjAcyBuQnWEhO665FwrSNRxU+AqpsyvO6LgGYPspN+lu5CLtw4jPRKNA==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/object-hash": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/object-hash/-/object-hash-3.0.0.tgz", + "integrity": "sha512-RSn9F68PjH9HqtltsSnqYC1XXoWe9Bju5+213R98cNGttag9q9yAOTzdbsqvIa7aNm5WffBZFpWYr2aWrklWAw==", + "license": "MIT", + "engines": { + "node": ">= 6" + } + }, + "node_modules/picocolors": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", + "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==", + "license": "ISC" + }, + "node_modules/postcss": { + "version": "8.4.49", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.49.tgz", + "integrity": "sha512-OCVPnIObs4N29kxTjzLfUryOkvZEq+pf8jTF0lg8E7uETuWHA+v7j3c/xJmiqpX450191LlmZfUKkXxkTry7nA==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/postcss" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "nanoid": "^3.3.7", + "picocolors": "^1.1.1", + "source-map-js": "^1.2.1" + }, + "engines": { + "node": "^10 || ^12 || >=14" + } + }, + "node_modules/postcss-import": { + "version": "15.1.0", + "resolved": "https://registry.npmjs.org/postcss-import/-/postcss-import-15.1.0.tgz", + "integrity": "sha512-hpr+J05B2FVYUAXHeK1YyI267J/dDDhMU6B6civm8hSY1jYJnBXxzKDKDswzJmtLHryrjhnDjqqp/49t8FALew==", + "license": "MIT", + "dependencies": { + "postcss-value-parser": "^4.0.0", + "read-cache": "^1.0.0", + "resolve": "^1.1.7" + }, + "engines": { + "node": ">=14.0.0" + }, + "peerDependencies": { + "postcss": "^8.0.0" + } + }, + "node_modules/postcss-import/node_modules/pify": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz", + "integrity": "sha512-udgsAY+fTnvv7kI7aaxbqwWNb0AHiB0qBO89PZKPkoTmGOgdbrHDKD+0B2X4uTfJ/FT1R09r9gTsjUjNJotuog==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/postcss-import/node_modules/read-cache": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/read-cache/-/read-cache-1.0.0.tgz", + "integrity": "sha512-Owdv/Ft7IjOgm/i0xvNDZ1LrRANRfew4b2prF3OWMQLxLfu3bS8FVhCsrSCMK4lR56Y9ya+AThoTpDCTxCmpRA==", + "license": "MIT", + "dependencies": { + "pify": "^2.3.0" + } + }, + "node_modules/postcss-js": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/postcss-js/-/postcss-js-4.0.1.tgz", + "integrity": "sha512-dDLF8pEO191hJMtlHFPRa8xsizHaM82MLfNkUHdUtVEV3tgTp5oj+8qbEqYM57SLfc74KSbw//4SeJma2LRVIw==", + "license": "MIT", + "dependencies": { + "camelcase-css": "^2.0.1" + }, + "engines": { + "node": "^12 || ^14 || >= 16" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + "peerDependencies": { + "postcss": "^8.4.21" + } + }, + "node_modules/postcss-js/node_modules/camelcase-css": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/camelcase-css/-/camelcase-css-2.0.1.tgz", + "integrity": "sha512-QOSvevhslijgYwRx6Rv7zKdMF8lbRmx+uQGx2+vDc+KI/eBnsy9kit5aj23AgGu3pa4t9AgwbnXWqS+iOY+2aA==", + "license": "MIT", + "engines": { + "node": ">= 6" + } + }, + "node_modules/postcss-load-config": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/postcss-load-config/-/postcss-load-config-4.0.2.tgz", + "integrity": "sha512-bSVhyJGL00wMVoPUzAVAnbEoWyqRxkjv64tUl427SKnPrENtq6hJwUojroMz2VB+Q1edmi4IfrAPpami5VVgMQ==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "lilconfig": "^3.0.0", + "yaml": "^2.3.4" + }, + "engines": { + "node": ">= 14" + }, + "peerDependencies": { + "postcss": ">=8.0.9", + "ts-node": ">=9.0.0" + }, + "peerDependenciesMeta": { + "postcss": { + "optional": true + }, + "ts-node": { + "optional": true + } + } + }, + "node_modules/postcss-load-config/node_modules/lilconfig": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-3.1.2.tgz", + "integrity": "sha512-eop+wDAvpItUys0FWkHIKeC9ybYrTGbU41U5K7+bttZZeohvnY7M9dZ5kB21GNWiFT2q1OoPTvncPCgSOVO5ow==", + "license": "MIT", + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/sponsors/antonk52" + } + }, + "node_modules/postcss-load-config/node_modules/yaml": { + "version": "2.6.1", + "resolved": "https://registry.npmjs.org/yaml/-/yaml-2.6.1.tgz", + "integrity": "sha512-7r0XPzioN/Q9kXBro/XPnA6kznR73DHq+GXh5ON7ZozRO6aMjbmiBuKste2wslTFkC5d1dw0GooOCepZXJ2SAg==", + "license": "ISC", + "bin": { + "yaml": "bin.mjs" + }, + "engines": { + "node": ">= 14" + } + }, + "node_modules/postcss-nested": { + "version": "6.2.0", + "resolved": "https://registry.npmjs.org/postcss-nested/-/postcss-nested-6.2.0.tgz", + "integrity": "sha512-HQbt28KulC5AJzG+cZtj9kvKB93CFCdLvog1WFLf1D+xmMvPGlBstkpTEZfK5+AN9hfJocyBFCNiqyS48bpgzQ==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "postcss-selector-parser": "^6.1.1" + }, + "engines": { + "node": ">=12.0" + }, + "peerDependencies": { + "postcss": "^8.2.14" + } + }, + "node_modules/postcss-selector-parser": { + "version": "6.1.2", + "resolved": "https://registry.npmjs.org/postcss-selector-parser/-/postcss-selector-parser-6.1.2.tgz", + "integrity": "sha512-Q8qQfPiZ+THO/3ZrOrO0cJJKfpYCagtMUkXbnEfmgUjwXg6z/WBeOyS9APBBPCTSiDV+s4SwQGu8yFsiMRIudg==", + "license": "MIT", + "dependencies": { + "cssesc": "^3.0.0", + "util-deprecate": "^1.0.2" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/postcss-selector-parser/node_modules/cssesc": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/cssesc/-/cssesc-3.0.0.tgz", + "integrity": "sha512-/Tb/JcjK111nNScGob5MNtsntNM1aCNUDipB/TkwZFhyDrrE47SOx/18wF2bbjgc3ZzCSKW1T5nt5EbFoAz/Vg==", + "license": "MIT", + "bin": { + "cssesc": "bin/cssesc" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/postcss-selector-parser/node_modules/util-deprecate": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", + "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==", + "license": "MIT" + }, + "node_modules/postcss-value-parser": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/postcss-value-parser/-/postcss-value-parser-4.2.0.tgz", + "integrity": "sha512-1NNCs6uurfkVbeXG4S8JFT9t19m45ICnif8zWLd5oPSZ50QnwMfK+H3jv408d4jw/7Bttv5axS5IiHoLaVNHeQ==", + "license": "MIT" + }, + "node_modules/postcss/node_modules/nanoid": { + "version": "3.3.8", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.8.tgz", + "integrity": "sha512-WNLf5Sd8oZxOm+TzppcYk8gVOgP+l58xNy58D0nbUnOxOWRWvlcCV4kUF7ltmI6PsrLl/BgKEyS4mqsGChFN0w==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "bin": { + "nanoid": "bin/nanoid.cjs" + }, + "engines": { + "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" + } + }, + "node_modules/postcss/node_modules/source-map-js": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/punycode.js": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/punycode.js/-/punycode.js-2.3.1.tgz", + "integrity": "sha512-uxFIHU0YlHYhDQtV4R9J6a52SLx28BCjT+4ieh7IGbgwVJWO+km431c4yRlREUAsAmt/uMjQUyQHNEPf0M39CA==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/resolve": { + "version": "1.22.8", + "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.8.tgz", + "integrity": "sha512-oKWePCxqpd6FlLvGV1VU0x7bkPmmCNolxzjMf4NczoDnQcIWrAF+cPtZn5i6n+RfD2d9i0tzpKnG6Yk168yIyw==", + "license": "MIT", + "dependencies": { + "is-core-module": "^2.13.0", + "path-parse": "^1.0.7", + "supports-preserve-symlinks-flag": "^1.0.0" + }, + "bin": { + "resolve": "bin/resolve" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/resolve/node_modules/function-bind": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", + "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==", + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/resolve/node_modules/hasown": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz", + "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==", + "license": "MIT", + "dependencies": { + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/resolve/node_modules/is-core-module": { + "version": "2.15.1", + "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.15.1.tgz", + "integrity": "sha512-z0vtXSwucUJtANQWldhbtbt7BnL0vxiFjIdDLAatwhDYty2bad6s+rijD6Ri4YuYJubLzIJLUidCh09e1djEVQ==", + "license": "MIT", + "dependencies": { + "hasown": "^2.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/resolve/node_modules/path-parse": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/path-parse/-/path-parse-1.0.7.tgz", + "integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==", + "license": "MIT" + }, + "node_modules/resolve/node_modules/supports-preserve-symlinks-flag": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/supports-preserve-symlinks-flag/-/supports-preserve-symlinks-flag-1.0.0.tgz", + "integrity": "sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/rollup": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.28.0.tgz", + "integrity": "sha512-G9GOrmgWHBma4YfCcX8PjH0qhXSdH8B4HDE2o4/jaxj93S4DPCIDoLcXz99eWMji4hB29UFCEd7B2gwGJDR9cQ==", + "license": "MIT", + "dependencies": { + "@types/estree": "1.0.6" + }, + "bin": { + "rollup": "dist/bin/rollup" + }, + "engines": { + "node": ">=18.0.0", + "npm": ">=8.0.0" + }, + "optionalDependencies": { + "@rollup/rollup-android-arm-eabi": "4.28.0", + "@rollup/rollup-android-arm64": "4.28.0", + "@rollup/rollup-darwin-arm64": "4.28.0", + "@rollup/rollup-darwin-x64": "4.28.0", + "@rollup/rollup-freebsd-arm64": "4.28.0", + "@rollup/rollup-freebsd-x64": "4.28.0", + "@rollup/rollup-linux-arm-gnueabihf": "4.28.0", + "@rollup/rollup-linux-arm-musleabihf": "4.28.0", + "@rollup/rollup-linux-arm64-gnu": "4.28.0", + "@rollup/rollup-linux-arm64-musl": "4.28.0", + "@rollup/rollup-linux-powerpc64le-gnu": "4.28.0", + "@rollup/rollup-linux-riscv64-gnu": "4.28.0", + "@rollup/rollup-linux-s390x-gnu": "4.28.0", + "@rollup/rollup-linux-x64-gnu": "4.28.0", + "@rollup/rollup-linux-x64-musl": "4.28.0", + "@rollup/rollup-win32-arm64-msvc": "4.28.0", + "@rollup/rollup-win32-ia32-msvc": "4.28.0", + "@rollup/rollup-win32-x64-msvc": "4.28.0", + "fsevents": "~2.3.2" + } + }, + "node_modules/rollup/node_modules/@rollup/rollup-darwin-arm64": { + "version": "4.28.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.28.0.tgz", + "integrity": "sha512-lmKx9yHsppblnLQZOGxdO66gT77bvdBtr/0P+TPOseowE7D9AJoBw8ZDULRasXRWf1Z86/gcOdpBrV6VDUY36Q==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/rollup/node_modules/@types/estree": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.6.tgz", + "integrity": "sha512-AYnb1nQyY49te+VRAVgmzfcgjYS91mY5P0TKUDCLEM+gNnA+3T6rWITXRLYCpahpqSQbN5cE+gHpnPyXjHWxcw==", + "license": "MIT" + }, + "node_modules/sucrase": { + "version": "3.35.0", + "resolved": "https://registry.npmjs.org/sucrase/-/sucrase-3.35.0.tgz", + "integrity": "sha512-8EbVDiu9iN/nESwxeSxDKe0dunta1GOlHufmSSXxMD2z2/tMZpDMpvXQGsc+ajGo8y2uYUmixaSRUc/QPoQ0GA==", + "license": "MIT", + "dependencies": { + "@jridgewell/gen-mapping": "^0.3.2", + "commander": "^4.0.0", + "glob": "^10.3.10", + "lines-and-columns": "^1.1.6", + "mz": "^2.7.0", + "pirates": "^4.0.1", + "ts-interface-checker": "^0.1.9" + }, + "bin": { + "sucrase": "bin/sucrase", + "sucrase-node": "bin/sucrase-node" + }, + "engines": { + "node": ">=16 || 14 >=14.17" + } + }, + "node_modules/sucrase/node_modules/@isaacs/cliui": { + "version": "8.0.2", + "resolved": "https://registry.npmjs.org/@isaacs/cliui/-/cliui-8.0.2.tgz", + "integrity": "sha512-O8jcjabXaleOG9DQ0+ARXWZBTfnP4WNAqzuiJK7ll44AmxGKv/J2M4TPjxjY3znBCfvBXFzucm1twdyFybFqEA==", + "license": "ISC", + "dependencies": { + "string-width": "^5.1.2", + "string-width-cjs": "npm:string-width@^4.2.0", + "strip-ansi": "^7.0.1", + "strip-ansi-cjs": "npm:strip-ansi@^6.0.1", + "wrap-ansi": "^8.1.0", + "wrap-ansi-cjs": "npm:wrap-ansi@^7.0.0" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/sucrase/node_modules/@jridgewell/gen-mapping": { + "version": "0.3.5", + "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.5.tgz", + "integrity": "sha512-IzL8ZoEDIBRWEzlCcRhOaCupYyN5gdIK+Q6fbFdPDg6HqX6jpkItn7DFIpW9LQzXG6Df9sA7+OKnq0qlz/GaQg==", + "license": "MIT", + "dependencies": { + "@jridgewell/set-array": "^1.2.1", + "@jridgewell/sourcemap-codec": "^1.4.10", + "@jridgewell/trace-mapping": "^0.3.24" + }, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/sucrase/node_modules/@jridgewell/resolve-uri": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz", + "integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==", + "license": "MIT", + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/sucrase/node_modules/@jridgewell/set-array": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/@jridgewell/set-array/-/set-array-1.2.1.tgz", + "integrity": "sha512-R8gLRTZeyp03ymzP/6Lil/28tGeGEzhx1q2k703KGWRAI1VdvPIXdG70VJc2pAMw3NA6JKL5hhFu1sJX0Mnn/A==", + "license": "MIT", + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/sucrase/node_modules/@jridgewell/sourcemap-codec": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.0.tgz", + "integrity": "sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ==", + "license": "MIT" + }, + "node_modules/sucrase/node_modules/@jridgewell/trace-mapping": { + "version": "0.3.25", + "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.25.tgz", + "integrity": "sha512-vNk6aEwybGtawWmy/PzwnGDOjCkLWSD2wqvjGGAgOAwCGWySYXfYoxt00IJkTF+8Lb57DwOb3Aa0o9CApepiYQ==", + "license": "MIT", + "dependencies": { + "@jridgewell/resolve-uri": "^3.1.0", + "@jridgewell/sourcemap-codec": "^1.4.14" + } + }, + "node_modules/sucrase/node_modules/@pkgjs/parseargs": { + "version": "0.11.0", + "resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz", + "integrity": "sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==", + "license": "MIT", + "optional": true, + "engines": { + "node": ">=14" + } + }, + "node_modules/sucrase/node_modules/ansi-regex": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.1.0.tgz", + "integrity": "sha512-7HSX4QQb4CspciLpVFwyRe79O3xsIZDDLER21kERQ71oaPodF8jL725AgJMFAYbooIqolJoRLuM81SpeUkpkvA==", + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/ansi-regex?sponsor=1" + } + }, + "node_modules/sucrase/node_modules/ansi-styles": { + "version": "6.2.1", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz", + "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==", + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/sucrase/node_modules/any-promise": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/any-promise/-/any-promise-1.3.0.tgz", + "integrity": "sha512-7UvmKalWRt1wgjL1RrGxoSJW/0QZFIegpeGvZG9kjp8vrRu55XTHbwnqq2GpXm9uLbcuhxm3IqX9OB4MZR1b2A==", + "license": "MIT" + }, + "node_modules/sucrase/node_modules/balanced-match": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", + "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==", + "license": "MIT" + }, + "node_modules/sucrase/node_modules/brace-expansion": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz", + "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==", + "license": "MIT", + "dependencies": { + "balanced-match": "^1.0.0" + } + }, + "node_modules/sucrase/node_modules/color-convert": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz", + "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==", + "license": "MIT", + "dependencies": { + "color-name": "~1.1.4" + }, + "engines": { + "node": ">=7.0.0" + } + }, + "node_modules/sucrase/node_modules/color-name": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz", + "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==", + "license": "MIT" + }, + "node_modules/sucrase/node_modules/commander": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/commander/-/commander-4.1.1.tgz", + "integrity": "sha512-NOKm8xhkzAjzFx8B2v5OAHT+u5pRQc2UCa2Vq9jYL/31o2wi9mxBA7LIFs3sV5VSC49z6pEhfbMULvShKj26WA==", + "license": "MIT", + "engines": { + "node": ">= 6" + } + }, + "node_modules/sucrase/node_modules/cross-spawn": { + "version": "7.0.6", + "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz", + "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==", + "license": "MIT", + "dependencies": { + "path-key": "^3.1.0", + "shebang-command": "^2.0.0", + "which": "^2.0.1" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/sucrase/node_modules/eastasianwidth": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/eastasianwidth/-/eastasianwidth-0.2.0.tgz", + "integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==", + "license": "MIT" + }, + "node_modules/sucrase/node_modules/emoji-regex": { + "version": "9.2.2", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-9.2.2.tgz", + "integrity": "sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg==", + "license": "MIT" + }, + "node_modules/sucrase/node_modules/foreground-child": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.3.0.tgz", + "integrity": "sha512-Ld2g8rrAyMYFXBhEqMz8ZAHBi4J4uS1i/CxGMDnjyFWddMXLVcDp051DZfu+t7+ab7Wv6SMqpWmyFIj5UbfFvg==", + "license": "ISC", + "dependencies": { + "cross-spawn": "^7.0.0", + "signal-exit": "^4.0.1" + }, + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/sucrase/node_modules/glob": { + "version": "10.4.5", + "resolved": "https://registry.npmjs.org/glob/-/glob-10.4.5.tgz", + "integrity": "sha512-7Bv8RF0k6xjo7d4A/PxYLbUCfb6c+Vpd2/mB2yRDlew7Jb5hEXiCD9ibfO7wpk8i4sevK6DFny9h7EYbM3/sHg==", + "license": "ISC", + "dependencies": { + "foreground-child": "^3.1.0", + "jackspeak": "^3.1.2", + "minimatch": "^9.0.4", + "minipass": "^7.1.2", + "package-json-from-dist": "^1.0.0", + "path-scurry": "^1.11.1" + }, + "bin": { + "glob": "dist/esm/bin.mjs" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/sucrase/node_modules/is-fullwidth-code-point": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", + "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==", + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/sucrase/node_modules/isexe": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz", + "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==", + "license": "ISC" + }, + "node_modules/sucrase/node_modules/jackspeak": { + "version": "3.4.3", + "resolved": "https://registry.npmjs.org/jackspeak/-/jackspeak-3.4.3.tgz", + "integrity": "sha512-OGlZQpz2yfahA/Rd1Y8Cd9SIEsqvXkLVoSw/cgwhnhFMDbsQFeZYoJJ7bIZBS9BcamUW96asq/npPWugM+RQBw==", + "license": "BlueOak-1.0.0", + "dependencies": { + "@isaacs/cliui": "^8.0.2" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + }, + "optionalDependencies": { + "@pkgjs/parseargs": "^0.11.0" + } + }, + "node_modules/sucrase/node_modules/lines-and-columns": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz", + "integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==", + "license": "MIT" + }, + "node_modules/sucrase/node_modules/lru-cache": { + "version": "10.4.3", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-10.4.3.tgz", + "integrity": "sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==", + "license": "ISC" + }, + "node_modules/sucrase/node_modules/minimatch": { + "version": "9.0.5", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz", + "integrity": "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==", + "license": "ISC", + "dependencies": { + "brace-expansion": "^2.0.1" + }, + "engines": { + "node": ">=16 || 14 >=14.17" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/sucrase/node_modules/minipass": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.2.tgz", + "integrity": "sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==", + "license": "ISC", + "engines": { + "node": ">=16 || 14 >=14.17" + } + }, + "node_modules/sucrase/node_modules/mz": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/mz/-/mz-2.7.0.tgz", + "integrity": "sha512-z81GNO7nnYMEhrGh9LeymoE4+Yr0Wn5McHIZMK5cfQCl+NDX08sCZgUc9/6MHni9IWuFLm1Z3HTCXu2z9fN62Q==", + "license": "MIT", + "dependencies": { + "any-promise": "^1.0.0", + "object-assign": "^4.0.1", + "thenify-all": "^1.0.0" + } + }, + "node_modules/sucrase/node_modules/object-assign": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", + "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/sucrase/node_modules/package-json-from-dist": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/package-json-from-dist/-/package-json-from-dist-1.0.1.tgz", + "integrity": "sha512-UEZIS3/by4OC8vL3P2dTXRETpebLI2NiI5vIrjaD/5UtrkFX/tNbwjTSRAGC/+7CAo2pIcBaRgWmcBBHcsaCIw==", + "license": "BlueOak-1.0.0" + }, + "node_modules/sucrase/node_modules/path-key": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz", + "integrity": "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==", + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/sucrase/node_modules/path-scurry": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/path-scurry/-/path-scurry-1.11.1.tgz", + "integrity": "sha512-Xa4Nw17FS9ApQFJ9umLiJS4orGjm7ZzwUrwamcGQuHSzDyth9boKDaycYdDcZDuqYATXw4HFXgaqWTctW/v1HA==", + "license": "BlueOak-1.0.0", + "dependencies": { + "lru-cache": "^10.2.0", + "minipass": "^5.0.0 || ^6.0.2 || ^7.0.0" + }, + "engines": { + "node": ">=16 || 14 >=14.18" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/sucrase/node_modules/pirates": { + "version": "4.0.6", + "resolved": "https://registry.npmjs.org/pirates/-/pirates-4.0.6.tgz", + "integrity": "sha512-saLsH7WeYYPiD25LDuLRRY/i+6HaPYr6G1OUlN39otzkSTxKnubR9RTxS3/Kk50s1g2JTgFwWQDQyplC5/SHZg==", + "license": "MIT", + "engines": { + "node": ">= 6" + } + }, + "node_modules/sucrase/node_modules/shebang-command": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz", + "integrity": "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==", + "license": "MIT", + "dependencies": { + "shebang-regex": "^3.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/sucrase/node_modules/shebang-regex": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/shebang-regex/-/shebang-regex-3.0.0.tgz", + "integrity": "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==", + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/sucrase/node_modules/signal-exit": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz", + "integrity": "sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==", + "license": "ISC", + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/sucrase/node_modules/string-width": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-5.1.2.tgz", + "integrity": "sha512-HnLOCR3vjcY8beoNLtcjZ5/nxn2afmME6lhrDrebokqMap+XbeW8n9TXpPDOqdGK5qcI3oT0GKTW6wC7EMiVqA==", + "license": "MIT", + "dependencies": { + "eastasianwidth": "^0.2.0", + "emoji-regex": "^9.2.2", + "strip-ansi": "^7.0.1" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/sucrase/node_modules/string-width-cjs": { + "name": "string-width", + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", + "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", + "license": "MIT", + "dependencies": { + "emoji-regex": "^8.0.0", + "is-fullwidth-code-point": "^3.0.0", + "strip-ansi": "^6.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/sucrase/node_modules/string-width-cjs/node_modules/ansi-regex": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", + "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/sucrase/node_modules/string-width-cjs/node_modules/emoji-regex": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", + "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==", + "license": "MIT" + }, + "node_modules/sucrase/node_modules/string-width-cjs/node_modules/strip-ansi": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", + "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", + "license": "MIT", + "dependencies": { + "ansi-regex": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/sucrase/node_modules/strip-ansi": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.0.tgz", + "integrity": "sha512-iq6eVVI64nQQTRYq2KtEg2d2uU7LElhTJwsH4YzIHZshxlgZms/wIc4VoDQTlG/IvVIrBKG06CrZnp0qv7hkcQ==", + "license": "MIT", + "dependencies": { + "ansi-regex": "^6.0.1" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/strip-ansi?sponsor=1" + } + }, + "node_modules/sucrase/node_modules/strip-ansi-cjs": { + "name": "strip-ansi", + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", + "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", + "license": "MIT", + "dependencies": { + "ansi-regex": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/sucrase/node_modules/strip-ansi-cjs/node_modules/ansi-regex": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", + "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/sucrase/node_modules/thenify": { + "version": "3.3.1", + "resolved": "https://registry.npmjs.org/thenify/-/thenify-3.3.1.tgz", + "integrity": "sha512-RVZSIV5IG10Hk3enotrhvz0T9em6cyHBLkH/YAZuKqd8hRkKhSfCGIcP2KUY0EPxndzANBmNllzWPwak+bheSw==", + "license": "MIT", + "dependencies": { + "any-promise": "^1.0.0" + } + }, + "node_modules/sucrase/node_modules/thenify-all": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/thenify-all/-/thenify-all-1.6.0.tgz", + "integrity": "sha512-RNxQH/qI8/t3thXJDwcstUO4zeqo64+Uy/+sNVRBx4Xn2OX+OZ9oP+iJnNFqplFra2ZUVeKCSa2oVWi3T4uVmA==", + "license": "MIT", + "dependencies": { + "thenify": ">= 3.1.0 < 4" + }, + "engines": { + "node": ">=0.8" + } + }, + "node_modules/sucrase/node_modules/ts-interface-checker": { + "version": "0.1.13", + "resolved": "https://registry.npmjs.org/ts-interface-checker/-/ts-interface-checker-0.1.13.tgz", + "integrity": "sha512-Y/arvbn+rrz3JCKl9C4kVNfTfSm2/mEp5FSz5EsZSANGPSlQrpRI5M4PKF+mJnE52jOO90PnPSc3Ur3bTQw0gA==", + "license": "Apache-2.0" + }, + "node_modules/sucrase/node_modules/which": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", + "integrity": "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==", + "license": "ISC", + "dependencies": { + "isexe": "^2.0.0" + }, + "bin": { + "node-which": "bin/node-which" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/sucrase/node_modules/wrap-ansi": { + "version": "8.1.0", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-8.1.0.tgz", + "integrity": "sha512-si7QWI6zUMq56bESFvagtmzMdGOtoxfR+Sez11Mobfc7tm+VkUckk9bW2UeffTGVUbOksxmSw0AA2gs8g71NCQ==", + "license": "MIT", + "dependencies": { + "ansi-styles": "^6.1.0", + "string-width": "^5.0.1", + "strip-ansi": "^7.0.1" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/wrap-ansi?sponsor=1" + } + }, + "node_modules/sucrase/node_modules/wrap-ansi-cjs": { + "name": "wrap-ansi", + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz", + "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==", + "license": "MIT", + "dependencies": { + "ansi-styles": "^4.0.0", + "string-width": "^4.1.0", + "strip-ansi": "^6.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/wrap-ansi?sponsor=1" + } + }, + "node_modules/sucrase/node_modules/wrap-ansi-cjs/node_modules/ansi-regex": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", + "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/sucrase/node_modules/wrap-ansi-cjs/node_modules/ansi-styles": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", + "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", + "license": "MIT", + "dependencies": { + "color-convert": "^2.0.1" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/sucrase/node_modules/wrap-ansi-cjs/node_modules/emoji-regex": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", + "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==", + "license": "MIT" + }, + "node_modules/sucrase/node_modules/wrap-ansi-cjs/node_modules/string-width": { + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", + "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", + "license": "MIT", + "dependencies": { + "emoji-regex": "^8.0.0", + "is-fullwidth-code-point": "^3.0.0", + "strip-ansi": "^6.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/sucrase/node_modules/wrap-ansi-cjs/node_modules/strip-ansi": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", + "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", + "license": "MIT", + "dependencies": { + "ansi-regex": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/tailwindcss": { + "version": "3.4.15", + "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-3.4.15.tgz", + "integrity": "sha512-r4MeXnfBmSOuKUWmXe6h2CcyfzJCEk4F0pptO5jlnYSIViUkVmsawj80N5h2lO3gwcmSb4n3PuN+e+GC1Guylw==", + "license": "MIT", + "dependencies": { + "@alloc/quick-lru": "^5.2.0", + "arg": "^5.0.2", + "chokidar": "^3.6.0", + "didyoumean": "^1.2.2", + "dlv": "^1.1.3", + "fast-glob": "^3.3.2", + "glob-parent": "^6.0.2", + "is-glob": "^4.0.3", + "jiti": "^1.21.6", + "lilconfig": "^2.1.0", + "micromatch": "^4.0.8", + "normalize-path": "^3.0.0", + "object-hash": "^3.0.0", + "picocolors": "^1.1.1", + "postcss": "^8.4.47", + "postcss-import": "^15.1.0", + "postcss-js": "^4.0.1", + "postcss-load-config": "^4.0.2", + "postcss-nested": "^6.2.0", + "postcss-selector-parser": "^6.1.2", + "resolve": "^1.22.8", + "sucrase": "^3.35.0" + }, + "bin": { + "tailwind": "lib/cli.js", + "tailwindcss": "lib/cli.js" + }, + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/uc.micro": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/uc.micro/-/uc.micro-2.1.0.tgz", + "integrity": "sha512-ARDJmphmdvUk6Glw7y9DQ2bFkKBHwQHLi2lsaH6PPmz/Ka9sFOBsBluozhDltWmnv9u/cF6Rt87znRTPV+yp/A==", + "license": "MIT" + }, + "node_modules/vite": { + "version": "5.4.11", + "resolved": "https://registry.npmjs.org/vite/-/vite-5.4.11.tgz", + "integrity": "sha512-c7jFQRklXua0mTzneGW9QVyxFjUgwcihC4bXEtujIo2ouWCe1Ajt/amn2PCxYnhYfd5k09JX3SB7OYWFKYqj8Q==", + "license": "MIT", + "dependencies": { + "esbuild": "^0.21.3", + "postcss": "^8.4.43", + "rollup": "^4.20.0" + }, + "bin": { + "vite": "bin/vite.js" + }, + "engines": { + "node": "^18.0.0 || >=20.0.0" + }, + "funding": { + "url": "https://github.com/vitejs/vite?sponsor=1" + }, + "optionalDependencies": { + "fsevents": "~2.3.3" + }, + "peerDependencies": { + "@types/node": "^18.0.0 || >=20.0.0", + "less": "*", + "lightningcss": "^1.21.0", + "sass": "*", + "sass-embedded": "*", + "stylus": "*", + "sugarss": "*", + "terser": "^5.4.0" + }, + "peerDependenciesMeta": { + "@types/node": { + "optional": true + }, + "less": { + "optional": true + }, + "lightningcss": { + "optional": true + }, + "sass": { + "optional": true + }, + "sass-embedded": { + "optional": true + }, + "stylus": { + "optional": true + }, + "sugarss": { + "optional": true + }, + "terser": { + "optional": true + } + } + }, + "node_modules/vite-plugin-singlefile": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/vite-plugin-singlefile/-/vite-plugin-singlefile-2.0.3.tgz", + "integrity": "sha512-OEBEwdX8nCGPSdtaB1D7rryYnT+YfPTS8ojL1TDyeUF+bWDCTfRriQqw6T0vl9EbKI/KMg7szN3awst6cLrKkA==", + "license": "MIT", + "dependencies": { + "micromatch": "^4.0.8" + }, + "engines": { + "node": ">18.0.0" + }, + "peerDependencies": { + "rollup": "^4.24.3", + "vite": "^5.4.10" + } + }, + "node_modules/vue": { + "version": "3.5.13", + "resolved": "https://registry.npmjs.org/vue/-/vue-3.5.13.tgz", + "integrity": "sha512-wmeiSMxkZCSc+PM2w2VRsOYAZC8GdipNFRTsLSfodVqI9mbejKeXEGr8SckuLnrQPGe3oJN5c3K0vpoU9q/wCQ==", + "license": "MIT", + "dependencies": { + "@vue/compiler-dom": "3.5.13", + "@vue/compiler-sfc": "3.5.13", + "@vue/runtime-dom": "3.5.13", + "@vue/server-renderer": "3.5.13", + "@vue/shared": "3.5.13" + }, + "peerDependencies": { + "typescript": "*" + }, + "peerDependenciesMeta": { + "typescript": { + "optional": true + } + } + } + } +} diff --git a/examples/server/webui/package.json b/examples/server/webui/package.json new file mode 100644 index 0000000000000..2a45ece14be85 --- /dev/null +++ b/examples/server/webui/package.json @@ -0,0 +1,23 @@ +{ + "name": "webui", + "private": true, + "version": "0.0.0", + "type": "module", + "scripts": { + "dev": "vite", + "build": "vite build", + "preview": "vite preview" + }, + "devDependencies": { + "vite": "^5.4.10" + }, + "dependencies": { + "autoprefixer": "^10.4.20", + "daisyui": "^4.12.14", + "markdown-it": "^14.1.0", + "postcss": "^8.4.49", + "tailwindcss": "^3.4.15", + "vite-plugin-singlefile": "^2.0.3", + "vue": "^3.5.13" + } +} diff --git a/examples/server/webui/postcss.config.js b/examples/server/webui/postcss.config.js new file mode 100644 index 0000000000000..2e7af2b7f1a6f --- /dev/null +++ b/examples/server/webui/postcss.config.js @@ -0,0 +1,6 @@ +export default { + plugins: { + tailwindcss: {}, + autoprefixer: {}, + }, +} diff --git a/examples/server/public/completion.js b/examples/server/webui/src/completion.js similarity index 100% rename from examples/server/public/completion.js rename to examples/server/webui/src/completion.js diff --git a/examples/server/webui/src/main.js b/examples/server/webui/src/main.js new file mode 100644 index 0000000000000..9b5b123293362 --- /dev/null +++ b/examples/server/webui/src/main.js @@ -0,0 +1,456 @@ +import './styles.css'; +import { createApp, defineComponent, shallowRef, computed, h } from 'vue/dist/vue.esm-bundler.js'; +import { llama } from './completion.js'; +import MarkdownIt from 'markdown-it'; + +// utility functions +const isString = (x) => !!x.toLowerCase; +const isNumeric = (n) => !isString(n) && !isNaN(n); +const escapeAttr = (str) => str.replace(/>/g, '>').replace(/"/g, '"'); +const copyStr = (str) => navigator.clipboard.writeText(str); + +// constants +const BASE_URL = localStorage.getItem('base') // for debugging + || (new URL('.', document.baseURI).href).toString(); // for production +const CONFIG_DEFAULT = { + // Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value. + apiKey: '', + systemMessage: 'You are a helpful assistant.', + // make sure these default values are in sync with `common.h` + samplers: 'dkypmxt', + temperature: 0.8, + dynatemp_range: 0.0, + dynatemp_exponent: 1.0, + top_k: 40, + top_p: 0.95, + min_p: 0.05, + xtc_probability: 0.0, + xtc_threshold: 0.1, + typical_p: 1.0, + repeat_last_n: 64, + repeat_penalty: 1.0, + presence_penalty: 0.0, + frequency_penalty: 0.0, + dry_multiplier: 0.0, + dry_base: 1.75, + dry_allowed_length: 2, + dry_penalty_last_n: -1, + max_tokens: -1, + custom: '', // custom json-stringified object +}; +const CONFIG_INFO = { + apiKey: 'Set the API Key if you are using --api-key option for the server.', + systemMessage: 'The starting message that defines how model should behave.', + samplers: 'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature', + temperature: 'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.', + dynatemp_range: 'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.', + dynatemp_exponent: 'Addon for the temperature sampler. Smoothes out the probability redistribution based on the most probable token.', + top_k: 'Keeps only k top tokens.', + top_p: 'Limits tokens to those that together have a cumulative probability of at least p', + min_p: 'Limits tokens based on the minimum probability for a token to be considered, relative to the probability of the most likely token.', + xtc_probability: 'XTC sampler cuts out top tokens; this parameter controls the chance of cutting tokens at all. 0 disables XTC.', + xtc_threshold: 'XTC sampler cuts out top tokens; this parameter controls the token probability that is required to cut that token.', + typical_p: 'Sorts and limits tokens based on the difference between log-probability and entropy.', + repeat_last_n: 'Last n tokens to consider for penalizing repetition', + repeat_penalty: 'Controls the repetition of token sequences in the generated text', + presence_penalty: 'Limits tokens based on whether they appear in the output or not.', + frequency_penalty: 'Limits tokens based on how often they appear in the output.', + dry_multiplier: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling multiplier.', + dry_base: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling base value.', + dry_allowed_length: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the allowed length for DRY sampling.', + dry_penalty_last_n: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets DRY penalty for the last n tokens.', + max_tokens: 'The maximum number of token per output.', + custom: '', // custom json-stringified object +}; +// config keys having numeric value (i.e. temperature, top_k, top_p, etc) +const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT).filter(e => isNumeric(e[1])).map(e => e[0]); +// list of themes supported by daisyui +const THEMES = ['light', 'dark', 'cupcake', 'bumblebee', 'emerald', 'corporate', 'synthwave', 'retro', 'cyberpunk', 'valentine', 'halloween', 'garden', 'forest', 'aqua', 'lofi', 'pastel', 'fantasy', 'wireframe', 'black', 'luxury', 'dracula', 'cmyk', 'autumn', 'business', 'acid', 'lemonade', 'night', 'coffee', 'winter', 'dim', 'nord', 'sunset']; + +// markdown support +const VueMarkdown = defineComponent( + (props) => { + const md = shallowRef(new MarkdownIt({ breaks: true })); + const origFenchRenderer = md.value.renderer.rules.fence; + md.value.renderer.rules.fence = (tokens, idx, ...args) => { + const content = tokens[idx].content; + const origRendered = origFenchRenderer(tokens, idx, ...args); + return `
+ + ${origRendered} +
`; + }; + window.copyStr = copyStr; + const content = computed(() => md.value.render(props.source)); + return () => h("div", { innerHTML: content.value }); + }, + { props: ["source"] } +); + +// input field to be used by settings modal +const SettingsModalShortInput = defineComponent({ + template: document.getElementById('settings-modal-short-input').innerHTML, + props: { + label: { type: String, required: false }, + configKey: String, + configDefault: Object, + configInfo: Object, + modelValue: [Object, String, Number], + }, +}); + +// coversations is stored in localStorage +// format: { [convId]: { id: string, lastModified: number, messages: [...] } } +// convId is a string prefixed with 'conv-' +const StorageUtils = { + // manage conversations + getAllConversations() { + const res = []; + for (const key in localStorage) { + if (key.startsWith('conv-')) { + res.push(JSON.parse(localStorage.getItem(key))); + } + } + res.sort((a, b) => b.lastModified - a.lastModified); + return res; + }, + // can return null if convId does not exist + getOneConversation(convId) { + return JSON.parse(localStorage.getItem(convId) || 'null'); + }, + // if convId does not exist, create one + appendMsg(convId, msg) { + if (msg.content === null) return; + const conv = StorageUtils.getOneConversation(convId) || { + id: convId, + lastModified: Date.now(), + messages: [], + }; + conv.messages.push(msg); + conv.lastModified = Date.now(); + localStorage.setItem(convId, JSON.stringify(conv)); + }, + getNewConvId() { + return `conv-${Date.now()}`; + }, + remove(convId) { + localStorage.removeItem(convId); + }, + filterAndKeepMsgs(convId, predicate) { + const conv = StorageUtils.getOneConversation(convId); + if (!conv) return; + conv.messages = conv.messages.filter(predicate); + conv.lastModified = Date.now(); + localStorage.setItem(convId, JSON.stringify(conv)); + }, + popMsg(convId) { + const conv = StorageUtils.getOneConversation(convId); + if (!conv) return; + const msg = conv.messages.pop(); + conv.lastModified = Date.now(); + if (conv.messages.length === 0) { + StorageUtils.remove(convId); + } else { + localStorage.setItem(convId, JSON.stringify(conv)); + } + return msg; + }, + + // manage config + getConfig() { + const savedVal = JSON.parse(localStorage.getItem('config') || '{}'); + // to prevent breaking changes in the future, we always provide default value for missing keys + return { + ...CONFIG_DEFAULT, + ...savedVal, + }; + }, + setConfig(config) { + localStorage.setItem('config', JSON.stringify(config)); + }, + getTheme() { + return localStorage.getItem('theme') || 'auto'; + }, + setTheme(theme) { + if (theme === 'auto') { + localStorage.removeItem('theme'); + } else { + localStorage.setItem('theme', theme); + } + }, +}; + +// scroll to bottom of chat messages +// if requiresNearBottom is true, only auto-scroll if user is near bottom +const chatScrollToBottom = (requiresNearBottom) => { + const msgListElem = document.getElementById('messages-list'); + const spaceToBottom = msgListElem.scrollHeight - msgListElem.scrollTop - msgListElem.clientHeight; + if (!requiresNearBottom || (spaceToBottom < 100)) { + setTimeout(() => msgListElem.scrollTo({ top: msgListElem.scrollHeight }), 1); + } +}; + +const mainApp = createApp({ + components: { + VueMarkdown, + SettingsModalShortInput, + }, + data() { + return { + conversations: StorageUtils.getAllConversations(), + messages: [], // { id: number, role: 'user' | 'assistant', content: string } + viewingConvId: StorageUtils.getNewConvId(), + inputMsg: '', + isGenerating: false, + pendingMsg: null, // the on-going message from assistant + stopGeneration: () => {}, + selectedTheme: StorageUtils.getTheme(), + config: StorageUtils.getConfig(), + showConfigDialog: false, + editingMsg: null, + // const + themes: THEMES, + configDefault: {...CONFIG_DEFAULT}, + configInfo: {...CONFIG_INFO}, + } + }, + computed: {}, + mounted() { + document.getElementById('app').classList.remove('opacity-0'); // show app + // scroll to the bottom when the pending message height is updated + const pendingMsgElem = document.getElementById('pending-msg'); + const resizeObserver = new ResizeObserver(() => { + if (this.isGenerating) chatScrollToBottom(true); + }); + resizeObserver.observe(pendingMsgElem); + }, + methods: { + hideSidebar() { + document.getElementById('toggle-drawer').checked = false; + }, + setSelectedTheme(theme) { + this.selectedTheme = theme; + StorageUtils.setTheme(theme); + }, + newConversation() { + if (this.isGenerating) return; + this.viewingConvId = StorageUtils.getNewConvId(); + this.editingMsg = null; + this.fetchMessages(); + chatScrollToBottom(); + this.hideSidebar(); + }, + setViewingConv(convId) { + if (this.isGenerating) return; + this.viewingConvId = convId; + this.editingMsg = null; + this.fetchMessages(); + chatScrollToBottom(); + this.hideSidebar(); + }, + deleteConv(convId) { + if (this.isGenerating) return; + if (window.confirm('Are you sure to delete this conversation?')) { + StorageUtils.remove(convId); + if (this.viewingConvId === convId) { + this.viewingConvId = StorageUtils.getNewConvId(); + this.editingMsg = null; + } + this.fetchConversation(); + this.fetchMessages(); + } + }, + downloadConv(convId) { + const conversation = StorageUtils.getOneConversation(convId); + if (!conversation) { + alert('Conversation not found.'); + return; + } + const conversationJson = JSON.stringify(conversation, null, 2); + const blob = new Blob([conversationJson], { type: 'application/json' }); + const url = URL.createObjectURL(blob); + const a = document.createElement('a'); + a.href = url; + a.download = `conversation_${convId}.json`; + document.body.appendChild(a); + a.click(); + document.body.removeChild(a); + URL.revokeObjectURL(url); + }, + async sendMessage() { + if (!this.inputMsg) return; + const currConvId = this.viewingConvId; + + StorageUtils.appendMsg(currConvId, { + id: Date.now(), + role: 'user', + content: this.inputMsg, + }); + this.fetchConversation(); + this.fetchMessages(); + this.inputMsg = ''; + this.editingMsg = null; + this.generateMessage(currConvId); + chatScrollToBottom(); + }, + async generateMessage(currConvId) { + if (this.isGenerating) return; + this.pendingMsg = { id: Date.now()+1, role: 'assistant', content: null }; + this.isGenerating = true; + this.editingMsg = null; + + try { + const abortController = new AbortController(); + this.stopGeneration = () => abortController.abort(); + const params = { + messages: [ + { role: 'system', content: this.config.systemMessage }, + ...this.messages, + ], + stream: true, + cache_prompt: true, + samplers: this.config.samplers, + temperature: this.config.temperature, + dynatemp_range: this.config.dynatemp_range, + dynatemp_exponent: this.config.dynatemp_exponent, + top_k: this.config.top_k, + top_p: this.config.top_p, + min_p: this.config.min_p, + typical_p: this.config.typical_p, + xtc_probability: this.config.xtc_probability, + xtc_threshold: this.config.xtc_threshold, + repeat_last_n: this.config.repeat_last_n, + repeat_penalty: this.config.repeat_penalty, + presence_penalty: this.config.presence_penalty, + frequency_penalty: this.config.frequency_penalty, + dry_multiplier: this.config.dry_multiplier, + dry_base: this.config.dry_base, + dry_allowed_length: this.config.dry_allowed_length, + dry_penalty_last_n: this.config.dry_penalty_last_n, + max_tokens: this.config.max_tokens, + ...(this.config.custom.length ? JSON.parse(this.config.custom) : {}), + ...(this.config.apiKey ? { api_key: this.config.apiKey } : {}), + }; + const config = { + controller: abortController, + api_url: BASE_URL, + endpoint: '/chat/completions', + }; + for await (const chunk of llama(prompt, params, config)) { + const stop = chunk.data.stop; + const addedContent = chunk.data.choices[0].delta.content; + const lastContent = this.pendingMsg.content || ''; + if (addedContent) { + this.pendingMsg = { + id: this.pendingMsg.id, + role: 'assistant', + content: lastContent + addedContent, + }; + } + } + + StorageUtils.appendMsg(currConvId, this.pendingMsg); + this.fetchConversation(); + this.fetchMessages(); + setTimeout(() => document.getElementById('msg-input').focus(), 1); + } catch (error) { + if (error.name === 'AbortError') { + // user stopped the generation via stopGeneration() function + StorageUtils.appendMsg(currConvId, this.pendingMsg); + this.fetchConversation(); + this.fetchMessages(); + } else { + console.error(error); + alert(error); + // pop last user message + const lastUserMsg = StorageUtils.popMsg(currConvId); + this.inputMsg = lastUserMsg ? lastUserMsg.content : ''; + } + } + + this.pendingMsg = null; + this.isGenerating = false; + this.stopGeneration = () => {}; + this.fetchMessages(); + chatScrollToBottom(); + }, + + // message actions + regenerateMsg(msg) { + if (this.isGenerating) return; + // TODO: somehow keep old history (like how ChatGPT has different "tree"). This can be done by adding "sub-conversations" with "subconv-" prefix, and new message will have a list of subconvIds + const currConvId = this.viewingConvId; + StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id); + this.fetchConversation(); + this.fetchMessages(); + this.generateMessage(currConvId); + }, + copyMsg(msg) { + copyStr(msg.content); + }, + editUserMsgAndRegenerate(msg) { + if (this.isGenerating) return; + const currConvId = this.viewingConvId; + const newContent = msg.content; + this.editingMsg = null; + StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id); + StorageUtils.appendMsg(currConvId, { + id: Date.now(), + role: 'user', + content: newContent, + }); + this.fetchConversation(); + this.fetchMessages(); + this.generateMessage(currConvId); + }, + + // settings dialog methods + closeAndSaveConfigDialog() { + try { + if (this.config.custom.length) JSON.parse(this.config.custom); + } catch (error) { + alert('Invalid JSON for custom config. Please either fix it or leave it empty.'); + return; + } + for (const key of CONFIG_NUMERIC_KEYS) { + if (isNaN(this.config[key]) || this.config[key].toString().trim().length === 0) { + alert(`Invalid number for ${key} (expected an integer or a float)`); + return; + } + this.config[key] = parseFloat(this.config[key]); + } + this.showConfigDialog = false; + StorageUtils.setConfig(this.config); + }, + closeAndDiscardConfigDialog() { + this.showConfigDialog = false; + this.config = StorageUtils.getConfig(); + }, + resetConfigDialog() { + if (window.confirm('Are you sure to reset all settings?')) { + this.config = {...CONFIG_DEFAULT}; + } + }, + + // sync state functions + fetchConversation() { + this.conversations = StorageUtils.getAllConversations(); + }, + fetchMessages() { + this.messages = StorageUtils.getOneConversation(this.viewingConvId)?.messages ?? []; + }, + }, +}); +mainApp.config.errorHandler = alert; +try { + mainApp.mount('#app'); +} catch (err) { + console.error(err); + document.getElementById('app').innerHTML = `
+ Failed to start app. Please try clearing localStorage and try again.
+
+ +
`; +} diff --git a/examples/server/webui/src/styles.css b/examples/server/webui/src/styles.css new file mode 100644 index 0000000000000..67d35b99e49fc --- /dev/null +++ b/examples/server/webui/src/styles.css @@ -0,0 +1,26 @@ +@tailwind base; +@tailwind components; +@tailwind utilities; + +.markdown { + h1, h2, h3, h4, h5, h6, ul, ol, li { all: revert; } + pre { + @apply whitespace-pre-wrap rounded-lg p-2; + border: 1px solid currentColor; + } + /* TODO: fix markdown table */ +} + +.show-on-hover { + @apply md:opacity-0 md:group-hover:opacity-100; +} +.btn-mini { + @apply cursor-pointer hover:shadow-md; +} +.chat-screen { max-width: 900px; } + +.chat-bubble-base-300 { + --tw-bg-opacity: 1; + --tw-text-opacity: 1; + @apply bg-base-300 text-base-content; +} diff --git a/examples/server/webui/tailwind.config.js b/examples/server/webui/tailwind.config.js new file mode 100644 index 0000000000000..c43066a19e7ad --- /dev/null +++ b/examples/server/webui/tailwind.config.js @@ -0,0 +1,16 @@ +/** @type {import('tailwindcss').Config} */ +export default { + content: [ + "./index.html", + "./src/**/*.{js,ts,jsx,tsx}", + ], + theme: { + extend: {}, + }, + plugins: [ + require('daisyui'), + ], + daisyui: { + themes: ['light', 'dark', 'cupcake', 'bumblebee', 'emerald', 'corporate', 'synthwave', 'retro', 'cyberpunk', 'valentine', 'halloween', 'garden', 'forest', 'aqua', 'lofi', 'pastel', 'fantasy', 'wireframe', 'black', 'luxury', 'dracula', 'cmyk', 'autumn', 'business', 'acid', 'lemonade', 'night', 'coffee', 'winter', 'dim', 'nord', 'sunset'], + } +} diff --git a/examples/server/webui/vite.config.js b/examples/server/webui/vite.config.js new file mode 100644 index 0000000000000..789bf9cbbf197 --- /dev/null +++ b/examples/server/webui/vite.config.js @@ -0,0 +1,36 @@ + +import { viteSingleFile } from 'vite-plugin-singlefile'; +import path from 'path'; +import fs from 'fs'; + +const GUIDE_FOR_FRONTEND = ` + +`.trim(); + +export default { + plugins: [ + viteSingleFile(), + (function llamaCppPlugin() { + let config; + return { + name: 'llamacpp:build', + apply: 'build', + async configResolved(_config) { + config = _config; + }, + writeBundle() { + const outputIndexHtml = path.join(config.build.outDir, 'index.html'); + const content = fs.readFileSync(outputIndexHtml, 'utf-8'); + + const targetOutputFile = path.join(config.build.outDir, '../../public/index.html'); + fs.writeFileSync(targetOutputFile, GUIDE_FOR_FRONTEND + '\n' + content); + } + } + })(), + ], +}; From 746a4e0ff4befe7962c9e55b1c870bd9fe23a9f1 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Tue, 3 Dec 2024 13:29:54 -0600 Subject: [PATCH 133/245] vulkan: optimize and reenable split_k (#10637) Use vector loads when possible in mul_mat_split_k_reduce. Use split_k when there aren't enough workgroups to fill the shaders. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 51 +++++++++++++++---- .../mul_mat_split_k_reduce.comp | 31 ++++++++--- 2 files changed, 65 insertions(+), 17 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index df6a659f44ff5..17e1be105f8b1 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -165,6 +165,7 @@ struct vk_device_struct { vk_queue transfer_queue; bool single_queue; uint32_t subgroup_size; + uint32_t shader_core_count; bool uma; size_t idx; @@ -1498,7 +1499,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_p021_f16_f32, "mul_mat_vec_p021_f16_f32", mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1); @@ -1610,11 +1611,14 @@ static vk_device ggml_vk_get_device(size_t idx) { const std::vector ext_props = device->physical_device.enumerateDeviceExtensionProperties(); bool maintenance4_support = false; + bool sm_builtins = false; // Check if maintenance4 is supported for (const auto& properties : ext_props) { if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) { maintenance4_support = true; + } else if (strcmp("VK_NV_shader_sm_builtins", properties.extensionName) == 0) { + sm_builtins = true; } } @@ -1622,11 +1626,21 @@ static vk_device ggml_vk_get_device(size_t idx) { vk::PhysicalDeviceMaintenance3Properties props3; vk::PhysicalDeviceMaintenance4Properties props4; vk::PhysicalDeviceSubgroupProperties subgroup_props; + vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props; props2.pNext = &props3; props3.pNext = &subgroup_props; + + VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&subgroup_props; + if (maintenance4_support) { - subgroup_props.pNext = &props4; + last_struct->pNext = (VkBaseOutStructure *)&props4; + last_struct = (VkBaseOutStructure *)&props4; + } + if (sm_builtins) { + last_struct->pNext = (VkBaseOutStructure *)&sm_props; + last_struct = (VkBaseOutStructure *)&sm_props; } + device->physical_device.getProperties2(&props2); device->properties = props2.properties; @@ -1643,6 +1657,11 @@ static vk_device ggml_vk_get_device(size_t idx) { device->vendor_id = device->properties.vendorID; device->subgroup_size = subgroup_props.subgroupSize; device->uma = device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; + if (sm_builtins) { + device->shader_core_count = sm_props.shaderSMCount; + } else { + device->shader_core_count = 0; + } bool fp16_storage = false; bool fp16_compute = false; @@ -2732,15 +2751,25 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz dst->device->device.resetFences({ dst->device->fence }); } -static uint32_t ggml_vk_guess_split_k(int m, int n, int k) { +static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int n, int k, const vk_pipeline& pipeline) { VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")"); - // if (k > 128 && (m < 128 || n < 128) && m > 2 && n > 2) { - // return 4; - // } - return 1; + uint32_t split_k = 1; + if (ctx->device->shader_core_count != 0 && m >= (int)pipeline->wg_denoms[0] && n >= (int)pipeline->wg_denoms[1]) { + // If k is 'large' and the SMs will fill less than halfway, use split_k. + uint32_t m_tiles = CEIL_DIV(m, pipeline->wg_denoms[0]); + uint32_t n_tiles = CEIL_DIV(n, pipeline->wg_denoms[1]); + if (k >= 2048 && m_tiles * n_tiles < ctx->device->shader_core_count / 2) { + split_k = ctx->device->shader_core_count / (m_tiles * n_tiles); + // Clamp to 2 or 4 + split_k = std::min(split_k, 4u); + if (split_k == 3) { + split_k = 2; + } + } + } - GGML_UNUSED(m); GGML_UNUSED(n); GGML_UNUSED(k); + return split_k; } static vk_pipeline ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) { @@ -2964,10 +2993,10 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11)); const bool aligned = ne10 == kpad && ne01 > 8 && ne11 > 8; - const uint32_t split_k = ggml_vk_guess_split_k(ne01, ne11, ne10); - vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned); + const uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, pipeline); + const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type); const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne; @@ -2993,7 +3022,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub if (dryrun) { const uint64_t x_sz_upd = x_sz * ne02 * ne03; const uint64_t y_sz_upd = y_sz * ne12 * ne13; - const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * 4 : 0; + const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * split_k : 0; if ( (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) || (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size) || diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp index 825b91031f569..4c64fd47af718 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_split_k_reduce.comp @@ -5,7 +5,9 @@ layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; layout (binding = 0) readonly buffer A {float data_a[];}; +layout (binding = 0) readonly buffer A4 {vec4 data_a4[];}; layout (binding = 1) writeonly buffer D {float data_d[];}; +layout (binding = 1) writeonly buffer D4 {vec4 data_d4[];}; layout (push_constant) uniform parameter { uint ne; @@ -13,17 +15,34 @@ layout (push_constant) uniform parameter { } p; void main() { - const uint idx = gl_GlobalInvocationID.x; + // Each invocation handles four consecutive components + const uint idx = gl_GlobalInvocationID.x * 4; if (idx >= p.ne) { return; } - float result = 0.0f; + // Check if all four components are in bounds and aligned, + // then use vector loads + if (idx + 3 < p.ne && (p.ne % 4) == 0) { + vec4 result = vec4(0.0f); - [[unroll]] for (uint i = 0; i < p.k_num; i++) { - result += data_a[i * p.ne + idx]; - } + [[unroll]] for (uint i = 0; i < p.k_num; i++) { + result += data_a4[(i * p.ne + idx) / 4]; + } + + data_d4[idx / 4] = result; + } else { + [[unroll]] for (uint j = 0; j < 4; ++j) { + if (idx + j < p.ne) { + float result = 0.0f; - data_d[idx] = result; + [[unroll]] for (uint i = 0; i < p.k_num; i++) { + result += data_a[i * p.ne + idx + j]; + } + + data_d[idx + j] = result; + } + } + } } From e480feb741c1944fb9505e7176a82e2e0a4cf143 Mon Sep 17 00:00:00 2001 From: piDack <104877312+piDack@users.noreply.github.com> Date: Wed, 4 Dec 2024 08:26:37 +0800 Subject: [PATCH 134/245] clip : add sycl support (#10574) Co-authored-by: piDack --- examples/llava/clip.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 7ba4cea58e80b..d7c94352b568b 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -12,6 +12,10 @@ #include "ggml-cuda.h" #endif +#ifdef GGML_USE_SYCL +#include "ggml-sycl.h" +#endif + #ifdef GGML_USE_METAL #include "ggml-metal.h" #endif @@ -1169,6 +1173,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { LOG_INF("%s: CLIP using Vulkan backend\n", __func__); #endif +#ifdef GGML_USE_SYCL + new_clip->backend = ggml_backend_sycl_init(0); + LOG_INF("%s: CLIP using SYCL backend\n", __func__); +#endif + if (!new_clip->backend) { new_clip->backend = ggml_backend_cpu_init(); LOG_INF("%s: CLIP using CPU backend\n", __func__); From 507aae4e7821031e0fa675786d99bde602859497 Mon Sep 17 00:00:00 2001 From: Benson Wong Date: Tue, 3 Dec 2024 16:40:36 -0800 Subject: [PATCH 135/245] Add docs for creating a static build (#10268) (#10630) * Add notes for a static build * Update docs/build.md --------- Co-authored-by: Diego Devesa --- docs/build.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/build.md b/docs/build.md index 97e340ab62acc..a4964cbd14909 100644 --- a/docs/build.md +++ b/docs/build.md @@ -39,6 +39,11 @@ cmake --build build --config Release ``` For more details and a list of supported generators, see the [CMake documentation](https://cmake.org/cmake/help/latest/manual/cmake-generators.7.html). +- For static builds, add `-DBUILD_SHARED_LIBS=OFF`: + ``` + cmake -B build -DBUILD_SHARED_LIBS=OFF + cmake --build build --config Release + ``` - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers: - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...): From 2d029b150fe49dd631027c8998bca4fe3957fab9 Mon Sep 17 00:00:00 2001 From: Frankie Robertson Date: Wed, 4 Dec 2024 02:41:37 +0200 Subject: [PATCH 136/245] Avoid using __fp16 on ARM with old nvcc (#10616) --- ggml/src/ggml-impl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 78e3af8f2971c..00a1546a7d5d0 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -310,14 +310,14 @@ void ggml_aligned_free(void * ptr, size_t size); // FP16 to FP32 conversion #if defined(__ARM_NEON) - #ifdef _MSC_VER + #if defined(_MSC_VER) || (defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) typedef uint16_t ggml_fp16_internal_t; #else typedef __fp16 ggml_fp16_internal_t; #endif #endif -#if defined(__ARM_NEON) && !defined(_MSC_VER) +#if defined(__ARM_NEON) && !defined(_MSC_VER) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) From 7fd12a62dae1c9d79adccc59880f63ce1208b76f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wang=20Ran=20=28=E6=B1=AA=E7=84=B6=29?= Date: Wed, 4 Dec 2024 09:22:50 +0800 Subject: [PATCH 137/245] fix typo of README.md (#10605) --- grammars/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grammars/README.md b/grammars/README.md index 4e57bca5f300b..9769540919f98 100644 --- a/grammars/README.md +++ b/grammars/README.md @@ -46,7 +46,7 @@ Terminals support the full range of Unicode. Unicode characters can be specified Character ranges can be negated with `^`: ``` -single-line ::= [^\n]+ "\n"` +single-line ::= [^\n]+ "\n" ``` ## Sequences and Alternatives From db892e11b4de94157682ce06f8dd2bf66d629a57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Scipione?= Date: Wed, 4 Dec 2024 02:29:20 +0100 Subject: [PATCH 138/245] SYCL : Move to compile time oneMKL interface backend selection for NVIDIA backend (#10584) * [SYCL] Move to Compile Time backend selection on oneMKL Interface for NVIDIA backend Move to compile time selection to backend to avoid latency at run time. Add it to all mkl gemm calls and only for NVIDIA backend. Signed-off-by: nscipione * Formatting * Address PR comments to increase readibility --------- Signed-off-by: nscipione --- ggml/src/ggml-sycl/CMakeLists.txt | 3 ++- ggml/src/ggml-sycl/dpct.hpp | 43 ++++++++++++++++++++++--------- ggml/src/ggml-sycl/ggml-sycl.cpp | 13 +++++++--- 3 files changed, 42 insertions(+), 17 deletions(-) diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt index 83f223fd7b6fc..3579a311aac07 100644 --- a/ggml/src/ggml-sycl/CMakeLists.txt +++ b/ggml/src/ggml-sycl/CMakeLists.txt @@ -68,7 +68,8 @@ else() target_link_libraries(ggml-sycl PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") - target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl) + add_compile_definitions(GGML_SYCL_NVIDIA) + target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl_blas_cublas) elseif (GGML_SYCL_TARGET STREQUAL "AMD") if (NOT GGML_SYCL_DEVICE_ARCH) message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.") diff --git a/ggml/src/ggml-sycl/dpct.hpp b/ggml/src/ggml-sycl/dpct.hpp index 0b04ad99f1d8b..c611bcf4fb7c6 100644 --- a/ggml/src/ggml-sycl/dpct.hpp +++ b/ggml/src/ggml-sycl/dpct.hpp @@ -1699,9 +1699,14 @@ namespace dpct auto data_a = get_memory(a); auto data_b = get_memory(b); auto data_c = get_memory(c); - oneapi::mkl::blas::column_major::gemm( - q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, - data_b, ldb, beta_value, data_c, ldc); +#ifdef GGML_SYCL_NVIDIA + oneapi::mkl::blas::column_major::gemm(oneapi::mkl::backend_selector{ q }, + a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb, + beta_value, data_c, ldc); +#else + oneapi::mkl::blas::column_major::gemm(q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb, + beta_value, data_c, ldc); +#endif } template @@ -1764,14 +1769,22 @@ namespace dpct matrix_info->ld_info[2] = ldc; matrix_info->groupsize_info = batch_size; +#ifdef GGML_SYCL_NVIDIA + sycl::event e = oneapi::mkl::blas::column_major::gemm_batch( + oneapi::mkl::backend_selector{ q }, matrix_info->transpose_info, + matrix_info->transpose_info + 1, matrix_info->size_info, matrix_info->size_info + 1, + matrix_info->size_info + 2, matrix_info->value_info, reinterpret_cast(a), + matrix_info->ld_info, reinterpret_cast(b), matrix_info->ld_info + 1, + matrix_info->value_info + 1, reinterpret_cast(c), matrix_info->ld_info + 2, 1, + &(matrix_info->groupsize_info)); +#else sycl::event e = oneapi::mkl::blas::column_major::gemm_batch( - q, matrix_info->transpose_info, matrix_info->transpose_info + 1, - matrix_info->size_info, matrix_info->size_info + 1, - matrix_info->size_info + 2, matrix_info->value_info, - reinterpret_cast(a), matrix_info->ld_info, - reinterpret_cast(b), matrix_info->ld_info + 1, - matrix_info->value_info + 1, reinterpret_cast(c), + q, matrix_info->transpose_info, matrix_info->transpose_info + 1, matrix_info->size_info, + matrix_info->size_info + 1, matrix_info->size_info + 2, matrix_info->value_info, + reinterpret_cast(a), matrix_info->ld_info, reinterpret_cast(b), + matrix_info->ld_info + 1, matrix_info->value_info + 1, reinterpret_cast(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info)); +#endif q.submit([&](sycl::handler &cgh) { @@ -1793,10 +1806,16 @@ namespace dpct auto data_a = get_memory(a); auto data_b = get_memory(b); auto data_c = get_memory(c); +#ifdef GGML_SYCL_NVIDIA oneapi::mkl::blas::column_major::gemm_batch( - q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, - stride_a, data_b, ldb, stride_b, beta_value, - data_c, ldc, stride_c, batch_size); + oneapi::mkl::backend_selector{ q }, a_trans, b_trans, m, n, k, + alpha_value, data_a, lda, stride_a, data_b, ldb, stride_b, beta_value, data_c, ldc, stride_c, + batch_size); +#else + oneapi::mkl::blas::column_major::gemm_batch(q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, + stride_a, data_b, ldb, stride_b, beta_value, data_c, ldc, + stride_c, batch_size); +#endif } } // namespace detail diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index fd39db0327093..46be0181c179f 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -2434,12 +2434,17 @@ inline void ggml_sycl_op_mul_mat_sycl( const float alpha = 1.0f; const float beta = 0.0f; #if !GGML_SYCL_DNNL +# ifdef GGML_SYCL_NVIDIA SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm( - *stream, oneapi::mkl::transpose::trans, - oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, - dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, - src1_ddf1_i, ne10, dpct::get_value(&beta, *stream), + oneapi::mkl::backend_selector{ *stream }, oneapi::mkl::transpose::trans, + oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, + ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream), dst_dd_i, ldc))); +# else + SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm( + *stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, + dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream), dst_dd_i, ldc))); +# endif #else auto dnnl_stream = ctx.stream_dnnl(stream); DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i, DnnlGemmWrapper::to_dt(), From 3c1539ac337fc3d8b2d98a545b31efac8670d2c6 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Wed, 4 Dec 2024 01:28:59 -0600 Subject: [PATCH 139/245] vulkan: Implement "fast divide" (mul+shift) for unary ops like copy (#10642) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 45 ++++++++++++++++++- .../vulkan-shaders/generic_unary_head.comp | 27 ++++++++--- tests/test-backend-ops.cpp | 2 + 3 files changed, 66 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 17e1be105f8b1..07b45d6b99838 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -353,7 +353,45 @@ struct vk_op_unary_push_constants { uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; uint32_t d_offset; float param1; float param2; + uint32_t ne0_012mp; uint32_t ne0_012L; + uint32_t ne0_01mp; uint32_t ne0_01L; + uint32_t ne0_0mp; uint32_t ne0_0L; + uint32_t ne1_012mp; uint32_t ne1_012L; + uint32_t ne1_01mp; uint32_t ne1_01L; + uint32_t ne1_0mp; uint32_t ne1_0L; }; +static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128"); + +// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1. +// Precompute mp (m' in the paper) and L such that division +// can be computed using a multiply (high 32b of 64b result) +// and a shift: +// +// n/d = (mulhi(n, mp) + n) >> L; +void init_fastdiv_values(uint32_t d, uint32_t &mp, uint32_t &L) +{ + // compute L = ceil(log2(d)); + L = 0; + while (L < 32 && (uint32_t{1} << L) < d) { + L++; + } + + mp = (uint32_t)((uint64_t{1} << 32) * ((uint64_t{1} << L) - d) / d + 1); +} + +template void init_pushconst_fastdiv(T &p) { + static_assert(!std::is_const::value, "unexpected type"); +} + +template <> void init_pushconst_fastdiv(vk_op_unary_push_constants &p) { + // Compute magic values to divide by these six numbers. + init_fastdiv_values(p.ne02*p.ne01*p.ne00, p.ne0_012mp, p.ne0_012L); + init_fastdiv_values(p.ne01*p.ne00, p.ne0_01mp, p.ne0_01L); + init_fastdiv_values(p.ne00, p.ne0_0mp, p.ne0_0L); + init_fastdiv_values(p.ne12*p.ne11*p.ne10, p.ne1_012mp, p.ne1_012L); + init_fastdiv_values(p.ne11*p.ne10, p.ne1_01mp, p.ne1_01L); + init_fastdiv_values(p.ne10, p.ne1_0mp, p.ne1_0L); +} struct vk_op_binary_push_constants { uint32_t ne; @@ -2914,13 +2952,14 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& elements = { ne, 1, 1 }; } - const vk_op_unary_push_constants pc = { + vk_op_unary_push_constants pc = { (uint32_t)ne, (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], (uint32_t)tensor->nb[0] / tensor_type_size, (uint32_t)tensor->nb[1] / tensor_type_size, (uint32_t)tensor->nb[2] / tensor_type_size, (uint32_t)tensor->nb[3] / tensor_type_size, (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], 1 , (uint32_t)tensor->ne[0] , (uint32_t)(tensor->ne[0] * tensor->ne[1]) , (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]), 0, 0.0f, 0.0f, }; + init_pushconst_fastdiv(pc); ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, elements); } @@ -4125,7 +4164,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) { } template -static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc, bool dryrun = false) { +static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, PC&& pc, bool dryrun = false) { VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; if (src1 != nullptr) { std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; @@ -4165,6 +4204,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co const uint64_t ned3 = dst->ne[3]; const uint64_t ned = ned0 * ned1; + init_pushconst_fastdiv(pc); + vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op); if (pipeline == nullptr) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp index 4e1fa3af3ad62..ab7c9d7eb1b09 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp @@ -8,6 +8,13 @@ layout (push_constant) uniform parameter uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13; uint d_offset; float param1; float param2; + + uint ne0_012mp; uint ne0_012L; + uint ne0_01mp; uint ne0_01L; + uint ne0_0mp; uint ne0_0L; + uint ne1_012mp; uint ne1_012L; + uint ne1_01mp; uint ne1_01L; + uint ne1_0mp; uint ne1_0L; } p; layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; @@ -17,22 +24,30 @@ uint get_idx() { return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; } +// see init_fastdiv_values in ggml-vulkan.cpp +uint fastdiv(uint n, uint mp, uint L) { + uint msbs, lsbs; + // msbs = mulhi(n, mp) + umulExtended(n, mp, msbs, lsbs); + return (msbs + n) >> L; +} + uint src0_idx(uint idx) { - const uint i03 = idx / (p.ne02*p.ne01*p.ne00); + const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L); const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; - const uint i02 = (idx - i03_offset) / (p.ne01*p.ne00); + const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L); const uint i02_offset = i02*p.ne01*p.ne00; - const uint i01 = (idx - i03_offset - i02_offset) / p.ne00; + const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L); const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00; return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00; } uint dst_idx(uint idx) { - const uint i13 = idx / (p.ne12*p.ne11*p.ne10); + const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L); const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10; - const uint i12 = (idx - i13_offset) / (p.ne11*p.ne10); + const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L); const uint i12_offset = i12*p.ne11*p.ne10; - const uint i11 = (idx - i13_offset - i12_offset) / p.ne10; + const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L); const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10; return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10; } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 87c92dadd9bcc..807d271c61efd 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3862,6 +3862,8 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1})); test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8192, 512, 2, 1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {3072, 512, 2, 1}, {0, 2, 1, 3})); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, 1.0f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 4096, 5, 1}, false, 1.0f, 0.0f)); From f0fd6a85928e03c91b7df46555be0c4c0d202fd4 Mon Sep 17 00:00:00 2001 From: JFLFY2255 Date: Wed, 4 Dec 2024 17:42:50 +0800 Subject: [PATCH 140/245] llama: Support MiniCPM-1B (with & w/o longrope) (#10559) --- convert_hf_to_gguf.py | 55 +++++++----- gguf-py/gguf/constants.py | 9 +- include/llama.h | 3 +- src/llama.cpp | 175 +++++--------------------------------- 4 files changed, 60 insertions(+), 182 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b931049d11e2d..d8df5cc001951 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1831,29 +1831,40 @@ class MiniCPMModel(Model): model_arch = gguf.MODEL_ARCH.MINICPM def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_file_type(self.ftype) + super().set_gguf_parameters() + embedding_scale = float(self.hparams["scale_emb"]) + self.gguf_writer.add_embedding_scale(embedding_scale) + logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}") + residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5 + self.gguf_writer.add_residual_scale(residual_scale) + logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}") + logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"] + self.gguf_writer.add_logit_scale(logit_scale) + logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}") + if self.hparams.get("rope_scaling") is not None: + if self.hparams["rope_scaling"].get("type") == "longrope": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE) + logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}") - def set_vocab(self): - self._set_vocab_llama_hf() + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head + rope_scaling = self.find_hparam(['rope_scaling'], True) + if rope_scaling is not None: + long_factors = rope_scaling.get('long_factor', None) + short_factors = rope_scaling.get('short_factor', None) - return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape) - ) + if long_factors is None or short_factors is None: + raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') + + if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: + raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) + + def set_vocab(self): + self._set_vocab_sentencepiece() def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -1863,9 +1874,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # HF models permute some of the tensors, so we need to undo that if name.endswith(("q_proj.weight")): - data_torch = self._reverse_hf_permute(data_torch, n_head, n_head) + data_torch = LlamaModel.permute(data_torch, n_head, n_head) if name.endswith(("k_proj.weight")): - data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head) + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) return [(self.map_tensor_name(name), data_torch)] diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 7df23371cc100..703199fcb3f68 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -896,6 +896,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.OUTPUT, MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ROPE_FACTORS_LONG, + MODEL_TENSOR.ROPE_FACTORS_SHORT, MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_K, @@ -1388,9 +1390,10 @@ class TokenType(IntEnum): class RopeScalingType(Enum): - NONE = 'none' - LINEAR = 'linear' - YARN = 'yarn' + NONE = 'none' + LINEAR = 'linear' + YARN = 'yarn' + LONGROPE = 'longrope' class PoolingType(IntEnum): diff --git a/include/llama.h b/include/llama.h index e85f459fc460f..168c3fa1f6e3b 100644 --- a/include/llama.h +++ b/include/llama.h @@ -185,7 +185,8 @@ extern "C" { LLAMA_ROPE_SCALING_TYPE_NONE = 0, LLAMA_ROPE_SCALING_TYPE_LINEAR = 1, LLAMA_ROPE_SCALING_TYPE_YARN = 2, - LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN, + LLAMA_ROPE_SCALING_TYPE_LONGROPE = 3, + LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_LONGROPE, }; enum llama_pooling_type { diff --git a/src/llama.cpp b/src/llama.cpp index 5681aea2ad762..f2dd1f47772d9 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1040,6 +1040,8 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, { LLM_TENSOR_OUTPUT, "output" }, { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" }, + { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, @@ -1687,9 +1689,10 @@ struct LLM_TN { // static const std::map LLAMA_ROPE_SCALING_TYPES = { - { LLAMA_ROPE_SCALING_TYPE_NONE, "none" }, - { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" }, - { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" }, + { LLAMA_ROPE_SCALING_TYPE_NONE, "none" }, + { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" }, + { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" }, + { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" }, }; static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) { @@ -5584,8 +5587,12 @@ static void llm_load_hparams( case LLM_ARCH_MINICPM: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale); + ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale); + ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); switch (hparams.n_layer) { + case 52: model.type = e_model::MODEL_1B; break; case 40: model.type = e_model::MODEL_2B; break; default: model.type = e_model::MODEL_UNKNOWN; } @@ -7069,7 +7076,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); } - if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) { + if (model.arch == LLM_ARCH_MINICPM || model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) { LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale); LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale); LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale); @@ -7699,7 +7706,13 @@ static bool llm_load_tensors( layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); - layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) { + layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + } + else { + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + } if (n_expert == 0) { layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); @@ -13506,153 +13519,6 @@ struct llm_build_context { return gf; } - // ref: https://arxiv.org/abs/2203.03466 - // https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738 - // based on the original build_llama() function - struct ggml_cgraph * build_minicpm() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - const int64_t n_embd = hparams.n_embd; - //TODO: if the model varies, these parameters need to be read from the model - const int64_t n_embd_base = 256; - const float scale_embd = 12.0f; - const float scale_depth = 1.4f; - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); - - // scale the input embeddings - inpL = ggml_scale(ctx0, inpL, scale_embd); - cb(inpL, "inp_scaled", -1); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - // norm - cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); - - // self-attention - { - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - if (model.layers[il].bq) { - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - cb(Qcur, "Qcur", il); - } - - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - if (model.layers[il].bk) { - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - cb(Kcur, "Kcur", il); - } - - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - if (model.layers[il].bv) { - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - cb(Vcur, "Vcur", il); - } - - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - // scale_res - scale the hidden states for residual connection - const float scale_res = scale_depth/sqrtf(float(n_layer)); - cur = ggml_scale(ctx0, cur, scale_res); - cb(cur, "hidden_scaled", -1); - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - { - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - } - - // scale the hidden states for residual connection - cur = ggml_scale(ctx0, cur, scale_res); - cb(cur, "hidden_scaled_ffn", -1); - - cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); - - // lm_head scaling - const float scale_lmhead = float(n_embd_base)/float(n_embd); - cur = ggml_scale(ctx0, cur, scale_lmhead); - cb(cur, "lmhead_scaling", -1); - - // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - struct ggml_cgraph * build_minicpm3() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); @@ -16751,6 +16617,7 @@ static struct ggml_cgraph * llama_build_graph( switch (model.arch) { case LLM_ARCH_LLAMA: + case LLM_ARCH_MINICPM: case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE_MOE: { @@ -16834,10 +16701,6 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_internlm2(); } break; - case LLM_ARCH_MINICPM: - { - result = llm.build_minicpm(); - } break; case LLM_ARCH_MINICPM3: { result = llm.build_minicpm3(); From a9bb5b856191a103f84b301c614f8006c040175b Mon Sep 17 00:00:00 2001 From: ltoniazzi <61414566+ltoniazzi@users.noreply.github.com> Date: Wed, 4 Dec 2024 09:45:48 +0000 Subject: [PATCH 141/245] Fix HF repo commit to clone lora test models (#10649) --- tests/test-lora-conversion-inference.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test-lora-conversion-inference.sh b/tests/test-lora-conversion-inference.sh index fe90ce0d1b801..fb308a9ffb76f 100755 --- a/tests/test-lora-conversion-inference.sh +++ b/tests/test-lora-conversion-inference.sh @@ -10,11 +10,16 @@ declare -a params=( MODELS_REPO=lora-tests MODELS_REPO_URL=https://huggingface.co/ggml-org/$MODELS_REPO +COMMIT=c26d5fb85b4070a9e9c4e65d132c783b98086890 # Clone the Hugging Face repository if the directory does not exist if [ ! -d "$MODELS_REPO" ]; then echo "Cloning the Hugging Face repository..." git clone $MODELS_REPO_URL --depth 1 + cd $MODELS_REPO + git fetch --depth=1 origin $COMMIT + git reset --hard $COMMIT + cd - else echo "Repository already exists. Skipping clone." fi From 9a840f863aa95b916401aa45671a3bc189be9623 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Wed, 4 Dec 2024 14:40:44 +0100 Subject: [PATCH 142/245] ggml-cpu : fix HWCAP2_I8MM value (#10646) --- ggml/src/ggml-cpu/ggml-cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 23ae2e10cd520..e4a9ca013eb0e 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -2425,7 +2425,7 @@ bool ggml_is_numa(void) { #endif #if !defined(HWCAP2_I8MM) -#define HWCAP2_I8MM 0 +#define HWCAP2_I8MM (1 << 13) #endif static void ggml_init_arm_arch_features(void) { From 545316b6bad3f5bc9c1048881e3392fc3a869993 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Wed, 4 Dec 2024 14:45:40 +0100 Subject: [PATCH 143/245] ggml : add predefined list of CPU backend variants to build (#10626) * ggml : add predefined list of CPU backend variants to build * update CPU dockerfiles --- .devops/full.Dockerfile | 31 +- .devops/llama-cli.Dockerfile | 16 +- .devops/llama-server.Dockerfile | 22 +- ggml/CMakeLists.txt | 49 ++- ggml/src/CMakeLists.txt | 35 ++ ggml/src/ggml-backend-reg.cpp | 32 +- ggml/src/ggml-cpu/CMakeLists.txt | 563 +++++++++++++++------------- ggml/src/ggml-cpu/cpu-feats-x86.cpp | 83 ++-- ggml/src/ggml-cpu/ggml-cpu.c | 2 +- ggml/src/ggml-cpu/ggml-cpu.cpp | 10 +- scripts/build-cpu.sh | 12 - 11 files changed, 483 insertions(+), 372 deletions(-) delete mode 100755 scripts/build-cpu.sh diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile index 2a06f82b738ae..d93c0be6a70c0 100644 --- a/.devops/full.Dockerfile +++ b/.devops/full.Dockerfile @@ -3,23 +3,36 @@ ARG UBUNTU_VERSION=22.04 FROM ubuntu:$UBUNTU_VERSION AS build RUN apt-get update && \ - apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1 + apt-get install -y build-essential git cmake libcurl4-openssl-dev + +WORKDIR /app + +COPY . . -COPY requirements.txt requirements.txt -COPY requirements requirements +RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \ + cmake --build build -j $(nproc) && \ + mkdir -p /app/lib && \ + find build -name "*.so" -exec cp {} /app/lib/ \; -RUN pip install --upgrade pip setuptools wheel \ - && pip install -r requirements.txt +FROM ubuntu:$UBUNTU_VERSION as runtime WORKDIR /app -COPY . . +RUN apt-get update && \ + apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1 -ENV LLAMA_CURL=1 +COPY requirements.txt /app/requirements.txt +COPY requirements /app/requirements +COPY .devops/tools.sh /app/tools.sh +RUN pip install --upgrade pip setuptools wheel && \ + pip install -r /app/requirements.txt -RUN make -j$(nproc) +COPY --from=build /app/build/bin/ /app/ +COPY --from=build /app/lib/ /app/ +COPY --from=build /app/convert_hf_to_gguf.py /app/ +COPY --from=build /app/gguf-py /app/gguf-py ENV LC_ALL=C.utf8 -ENTRYPOINT ["/app/.devops/tools.sh"] +ENTRYPOINT ["/app/tools.sh"] diff --git a/.devops/llama-cli.Dockerfile b/.devops/llama-cli.Dockerfile index 7f741aa46ecf0..be234d55dce5c 100644 --- a/.devops/llama-cli.Dockerfile +++ b/.devops/llama-cli.Dockerfile @@ -3,21 +3,27 @@ ARG UBUNTU_VERSION=22.04 FROM ubuntu:$UBUNTU_VERSION AS build RUN apt-get update && \ - apt-get install -y build-essential git + apt-get install -y build-essential git cmake libcurl4-openssl-dev WORKDIR /app COPY . . -RUN make -j$(nproc) llama-cli +RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \ + cmake --build build -j $(nproc) && \ + mkdir -p /app/lib && \ + find build -name "*.so" -exec cp {} /app/lib/ \; FROM ubuntu:$UBUNTU_VERSION AS runtime +WORKDIR /app + RUN apt-get update && \ - apt-get install -y libgomp1 + apt-get install -y libcurl4-openssl-dev libgomp1 curl -COPY --from=build /app/llama-cli /llama-cli +COPY --from=build /app/build/bin/llama-cli /app/ +COPY --from=build /app/lib/ /app/ ENV LC_ALL=C.utf8 -ENTRYPOINT [ "/llama-cli" ] +ENTRYPOINT [ "/app/llama-cli" ] diff --git a/.devops/llama-server.Dockerfile b/.devops/llama-server.Dockerfile index 7110dda9e225c..72ccde2feaeb9 100644 --- a/.devops/llama-server.Dockerfile +++ b/.devops/llama-server.Dockerfile @@ -9,28 +9,20 @@ WORKDIR /app COPY . . - -RUN \ - # Build multiple versions of the CPU backend - scripts/build-cpu.sh avx -DGGML_AVX=ON -DGGML_AVX2=OFF && \ - scripts/build-cpu.sh avx2 -DGGML_AVX=ON -DGGML_AVX2=ON && \ - scripts/build-cpu.sh avx512 -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON && \ - scripts/build-cpu.sh amx -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON -DGGML_AVX_VNNI=ON -DGGML_AVX512_VNNI=ON -DGGML_AMX_TILE=ON -DGGML_AMX_INT8=ON && \ - # Build llama-server - cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \ - cmake --build build --target llama-server -j $(nproc) && \ - # Copy the built libraries to /app/lib +RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \ + cmake --build build -j $(nproc) && \ mkdir -p /app/lib && \ - mv libggml-cpu* /app/lib/ && \ find build -name "*.so" -exec cp {} /app/lib/ \; FROM ubuntu:$UBUNTU_VERSION AS runtime +WORKDIR /app + RUN apt-get update && \ apt-get install -y libcurl4-openssl-dev libgomp1 curl -COPY --from=build /app/build/bin/llama-server /llama-server -COPY --from=build /app/lib/ / +COPY --from=build /app/build/bin/llama-server /app/ +COPY --from=build /app/lib/ /app/ ENV LC_ALL=C.utf8 # Must be set to 0.0.0.0 so it can listen to requests from host machine @@ -38,4 +30,4 @@ ENV LLAMA_ARG_HOST=0.0.0.0 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] -ENTRYPOINT [ "/llama-server" ] +ENTRYPOINT [ "/app/llama-server" ] diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 06d371e097f24..1b3d9896781f1 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -92,30 +92,33 @@ else() set(INS_ENB ON) endif() -option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF) -option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON) - -option(GGML_AVX "ggml: enable AVX" ${INS_ENB}) -option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF) -option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB}) -option(GGML_AVX512 "ggml: enable AVX512" OFF) -option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF) -option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF) -option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF) -option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF) -option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF) -option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF) -option(GGML_FMA "ggml: enable FMA" ${INS_ENB}) +option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF) +option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON) +option(GGML_AVX "ggml: enable AVX" ${INS_ENB}) +option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF) +option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB}) +option(GGML_AVX512 "ggml: enable AVX512F" OFF) +option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF) +option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF) +option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF) if (NOT MSVC) - option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512 + # in MSVC F16C and FMA is implied with AVX2/AVX512 + option(GGML_FMA "ggml: enable FMA" ${INS_ENB}) + option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) + # MSVC does not seem to support AMX + option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF) + option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF) + option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF) endif() -option(GGML_LASX "ggml: enable lasx" ON) -option(GGML_LSX "ggml: enable lsx" ON) -option(GGML_RVV "ggml: enable rvv" ON) -option(GGML_SVE "ggml: enable SVE" OFF) +option(GGML_LASX "ggml: enable lasx" ON) +option(GGML_LSX "ggml: enable lsx" ON) +option(GGML_RVV "ggml: enable rvv" ON) +option(GGML_SVE "ggml: enable SVE" OFF) +option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF) + if (WIN32) - set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version") + set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version") endif() # ggml core @@ -180,11 +183,7 @@ option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE}) set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD_REQUIRED true) -if (GGML_SYCL) - set(CMAKE_CXX_STANDARD 17) -else() - set(CMAKE_CXX_STANDARD 11) -endif() +set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED true) set(THREADS_PREFER_PTHREAD_FLAG ON) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 19289f32beaac..f07533fdb316a 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -269,7 +269,42 @@ function(ggml_add_backend backend) endif() endfunction() +function(ggml_add_cpu_backend_variant tag_name) + set(GGML_CPU_TAG_NAME ${tag_name}) + # other: OPENMP LLAMAFILE CPU_HBM + foreach (feat NATIVE + AVX AVX2 AVX_VNNI FMA F16C + AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 + AMX_TILE AMX_INT8 AMX_BF16) + set(GGML_${feat} OFF) + endforeach() + + foreach (feat ${ARGN}) + set(GGML_${feat} ON) + endforeach() + + ggml_add_cpu_backend_variant_impl(${tag_name}) +endfunction() + ggml_add_backend(CPU) + +if (GGML_CPU_ALL_VARIANTS) + if (NOT GGML_BACKEND_DL) + message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL") + endif() + ggml_add_cpu_backend_variant(sandybridge AVX) + ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 FMA) + ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512) + ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI) + if (NOT MSVC) + # MSVC doesn't support AVX-VNNI or AMX + ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 FMA AVX_VNNI) + ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8) + endif() +else () + ggml_add_cpu_backend_variant_impl("") +endif() + ggml_add_backend(BLAS) ggml_add_backend(CANN) ggml_add_backend(CUDA) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 2c4bf11b0233e..5cb0fb9d159db 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -483,6 +483,10 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) best_score = s; best_path = entry.path().string(); } + } else { + if (!silent) { + GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str()); + } } } } @@ -505,15 +509,21 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) } void ggml_backend_load_all() { - ggml_backend_load_best("blas", true); - ggml_backend_load_best("cann", true); - ggml_backend_load_best("cuda", true); - ggml_backend_load_best("hip", true); - ggml_backend_load_best("kompute", true); - ggml_backend_load_best("metal", true); - ggml_backend_load_best("rpc", true); - ggml_backend_load_best("sycl", true); - ggml_backend_load_best("vulkan", true); - ggml_backend_load_best("musa", true); - ggml_backend_load_best("cpu", true); +#ifdef NDEBUG + bool silent = true; +#else + bool silent = false; +#endif + + ggml_backend_load_best("blas", silent); + ggml_backend_load_best("cann", silent); + ggml_backend_load_best("cuda", silent); + ggml_backend_load_best("hip", silent); + ggml_backend_load_best("kompute", silent); + ggml_backend_load_best("metal", silent); + ggml_backend_load_best("rpc", silent); + ggml_backend_load_best("sycl", silent); + ggml_backend_load_best("vulkan", silent); + ggml_backend_load_best("musa", silent); + ggml_backend_load_best("cpu", silent); } diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 5df63884cf84c..bc326c0593024 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -1,319 +1,354 @@ -ggml_add_backend_library(ggml-cpu) - -list (APPEND GGML_CPU_SOURCES - ggml-cpu.c - ggml-cpu.cpp - ggml-cpu-aarch64.c - ggml-cpu-aarch64.h - ggml-cpu-quants.c - ggml-cpu-quants.h - amx/amx.cpp - amx/amx.h - amx/mmq.cpp - amx/mmq.h - ggml-cpu-impl.h - ) - -target_compile_features(ggml-cpu PRIVATE c_std_11 cxx_std_17) -target_include_directories(ggml-cpu PRIVATE .) - -if (APPLE AND GGML_ACCELERATE) - find_library(ACCELERATE_FRAMEWORK Accelerate) - if (ACCELERATE_FRAMEWORK) - message(STATUS "Accelerate framework found") - - target_compile_definitions(ggml-cpu PRIVATE GGML_USE_ACCELERATE) - target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_NEW_LAPACK) - target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_LAPACK_ILP64) - - target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK}) +function(ggml_add_cpu_backend_variant_impl tag_name) + if (tag_name) + set(GGML_CPU_NAME ggml-cpu-${tag_name}) else() - message(WARNING "Accelerate framework not found") + set(GGML_CPU_NAME ggml-cpu) endif() -endif() -if (GGML_OPENMP) - find_package(OpenMP) - if (OpenMP_FOUND) - message(STATUS "OpenMP found") + ggml_add_backend_library(${GGML_CPU_NAME}) + + list (APPEND GGML_CPU_SOURCES + ggml-cpu/ggml-cpu.c + ggml-cpu/ggml-cpu.cpp + ggml-cpu/ggml-cpu-aarch64.c + ggml-cpu/ggml-cpu-aarch64.h + ggml-cpu/ggml-cpu-quants.c + ggml-cpu/ggml-cpu-quants.h + ggml-cpu/amx/amx.cpp + ggml-cpu/amx/amx.h + ggml-cpu/amx/mmq.cpp + ggml-cpu/amx/mmq.h + ggml-cpu/ggml-cpu-impl.h + ) + + target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17) + target_include_directories(${GGML_CPU_NAME} PRIVATE . ggml-cpu) + + if (APPLE AND GGML_ACCELERATE) + find_library(ACCELERATE_FRAMEWORK Accelerate) + if (ACCELERATE_FRAMEWORK) + message(STATUS "Accelerate framework found") + + target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_ACCELERATE) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE ACCELERATE_NEW_LAPACK) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE ACCELERATE_LAPACK_ILP64) + + target_link_libraries(${GGML_CPU_NAME} PRIVATE ${ACCELERATE_FRAMEWORK}) + else() + message(WARNING "Accelerate framework not found") + endif() + endif() - target_compile_definitions(ggml-cpu PRIVATE GGML_USE_OPENMP) + if (GGML_OPENMP) + find_package(OpenMP) + if (OpenMP_FOUND) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP) - target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) - else() - message(WARNING "OpenMP not found") + target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) + else() + message(WARNING "OpenMP not found") + endif() endif() -endif() - -if (GGML_LLAMAFILE) - message(STATUS "Using llamafile") - target_compile_definitions(ggml-cpu PRIVATE GGML_USE_LLAMAFILE) + if (GGML_LLAMAFILE) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_LLAMAFILE) - list(APPEND GGML_CPU_SOURCES - llamafile/sgemm.cpp - llamafile/sgemm.h) -endif() + list(APPEND GGML_CPU_SOURCES + ggml-cpu/llamafile/sgemm.cpp + ggml-cpu/llamafile/sgemm.h) + endif() -if (GGML_CPU_HBM) - find_library(memkind memkind REQUIRED) + if (GGML_CPU_HBM) + find_library(memkind memkind REQUIRED) - message(STATUS "Using memkind for CPU HBM") + message(STATUS "Using memkind for CPU HBM") - target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_HBM) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_HBM) - target_link_libraries(ggml-cpu PUBLIC memkind) -endif() + target_link_libraries(${GGML_CPU_NAME} PUBLIC memkind) + endif() -if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR - CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR - (NOT CMAKE_OSX_ARCHITECTURES AND - NOT CMAKE_GENERATOR_PLATFORM_LWR AND - CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$")) + if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR + CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR + (NOT CMAKE_OSX_ARCHITECTURES AND + NOT CMAKE_GENERATOR_PLATFORM_LWR AND + CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$")) - message(STATUS "ARM detected") + message(STATUS "ARM detected") - if (MSVC) - list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead - list(APPEND ARCH_DEFINITIONS __ARM_NEON) - list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA) + if (MSVC) + list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead + list(APPEND ARCH_DEFINITIONS __ARM_NEON) + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA) - set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS}) - string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2") + set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS}) + string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2") - check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) - if (GGML_COMPILER_SUPPORT_DOTPROD) - list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD) + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) + if (GGML_COMPILER_SUPPORT_DOTPROD) + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD) - message(STATUS "ARM feature DOTPROD enabled") - endif () + message(STATUS "ARM feature DOTPROD enabled") + endif () - check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) - if (GGML_COMPILER_SUPPORT_MATMUL_INT8) - list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8) + if (GGML_COMPILER_SUPPORT_MATMUL_INT8) + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8) - message(STATUS "ARM feature MATMUL_INT8 enabled") - endif () + message(STATUS "ARM feature MATMUL_INT8 enabled") + endif () - check_cxx_source_compiles("#include \nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) - if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) - list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + check_cxx_source_compiles("#include \nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) + if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled") - endif () + message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled") + endif () - set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV}) - elseif (APPLE) - if (GGML_NATIVE) - set(USER_PROVIDED_MARCH FALSE) - foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS) - if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+") - set(USER_PROVIDED_MARCH TRUE) - break() - endif() - endforeach() + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV}) + elseif (APPLE) + if (GGML_NATIVE) + set(USER_PROVIDED_MARCH FALSE) + foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS) + if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+") + set(USER_PROVIDED_MARCH TRUE) + break() + endif() + endforeach() - if (NOT USER_PROVIDED_MARCH) - set(MARCH_FLAGS "-march=armv8.2a") + if (NOT USER_PROVIDED_MARCH) + set(MARCH_FLAGS "-march=armv8.2a") - check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) - if (GGML_COMPILER_SUPPORT_DOTPROD) - set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod") - list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD) + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) + if (GGML_COMPILER_SUPPORT_DOTPROD) + set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod") + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD) - message(STATUS "ARM feature DOTPROD enabled") - endif () + message(STATUS "ARM feature DOTPROD enabled") + endif () - set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm") + set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm") - set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS}) - set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}") + set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS}) + set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}") - check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) - if (GGML_COMPILER_SUPPORT_MATMUL_INT8) - set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm") - list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8) + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) + if (GGML_COMPILER_SUPPORT_MATMUL_INT8) + set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm") + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8) - message(STATUS "ARM feature MATMUL_INT8 enabled") - endif () + message(STATUS "ARM feature MATMUL_INT8 enabled") + endif () - set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE}) + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE}) - list(APPEND ARCH_FLAGS "${MARCH_FLAGS}") + list(APPEND ARCH_FLAGS "${MARCH_FLAGS}") + endif () endif () - endif () - else() - check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) - if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") - list(APPEND ARCH_FLAGS -mfp16-format=ieee) - endif() - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6") - # Raspberry Pi 1, Zero - list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access) - endif() - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") - if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android") - # Android armeabi-v7a - list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations) - else() - # Raspberry Pi 2 - list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) + else() + check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) + if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") + list(APPEND ARCH_FLAGS -mfp16-format=ieee) endif() - endif() - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") - # Android arm64-v8a - # Raspberry Pi 3, 4, Zero 2 (32-bit) - list(APPEND ARCH_FLAGS -mno-unaligned-access) - endif() - if (GGML_SVE) - list(APPEND ARCH_FLAGS -march=armv8.6-a+sve) - endif() - endif() -elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR - (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND - CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$")) - message(STATUS "x86 detected") - if (MSVC) - # instruction set detection for MSVC only - if (GGML_NATIVE) - include(cmake/FindSIMD.cmake) - endif () - if (GGML_AVX512) - list(APPEND ARCH_FLAGS /arch:AVX512) - # MSVC has no compile-time flags enabling specific - # AVX512 extensions, neither it defines the - # macros corresponding to the extensions. - # Do it manually. - if (GGML_AVX512_VBMI) - list(APPEND ARCH_DEFINITIONS __AVX512VBMI__) - if (CMAKE_C_COMPILER_ID STREQUAL "Clang") - list(APPEND ARCH_FLAGS -mavx512vbmi) - endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6") + # Raspberry Pi 1, Zero + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access) endif() - if (GGML_AVX512_VNNI) - list(APPEND ARCH_DEFINITIONS __AVX512VNNI__) - if (CMAKE_C_COMPILER_ID STREQUAL "Clang") - list(APPEND ARCH_FLAGS -mavx512vnni) + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") + if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android") + # Android armeabi-v7a + list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations) + else() + # Raspberry Pi 2 + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) endif() endif() - if (GGML_AVX512_BF16) - list(APPEND ARCH_DEFINITIONS __AVX512BF16__) - if (CMAKE_C_COMPILER_ID STREQUAL "Clang") - list(APPEND ARCH_FLAGS -mavx512bf16) - endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") + # Android arm64-v8a + # Raspberry Pi 3, 4, Zero 2 (32-bit) + list(APPEND ARCH_FLAGS -mno-unaligned-access) endif() - if (GGML_AMX_TILE) - list(APPEND ARCH_DEFINITIONS __AMX_TILE__) + if (GGML_SVE) + list(APPEND ARCH_FLAGS -march=armv8.6-a+sve) endif() - if (GGML_AMX_INT8) - list(APPEND ARCH_DEFINITIONS __AMX_INT8__) + endif() + elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR + (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND + CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$")) + if (MSVC) + # instruction set detection for MSVC only + if (GGML_NATIVE) + include(ggml-cpu/cmake/FindSIMD.cmake) + endif () + if (GGML_AVX512) + list(APPEND ARCH_FLAGS /arch:AVX512) + # /arch:AVX512 includes: __AVX512F__, __AVX512CD__, __AVX512BW__, __AVX512DQ__, and __AVX512VL__ + # MSVC has no compile-time flags enabling specific + # AVX512 extensions, neither it defines the + # macros corresponding to the extensions. + # Do it manually. + list(APPEND ARCH_DEFINITIONS GGML_AVX512) + if (GGML_AVX512_VBMI) + list(APPEND ARCH_DEFINITIONS __AVX512VBMI__) + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + list(APPEND ARCH_FLAGS -mavx512vbmi) + endif() + endif() + if (GGML_AVX512_VNNI) + list(APPEND ARCH_DEFINITIONS __AVX512VNNI__ GGML_AVX512_VNNI) + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + list(APPEND ARCH_FLAGS -mavx512vnni) + endif() + endif() + if (GGML_AVX512_BF16) + list(APPEND ARCH_DEFINITIONS __AVX512BF16__ GGML_AVX512_BF16) + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + list(APPEND ARCH_FLAGS -mavx512bf16) + endif() + endif() + if (GGML_AMX_TILE) + list(APPEND ARCH_DEFINITIONS __AMX_TILE__ GGML_AMX_TILE) + endif() + if (GGML_AMX_INT8) + list(APPEND ARCH_DEFINITIONS __AMX_INT8__ GGML_AMX_INT8) + endif() + if (GGML_AMX_BF16) + list(APPEND ARCH_DEFINITIONS __AMX_BF16__ GGML_AMX_BF16) + endif() + elseif (GGML_AVX2) + list(APPEND ARCH_FLAGS /arch:AVX2) + list(APPEND ARCH_DEFINITIONS GGML_AVX2 GGML_FMA GGML_F16C) + elseif (GGML_AVX) + list(APPEND ARCH_FLAGS /arch:AVX) + list(APPEND ARCH_DEFINITIONS GGML_AVX) + else () + list(APPEND ARCH_FLAGS /arch:SSE4.2) + list(APPEND ARCH_DEFINITIONS GGML_SSE42) endif() - if (GGML_AMX_BF16) - list(APPEND ARCH_DEFINITIONS __AMX_BF16__) + if (GGML_AVX_VNNI) + # MSVC generates AVX512 with AVX-VNNI intrinsics even with /arch:AVX2 + #list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI) endif() - elseif (GGML_AVX2) - list(APPEND ARCH_FLAGS /arch:AVX2) - elseif (GGML_AVX) - list(APPEND ARCH_FLAGS /arch:AVX) - endif() - if (GGML_AVX_VNNI) - list(APPEND ARCH_DEFINITIONS __AVXVNNI__) - if (CMAKE_C_COMPILER_ID STREQUAL "Clang") - list(APPEND ARCH_FLAGS -mavxvnni) + else () + if (GGML_NATIVE) + list(APPEND ARCH_FLAGS -march=native) + else () + list(APPEND ARCH_FLAGS -msse4.2) + list(APPEND ARCH_DEFINITIONS GGML_SSE42) + if (GGML_F16C) + list(APPEND ARCH_FLAGS -mf16c) + list(APPEND ARCH_DEFINITIONS GGML_F16C) + endif() + if (GGML_FMA) + list(APPEND ARCH_FLAGS -mfma) + list(APPEND ARCH_DEFINITIONS GGML_FMA) + endif() + if (GGML_AVX) + list(APPEND ARCH_FLAGS -mavx) + list(APPEND ARCH_DEFINITIONS GGML_AVX) + endif() + if (GGML_AVX2) + list(APPEND ARCH_FLAGS -mavx2) + list(APPEND ARCH_DEFINITIONS GGML_AVX2) + endif() + if (GGML_AVX_VNNI) + list(APPEND ARCH_FLAGS -mavxvnni) + list(APPEND ARCH_DEFINITIONS GGML_AVX_VNNI) + endif() + if (GGML_AVX512) + list(APPEND ARCH_FLAGS -mavx512f) + list(APPEND ARCH_FLAGS -mavx512cd) + list(APPEND ARCH_FLAGS -mavx512vl) + list(APPEND ARCH_FLAGS -mavx512dq) + list(APPEND ARCH_FLAGS -mavx512bw) + list(APPEND ARCH_DEFINITIONS GGML_AVX512) + endif() + if (GGML_AVX512_VBMI) + list(APPEND ARCH_FLAGS -mavx512vbmi) + list(APPEND ARCH_DEFINITIONS GGML_AVX512_VBMI) + endif() + if (GGML_AVX512_VNNI) + list(APPEND ARCH_FLAGS -mavx512vnni) + list(APPEND ARCH_DEFINITIONS GGML_AVX512_VNNI) + endif() + if (GGML_AVX512_BF16) + list(APPEND ARCH_FLAGS -mavx512bf16) + list(APPEND ARCH_DEFINITIONS GGML_AVX512_BF16) + endif() + if (GGML_AMX_TILE) + list(APPEND ARCH_FLAGS -mamx-tile) + list(APPEND ARCH_DEFINITIONS GGML_AMX_TILE) + endif() + if (GGML_AMX_INT8) + list(APPEND ARCH_FLAGS -mamx-int8) + list(APPEND ARCH_DEFINITIONS GGML_AMX_INT8) + endif() + if (GGML_AMX_BF16) + list(APPEND ARCH_FLAGS -mamx-bf16) + list(APPEND ARCH_DEFINITIONS GGML_AMX_BF16) + endif() endif() endif() - else() - if (GGML_NATIVE) - list(APPEND ARCH_FLAGS -march=native) - endif() - if (GGML_F16C) - list(APPEND ARCH_FLAGS -mf16c) - endif() - if (GGML_FMA) - list(APPEND ARCH_FLAGS -mfma) - endif() - if (GGML_AVX) - list(APPEND ARCH_FLAGS -mavx) - endif() - if (GGML_AVX2) - list(APPEND ARCH_FLAGS -mavx2) - endif() - if (GGML_AVX_VNNI) - list(APPEND ARCH_FLAGS -mavxvnni) - endif() - if (GGML_AVX512) - list(APPEND ARCH_FLAGS -mavx512f) - list(APPEND ARCH_FLAGS -mavx512dq) - list(APPEND ARCH_FLAGS -mavx512bw) - endif() - if (GGML_AVX512_VBMI) - list(APPEND ARCH_FLAGS -mavx512vbmi) + elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") + message(STATUS "PowerPC detected") + execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M) + string(FIND "${POWER10_M}" "POWER10" substring_index) + if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "") + set(substring_index -1) endif() - if (GGML_AVX512_VNNI) - list(APPEND ARCH_FLAGS -mavx512vnni) - endif() - if (GGML_AVX512_BF16) - list(APPEND ARCH_FLAGS -mavx512bf16) + + if (${substring_index} GREATER_EQUAL 0) + list(APPEND ARCH_FLAGS -mcpu=power10) + elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le") + list(APPEND ARCH_FLAGS -mcpu=powerpc64le) + else() + list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) + # TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) endif() - if (GGML_AMX_TILE) - list(APPEND ARCH_FLAGS -mamx-tile) + elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") + message(STATUS "loongarch64 detected") + + list(APPEND ARCH_FLAGS -march=loongarch64) + if (GGML_LASX) + list(APPEND ARCH_FLAGS -mlasx) endif() - if (GGML_AMX_INT8) - list(APPEND ARCH_FLAGS -mamx-int8) + if (GGML_LSX) + list(APPEND ARCH_FLAGS -mlsx) endif() - if (GGML_AMX_BF16) - list(APPEND ARCH_FLAGS -mamx-bf16) + elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64") + message(STATUS "RISC-V detected") + if (GGML_RVV) + list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d) endif() - endif() -elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") - message(STATUS "PowerPC detected") - execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M) - string(FIND "${POWER10_M}" "POWER10" substring_index) - if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "") - set(substring_index -1) - endif() - - if (${substring_index} GREATER_EQUAL 0) - list(APPEND ARCH_FLAGS -mcpu=power10) - elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le") - list(APPEND ARCH_FLAGS -mcpu=powerpc64le) else() - list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) - # TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) + message(STATUS "Unknown architecture") endif() -elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") - message(STATUS "loongarch64 detected") - list(APPEND ARCH_FLAGS -march=loongarch64) - if (GGML_LASX) - list(APPEND ARCH_FLAGS -mlasx) + if (GGML_CPU_AARCH64) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_AARCH64) endif() - if (GGML_LSX) - list(APPEND ARCH_FLAGS -mlsx) + + message(STATUS "Adding CPU backend variant ${GGML_CPU_NAME}: ${ARCH_FLAGS} ${ARCH_DEFINITIONS}") + target_sources(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_SOURCES}) + target_compile_options(${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS}) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS}) + + if (GGML_BACKEND_DL) + # The feature detection code is compiled as a separate target so that + # it can be built without the architecture flags + # Since multiple variants of the CPU backend may be included in the same + # build, using set_source_files_properties() to set the arch flags is not possible + set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats) + add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp) + target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include) + target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS}) + target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED) + set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME}) endif() -elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64") - message(STATUS "RISC-V detected") - if (GGML_RVV) - list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d) + + if (EMSCRIPTEN) + set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128") endif() -else() - message(STATUS "Unknown architecture") -endif() - -if (GGML_CPU_AARCH64) - message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels") - target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_AARCH64) -endif() - -target_sources(ggml-cpu PRIVATE ${GGML_CPU_SOURCES}) -set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS "${ARCH_FLAGS}") -set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}") - -# the feature detection code must be compiled without any architecture flags -target_sources(ggml-cpu PRIVATE cpu-feats-x86.cpp) -# target_sources(ggml-cpu PRIVATE cpu-feats-arm.cpp) # TODO: ARM feature detection - -if (EMSCRIPTEN) - set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128") -endif() +endfunction() diff --git a/ggml/src/ggml-cpu/cpu-feats-x86.cpp b/ggml/src/ggml-cpu/cpu-feats-x86.cpp index 514701ffe8024..e8133d411fd14 100644 --- a/ggml/src/ggml-cpu/cpu-feats-x86.cpp +++ b/ggml/src/ggml-cpu/cpu-feats-x86.cpp @@ -1,4 +1,3 @@ -#include "ggml-cpu.h" #include "ggml-backend-impl.h" #if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) @@ -13,6 +12,7 @@ #include #include +// ref: https://cdrdv2-public.intel.com/782156/325383-sdm-vol-2abcd.pdf struct cpuid_x86 { bool SSE3(void) { return f_1_ecx[0]; } bool PCLMULQDQ(void) { return f_1_ecx[1]; } @@ -50,11 +50,15 @@ struct cpuid_x86 { bool INVPCID(void) { return f_7_ebx[10]; } bool RTM(void) { return is_intel && f_7_ebx[11]; } bool AVX512F(void) { return f_7_ebx[16]; } + bool AVX512DQ(void) { return f_7_ebx[17]; } bool RDSEED(void) { return f_7_ebx[18]; } bool ADX(void) { return f_7_ebx[19]; } bool AVX512PF(void) { return f_7_ebx[26]; } bool AVX512ER(void) { return f_7_ebx[27]; } bool AVX512CD(void) { return f_7_ebx[28]; } + bool AVX512BW(void) { return f_7_ebx[30]; } + bool AVX512VL(void) { return f_7_ebx[31]; } + bool SHA(void) { return f_7_ebx[29]; } bool PREFETCHWT1(void) { return f_7_ecx[0]; } @@ -259,36 +263,57 @@ void test_x86_is() { static int ggml_backend_cpu_x86_score() { // FIXME: this does not check for OS support + int score = 0; cpuid_x86 is; - // if the CPU backend was built with any features not supported by the current CPU, it cannot be used - if (ggml_cpu_has_fma() && !is.FMA()) { return 0; } - if (ggml_cpu_has_f16c() && !is.F16C()) { return 0; } - if (ggml_cpu_has_ssse3() && !is.SSSE3()) { return 0; } - if (ggml_cpu_has_sse3() && !is.SSE3()) { return 0; } - if (ggml_cpu_has_avx() && !is.AVX()) { return 0; } - if (ggml_cpu_has_avx_vnni() && !is.AVX_VNNI()) { return 0; } - if (ggml_cpu_has_avx2() && !is.AVX2()) { return 0; } - if (ggml_cpu_has_avx512() && !is.AVX512F()) { return 0; } - if (ggml_cpu_has_avx512_vbmi() && !is.AVX512_VBMI()) { return 0; } - if (ggml_cpu_has_avx512_bf16() && !is.AVX512_BF16()) { return 0; } - if (ggml_cpu_has_avx512_vnni() && !is.AVX512_VNNI()) { return 0; } - if (ggml_cpu_has_amx_int8() && !is.AMX_INT8()) { return 0; } - // calculate a backend score based on the supported features - // more important features have a higher weight - int score = 0; - score += ggml_cpu_has_fma () * 1; - score += ggml_cpu_has_f16c () * 1<<1; - score += ggml_cpu_has_ssse3 () * 1<<2; - score += ggml_cpu_has_sse3 () * 1<<3; - score += ggml_cpu_has_avx_vnni () * 1<<4; - score += ggml_cpu_has_avx () * 1<<5; - score += ggml_cpu_has_avx2 () * 1<<6; - score += ggml_cpu_has_avx512 () * 1<<7; - // score += ggml_cpu_has_avx512_vbmi() * 1<<8; // not used - score += ggml_cpu_has_avx512_bf16() * 1<<9; - score += ggml_cpu_has_avx512_vnni() * 1<<10; - score += ggml_cpu_has_amx_int8 () * 1<<11; +#ifdef GGML_FMA + if (!is.FMA()) { return 0; } + score += 1; +#endif +#ifdef GGML_F16C + if (!is.F16C()) { return 0; } + score += 1<<1; +#endif +#ifdef GGML_SSE42 + if (!is.SSE42()) { return 0; } + score += 1<<2; +#endif +#ifdef GGML_AVX + if (!is.AVX()) { return 0; } + score += 1<<4; +#endif +#ifdef GGML_AVX2 + if (!is.AVX2()) { return 0; } + score += 1<<5; +#endif +#ifdef GGML_AVX_VNNI + if (!is.AVX_VNNI()) { return 0; } + score += 1<<6; +#endif +#ifdef GGML_AVX512 + if (!is.AVX512F()) { return 0; } + if (!is.AVX512CD()) { return 0; } + if (!is.AVX512VL()) { return 0; } + if (!is.AVX512DQ()) { return 0; } + if (!is.AVX512BW()) { return 0; } + score += 1<<7; +#endif +#ifdef GGML_AVX512_VBMI + if (!is.AVX512_VBMI()) { return 0; } + score += 1<<8; +#endif +#ifdef GGML_AVX512_BF16 + if (!is.AVX512_BF16()) { return 0; } + score += 1<<9; +#endif +#ifdef GGML_AVX512_VNNI + if (!is.AVX512_VNNI()) { return 0; } + score += 1<<10; +#endif +#ifdef GGML_AMX_INT8 + if (!is.AMX_INT8()) { return 0; } + score += 1<<11; +#endif return score; } diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index e4a9ca013eb0e..40ca7bb6873ef 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -756,7 +756,7 @@ do { \ #define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x))) #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0)) #else -static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) { +static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) { float tmp[8]; for (int i = 0; i < 8; i++) { diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 77e5d87a887ec..d3b4bdb965092 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -641,7 +641,15 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_llamafile()) { features.push_back({ "LLAMAFILE", "1" }); } - // TODO: rename this + #ifdef GGML_USE_ACCELERATE + features.push_back({ "ACCELERATE", "1" }); + #endif + #ifdef GGML_USE_CPU_HBM + features.push_back({ "CPU_HBM", "1" }); + #endif + #ifdef GGML_USE_OPENMP + features.push_back({ "OPENMP", "1" }); + #endif #ifdef GGML_USE_CPU_AARCH64 features.push_back({ "AARCH64_REPACK", "1" }); #endif diff --git a/scripts/build-cpu.sh b/scripts/build-cpu.sh deleted file mode 100755 index 4b2ad816e4cf6..0000000000000 --- a/scripts/build-cpu.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -name="$1" -args="${@:2}" - -echo "Building $name with args: $args" - -rm -fr build-cpu-$1 -cmake -S . -B build-cpu-$1 -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF $args -cmake --build build-cpu-$1 --config Release -t ggml-cpu -j $(nproc) -cp build-cpu-$1/bin/libggml-cpu.so ./libggml-cpu-$1.so -rm -fr build-cpu-$1 From b8b403f10d23bf74a4c9bffee02ef2fab289d93e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 4 Dec 2024 22:38:20 +0200 Subject: [PATCH 144/245] server : fix speculative decoding with context shift (#10641) * server : fix speculative decoding with context shift ggml-ci * server : take into account speculative limits ggml-ci * server : add tests --- examples/server/server.cpp | 29 +++++++++++++++-- .../server/tests/unit/test_speculative.py | 31 +++++++++++++++++++ 2 files changed, 58 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 9bca3f30e7574..31dfd62408047 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -921,6 +921,8 @@ struct server_context { slot.params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min); slot.params.speculative.n_min = std::min(slot.params.speculative.n_max, slot.params.speculative.n_min); + slot.params.speculative.n_min = std::max(slot.params.speculative.n_min, 2); + slot.params.speculative.n_max = std::max(slot.params.speculative.n_max, 0); if (slot.params.sampling.dry_base < 1.0f) { slot.params.sampling.dry_base = defaults.sampling.dry_base; @@ -2322,10 +2324,29 @@ struct server_context { continue; } + // determine the max draft that fits the current slot state + int n_draft_max = slot.params.speculative.n_max; + + // note: n_past is not yet increased for the `id` token sampled above + // also, need to leave space for 1 extra token to allow context shifts + n_draft_max = std::min(n_draft_max, slot.n_ctx - slot.n_past - 2); + + if (slot.n_remaining > 0) { + n_draft_max = std::min(n_draft_max, slot.n_remaining - 1); + } + + SLT_DBG(slot, "max possible draft: %d\n", n_draft_max); + + if (n_draft_max < slot.params.speculative.n_min) { + SLT_DBG(slot, "the max possible draft is too small: %d < %d - skipping speculative decoding\n", n_draft_max, slot.params.speculative.n_min); + + continue; + } + llama_token id = slot.sampled; struct common_speculative_params params_spec; - params_spec.n_draft = slot.params.speculative.n_max; + params_spec.n_draft = n_draft_max; params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max; params_spec.p_min = slot.params.speculative.p_min; @@ -2333,6 +2354,8 @@ struct server_context { // ignore small drafts if (slot.params.speculative.n_min > (int) draft.size()) { + SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min); + continue; } @@ -2344,6 +2367,8 @@ struct server_context { common_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true); } + SLT_DBG(slot, "decoding speculative batch, size = %d\n", slot.batch_spec.n_tokens); + llama_decode(ctx, slot.batch_spec); // the accepted tokens from the speculation @@ -2372,7 +2397,7 @@ struct server_context { } } - SRV_DBG("accepted %d/%d draft tokens\n", (int) ids.size() - 1, (int) draft.size()); + SLT_DBG(slot, "accepted %d/%d draft tokens, new n_past = %d\n", (int) ids.size() - 1, (int) draft.size(), slot.n_past); } } diff --git a/examples/server/tests/unit/test_speculative.py b/examples/server/tests/unit/test_speculative.py index 982d6abb45f5f..3bb5733cbdf48 100644 --- a/examples/server/tests/unit/test_speculative.py +++ b/examples/server/tests/unit/test_speculative.py @@ -82,6 +82,37 @@ def test_different_draft_min_draft_max(): last_content = res.body["content"] +def test_slot_ctx_not_exceeded(): + global server + server.n_ctx = 64 + server.start() + res = server.make_request("POST", "/completion", data={ + "prompt": "Hello " * 56, + "temperature": 0.0, + "top_k": 1, + "speculative.p_min": 0.0, + }) + assert res.status_code == 200 + assert len(res.body["content"]) > 0 + + +def test_with_ctx_shift(): + global server + server.n_ctx = 64 + server.start() + res = server.make_request("POST", "/completion", data={ + "prompt": "Hello " * 56, + "temperature": 0.0, + "top_k": 1, + "n_predict": 64, + "speculative.p_min": 0.0, + }) + assert res.status_code == 200 + assert len(res.body["content"]) > 0 + assert res.body["tokens_predicted"] == 64 + assert res.body["truncated"] == True + + @pytest.mark.parametrize("n_slots,n_requests", [ (1, 2), (2, 2), From 94bb316714e67c75a1cbe4b9ba76f6d3b9ddfe8b Mon Sep 17 00:00:00 2001 From: aryantandon01 <80969509+aryantandon01@users.noreply.github.com> Date: Thu, 5 Dec 2024 03:49:20 +0530 Subject: [PATCH 145/245] Update deprecation-warning.cpp (#10619) Fixed Path Separator Handling for Cross-Platform Support (Windows File Systems) --- examples/deprecation-warning/deprecation-warning.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/deprecation-warning/deprecation-warning.cpp b/examples/deprecation-warning/deprecation-warning.cpp index 11b35d2c22500..c2958ea12d92d 100644 --- a/examples/deprecation-warning/deprecation-warning.cpp +++ b/examples/deprecation-warning/deprecation-warning.cpp @@ -12,7 +12,7 @@ int main(int argc, char** argv) { } // Get only the program name from the full path - auto pos = filename.find_last_of('/'); + auto pos = filename.find_last_of("/\\"); if (pos != std::string::npos) { filename = filename.substr(pos+1); } From 73d3d8cb1aa5ec4a722dfbbaff92834ac703d8f7 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Thu, 5 Dec 2024 08:47:55 +0100 Subject: [PATCH 146/245] py : update outdated copy-paste instructions [no ci] (#10667) This commit updates the copy-paste instruction in convert_hf_to_gguf_update.py to reflect that convert_hf_to_gguf.py will have already been updated with the new get_vocab_base_pre() function when this script completes. --- convert_hf_to_gguf_update.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 28cd02e5a7f66..2a51fce2d8db0 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -17,7 +17,7 @@ # # python3 convert_hf_to_gguf_update.py # -# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py +# - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated # - Update llama.cpp with the new pre-tokenizer if necessary # # TODO: generate tokenizer tests for llama.cpp From 63df14abc07000e34d16ec65539b4455bf14eb9c Mon Sep 17 00:00:00 2001 From: PAB Date: Tue, 3 Dec 2024 20:20:04 +0100 Subject: [PATCH 147/245] ggml : add `GGML_PAD_REFLECT_1D` operation (ggml/1034) * ggml_pad_reflect_1d defined in header * implemented on CPU * called the forward pass * impl Metal kernel * added Metal kernel * added OP_PAD_REFLECT_1D in test-backend-ops.cpp * add test-pad-reflect-1d test case * test case support multiple backend --- ggml/include/ggml.h | 8 +++++ ggml/src/ggml-cpu/ggml-cpu.c | 39 +++++++++++++++++++++++ ggml/src/ggml-metal/ggml-metal.m | 35 +++++++++++++++++++++ ggml/src/ggml-metal/ggml-metal.metal | 47 ++++++++++++++++++++++++++++ ggml/src/ggml.c | 37 ++++++++++++++++++++-- tests/test-backend-ops.cpp | 28 +++++++++++++++++ 6 files changed, 192 insertions(+), 2 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 65cb92c444bb7..1c8cc11b6b24d 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -499,6 +499,7 @@ extern "C" { GGML_OP_POOL_2D_BACK, GGML_OP_UPSCALE, // nearest interpolate GGML_OP_PAD, + GGML_OP_PAD_REFLECT_1D, GGML_OP_ARANGE, GGML_OP_TIMESTEP_EMBEDDING, GGML_OP_ARGSORT, @@ -1695,6 +1696,13 @@ extern "C" { int p2, int p3); + // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c] + GGML_API struct ggml_tensor * ggml_pad_reflect_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int p0, + int p1); + // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151 // timesteps: [N,] // return: [N, dim] diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 40ca7bb6873ef..f82e877f05506 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -10439,6 +10439,40 @@ static void ggml_compute_forward_pad( } } +// ggml_compute_forward_pad_reflect_1d + +static void ggml_compute_forward_pad_reflect_1d( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int ith = params->ith; + const int nth = params->nth; + + const int32_t * opts = (const int32_t *) dst->op_params; + const int p0 = opts[0]; + const int p1 = opts[1]; + + GGML_TENSOR_UNARY_OP_LOCALS + + for (int64_t i3 = 0; i3 < ne3; i3++) { + for (int64_t i2 = 0; i2 < ne2; i2++) { + for (int64_t i1 = ith; i1 < ne1; i1 += nth) { + float * left = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + p0*nb0); + float * right = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (ne0-p1-1)*nb0); + + ggml_vec_cpy_f32(ne00, left, (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01)); + + for (int i0 = 1; i0 <= p0; i0++) { left[-i0] = left[i0]; } + for (int i0 = 1; i0 <= p1; i0++) { right[i0] = right[-i0]; } + } + } + } +} // ggml_compute_forward_arange @@ -12535,6 +12569,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_pad(params, tensor); } break; + case GGML_OP_PAD_REFLECT_1D: + { + ggml_compute_forward_pad_reflect_1d(params, tensor); + } break; case GGML_OP_ARANGE: { ggml_compute_forward_arange(params, tensor); @@ -12877,6 +12915,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } break; case GGML_OP_UPSCALE: case GGML_OP_PAD: + case GGML_OP_PAD_REFLECT_1D: case GGML_OP_ARANGE: case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_ARGSORT: diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 093ae9000ab37..71f5525f38c36 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -310,6 +310,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32, GGML_METAL_KERNEL_TYPE_UPSCALE_F32, GGML_METAL_KERNEL_TYPE_PAD_F32, + GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, GGML_METAL_KERNEL_TYPE_ARANGE_F32, GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, @@ -877,6 +878,7 @@ @implementation GGMLMetalClass GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32, conv_transpose_1d_f16_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); @@ -1099,6 +1101,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex case GGML_OP_POOL_2D: case GGML_OP_UPSCALE: case GGML_OP_PAD: + case GGML_OP_PAD_REFLECT_1D: case GGML_OP_ARANGE: case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_ARGSORT: @@ -3258,6 +3261,38 @@ static void ggml_metal_encode_node( const int nth = MIN(1024, ne0); + [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_PAD_REFLECT_1D: + { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + + const int32_t p0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t p1 = ((const int32_t *)(dst->op_params))[1]; + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:6]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:11]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:12]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:13]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:14]; + [encoder setBytes:&p0 length:sizeof(p0) atIndex:15]; + [encoder setBytes:&p1 length:sizeof(p1) atIndex:16]; + + const int nth = MIN(1024, ne0); + [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; case GGML_OP_ARANGE: diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 5caa0846a8b53..ee3e353ab0d79 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -2897,6 +2897,53 @@ kernel void kernel_pad_f32( } } +kernel void kernel_pad_reflect_1d_f32( + device const char * src0, + device char * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant int64_t & ne0, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + constant int32_t & p0, + constant int32_t & p1, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + + const int64_t i3 = tgpig.z; + const int64_t i2 = tgpig.y; + const int64_t i1 = tgpig.x; + + const int64_t i03 = i3; + const int64_t i02 = i2; + const int64_t i01 = i1; + + device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01); + device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1); + + if (i1 < ne01 && i2 < ne02 && i3 < ne03) { + for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { + if (i0 < p0) { + dst_ptr[i0] = src0_ptr[p0 - i0]; + } else if (i0 < ne0 - p1) { + dst_ptr[i0] = src0_ptr[i0 - p0]; + } else { + dst_ptr[i0] = src0_ptr[(ne0 - p1 - p0) - (p1 + 1 - (ne0 - i0)) - 1]; + } + } + } +} + kernel void kernel_arange_f32( device char * dst, constant int64_t & ne0, diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 1abd2d963f4de..ba65160c0eec3 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -950,6 +950,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "POOL_2D_BACK", "UPSCALE", "PAD", + "PAD_REFLECT_1D", "ARANGE", "TIMESTEP_EMBEDDING", "ARGSORT", @@ -983,7 +984,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "OPT_STEP_ADAMW", }; -static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); +static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1045,6 +1046,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "pool_2d_back(x)", "upscale(x)", "pad(x)", + "pad_reflect_1d(x)", "arange(start, stop, step)", "timestep_embedding(timesteps, dim, max_period)", "argsort(x)", @@ -1078,7 +1080,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "adamw(x)", }; -static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); +static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -4097,6 +4099,37 @@ struct ggml_tensor * ggml_pad( return result; } +// ggml_pad_reflect_1d + +struct ggml_tensor * ggml_pad_reflect_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int p0, + int p1) { + GGML_ASSERT(p0 >= 0); + GGML_ASSERT(p1 >= 0); + + GGML_ASSERT(p0 < a->ne[0]); // padding length on each size must be less than the + GGML_ASSERT(p1 < a->ne[0]); // existing length of the dimension being padded + + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(a->type == GGML_TYPE_F32); + + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, + a->ne[0] + p0 + p1, + a->ne[1], + a->ne[2], + a->ne[3]); + + int32_t params[] = { p0, p1 }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_PAD_REFLECT_1D; + result->src[0] = a; + + return result; +} + // ggml_arange struct ggml_tensor * ggml_arange( diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 807d271c61efd..042eeaaaba5c7 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2697,6 +2697,33 @@ struct test_pad : public test_case { } }; +// GGML_OP_PAD_REFLECT_1D +struct test_pad_reflect_1d : public test_case { + const ggml_type type; + const std::array ne_a; + const int pad_0; + const int pad_1; + + std::string vars() override { + return VARS_TO_STR4(type, ne_a, pad_0, pad_1); + } + + test_pad_reflect_1d(ggml_type type = GGML_TYPE_F32, + std::array ne_a = {512, 34, 2, 1}, + int pad_0 = 10, int pad_1 = 9) + : type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor(ctx, type, 2, ne_a.data()); + ggml_set_name(a, "a"); + + ggml_tensor * out = ggml_pad_reflect_1d(ctx, a, pad_0, pad_1); + ggml_set_name(out, "out"); + + return out; + } +}; + // GGML_OP_ARANGE struct test_arange : public test_case { const ggml_type type; @@ -3816,6 +3843,7 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {9, 9, 1280, 1})); test_cases.emplace_back(new test_acc()); test_cases.emplace_back(new test_pad()); + test_cases.emplace_back(new test_pad_reflect_1d()); test_cases.emplace_back(new test_arange()); test_cases.emplace_back(new test_timestep_embedding()); test_cases.emplace_back(new test_leaky_relu()); From 1daf35b05f77b267bc8bf8ea7dfef7d5c8b72ded Mon Sep 17 00:00:00 2001 From: PAB Date: Wed, 4 Dec 2024 09:19:30 +0100 Subject: [PATCH 148/245] ggml: add `GGML_SET` Metal kernel + i32 CPU kernel (ggml/1037) * implemented cpu kernel * add i32 test cases in test-backend-ops * typedef `ggml_metal_kargs_set` * implemented `kernel_set` * memcpy --- ggml/src/ggml-cpu/ggml-cpu.c | 80 ++++++++++++++++++++++++++- ggml/src/ggml-metal/ggml-metal-impl.h | 15 +++++ ggml/src/ggml-metal/ggml-metal.m | 76 +++++++++++++++++++++++++ ggml/src/ggml-metal/ggml-metal.metal | 32 +++++++++++ tests/test-backend-ops.cpp | 4 ++ 5 files changed, 206 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index f82e877f05506..f12b62e3bd8b9 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1374,7 +1374,10 @@ struct ggml_compute_state { inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; } inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } -inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } + +inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } +inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; } + inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } @@ -8248,6 +8251,77 @@ static void ggml_compute_forward_set_f32( } } +static void ggml_compute_forward_set_i32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); + + // view src0 and dst with these strides and data offset inbytes during set + // nb0 is implicitly element_size because src0 and dst are contiguous + size_t nb1 = ((int32_t *) dst->op_params)[0]; + size_t nb2 = ((int32_t *) dst->op_params)[1]; + size_t nb3 = ((int32_t *) dst->op_params)[2]; + size_t offset = ((int32_t *) dst->op_params)[3]; + bool inplace = (bool) ((int32_t *) dst->op_params)[4]; + + if (!inplace) { + if (params->ith == 0) { + // memcpy needs to be synchronized across threads to avoid race conditions. + // => do it in INIT phase + memcpy( + ((char *) dst->data), + ((char *) src0->data), + ggml_nbytes(dst)); + } + ggml_barrier(params->threadpool); + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src1); + const int nc = src1->ne[0]; + + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) + + // src0 and dst as viewed during set + const size_t nb0 = ggml_element_size(src0); + + const int im0 = (ne10 == 0 ? 0 : ne10-1); + const int im1 = (ne11 == 0 ? 0 : ne11-1); + const int im2 = (ne12 == 0 ? 0 : ne12-1); + const int im3 = (ne13 == 0 ? 0 : ne13-1); + + GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 <= ggml_nbytes(dst)); + + GGML_ASSERT(nb10 == sizeof(int32_t)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are viewed with shape of src1 and offset + // => same indices + const int i3 = ir/(ne12*ne11); + const int i2 = (ir - i3*ne12*ne11)/ne11; + const int i1 = (ir - i3*ne12*ne11 - i2*ne11); + + ggml_vec_cpy_i32(nc, + (int32_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), + (int32_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); + } +} + static void ggml_compute_forward_set( const struct ggml_compute_params * params, struct ggml_tensor * dst) { @@ -8259,6 +8333,10 @@ static void ggml_compute_forward_set( { ggml_compute_forward_set_f32(params, dst); } break; + case GGML_TYPE_I32: + { + ggml_compute_forward_set_i32(params, dst); + } break; case GGML_TYPE_F16: case GGML_TYPE_BF16: case GGML_TYPE_Q4_0: diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index e195867192712..e3dc25f1686fb 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -102,6 +102,21 @@ typedef struct { uint64_t nb3; } ggml_metal_kargs_cpy; +typedef struct { + int64_t ne10; + int64_t ne11; + int64_t ne12; + uint64_t nb10; + uint64_t nb11; + uint64_t nb12; + uint64_t nb13; + uint64_t nb1; + uint64_t nb2; + uint64_t nb3; + uint64_t offs; + bool inplace; +} ggml_metal_kargs_set; + typedef struct { int32_t ne00; int32_t ne01; diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 71f5525f38c36..f80fda7a4ca38 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -372,6 +372,8 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256, + GGML_METAL_KERNEL_TYPE_SET_I32, + GGML_METAL_KERNEL_TYPE_SET_F32, GGML_METAL_KERNEL_TYPE_CPY_F32_F32, GGML_METAL_KERNEL_TYPE_CPY_F32_F16, GGML_METAL_KERNEL_TYPE_CPY_F32_BF16, @@ -940,6 +942,8 @@ @implementation GGMLMetalClass GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256, flash_attn_ext_vec_q5_0_h256, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256, flash_attn_ext_vec_q5_1_h256, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256, flash_attn_ext_vec_q8_0_h256, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_F32, set_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_I32, set_i32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_BF16, cpy_f32_bf16, use_bfloat); @@ -1159,6 +1163,16 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex return false; }; } + case GGML_OP_SET: + { + switch (op->src[0]->type) { + case GGML_TYPE_F32: + case GGML_TYPE_I32: + return true; + default: + return false; + }; + } case GGML_OP_DIAG_MASK_INF: case GGML_OP_GET_ROWS: { @@ -3824,6 +3838,68 @@ static void ggml_metal_encode_node( [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; + case GGML_OP_SET: + { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); + + // src0 and dst as viewed during set + const size_t dst_nb0 = ggml_element_size(src0); + + const size_t dst_nb1 = ((int32_t *) dst->op_params)[0]; + const size_t dst_nb2 = ((int32_t *) dst->op_params)[1]; + const size_t dst_nb3 = ((int32_t *) dst->op_params)[2]; + const size_t offset = ((int32_t *) dst->op_params)[3]; + const bool inplace = (bool) ((int32_t *) dst->op_params)[4]; + + if (!inplace) { + memcpy(((char *) dst->data), ((char *) src0->data), ggml_nbytes(dst)); + } + + const int im0 = (ne10 == 0 ? 0 : ne10-1); + const int im1 = (ne11 == 0 ? 0 : ne11-1); + const int im2 = (ne12 == 0 ? 0 : ne12-1); + const int im3 = (ne13 == 0 ? 0 : ne13-1); + + GGML_ASSERT(offset + im0*dst_nb0 + im1*dst_nb1 + im2*dst_nb2 + im3*dst_nb3 <= ggml_nbytes(dst)); + + id pipeline = nil; + + switch (src0t) { + case GGML_TYPE_F32: + GGML_ASSERT(nb10 == sizeof(float)); + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_F32].pipeline; break; + case GGML_TYPE_I32: + GGML_ASSERT(nb10 == sizeof(int32_t)); + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_I32].pipeline; break; + default: GGML_ABORT("fatal error"); + } + + ggml_metal_kargs_set args = { + /*.ne10 =*/ ne10, + /*.ne11 =*/ ne11, + /*.ne12 =*/ ne12, + /*.nb10 =*/ nb10, + /*.nb11 =*/ nb11, + /*.nb12 =*/ nb12, + /*.nb13 =*/ nb13, + /*.nb1 =*/ dst_nb1, + /*.nb2 =*/ dst_nb2, + /*.nb3 =*/ dst_nb3, + /*.offs =*/ offset, + /*.inplace =*/ inplace, + }; + + const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne10); + + [encoder setComputePipelineState:pipeline]; + [encoder setBytes:&args length:sizeof(args) atIndex:0]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; case GGML_OP_POOL_2D: { GGML_ASSERT(ggml_is_contiguous(src0)); diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index ee3e353ab0d79..8ba43904d0c1b 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -3927,6 +3927,38 @@ template [[host_name("kernel_flash_attn_ext_vec_q8_0_h256")]] kernel flash_attn_ #undef FA_TYPES +template +kernel void kernel_set( + constant ggml_metal_kargs_set & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + const int i13 = tgpig[2]; + const int i12 = tgpig[1]; + const int i11 = tgpig[0]; + + const int64_t n = i13*args.ne12*args.ne11*args.ne10 + i12*args.ne11*args.ne10 + i11*args.ne10; + + const int64_t i3 = n / (args.ne12*args.ne11*args.ne10); + const int64_t i2 = (n - i3*args.ne12*args.ne11*args.ne10) / (args.ne11*args.ne10); + const int64_t i1 = (n - i3*args.ne12*args.ne11*args.ne10 - i2*args.ne11*args.ne10) / args.ne10; + + device T * dst_data = (device T *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + args.offs); + + for (int64_t i10 = tpitg.x; i10 < args.ne10; i10 += ntg.x) { + device const T * src = (device T *) (src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + i10*args.nb10); + dst_data[i10] = (T) src[0]; + } +} + +typedef decltype(kernel_set) kernel_set_t; + +template [[host_name("kernel_set_f32")]] kernel kernel_set_t kernel_set; +template [[host_name("kernel_set_i32")]] kernel kernel_set_t kernel_set; + template kernel void kernel_cpy( constant ggml_metal_kargs_cpy & args, diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 042eeaaaba5c7..9dd41260a1f19 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3521,6 +3521,10 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_set(GGML_TYPE_F32, GGML_TYPE_F32, {6, 5, 4, 3}, dim)); } + for (int dim = 1; dim < GGML_MAX_DIMS; ++dim) { + test_cases.emplace_back(new test_set(GGML_TYPE_I32, GGML_TYPE_I32, {6, 5, 4, 3}, dim)); + } + for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) { for (ggml_type type_dst : all_types) { test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4})); From 36447f91124169dd565b2cf9ece452eb6e6bf071 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 5 Dec 2024 13:27:42 +0200 Subject: [PATCH 149/245] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 27769c93b512d..47eae44f7d7b6 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -b903ffe79daf18c0aaacbebe44a7b93a6b8d0982 +74d66b63eaf207a24f3e93bb922aba131cbf2906 From d80a283667053f1b9501c9193960958ac76a6a1e Mon Sep 17 00:00:00 2001 From: Riccardo Orlando Date: Thu, 5 Dec 2024 19:30:59 +0100 Subject: [PATCH 150/245] llama : add Minerva 7B model support (#10673) * Support for Minerva 7B * Update convert_hf_to_gguf_update.py --- convert_hf_to_gguf.py | 3 +++ convert_hf_to_gguf_update.py | 1 + include/llama.h | 1 + src/llama-vocab.cpp | 1 + src/llama.cpp | 3 +++ 5 files changed, 9 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d8df5cc001951..9f1419e29eb4e 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -658,6 +658,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450": # ref: https://huggingface.co/facebook/chameleon-7b res = "chameleon" + if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35": + # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0 + res = "minerva-7b" if res is None: logger.warning("\n") diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 2a51fce2d8db0..ce3c571df3453 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -102,6 +102,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", }, {"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", }, {"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", }, + {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", }, ] diff --git a/include/llama.h b/include/llama.h index 168c3fa1f6e3b..d121354c19d56 100644 --- a/include/llama.h +++ b/include/llama.h @@ -104,6 +104,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24, LLAMA_VOCAB_PRE_TYPE_EXAONE = 25, LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26, + LLAMA_VOCAB_PRE_TYPE_MINERVA = 27, }; enum llama_rope_type { diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index d1dc96276c2a2..8c9aaf5a0c85d 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -418,6 +418,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { case LLAMA_VOCAB_PRE_TYPE_SMOLLM: case LLAMA_VOCAB_PRE_TYPE_CODESHELL: case LLAMA_VOCAB_PRE_TYPE_EXAONE: + case LLAMA_VOCAB_PRE_TYPE_MINERVA: regex_exprs = { "\\p{N}", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", diff --git a/src/llama.cpp b/src/llama.cpp index f2dd1f47772d9..de38a2b4ac43a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6483,6 +6483,9 @@ static void llm_load_vocab( vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON; vocab.tokenizer_add_bos = true; vocab.tokenizer_clean_spaces = false; + } else if ( + tokenizer_pre == "minerva-7b") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } From 10512c37e6829598f08a22f860c78ef70752ad40 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Thu, 5 Dec 2024 13:15:05 -0600 Subject: [PATCH 151/245] vulkan: Add VK_NV_cooperative_matrix2 support for mul_mat and flash attention (#10206) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 751 ++++++++++++++++-- .../ggml-vulkan/vulkan-shaders/CMakeLists.txt | 2 + .../vulkan-shaders/dequant_funcs_cm2.comp | 305 +++++++ .../vulkan-shaders/flash_attn_cm2.comp | 289 +++++++ .../vulkan-shaders/mul_mm_cm2.comp | 328 ++++++++ .../vulkan-shaders/vulkan-shaders-gen.cpp | 95 ++- 6 files changed, 1669 insertions(+), 101 deletions(-) create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 07b45d6b99838..c7ac0e8f7bd70 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -167,6 +167,7 @@ struct vk_device_struct { uint32_t subgroup_size; uint32_t shader_core_count; bool uma; + bool coopmat2; size_t idx; @@ -176,6 +177,7 @@ struct vk_device_struct { vk_matmul_pipeline2 pipeline_matmul_f16_f32; vk_pipeline pipeline_matmul_split_k_reduce; + vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_COUNT]; vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT]; vk_matmul_pipeline pipeline_matmul_id_f32; @@ -229,6 +231,14 @@ struct vk_device_struct { vk_pipeline pipeline_timestep_embedding_f32; vk_pipeline pipeline_pool2d_f32; + // [2][2][2] is for {f16acc,f32acc}x{large,small_rows}x{unaligned, aligned} + vk_pipeline pipeline_flash_attn_f32_f16_D64[GGML_TYPE_COUNT][2][2][2]; + vk_pipeline pipeline_flash_attn_f32_f16_D80[GGML_TYPE_COUNT][2][2][2]; + vk_pipeline pipeline_flash_attn_f32_f16_D96[GGML_TYPE_COUNT][2][2][2]; + vk_pipeline pipeline_flash_attn_f32_f16_D112[GGML_TYPE_COUNT][2][2][2]; + vk_pipeline pipeline_flash_attn_f32_f16_D128[GGML_TYPE_COUNT][2][2][2]; + vk_pipeline pipeline_flash_attn_f32_f16_D256[GGML_TYPE_COUNT][2][2][2]; + std::unordered_map pipelines; std::unordered_map pipeline_descriptor_set_requirements; @@ -340,6 +350,40 @@ struct vk_mat_vec_id_push_constants { uint32_t nei0; uint32_t ne11; }; +struct vk_flash_attn_push_constants { + uint32_t N; + uint32_t KV; + + uint32_t ne1; + uint32_t ne2; + uint32_t ne3; + + uint32_t neq2; + uint32_t neq3; + uint32_t nek2; + uint32_t nek3; + uint32_t nev2; + uint32_t nev3; + uint32_t nem1; + + uint32_t nb02; + uint32_t nb03; + uint32_t nb12; + uint32_t nb13; + uint32_t nb22; + uint32_t nb23; + uint32_t nb31; + + float scale; + float max_bias; + float logit_softcap; + + uint32_t mask; + uint32_t n_head_log2; + float m0; + float m1; +}; + struct vk_op_push_constants { uint32_t KX; uint32_t KY; @@ -1265,6 +1309,23 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector&& events ); } +// number of rows/cols for flash attention shader +static constexpr uint32_t flash_attention_num_small_rows = 32; +static std::array fa_rows_cols(uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) { + GGML_UNUSED(clamp); + + // small rows, large cols + if (small_rows) { + return {flash_attention_num_small_rows, 128}; + } + // small cols to reduce register count + if (ggml_is_quantized(type) || D == 256) { + return {64, 32}; + } + return {64, 64}; +}; + + static void ggml_vk_load_shaders(vk_device& device) { VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")"); @@ -1275,59 +1336,98 @@ static void ggml_vk_load_shaders(vk_device& device) { // mulmat std::vector l_warptile, m_warptile, s_warptile, - l_warptile_mmq, m_warptile_mmq, s_warptile_mmq; + l_warptile_mmq, m_warptile_mmq, s_warptile_mmq, + l_warptile_mmq_k, m_warptile_mmq_k, s_warptile_mmq_k, + l_warptile_mmqid, m_warptile_mmqid, s_warptile_mmqid; std::array l_wg_denoms, m_wg_denoms, s_wg_denoms, - l_mmq_wg_denoms, m_mmq_wg_denoms, s_mmq_wg_denoms; - uint32_t l_align, m_align, s_align; + l_mmq_wg_denoms, m_mmq_wg_denoms, s_mmq_wg_denoms, + l_mmq_wg_denoms_k, m_mmq_wg_denoms_k, s_mmq_wg_denoms_k, + l_mmqid_wg_denoms, m_mmqid_wg_denoms, s_mmqid_wg_denoms; - l_warptile = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size }; - m_warptile = { 128, 64, 64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size }; - s_warptile = { subgroup_size_16, 32, 32, 16, 32, 32, 2, 2, 2, device->subgroup_size }; - - l_warptile_mmq = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size }; - m_warptile_mmq = { 128, 64, 64, 32, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size }; - s_warptile_mmq = { subgroup_size_16, 32, 32, 32, 32, 32, 2, 2, 2, device->subgroup_size }; - - l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 }; - m_mmq_wg_denoms = m_wg_denoms = { 64, 64, 1 }; - s_mmq_wg_denoms = s_wg_denoms = { 32, 32, 1 }; - - l_align = 128; - m_align = 64; - s_align = 32; - - // Fallback to smaller sizes if there's not enough shared memory. Given the current shaders - // and tile sizes, this should handle 16KB, 32KB, and 48KB+. - // This logic doesn't explicitly account for the 12KB row_ids in the mul_mat_mat_id shaders. - // But the numbers happen to work out for 32KB shared memory size that when using the medium - // size there's enough room for everything, and we assert for this. - uint32_t shmem_needed = (l_warptile[1] + l_warptile[2]) * (l_warptile[3] + 1) * sizeof(float); - if (shmem_needed > device->properties.limits.maxComputeSharedMemorySize) { - l_warptile = m_warptile; - l_wg_denoms = m_wg_denoms; - shmem_needed = (l_warptile[1] + l_warptile[2]) * (l_warptile[3] + 1) * sizeof(float); - GGML_ASSERT(shmem_needed <= device->properties.limits.maxComputeSharedMemorySize); - } - if (device->properties.limits.maxComputeSharedMemorySize >= 32768) { - // assert mul_mat_mat_id shaders will fit. - GGML_ASSERT(shmem_needed + 3072*4 <= device->properties.limits.maxComputeSharedMemorySize); - } - - shmem_needed = (l_warptile_mmq[1] + l_warptile_mmq[2]) * (l_warptile_mmq[3] + 1) * sizeof(float); - if (shmem_needed > device->properties.limits.maxComputeSharedMemorySize) { - if (device->properties.limits.maxComputeSharedMemorySize == 32768) { - l_warptile_mmq = m_warptile_mmq; - l_mmq_wg_denoms = m_mmq_wg_denoms; - } else { - l_warptile_mmq = s_warptile_mmq; - l_mmq_wg_denoms = s_mmq_wg_denoms; + uint32_t l_align, m_align, s_align; + if (device->coopmat2) { + // spec constants and tile sizes for non-quant matmul/matmul_id + l_warptile = { 256, 128, 256, 64 }; + m_warptile = { 256, 128, 128, 64 }; + s_warptile = { 128, 32, 16, 64 }; + l_wg_denoms = {128, 256, 1 }; + m_wg_denoms = {128, 128, 1 }; + s_wg_denoms = { 32, 16, 1 }; + + // spec constants and tile sizes for quant matmul (non-Qi_K) + l_warptile_mmq = { 256, 128, 256, 64 }; + m_warptile_mmq = { 256, 128, 128, 64 }; + s_warptile_mmq = { 256, 128, 128, 64 }; + l_mmq_wg_denoms = { 128, 256, 1 }; + m_mmq_wg_denoms = { 128, 128, 1 }; + s_mmq_wg_denoms = { 128, 128, 1 }; + + // spec constants and tile sizes for quant matmul (Qi_K) + l_warptile_mmq_k = { 256, 128, 512, 16 }; + m_warptile_mmq_k = { 256, 128, 256, 16 }; + s_warptile_mmq_k = { 256, 32, 128, 64 }; + l_mmq_wg_denoms_k = { 128, 512, 1 }; + m_mmq_wg_denoms_k = { 128, 256, 1 }; + s_mmq_wg_denoms_k = { 32, 128, 1 }; + + // spec constants and tile sizes for quant matmul_id + l_warptile_mmqid = { 256, 128, 128, 16 }; + m_warptile_mmqid = { 256, 128, 64, 16 }; + s_warptile_mmqid = { 256, 64, 64, 16 }; + l_mmqid_wg_denoms = { 128, 128, 1 }; + m_mmqid_wg_denoms = { 128, 64, 1 }; + s_mmqid_wg_denoms = { 64, 64, 1 }; + + l_align = 128; + m_align = 64; + s_align = 32; + } else { + l_warptile = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size }; + m_warptile = { 128, 64, 64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size }; + s_warptile = { subgroup_size_16, 32, 32, 16, 32, 32, 2, 2, 2, device->subgroup_size }; + l_warptile_mmq = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size }; + m_warptile_mmq = { 128, 64, 64, 32, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size }; + s_warptile_mmq = { subgroup_size_16, 32, 32, 32, 32, 32, 2, 2, 2, device->subgroup_size }; + l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 }; + m_mmq_wg_denoms = m_wg_denoms = { 64, 64, 1 }; + s_mmq_wg_denoms = s_wg_denoms = { 32, 32, 1 }; + l_align = 128; + m_align = 64; + s_align = 32; + + // Fallback to smaller sizes if there's not enough shared memory. Given the current shaders + // and tile sizes, this should handle 16KB, 32KB, and 48KB+. + // This logic doesn't explicitly account for the 12KB row_ids in the mul_mat_mat_id shaders. + // But the numbers happen to work out for 32KB shared memory size that when using the medium + // size there's enough room for everything, and we assert for this. + uint32_t shmem_needed = (l_warptile[1] + l_warptile[2]) * (l_warptile[3] + 1) * sizeof(float); + if (shmem_needed > device->properties.limits.maxComputeSharedMemorySize) { + l_warptile = m_warptile; + l_wg_denoms = m_wg_denoms; + shmem_needed = (l_warptile[1] + l_warptile[2]) * (l_warptile[3] + 1) * sizeof(float); + GGML_ASSERT(shmem_needed <= device->properties.limits.maxComputeSharedMemorySize); + } + if (device->properties.limits.maxComputeSharedMemorySize >= 32768) { + // assert mul_mat_mat_id shaders will fit. + GGML_ASSERT(shmem_needed + 3072*4 <= device->properties.limits.maxComputeSharedMemorySize); } + shmem_needed = (l_warptile_mmq[1] + l_warptile_mmq[2]) * (l_warptile_mmq[3] + 1) * sizeof(float); - GGML_ASSERT(shmem_needed <= device->properties.limits.maxComputeSharedMemorySize); - } - if (device->properties.limits.maxComputeSharedMemorySize >= 32768) { - // assert mul_mat_mat_id shaders will fit. - GGML_ASSERT(shmem_needed + 3072*4 <= device->properties.limits.maxComputeSharedMemorySize); + if (shmem_needed > device->properties.limits.maxComputeSharedMemorySize) { + if (device->properties.limits.maxComputeSharedMemorySize == 32768) { + l_warptile_mmq = m_warptile_mmq; + l_mmq_wg_denoms = m_mmq_wg_denoms; + } else { + l_warptile_mmq = s_warptile_mmq; + l_mmq_wg_denoms = s_mmq_wg_denoms; + } + shmem_needed = (l_warptile_mmq[1] + l_warptile_mmq[2]) * (l_warptile_mmq[3] + 1) * sizeof(float); + GGML_ASSERT(shmem_needed <= device->properties.limits.maxComputeSharedMemorySize); + } + if (device->properties.limits.maxComputeSharedMemorySize >= 32768) { + // assert mul_mat_mat_id shaders will fit. + GGML_ASSERT(shmem_needed + 3072*4 <= device->properties.limits.maxComputeSharedMemorySize); + } } device->pipeline_matmul_f32 = std::make_shared(); @@ -1362,6 +1462,105 @@ static void ggml_vk_load_shaders(vk_device& device) { compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align, disable_robustness)); }; +#if defined(VK_NV_cooperative_matrix2) + if (device->coopmat2) { + + auto const &fa_wg_denoms = [&](uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::array { + return {fa_rows_cols(D, clamp, type, small_rows)[0], 1, 1}; + }; + + auto const &fa_spec_constants = [&](uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::vector { + // For large number of rows, 128 invocations seems to work best. + // For small number of rows (e.g. N==1), 256 works better. But matrix granularity for 256 is 32, so we + // can't use 256 for D==80. + uint32_t wg_size = (small_rows && (D % 32) == 0) ? 256 : 128; + auto rows_cols = fa_rows_cols(D, clamp, type, small_rows); + return {wg_size, rows_cols[0], rows_cols[1], (D), clamp}; + }; + +#define CREATE_FA2(TYPE, NAMELC, D) \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][0][0][0], "flash_attn_f32_f16_D" #D "_f16acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_len, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,1,TYPE,false), fa_spec_constants(D,1,TYPE,false), 1); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][0][0][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_len, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,0,TYPE,false), fa_spec_constants(D,0,TYPE,false), fa_rows_cols(D,0,TYPE,false)[1]); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][1][0][0], "flash_attn_f32_f16_D" #D "_f32acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _cm2_len, flash_attn_f32_f16_ ## NAMELC ## _cm2_data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,1,TYPE,false), fa_spec_constants(D,1,TYPE,false), 1); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][1][0][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _cm2_len, flash_attn_f32_f16_ ## NAMELC ## _cm2_data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,0,TYPE,false), fa_spec_constants(D,0,TYPE,false), fa_rows_cols(D,0,TYPE,false)[1]); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][0][1][0], "flash_attn_f32_f16_D" #D "_f16acc_smallrows" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_len, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,1,TYPE,true), fa_spec_constants(D,1,TYPE,true), 1); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][0][1][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc_smallrows" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_len, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,0,TYPE,true), fa_spec_constants(D,0,TYPE,true), fa_rows_cols(D,0,TYPE,true)[1]); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][1][1][0], "flash_attn_f32_f16_D" #D "_f32acc_smallrows" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _cm2_len, flash_attn_f32_f16_ ## NAMELC ## _cm2_data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,1,TYPE,true), fa_spec_constants(D,1,TYPE,true), 1); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][1][1][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc_smallrows" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _cm2_len, flash_attn_f32_f16_ ## NAMELC ## _cm2_data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,0,TYPE,true), fa_spec_constants(D,0,TYPE,true), fa_rows_cols(D,0,TYPE,true)[1]); \ + +#define CREATE_FA(TYPE, NAMELC) \ + CREATE_FA2(TYPE, NAMELC, 64) \ + CREATE_FA2(TYPE, NAMELC, 80) \ + CREATE_FA2(TYPE, NAMELC, 96) \ + CREATE_FA2(TYPE, NAMELC, 112) \ + CREATE_FA2(TYPE, NAMELC, 128) \ + CREATE_FA2(TYPE, NAMELC, 256) + + CREATE_FA(GGML_TYPE_F16, f16) + CREATE_FA(GGML_TYPE_Q4_0, q4_0) + CREATE_FA(GGML_TYPE_Q4_1, q4_1) + CREATE_FA(GGML_TYPE_Q5_0, q5_0) + CREATE_FA(GGML_TYPE_Q5_1, q5_1) + CREATE_FA(GGML_TYPE_Q8_0, q8_0) + // K dequants currently disabled because D dimension is rounded up to 256 and runs inefficiently + //CREATE_FA(GGML_TYPE_Q2_K, q2_k) + //CREATE_FA(GGML_TYPE_Q3_K, q3_k) + //CREATE_FA(GGML_TYPE_Q4_K, q4_k) + //CREATE_FA(GGML_TYPE_Q5_K, q5_k) + //CREATE_FA(GGML_TYPE_Q6_K, q6_k) + CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl) +#undef CREATE_FA + + // Create 6 variants, {s,m,l}x{unaligned,aligned} +#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \ + + // Create 2 variants, {f16,f32} accumulator +#define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ + CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ + CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ + + CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3) + + CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3) + CREATE_MM2(pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + + CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_matmul_id_f16, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 4) + + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) +#undef CREATE_MM +#undef CREATE_MM2 + } else +#endif if (device->fp16) { // Create 6 variants, {s,m,l}x{unaligned,aligned} #define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ @@ -1648,15 +1847,28 @@ static vk_device ggml_vk_get_device(size_t idx) { device->physical_device = physical_devices[dev_num]; const std::vector ext_props = device->physical_device.enumerateDeviceExtensionProperties(); + bool fp16_storage = false; + bool fp16_compute = false; bool maintenance4_support = false; bool sm_builtins = false; + bool pipeline_robustness = false; + bool coopmat2_support = false; // Check if maintenance4 is supported for (const auto& properties : ext_props) { if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) { maintenance4_support = true; + } else if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { + fp16_storage = true; + } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { + fp16_compute = true; } else if (strcmp("VK_NV_shader_sm_builtins", properties.extensionName) == 0) { sm_builtins = true; + } else if (strcmp("VK_EXT_pipeline_robustness", properties.extensionName) == 0) { + pipeline_robustness = true; + } else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 && + !getenv("GGML_VULKAN_DISABLE_COOPMAT2")) { + coopmat2_support = true; } } @@ -1679,6 +1891,14 @@ static vk_device ggml_vk_get_device(size_t idx) { last_struct = (VkBaseOutStructure *)&sm_props; } +#if defined(VK_NV_cooperative_matrix2) + vk::PhysicalDeviceCooperativeMatrix2PropertiesNV coopmat2_props; + if (coopmat2_support) { + last_struct->pNext = (VkBaseOutStructure *)&coopmat2_props; + last_struct = (VkBaseOutStructure *)&coopmat2_props; + } +#endif + device->physical_device.getProperties2(&props2); device->properties = props2.properties; @@ -1701,20 +1921,6 @@ static vk_device ggml_vk_get_device(size_t idx) { device->shader_core_count = 0; } - bool fp16_storage = false; - bool fp16_compute = false; - bool pipeline_robustness = false; - - for (const auto& properties : ext_props) { - if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { - fp16_storage = true; - } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { - fp16_compute = true; - } else if (strcmp("VK_EXT_pipeline_robustness", properties.extensionName) == 0) { - pipeline_robustness = true; - } - } - const char* GGML_VK_DISABLE_F16 = getenv("GGML_VK_DISABLE_F16"); const bool force_disable_f16 = GGML_VK_DISABLE_F16 != nullptr; @@ -1757,22 +1963,112 @@ static vk_device ggml_vk_get_device(size_t idx) { vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES; vk11_features.pNext = &vk12_features; + last_struct = (VkBaseOutStructure *)&vk12_features; + VkPhysicalDevicePipelineRobustnessFeaturesEXT pl_robustness_features; pl_robustness_features.pNext = nullptr; pl_robustness_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_ROBUSTNESS_FEATURES_EXT; pl_robustness_features.pipelineRobustness = VK_FALSE; if (pipeline_robustness) { - vk12_features.pNext = &pl_robustness_features; + last_struct->pNext = (VkBaseOutStructure *)&pl_robustness_features; + last_struct = (VkBaseOutStructure *)&pl_robustness_features; device_extensions.push_back("VK_EXT_pipeline_robustness"); } +#if defined(VK_NV_cooperative_matrix2) + VkPhysicalDeviceCooperativeMatrix2FeaturesNV coopmat2_features {}; + coopmat2_features.pNext = nullptr; + coopmat2_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_2_FEATURES_NV; + if (coopmat2_support) { + last_struct->pNext = (VkBaseOutStructure *)&coopmat2_features; + last_struct = (VkBaseOutStructure *)&coopmat2_features; + device_extensions.push_back("VK_NV_cooperative_matrix2"); + } +#endif + vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2); device->fp16 = device->fp16 && vk12_features.shaderFloat16; device->pipeline_robustness = pl_robustness_features.pipelineRobustness; + if (coopmat2_support) { +#if defined(VK_NV_cooperative_matrix2) + if (coopmat2_features.cooperativeMatrixWorkgroupScope && + coopmat2_features.cooperativeMatrixFlexibleDimensions && + coopmat2_features.cooperativeMatrixReductions && + coopmat2_features.cooperativeMatrixConversions && + coopmat2_features.cooperativeMatrixPerElementOperations && + coopmat2_features.cooperativeMatrixTensorAddressing && + coopmat2_features.cooperativeMatrixBlockLoads && + vk12_features.bufferDeviceAddress) { + + std::vector flexible_dimensions; + uint32_t count = 0; + + PFN_vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV + _vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV = + (PFN_vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV) + vk_instance.instance.getProcAddr("vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV"); + + _vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV(device->physical_device, &count, nullptr); + + VkCooperativeMatrixFlexibleDimensionsPropertiesNV empty_prop {}; + empty_prop.sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_FLEXIBLE_DIMENSIONS_PROPERTIES_NV; + flexible_dimensions.resize(count, empty_prop); + + _vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV(device->physical_device, &count, flexible_dimensions.data()); + + bool found_fp16_128 = false, + found_fp16_256 = false, + found_fp32_128 = false, + found_fp32_256 = false; + // need to support fp16*fp16 with fp16/fp32 accumulator, for workgroupsize 128 + // with 32x16x16 and 256 with 32x32x16. + for (auto &prop : flexible_dimensions) { + if (prop.saturatingAccumulation == VK_FALSE && + prop.scope == VK_SCOPE_WORKGROUP_KHR && + prop.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && + prop.BType == VK_COMPONENT_TYPE_FLOAT16_KHR) { + + if (prop.workgroupInvocations == 128 && + prop.MGranularity <= 32 && + prop.NGranularity <= 16 && + prop.KGranularity <= 16) { + if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR && + prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) { + found_fp16_128 = true; + } + if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && + prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) { + found_fp32_128 = true; + } + } + if (prop.workgroupInvocations == 256 && + prop.MGranularity <= 32 && + prop.NGranularity <= 32 && + prop.KGranularity <= 16) { + if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR && + prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) { + found_fp16_256 = true; + } + if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && + prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) { + found_fp32_256 = true; + } + } + } + } + if (found_fp16_128 && found_fp16_256 && + found_fp32_128 && found_fp32_256 && + coopmat2_props.cooperativeMatrixFlexibleDimensionsMaxDimension >= 512) { + device->coopmat2 = true; + } + } +#endif + } + if (!vk11_features.storageBuffer16BitAccess) { std::cerr << "ggml_vulkan: device " << GGML_VK_NAME << idx << " does not support 16-bit storage." << std::endl; throw std::runtime_error("Unsupported device"); @@ -2124,7 +2420,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type return ctx->device->pipeline_dequant[type]; } -static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) { +static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type, ggml_prec prec) { VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline(" << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")"); if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { return ctx->device->pipeline_matmul_f32; @@ -2132,14 +2428,23 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) { return ctx->device->pipeline_matmul_f32_f16; } - if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { - return ctx->device->pipeline_matmul_f16_f32.f32acc; - } - if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { - return ctx->device->pipeline_matmul_f16.f32acc; + if (prec == GGML_PREC_DEFAULT && ctx->device->coopmat2) { + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { + return ctx->device->pipeline_matmul_f16_f32.f16acc; + } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { + return ctx->device->pipeline_matmul_f16.f16acc; + } + } else { + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { + return ctx->device->pipeline_matmul_f16_f32.f32acc; + } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { + return ctx->device->pipeline_matmul_f16.f32acc; + } } - if (src1_type != GGML_TYPE_F32) { + if (src1_type != GGML_TYPE_F32 && !ctx->device->coopmat2) { return nullptr; } @@ -2160,6 +2465,10 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte return nullptr; } + if (ctx->device->coopmat2) { + assert(src1_type == GGML_TYPE_F16); + return ctx->device->pipeline_dequant_mul_mat_mat_f16[src0_type].f16acc; + } return ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f32acc; } @@ -2844,6 +3153,16 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, break; } + if (ctx->device->coopmat2) { + if ((m % mmp->l->wg_denoms[0]) == 0 && (n % mmp->l->wg_denoms[1]) == 0) { + return aligned ? mmp->a_l : mmp->l; + } + if ((m % mmp->m->wg_denoms[0]) == 0 && (n % mmp->m->wg_denoms[1]) == 0) { + return aligned ? mmp->a_m : mmp->m; + } + return aligned ? mmp->a_s : mmp->s; + } + if (m <= 32 || n <= 32) { return aligned ? mmp->a_s : mmp->s; } @@ -3008,18 +3327,20 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub } const bool x_non_contig = !ggml_vk_dim01_contiguous(src0); - const bool y_non_contig = !ggml_vk_dim01_contiguous(src1); + // Reformat and convert to fp16 if src1 is non-contiguous, or for coopmat2 for better perf + const bool y_non_contig = (ctx->device->coopmat2 && src1->type == GGML_TYPE_F32) || + !ggml_vk_dim01_contiguous(src1); const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig; - vk_matmul_pipeline mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type); + vk_matmul_pipeline mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type, (ggml_prec)dst->op_params[0]); const bool qx_needs_dequant = mmp == nullptr || x_non_contig; const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig; if (qx_needs_dequant) { // Fall back to dequant + f16 mulmat - mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, GGML_TYPE_F16, y_f32_kernel ? GGML_TYPE_F32 : GGML_TYPE_F16); + mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, GGML_TYPE_F16, y_f32_kernel ? GGML_TYPE_F32 : GGML_TYPE_F16, (ggml_prec)dst->op_params[0]); } // Not implemented @@ -3930,6 +4251,167 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx } } +static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * q, const ggml_tensor * k, const ggml_tensor * v, const ggml_tensor * mask, ggml_tensor * dst, bool dryrun = false) { + VK_LOG_DEBUG("ggml_vk_flash_attn((" << q << ", name=" << q->name << ", type=" << q->type << ", ne0=" << q->ne[0] << ", ne1=" << q->ne[1] << ", ne2=" << q->ne[2] << ", ne3=" << q->ne[3] << ", nb0=" << q->nb[0] << ", nb1=" << q->nb[1] << ", nb2=" << q->nb[2] << ", nb3=" << q->nb[3]; + std::cerr << "), (" << k << ", name=" << k->name << ", type=" << k->type << ", ne0=" << k->ne[0] << ", ne1=" << k->ne[1] << ", ne2=" << k->ne[2] << ", ne3=" << k->ne[3] << ", nb0=" << k->nb[0] << ", nb1=" << k->nb[1] << ", nb2=" << k->nb[2] << ", nb3=" << k->nb[3]; + std::cerr << "), (" << v << ", name=" << v->name << ", type=" << v->type << ", ne0=" << v->ne[0] << ", ne1=" << v->ne[1] << ", ne2=" << v->ne[2] << ", ne3=" << v->ne[3] << ", nb0=" << v->nb[0] << ", nb1=" << v->nb[1] << ", nb2=" << v->nb[2] << ", nb3=" << v->nb[3]; + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; + std::cerr << "), " << (dryrun ? "dryrun" : "") << ")"); + + GGML_TENSOR_LOCALS(int64_t, neq, q, ne) + GGML_TENSOR_LOCALS(size_t, nbq, q, nb) + GGML_TENSOR_LOCALS(int64_t, nek, k, ne) + GGML_TENSOR_LOCALS(size_t, nbk, k, nb) + GGML_TENSOR_LOCALS(int64_t, nev, v, ne) + GGML_TENSOR_LOCALS(size_t, nbv, v, nb) + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) + + const uint32_t nem1 = mask ? mask->ne[1] : 0; + const uint32_t nbm1 = mask ? mask->nb[1] : 0; + + const uint32_t D = neq0; + const uint32_t N = neq1; + const uint32_t KV = nek1; + + GGML_ASSERT(ne0 == D); + GGML_ASSERT(ne2 == N); + + // input tensor rows must be contiguous + GGML_ASSERT(nbq0 == ggml_type_size(q->type)); + GGML_ASSERT(nbk0 == ggml_type_size(k->type)); + GGML_ASSERT(nbv0 == ggml_type_size(v->type)); + + GGML_ASSERT(neq0 == D); + GGML_ASSERT(nek0 == D); + GGML_ASSERT(nev0 == D); + + GGML_ASSERT(neq1 == N); + GGML_ASSERT(nev0 == D); + + GGML_ASSERT(nev1 == nek1); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + assert(dst->type == GGML_TYPE_F32); + assert(q->type == GGML_TYPE_F32); + assert(k->type == v->type); + + vk_pipeline *pipelines; + // XXX TODO other backends may be changing accumulator precision to default to f32 soon + bool f32acc = dst->op_params[3] == GGML_PREC_F32; + bool small_rows = N <= flash_attention_num_small_rows; + switch (D) { + case 64: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D64[k->type][f32acc][small_rows][0]; break; + case 80: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D80[k->type][f32acc][small_rows][0]; break; + case 96: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D96[k->type][f32acc][small_rows][0]; break; + case 112: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D112[k->type][f32acc][small_rows][0]; break; + case 128: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D128[k->type][f32acc][small_rows][0]; break; + case 256: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D256[k->type][f32acc][small_rows][0]; break; + default: + assert(!"unsupported D value"); + return; + } + assert(pipelines); + + bool aligned = (KV % pipelines[1]->align) == 0; + vk_pipeline pipeline = pipelines[aligned]; + assert(pipeline); + + if (dryrun) { + // Request descriptor sets + ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1); + return; + } + + float scale = 1.0f; + float max_bias = 0.0f; + float logit_softcap = 0.0f; + + memcpy(&scale, (const float *) dst->op_params + 0, sizeof(float)); + memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float)); + memcpy(&logit_softcap, (const float *) dst->op_params + 2, sizeof(float)); + + if (logit_softcap != 0) { + scale /= logit_softcap; + } + + const uint32_t n_head_kv = neq2; + const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); + const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + + ggml_vk_sync_buffers(subctx); + + vk_buffer d_Q, d_K, d_V, d_D, d_M; + uint64_t q_buf_offset, k_buf_offset, v_buf_offset, d_buf_offset, m_buf_offset; + + bool Q_uma = false, K_uma = false, V_uma = false, D_uma = false, M_uma = false; + + if (ctx->device->uma) { + ggml_vk_host_get(ctx->device, q->data, d_Q, q_buf_offset); + ggml_vk_host_get(ctx->device, k->data, d_K, q_buf_offset); + ggml_vk_host_get(ctx->device, v->data, d_V, q_buf_offset); + ggml_vk_host_get(ctx->device, dst->data, d_D, q_buf_offset); + Q_uma = d_Q != nullptr; + K_uma = d_K != nullptr; + V_uma = d_V != nullptr; + D_uma = d_D != nullptr; + if (mask) { + ggml_vk_host_get(ctx->device, mask->data, d_M, q_buf_offset); + M_uma = d_M != nullptr; + } + } + + + ggml_backend_vk_buffer_context * d_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + ggml_backend_vk_buffer_context * q_buf_ctx = (ggml_backend_vk_buffer_context *)q->buffer->context; + ggml_backend_vk_buffer_context * k_buf_ctx = (ggml_backend_vk_buffer_context *)k->buffer->context; + ggml_backend_vk_buffer_context * v_buf_ctx = (ggml_backend_vk_buffer_context *)v->buffer->context; + + if (!Q_uma) { + d_Q = q_buf_ctx->dev_buffer; + q_buf_offset = vk_tensor_offset(q) + q->view_offs; + } + if (!K_uma) { + d_K = k_buf_ctx->dev_buffer; + k_buf_offset = vk_tensor_offset(k) + k->view_offs; + } + if (!V_uma) { + d_V = v_buf_ctx->dev_buffer; + v_buf_offset = vk_tensor_offset(v) + v->view_offs; + } + if (!D_uma) { + d_D = d_buf_ctx->dev_buffer; + d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; + } + + if (!M_uma) { + d_M = d_Q; + m_buf_offset = q_buf_offset; + if (mask) { + ggml_backend_vk_buffer_context * m_buf_ctx = (ggml_backend_vk_buffer_context*)mask->buffer->context; + d_M = m_buf_ctx->dev_buffer; + m_buf_offset = vk_tensor_offset(mask) + mask->view_offs; + } + } + + const vk_flash_attn_push_constants pc = { N, KV, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3, (uint32_t)neq2, (uint32_t)neq3, (uint32_t)nek2, (uint32_t)nek3, (uint32_t)nev2, (uint32_t)nev3, nem1, (uint32_t)nbq2, (uint32_t)nbq3, (uint32_t)nbk2, (uint32_t)nbk3, (uint32_t)nbv2, (uint32_t)nbv3, nbm1, scale, max_bias, logit_softcap, mask != nullptr, n_head_log2, m0, m1 }; + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, + { + vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE}, + vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE}, + vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE}, + vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE}, + vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE}, + }, + sizeof(vk_flash_attn_push_constants), &pc, { (uint32_t)neq1, (uint32_t)neq2, (uint32_t)neq3 }); +} + static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) { switch (op) { case GGML_OP_GET_ROWS: @@ -5044,16 +5526,16 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch); vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); + ggml_vk_ctx_begin(ctx->device, subctx); for (size_t i = 0; i < num_it; i++) { - ggml_vk_ctx_begin(ctx->device, subctx); ggml_vk_matmul( ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k), m, n, k, k, k, m, k*m, k*n, m*n, split_k, batch, batch, batch, 1, 1 ); - ggml_vk_ctx_end(subctx); } + ggml_vk_ctx_end(subctx); auto begin = std::chrono::high_resolution_clock::now(); ggml_vk_submit(subctx, ctx->fence); @@ -5391,16 +5873,16 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, ggml_vk_buffer_write(y_buf, 0, y, y_sz); vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); + ggml_vk_ctx_begin(ctx->device, subctx); for (size_t i = 0; i < num_it; i++) { - ggml_vk_ctx_begin(ctx->device, subctx); ggml_vk_matmul( ctx, subctx, p, ggml_vk_subbuffer(qx_buf), ggml_vk_subbuffer(y_buf), ggml_vk_subbuffer(d_buf), ggml_vk_subbuffer(ctx->prealloc_split_k), m, n, k, k, k, m, k*m, k*n, m*n, split_k, batch, batch, batch, 1, 1 ); - ggml_vk_ctx_end(subctx); } + ggml_vk_ctx_end(subctx); auto begin = std::chrono::high_resolution_clock::now(); @@ -5621,7 +6103,8 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { 4096, 512, 11008, 32000, 512, 4096, }; - const size_t num_it = 1; + const size_t num_it = 100; + for (size_t i = 0; i < vals.size(); i += 3) { ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0); ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1); @@ -5676,6 +6159,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod const ggml_tensor * src0 = node->src[0]; const ggml_tensor * src1 = node->src[1]; const ggml_tensor * src2 = node->src[2]; + const ggml_tensor * src3 = node->src[3]; switch (node->op) { // Return on empty ops to avoid generating a compute_ctx and setting exit_tensor @@ -5728,6 +6212,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_POOL_2D: case GGML_OP_LEAKY_RELU: + case GGML_OP_FLASH_ATTN_EXT: break; default: std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl; @@ -5920,6 +6405,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_MUL_MAT_ID: ggml_vk_mul_mat_id(ctx, compute_ctx, src0, src1, src2, node, dryrun); + break; + + case GGML_OP_FLASH_ATTN_EXT: + ggml_vk_flash_attn(ctx, compute_ctx, src0, src1, src2, src3, node, dryrun); + break; default: return false; @@ -6020,6 +6510,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * break; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: + case GGML_OP_FLASH_ATTN_EXT: buf = tensor->buffer; break; @@ -6751,6 +7242,57 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm return true; } break; + case GGML_OP_FLASH_ATTN_EXT: + { + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + if (!ggml_vk_get_device(ctx->device)->coopmat2) { + return false; + } + switch (op->src[0]->ne[0]) { + case 64: + case 80: + case 96: + case 112: + case 128: + case 256: + break; + default: + return false; + } + if (op->src[0]->type != GGML_TYPE_F32) { + return false; + } + if (op->type != GGML_TYPE_F32) { + return false; + } + if (op->src[3] && op->src[3]->type != GGML_TYPE_F16) { + return false; + } + // It's straightforward to support different K/V dequant, but would + // significantly increase the number of pipelines + if (op->src[1]->type != op->src[2]->type) { + return false; + } + switch (op->src[1]->type) { + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + // K dequants currently disabled because D dimension is rounded up to 256 and runs inefficiently + //case GGML_TYPE_Q2_K: + //case GGML_TYPE_Q3_K: + //case GGML_TYPE_Q4_K: + //case GGML_TYPE_Q5_K: + //case GGML_TYPE_Q6_K: + case GGML_TYPE_IQ4_NL: + break; + default: + return false; + } + return true; + } case GGML_OP_GET_ROWS: { switch (op->src[0]->type) { @@ -7065,6 +7607,7 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { ggml_tensor * src0 = tensor->src[0]; ggml_tensor * src1 = tensor->src[1]; ggml_tensor * src2 = tensor->src[2]; + ggml_tensor * src3 = tensor->src[3]; struct ggml_init_params iparams = { /*.mem_size =*/ 2ul*1024ul*1024ul*1024ul, @@ -7077,15 +7620,18 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { struct ggml_tensor * src0_clone = nullptr; struct ggml_tensor * src1_clone = nullptr; struct ggml_tensor * src2_clone = nullptr; + struct ggml_tensor * src3_clone = nullptr; struct ggml_tensor * tensor_clone = nullptr; size_t src0_size; size_t src1_size; size_t src2_size; + size_t src3_size; void * src0_buffer = nullptr; void * src1_buffer = nullptr; void * src2_buffer = nullptr; + void * src3_buffer = nullptr; if (src0 != nullptr) { src0_clone = ggml_dup_tensor(ggml_ctx, src0); @@ -7213,8 +7759,53 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { ggml_vk_print_tensor(src2, "src2"); } } + if (src3 != nullptr) { + src3_clone = ggml_dup_tensor(ggml_ctx, src3); + + src3_size = ggml_nbytes(src3); + + src3_buffer = malloc(src3_size); + src3_clone->data = src3_buffer; + if (ggml_backend_buffer_is_host(src3->buffer)) { + memcpy(src3_clone->data, src3->data, src3_size); + memcpy(src3_clone->nb, src3->nb, sizeof(size_t) * GGML_MAX_DIMS); + } else if (ggml_backend_buffer_is_vk(src3->buffer)) { + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)src3->buffer->context; + vk_buffer& buffer_gpu = buf_ctx->dev_buffer; + uint64_t offset = vk_tensor_offset(src3) + src3->view_offs; + if (!ggml_is_contiguous(src3) && ggml_vk_dim01_contiguous(src3)) { + for (int i3 = 0; i3 < src3->ne[3]; i3++) { + for (int i2 = 0; i2 < src3->ne[2]; i2++) { + const int idx = i3*src3->ne[2] + i2; + ggml_vk_buffer_read(buffer_gpu, offset + idx * src3->nb[2], ((char *)src3_clone->data + idx * src3_clone->nb[2]), src3->ne[1] * src3->nb[1]); + } + } + + src3_clone->nb[0] = src3->nb[0]; + src3_clone->nb[1] = src3->nb[1]; + for (int i = 2; i < GGML_MAX_DIMS; i++) { + src3_clone->nb[i] = src3_clone->nb[i - 1]*src3_clone->ne[i - 1]; + } + } else { + if (offset + src3_size >= buffer_gpu->size) { + src3_size = buffer_gpu->size - offset; + } + ggml_vk_buffer_read(buffer_gpu, offset, src3_clone->data, src3_size); + memcpy(src3_clone->nb, src3->nb, sizeof(size_t) * GGML_MAX_DIMS); + } + } else { + GGML_ABORT("fatal error"); + } + + if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { + ggml_vk_print_tensor(src3, "src3"); + } + } - if (tensor->op == GGML_OP_MUL_MAT) { + if (tensor->op == GGML_OP_FLASH_ATTN_EXT) { + const float *params = (const float *)tensor->op_params; + tensor_clone = ggml_flash_attn_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, src3_clone, params[0], params[1], params[2]); + } else if (tensor->op == GGML_OP_MUL_MAT) { tensor_clone = ggml_mul_mat(ggml_ctx, src0_clone, src1_clone); } else if (tensor->op == GGML_OP_MUL_MAT_ID) { tensor_clone = ggml_mul_mat_id(ggml_ctx, src0_clone, src1_clone, src2_clone); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt index 51c78b7d2293a..bd0c74cb1c770 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt @@ -1,7 +1,9 @@ find_package (Threads REQUIRED) +find_package(Vulkan COMPONENTS glslc REQUIRED) set(TARGET vulkan-shaders-gen) add_executable(${TARGET} vulkan-shaders-gen.cpp) install(TARGETS ${TARGET} RUNTIME) target_compile_features(${TARGET} PRIVATE cxx_std_17) target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads) +target_link_libraries(vulkan-shaders-gen PRIVATE Vulkan::Vulkan) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp new file mode 100644 index 0000000000000..a8707b621e7e2 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp @@ -0,0 +1,305 @@ + +#include "types.comp" + +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ4_0 { + block_q4_0_packed16 block; +}; + +float16_t dequantFuncQ4_0(const in decodeBufQ4_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const float16_t d = bl.block.d; + const uint idx = coordInBlock[1]; + const uint shift = (idx & 0x10) >> 2; + uint32_t qs = unpack8(uint32_t(bl.block.qs[(idx & 0xE) >> 1]))[idx & 1]; + qs >>= shift; + qs &= 0xF; + float16_t ret = (float16_t(qs) - float16_t(8)) * d; + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ4_1 { + block_q4_1 block; +}; + +float16_t dequantFuncQ4_1(const in decodeBufQ4_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const float16_t d = bl.block.d; + const float16_t m = bl.block.m; + const uint idx = coordInBlock[1]; + const uint iqs = idx & 0xF; + const uint shift = (idx & 0x10) >> 2; + uint32_t qs = bl.block.qs[iqs]; + qs >>= shift; + qs &= 0xF; + float16_t ret = float16_t(qs) * d + m; + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ5_0 { + block_q5_0 block; +}; + +float16_t dequantFuncQ5_0(const in decodeBufQ5_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const float16_t d = bl.block.d; + const uint idx = coordInBlock[1]; + const uint iqs = idx & 0xF; + + const uint uint_qh = uint(bl.block.qh[1]) << 16 | bl.block.qh[0]; + const uint qh = ((uint_qh >> idx) << 4) & 0x10; + + const uint shift = (idx & 0x10) >> 2; + uint32_t qs = bl.block.qs[iqs]; + qs >>= shift; + qs &= 0xF; + + float16_t ret = (float16_t(qs | qh) - float16_t(16)) * d; + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 8) buffer decodeBufQ5_1 { + block_q5_1 block; +}; + +float16_t dequantFuncQ5_1(const in decodeBufQ5_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const float16_t d = bl.block.d; + const float16_t m = bl.block.m; + const uint idx = coordInBlock[1]; + const uint iqs = idx & 0xF; + + const uint uint_qh = bl.block.qh; + const uint qh = ((uint_qh >> idx) << 4) & 0x10; + + const uint shift = (idx & 0x10) >> 2; + uint32_t qs = bl.block.qs[iqs]; + qs >>= shift; + qs &= 0xF; + + float16_t ret = float16_t(qs | qh) * d + m; + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ8_0 { + block_q8_0_packed16 block; +}; + +float16_t dequantFuncQ8_0(const in decodeBufQ8_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const float16_t d = bl.block.d; + const uint idx = coordInBlock[1]; + const uint iqs = idx; + + // Load 16b and select the byte for this element + int32_t qs = unpack8(int32_t(bl.block.qs[(iqs & 0x1E) >> 1]))[iqs & 1]; + float16_t ret = float16_t(qs) * d; + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ2_K { + block_q2_K block; +}; + +float16_t dequantFuncQ2_K(const in decodeBufQ2_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const f16vec2 d = bl.block.d; + const uint idx = coordInBlock[1]; + const uint iqs = idx; + + const uint qsi = (iqs / 128) * 32 + (iqs % 32); // 0..31 + const uint scalesi = iqs / 16; // 0..15 + const uint qsshift = ((iqs % 128) / 32) * 2; // 0,2,4,6 + + uint32_t qs = bl.block.qs[qsi]; + const uint scales = bl.block.scales[scalesi]; + float16_t ret = d.x * float16_t(scales & 0xF) * float16_t((qs >> qsshift) & 3) - d.y * float16_t(scales >> 4); + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ3_K { + block_q3_K block; +}; + +float16_t dequantFuncQ3_K(const in decodeBufQ3_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const uint idx = coordInBlock[1]; + const uint iqs = idx; + + const uint n = iqs / 128; // 0,1 + const uint qsi = n * 32 + (iqs % 32); // 0..63 + const uint hmi = (iqs % 32); // 0..31 + const uint j = (iqs % 128) / 8; // 0..15 + const uint is = iqs / 16; // 0..15 + const uint halfsplit = ((iqs % 128) / 32); // 0,1,2,3 + const uint qsshift = halfsplit * 2; // 0,2,4,6 + const uint m = 1 << (4 * n + halfsplit); // 1,2,4,8,16,32,64,128 + + uint32_t scaleidx0 = (is < 8) ? is : (is-8); + uint32_t scaleidx0shift = (is < 8) ? 0 : 4; + uint32_t scaleidx1 = is + 8 - (is/4)*4; + uint32_t scaleidx1shift = (is/4)*2; + + const int8_t us = int8_t(((bl.block.scales[scaleidx0] >> scaleidx0shift) & 0xF) | (((bl.block.scales[scaleidx1] >> scaleidx1shift) & 3) << 4)); + + const float16_t dl = bl.block.d * float16_t(us - 32); + + float16_t ret = dl * float16_t(int8_t((bl.block.qs[qsi ] >> qsshift) & 3) - (((bl.block.hmask[hmi ] & m) != 0) ? 0 : 4)); + + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K { + block_q4_K block; +}; + +float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const uint idx = coordInBlock[1]; + const uint iqs = idx; + + const uint n = iqs / 64; // 0,1,2,3 + const uint b = (iqs % 64) / 32; // 0,1 + const uint is = (idx & 0xE0) >> 5; // 0..7 + const uint qsi = n * 32 + (iqs % 32); // 0..127 + + const f16vec2 loadd = bl.block.d; + + uint32_t sc; + uint32_t mbyte; + + uint32_t scidx0 = (is < 4) ? is : (is + 4); + uint32_t scidx1 = (is < 4) ? is : (is - 4); + uint32_t scidxmask1 = (is < 4) ? 0x30 : 0xC0; + uint32_t scidxshift1 = (is < 4) ? 0 : 2; + uint32_t mbidx0 = is + 4; + uint32_t mbidx1 = (is < 4) ? is + 4 : is; + uint32_t mbidxmask0 = (is < 4) ? 0xF : 0xF0; + uint32_t mbidxshift0 = (is < 4) ? 0 : 4; + uint32_t mbidxmask1 = (is < 4) ? 0x30 : 0xC0; + uint32_t mbidxshift1 = (is < 4) ? 0 : 2; + + sc = uint8_t((bl.block.scales[scidx0] & 0xF) | ((bl.block.scales[scidx1] & scidxmask1) >> scidxshift1)); + mbyte = uint8_t(((bl.block.scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((bl.block.scales[mbidx1] & mbidxmask1) >> mbidxshift1)); + + const float16_t d = loadd.x * float16_t(sc); + const float16_t m = loadd.y * float16_t(mbyte); + + uint32_t dmask = 0xF << (b * 4); + + float16_t ret = d * float16_t((bl.block.qs[qsi ] & dmask) >> (b * 4)) - m; + + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K { + block_q5_K block; +}; + +float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const uint idx = coordInBlock[1]; + const uint iqs = idx; + + const uint n = iqs / 64; // 0,1,2,3 + const uint b = (iqs % 64) / 32; // 0,1 + const uint is = (idx & 0xE0) >> 5; // 0..7 + const uint qsi = n * 32 + (iqs % 32); // 0..127 + const uint qhi = (iqs % 32); // 0..31 + + const uint8_t hm = uint8_t(1 << (iqs / 32)); + + const f16vec2 loadd = bl.block.d; + + uint32_t sc; + uint32_t mbyte; + + uint32_t scidx0 = (is < 4) ? is : (is + 4); + uint32_t scidx1 = (is < 4) ? is : (is - 4); + uint32_t scidxmask1 = (is < 4) ? 0x30 : 0xC0; + uint32_t scidxshift1 = (is < 4) ? 0 : 2; + uint32_t mbidx0 = is + 4; + uint32_t mbidx1 = (is < 4) ? is + 4 : is; + uint32_t mbidxmask0 = (is < 4) ? 0xF : 0xF0; + uint32_t mbidxshift0 = (is < 4) ? 0 : 4; + uint32_t mbidxmask1 = (is < 4) ? 0x30 : 0xC0; + uint32_t mbidxshift1 = (is < 4) ? 0 : 2; + + sc = uint8_t((bl.block.scales[scidx0] & 0xF) | ((bl.block.scales[scidx1] & scidxmask1) >> scidxshift1)); + mbyte = uint8_t(((bl.block.scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((bl.block.scales[mbidx1] & mbidxmask1) >> mbidxshift1)); + + const float16_t d = loadd.x * float16_t(sc); + const float16_t m = loadd.y * float16_t(mbyte); + + uint32_t dmask = 0xF << (b * 4); + + float16_t ret = d * (float16_t((bl.block.qs[qsi ] & dmask) >> (b * 4)) + float16_t((bl.block.qh[qhi ] & hm) != 0 ? 16 : 0)) - m; + + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ6_K { + block_q6_K block; +}; + +float16_t dequantFuncQ6_K(const in decodeBufQ6_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const uint idx = coordInBlock[1]; + const uint iqs = idx; + + const uint n = iqs / 128; // 0,1 + const uint b = (iqs % 128) / 64; // 0,1 + const uint is_b = (iqs % 32) / 16; // 0,1 + const uint qhshift = ((iqs % 128) / 32) * 2;// 0,2,4,6 + const uint is = 8 * n + qhshift + is_b; // 0..15 + const uint qsi = n * 64 + (iqs % 64); // 0..127 + const uint qhi = n * 32 + (iqs % 32); // 0..63 + + const float16_t dscale = bl.block.d * float16_t(bl.block.scales[is]); + + float16_t ret = dscale * float16_t(int8_t(((bl.block.ql[qsi ] >> (b * 4)) & 0xF) | (((bl.block.qh[qhi ] >> qhshift) & 3) << 4)) - 32); + + return ret; +} + +#if defined(DATA_A_IQ4_NL) +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4_NL { + block_iq4_nl block; +}; + +float16_t dequantFuncIQ4_NL(const in decodeBufIQ4_NL bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const float16_t d = bl.block.d; + const uint idx = coordInBlock[1]; + const uint iqs = idx & 0xF; + const uint shift = (idx & 0x10) >> 2; + uint32_t qs = bl.block.qs[iqs]; + qs >>= shift; + qs &= 0xF; + float16_t ret = float16_t(kvalues_iq4nl[qs]) * d; + return ret; +} +#endif + +#if defined(DATA_A_Q4_0) +#define dequantFuncA dequantFuncQ4_0 +#elif defined(DATA_A_Q4_1) +#define dequantFuncA dequantFuncQ4_1 +#elif defined(DATA_A_Q5_0) +#define dequantFuncA dequantFuncQ5_0 +#elif defined(DATA_A_Q5_1) +#define dequantFuncA dequantFuncQ5_1 +#elif defined(DATA_A_Q8_0) +#define dequantFuncA dequantFuncQ8_0 +#elif defined(DATA_A_Q2_K) +#define dequantFuncA dequantFuncQ2_K +#elif defined(DATA_A_Q3_K) +#define dequantFuncA dequantFuncQ3_K +#elif defined(DATA_A_Q4_K) +#define dequantFuncA dequantFuncQ4_K +#elif defined(DATA_A_Q5_K) +#define dequantFuncA dequantFuncQ5_K +#elif defined(DATA_A_Q6_K) +#define dequantFuncA dequantFuncQ6_K +#elif defined(DATA_A_IQ4_NL) +#define dequantFuncA dequantFuncIQ4_NL +#endif diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp new file mode 100644 index 0000000000000..c5be8131b30db --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp @@ -0,0 +1,289 @@ +#version 450 + +#extension GL_EXT_control_flow_attributes : enable +#extension GL_EXT_shader_16bit_storage : require + +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require + +#extension GL_KHR_memory_scope_semantics : enable +#extension GL_KHR_cooperative_matrix : enable +#extension GL_NV_cooperative_matrix2 : enable +#extension GL_EXT_buffer_reference : enable +#extension GL_KHR_shader_subgroup_ballot : enable +#extension GL_KHR_shader_subgroup_vote : enable +#extension GL_EXT_null_initializer : enable + +#include "types.comp" +#include "dequant_funcs_cm2.comp" + +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; + +layout (constant_id = 1) const uint32_t Br = 32; +layout (constant_id = 2) const uint32_t Bc = 32; +layout (constant_id = 3) const uint32_t D = 32; +layout (constant_id = 4) const uint32_t Clamp = gl_CooperativeMatrixClampModeConstantNV; + +layout (push_constant) uniform parameter { + uint32_t N; + uint32_t KV; + + uint32_t ne1; + uint32_t ne2; + uint32_t ne3; + + uint32_t neq2; + uint32_t neq3; + uint32_t nek2; + uint32_t nek3; + uint32_t nev2; + uint32_t nev3; + uint32_t nem1; + + uint32_t nb02; + uint32_t nb03; + uint32_t nb12; + uint32_t nb13; + uint32_t nb22; + uint32_t nb23; + uint32_t nb31; + + float scale; + float max_bias; + float logit_softcap; + + uint32_t mask; + uint32_t n_head_log2; + float m0; + float m1; +} p; + +layout (binding = 0) readonly buffer Q {uint8_t data_q[];}; +layout (binding = 1) readonly buffer K {uint8_t data_k[];}; +layout (binding = 2) readonly buffer V {uint8_t data_v[];}; +layout (binding = 3) readonly buffer M {uint8_t data_m[];}; +layout (binding = 4) writeonly buffer O {D_TYPE data_o[];}; + +#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b)) + +ACC_TYPE maxReduce(const in ACC_TYPE x, const in ACC_TYPE y) { + return max(x, y); +} + +ACC_TYPE smearReduce(const in ACC_TYPE x, const in ACC_TYPE y) { + return x; +} + +// Replace matrix elements >= numRows or numCols with 'replace' +ACC_TYPE replacePadding(const in uint32_t row, const in uint32_t col, const in ACC_TYPE elem, const in ACC_TYPE replace, const in uint32_t numRows, const in uint32_t numCols) { + if (row >= numRows || col >= numCols) { + return replace; + } + return elem; +} + +ACC_TYPE Exp(const in uint32_t row, const in uint32_t col, const in ACC_TYPE elem) +{ + return exp(elem); +} + +ACC_TYPE Max(const in uint32_t row, const in uint32_t col, const in ACC_TYPE elem0, const in ACC_TYPE elem1) +{ + return max(elem0, elem1); +} + +#if defined(BLOCK_SIZE) +#define DECODEFUNC , DEQUANTFUNC +#else +#define DECODEFUNC +#endif + +void main() { +#if defined(DATA_A_IQ4_NL) + init_iq4nl_shmem(); +#endif + + const uint32_t N = p.N; + const uint32_t KV = p.KV; + + const uint32_t Tr = CEIL_DIV(N, Br); + const uint32_t Tc = CEIL_DIV(KV, Bc); + + const uint32_t i = gl_WorkGroupID.x; + + const uint32_t iq2 = gl_WorkGroupID.y; + const uint32_t iq3 = gl_WorkGroupID.z; + + // broadcast factors + const uint32_t rk2 = p.neq2/p.nek2; + const uint32_t rk3 = p.neq3/p.nek3; + + const uint32_t rv2 = p.neq2/p.nev2; + const uint32_t rv3 = p.neq3/p.nev3; + + // k indices + const uint32_t ik3 = iq3 / rk3; + const uint32_t ik2 = iq2 / rk2; + + // v indices + const uint32_t iv3 = iq3 / rv3; + const uint32_t iv2 = iq2 / rv2; + + tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutQ = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV); + tensorLayoutNV<2, Clamp> tensorLayoutK = createTensorLayoutNV(2, Clamp); + tensorLayoutNV<2, Clamp> tensorLayoutV = createTensorLayoutNV(2, Clamp); + + tensorViewNV<2, false, 1, 0> tensorViewTranspose = createTensorViewNV(2, false, 1, 0); + +#if defined(BLOCK_SIZE) + tensorLayoutK = setTensorLayoutBlockSizeNV(tensorLayoutK, 1, BLOCK_SIZE); + tensorLayoutV = setTensorLayoutBlockSizeNV(tensorLayoutV, 1, BLOCK_SIZE); +#endif + + tensorLayoutQ = setTensorLayoutDimensionNV(tensorLayoutQ, N, D); + tensorLayoutK = setTensorLayoutDimensionNV(tensorLayoutK, KV, D); + tensorLayoutV = setTensorLayoutDimensionNV(tensorLayoutV, KV, D); + + coopmat Q; + coopmat Qf16; + + uint32_t q_offset = iq2*p.nb02+iq3*p.nb03; + coopMatLoadTensorNV(Q, data_q, q_offset, sliceTensorLayoutNV(tensorLayoutQ, i * Br, Br, 0, D)); + + Qf16 = coopmat(Q); + Qf16 *= float16_t(p.scale); + + coopmat O = coopmat(0); + + coopmat L, M; + + L = coopmat(0); + M = coopmat(-1.0/0.0); + + ACC_TYPE slope = ACC_TYPE(1.0); + + // ALiBi + if (p.max_bias > 0.0f) { + const uint32_t h = iq2; + + const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1); + const int exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1); + + slope = pow(base, ACC_TYPE(exph)); + } + + [[dont_unroll]] + for (uint32_t j = 0; j < Tc; ++j) { + + coopmat S = coopmat(0); + + coopmat K_T; + + uint32_t k_offset = ik2*p.nb12 + ik3*p.nb13; + coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, D), tensorViewTranspose DECODEFUNC); + S = coopMatMulAdd(Qf16, K_T, S); + + if (p.logit_softcap != 0.0f) { + [[unroll]] + for (int k = 0; k < S.length(); ++k) { + S[k] = ACC_TYPE(p.logit_softcap)*tanh(S[k]); + } + } + + if (p.mask != 0) { + tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutM = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV); + tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV); + + coopmat mv; + + coopMatLoadTensorNV(mv, data_m, 0, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc)); + + S += slope*coopmat(mv); + } + + // Clear padding elements to -inf, so they don't contribute to rowmax + if (Clamp != 0 && + ((j + 1) * Bc > KV || + (i + 1) * Br > N)) { + + uint R = ((i + 1) * Br > N) ? (N % Br) : Br; + uint C = ((j + 1) * Bc > KV) ? (KV % Bc) : Bc; + + coopMatPerElementNV(S, S, replacePadding, ACC_TYPE(-1.0/0.0), R, C); + } + + coopmat rowmax, P, rowsum, eM; + + coopMatReduceNV(rowmax, S, gl_CooperativeMatrixReduceRowNV, maxReduce); + + coopmat Mold = M; + + // M = max(rowmax, Mold) + // P = e^(S - M) + // eM = e^(Mold - M) + coopMatPerElementNV(M, rowmax, Max, Mold); + coopMatPerElementNV(P, S - M, Exp); + coopMatPerElementNV(eM, Mold - M, Exp); + + // Clear padding elements to 0, so they don't contribute to rowsum + if (Clamp != 0 && + ((j + 1) * Bc > KV || + (i + 1) * Br > N)) { + + uint R = ((i + 1) * Br > N) ? (N % Br) : Br; + uint C = ((j + 1) * Bc > KV) ? (KV % Bc) : Bc; + + coopMatPerElementNV(P, P, replacePadding, ACC_TYPE(0.0), R, C); + } + + coopmat P_A = coopmat(P); + + // compute rowsum by multiplying by matrix of all ones. + coopmat One = coopmat(1.0); + + rowsum = coopmat(0.0); + rowsum = coopMatMulAdd(P_A, One, rowsum); + + coopmat V; + uint32_t v_offset = iv2*p.nb22 + iv3*p.nb23; + coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, D) DECODEFUNC); + + L = eM*L + rowsum; + + // This is the "diagonal" matrix in the paper, but since we do componentwise + // multiply rather than matrix multiply it has the diagonal element smeared + // across the row + coopmat eMdiag; + + // resize eM by using smear/reduce + coopMatReduceNV(eMdiag, eM, gl_CooperativeMatrixReduceRowNV, smearReduce); + + O = eMdiag * O; + + O = coopMatMulAdd(P_A, V, O); + } + + coopmat Ldiag; + + // resize L by using smear/reduce + coopMatReduceNV(Ldiag, L, gl_CooperativeMatrixReduceRowNV, smearReduce); + + [[unroll]] + for (int k = 0; k < Ldiag.length(); ++k) { + Ldiag[k] = ACC_TYPE(1.0) / Ldiag[k]; + } + + O = Ldiag*O; + + tensorLayoutNV<3, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutD = createTensorLayoutNV(3, gl_CooperativeMatrixClampModeConstantNV); + tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.ne2, p.ne1, D); + + // permute dimensions + tensorViewNV<3, false, 1, 0, 2> tensorViewPermute = createTensorViewNV(3, false, 1, 0, 2); + uint32_t o_offset = iq3*p.ne2*p.ne1; + + coopmat O_D = coopmat(O); + coopMatStoreTensorNV(O_D, data_o, o_offset, sliceTensorLayoutNV(tensorLayoutD, i * Br, Br, iq2, 1, 0, D), tensorViewPermute); +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp new file mode 100644 index 0000000000000..cbfa5dce1b091 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp @@ -0,0 +1,328 @@ +#version 450 + +#extension GL_EXT_control_flow_attributes : enable +#extension GL_EXT_shader_16bit_storage : require + +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require + +#extension GL_KHR_memory_scope_semantics : enable +#extension GL_KHR_cooperative_matrix : enable +#extension GL_NV_cooperative_matrix2 : enable +#extension GL_EXT_buffer_reference : enable +#extension GL_KHR_shader_subgroup_ballot : enable +#extension GL_KHR_shader_subgroup_vote : enable + +#include "types.comp" + +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; + +layout (constant_id = 1) const uint BM = 64; +layout (constant_id = 2) const uint BN = 64; +layout (constant_id = 3) const uint BK = 16; // Assumed to be 32 if working with a quant + +layout (push_constant) uniform parameter +{ + uint M; + uint N; + uint K; + uint stride_a; + uint stride_b; + uint stride_d; + + uint batch_stride_a; + uint batch_stride_b; + uint batch_stride_d; + +#ifdef MUL_MAT_ID + uint nei0; + uint nei1; + uint nbi1; + uint ne11; +#else + uint k_split; + uint ne02; + uint ne12; + uint broadcast2; + uint broadcast3; +#endif +} p; + + +layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; +layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; +layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; + +#if QUANT_K > 1 +#define DECODEFUNCA , dequantFuncA +#define MAT_A_TYPE float16_t + +#include "dequant_funcs_cm2.comp" + +#else +#define DECODEFUNCA +#define MAT_A_TYPE A_TYPE +#endif + +#define MAT_B_TYPE B_TYPE + +#ifdef MUL_MAT_ID +layout (binding = 3) readonly buffer IDS {int data_ids[];}; + +shared u16vec4 row_ids[3072]; + +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufB { + B_TYPE b[]; +}; + +uint _ne1; +shared uint _ne1_sh; + +B_TYPE decodeFuncB(const in decodeBufB bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const uint row_i = blockCoords[0]; + + if (row_i >= _ne1) { + return B_TYPE(0.0); + } + + const u16vec4 row_idx = row_ids[row_i]; + B_TYPE ret = data_b[row_idx.y * p.batch_stride_b + row_idx.x * p.stride_b + blockCoords[1]]; + + return ret; +} + +D_TYPE perElemOpD(const in uint32_t r, const in uint32_t c, const in D_TYPE elem, const in uint32_t ir, const in uint32_t ic) +{ + uint dr = ir * BM + r; + uint dc = ic * BN + c; + + if (dr < p.M && dc < _ne1) { + uint row_i = dc; + const u16vec4 row_idx = row_ids[row_i]; + data_d[row_idx.y * p.batch_stride_d + row_idx.z * p.stride_d + dr] = elem; + } + return elem; +} + +#endif + +void main() { +#if defined(DATA_A_IQ4_NL) + init_iq4nl_shmem(); +#endif + +#ifdef MUL_MAT_ID + const uint expert_idx = gl_GlobalInvocationID.z; +#else + const uint batch_idx = gl_GlobalInvocationID.z; + + const uint i13 = batch_idx / p.ne12; + const uint i12 = batch_idx % p.ne12; + + const uint i03 = i13 / p.broadcast3; + const uint i02 = i12 / p.broadcast2; + + const uint batch_idx_a = i03 * p.ne02 + i02; +#endif + + const uint blocks_m = (p.M + BM - 1) / BM; + const uint ir = gl_WorkGroupID.x % blocks_m; + const uint ik = gl_WorkGroupID.x / blocks_m; + const uint ic = gl_WorkGroupID.y; + +#ifdef MUL_MAT_ID + // Spread the search across all elements in the first subgroup + if (gl_SubgroupID == 0) { + _ne1 = 0; + uint num_elements = p.nei1 * p.nei0; + + for (uint i = gl_SubgroupInvocationID; subgroupAny(i < num_elements); i += gl_SubgroupSize) { + bool in_range = i < num_elements; + uint ii0 = i % p.nei0; + uint ii1 = i / p.nei0; + uint id = in_range ? data_ids[ii1*p.nbi1 + ii0] : 0; + uvec4 ballot = subgroupBallot(in_range && id == expert_idx); + uint idx = subgroupBallotExclusiveBitCount(ballot); + if (in_range && id == expert_idx) { + row_ids[_ne1 + idx] = u16vec4(ii0 % p.ne11, ii1, ii0, 0); + } + _ne1 += subgroupBallotBitCount(ballot); + } + _ne1_sh = _ne1; + } + + barrier(); + + _ne1 = _ne1_sh; + + // Workgroup has no work + if (ic * BN >= _ne1) return; +#endif + +#ifdef MUL_MAT_ID + uint start_k = 0; + const uint end_k = p.K; +#else + uint start_k = ik * p.k_split; + const uint end_k = min(p.K, (ik + 1) * p.k_split); +#endif + + coopmat sum; + sum = coopmat(0.0); + +#ifdef MUL_MAT_ID + uint pos_a = (expert_idx * p.batch_stride_a) / QUANT_K; + uint pos_b = 0; +#else + uint pos_a = (batch_idx_a * p.batch_stride_a) / QUANT_K; + uint pos_b = batch_idx * p.batch_stride_b; +#endif + + uint stride_a = p.stride_a / QUANT_K; + uint stride_b = p.stride_b; + + // Hint to the compiler that values are aligned (want 16B alignment). + // Quants are always block-aligned, no alignment needed. +#if ALIGNED +#if QUANT_K == 1 + stride_a &= ~7; +#endif + stride_b &= ~7; +#endif + + // Create layouts for both clamped and unclamped accesses + tensorLayoutNV<2> tensorLayoutA = createTensorLayoutNV(2); + tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutAClamp = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV); + tensorLayoutNV<2> tensorLayoutB = createTensorLayoutNV(2); + tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutBClamp = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV); + tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutD = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV); + +#if QUANT_K > 1 + tensorLayoutA = setTensorLayoutBlockSizeNV(tensorLayoutA, 1, QUANT_K); + tensorLayoutAClamp = setTensorLayoutBlockSizeNV(tensorLayoutAClamp, 1, QUANT_K); +#endif + + // Use end_k rather than p.K as the dimension because that's what + // we need to bound check against when using split_k + tensorLayoutA = setTensorLayoutDimensionNV(tensorLayoutA, p.M, end_k); + tensorLayoutB = setTensorLayoutDimensionNV(tensorLayoutB, p.N, end_k); + tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.N, p.M); + tensorLayoutAClamp = setTensorLayoutDimensionNV(tensorLayoutAClamp, p.M, end_k); + tensorLayoutBClamp = setTensorLayoutDimensionNV(tensorLayoutBClamp, p.N, end_k); + + tensorViewNV<2, false, 1, 0> tensorViewTranspose = createTensorViewNV(2, false, 1, 0); + +#if !defined(MUL_MAT_ID) + // Detect a fast path where all loads are entirely in bounds and no clamping is required + if ((ir + 1) * BM <= p.M && (ic + 1) * BN <= p.N && (start_k % BK) == 0 && (end_k % BK) == 0 && +#if QUANT_K == 1 + (stride_a % 8) == 0 && +#endif + (stride_b % 8) == 0 && (start_k % 8) == 0) { + // Hint to the compiler that values are aligned (want 16B alignment) + start_k &= ~7; + stride_b &= ~7; +#if QUANT_K == 1 + stride_a &= ~7; +#endif + + tensorLayoutA = setTensorLayoutStrideNV(tensorLayoutA, stride_a, 1); + tensorLayoutB = setTensorLayoutStrideNV(tensorLayoutB, stride_b, 1); + + uint k_iters = (end_k - start_k + BK - 1) / BK; + + for (uint block_k = start_k, i = 0; i < k_iters; block_k += BK, ++i) { + + coopmat mat_a; + coopmat mat_b; + + coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA); + coopmat mat_a_ft = coopmat(mat_a); + + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose); + coopmat mat_b_ft = coopmat(mat_b); + + sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum); + } + } else +#endif // !defined(MUL_MAT_ID) + { + tensorLayoutA = setTensorLayoutStrideNV(tensorLayoutA, stride_a, 1); + + tensorLayoutAClamp = setTensorLayoutStrideNV(tensorLayoutAClamp, stride_a, 1); + + tensorLayoutB = setTensorLayoutStrideNV(tensorLayoutB, stride_b, 1); + + tensorLayoutBClamp = setTensorLayoutStrideNV(tensorLayoutBClamp, stride_b, 1); + + [[dont_unroll]] + for (uint block_k = start_k; block_k < end_k; block_k += BK) { + + coopmat mat_a; + coopmat mat_b; + coopmat mat_a_ft; + coopmat mat_b_ft; + + // Clamping is expensive, so detect different code paths for each combination + // of A and B needing clamping. + bool unclampedA = (ir + 1) * BM <= p.M && block_k + BK <= end_k && (block_k % 8) == 0; +#ifdef MUL_MAT_ID + bool unclampedB = true; +#else + bool unclampedB = (ic + 1) * BN <= p.N && block_k + BK <= end_k && (block_k % 8) == 0; +#endif + if (unclampedA && unclampedB) { + coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, (block_k & ~7), BK) DECODEFUNCA); +#ifdef MUL_MAT_ID + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose, decodeFuncB); +#else + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, (block_k & ~7), BK), tensorViewTranspose); +#endif + mat_a_ft = coopmat(mat_a); + mat_b_ft = coopmat(mat_b); + sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum); + } else if (unclampedA && !unclampedB) { + coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, (block_k & ~7), BK) DECODEFUNCA); + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose); + + mat_a_ft = coopmat(mat_a); + mat_b_ft = coopmat(mat_b); + sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum); + } else if (!unclampedA && unclampedB) { + coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA); +#ifdef MUL_MAT_ID + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose, decodeFuncB); +#else + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, (block_k & ~7), BK), tensorViewTranspose); +#endif + mat_a_ft = coopmat(mat_a); + mat_b_ft = coopmat(mat_b); + sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum); + } else if (!unclampedA && !unclampedB) { + coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA); + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose); + + mat_a_ft = coopmat(mat_a); + mat_b_ft = coopmat(mat_b); + sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum); + } + } + } + + // Convert from ACC_TYPE to D_TYPE + coopmat mat_d; + mat_d = coopmat(sum); + +#ifdef MUL_MAT_ID + // Call callback to store each element, remapping row through shared memory + coopMatPerElementNV(mat_d, mat_d, perElemOpD, ir, ic); +#else + tensorLayoutD = setTensorLayoutStrideNV(tensorLayoutD, p.stride_d, 1); + + uint pos_d = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z; + coopMatStoreTensorNV(mat_d, data_d, pos_d, sliceTensorLayoutNV(tensorLayoutD, ic * BN, BN, ir * BM, BM), tensorViewTranspose); +#endif +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 5c317b68b7ea0..4716e2c80869b 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -30,6 +30,8 @@ #include #endif +#include + #define ASYNCIO_CONCURRENCY 64 std::mutex lock; @@ -196,15 +198,17 @@ static uint32_t compile_count = 0; static std::mutex compile_count_mutex; static std::condition_variable compile_count_cond; -void string_to_spv_func(const std::string& _name, const std::string& in_fname, const std::map& defines, bool fp16 = true) { - std::string name = _name + (fp16 ? "" : "_fp32"); +void string_to_spv_func(const std::string& _name, const std::string& in_fname, const std::map& defines, bool fp16 = true, bool coopmat2 = false, bool f16acc = false) { + std::string name = _name + (f16acc ? "_f16acc" : "") + (coopmat2 ? "_cm2" : (fp16 ? "" : "_fp32")); std::string out_fname = join_paths(output_dir, name + ".spv"); std::string in_path = join_paths(input_dir, in_fname); + std::string target_env = (name.find("_cm2") != std::string::npos) ? "--target-env=vulkan1.3" : "--target-env=vulkan1.2"; + #ifdef _WIN32 - std::vector cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""}; + std::vector cmd = {GLSLC, "-fshader-stage=compute", target_env, "-O", "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""}; #else - std::vector cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname}; + std::vector cmd = {GLSLC, "-fshader-stage=compute", target_env, "-O", in_path, "-o", out_fname}; #endif #ifdef GGML_VULKAN_SHADER_DEBUG_INFO @@ -254,7 +258,7 @@ std::map merge_maps(const std::map> compiles; -void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map& defines, bool fp16 = true) { +void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map& defines, bool fp16 = true, bool coopmat2 = false, bool f16acc = false) { { // wait until fewer than N compiles are in progress. // 16 is an arbitrary limit, the goal is to avoid "failed to create pipe" errors. @@ -265,15 +269,15 @@ void string_to_spv(const std::string& _name, const std::string& in_fname, const } compile_count++; } - compiles.push_back(std::async(string_to_spv_func, _name, in_fname, defines, fp16)); + compiles.push_back(std::async(string_to_spv_func, _name, in_fname, defines, fp16, coopmat2, f16acc)); } -void matmul_shaders(bool fp16, bool matmul_id) { - std::string load_vec = fp16 ? "8" : "4"; - std::string aligned_b_type_f32 = fp16 ? "mat2x4" : "vec4"; - std::string aligned_b_type_f16 = fp16 ? "f16mat2x4" : "f16vec4"; +void matmul_shaders(bool fp16, bool matmul_id, bool coopmat2, bool f16acc) { + std::string load_vec = coopmat2 ? "1" : fp16 ? "8" : "4"; + std::string aligned_b_type_f32 = coopmat2 ? "float" : fp16 ? "mat2x4" : "vec4"; + std::string aligned_b_type_f16 = coopmat2 ? "float16_t" : fp16 ? "f16mat2x4" : "f16vec4"; - std::map base_dict = {{"FLOAT_TYPE", fp16 ? "float16_t" : "float"}}; + std::map base_dict = {{"FLOAT_TYPE", (coopmat2 || fp16) ? "float16_t" : "float"}}; std::string shader_name = "matmul"; if (matmul_id) { @@ -285,21 +289,31 @@ void matmul_shaders(bool fp16, bool matmul_id) { base_dict["FLOAT16"] = "1"; } + base_dict["ACC_TYPE"] = f16acc ? "float16_t" : "float"; + + std::string source_name = coopmat2 ? "mul_mm_cm2.comp" : "mul_mm.comp"; + // Shaders with f16 B_TYPE - string_to_spv(shader_name + "_f32_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16); - string_to_spv(shader_name + "_f32_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16); + string_to_spv(shader_name + "_f32_f16", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, }), fp16, coopmat2, f16acc); + string_to_spv(shader_name + "_f32_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc); - string_to_spv(shader_name + "_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16); - string_to_spv(shader_name + "_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16); + string_to_spv(shader_name + "_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc); + string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat2, f16acc); for (const auto& tname : type_names) { std::string data_a_key = "DATA_A_" + to_uppercase(tname); // For unaligned, load one at a time for f32/f16, or two at a time for quants - std::string load_vec_a_unaligned = (tname == "f32" || tname == "f16") ? "1" : "2"; + std::string load_vec_a_unaligned = (coopmat2 || tname == "f32" || tname == "f16") ? "1" : "2"; // For aligned matmul loads - std::string load_vec_a = (tname == "f32" || tname == "f16") ? load_vec : "2"; - string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16); - string_to_spv(shader_name + "_" + tname + "_f32_aligned", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}}), fp16); + std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16") ? load_vec : "2"; + + string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat2, f16acc); + string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc); + + if (tname != "f16" && tname != "f32") { + string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat2, f16acc); + string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc); + } } } @@ -307,11 +321,50 @@ void process_shaders() { std::cout << "ggml_vulkan: Generating and compiling shaders to SPIR-V" << std::endl; std::map base_dict = {{"FLOAT_TYPE", "float"}}; + // matmul for (const auto& fp16 : {false, true}) { - matmul_shaders(fp16, false); - matmul_shaders(fp16, true); + for (const auto& matmul_id : {false, true}) { + for (const auto& coopmat2 : {false, true}) { + for (const auto& f16acc : {false, true}) { +#if !defined(VK_NV_cooperative_matrix2) + if (coopmat2) { + continue; + } +#endif + if (coopmat2 && !fp16) { + continue; + } + if (!coopmat2 && f16acc) { + continue; + } + matmul_shaders(fp16, matmul_id, coopmat2, f16acc); + } + } + } } +#if defined(VK_NV_cooperative_matrix2) + // flash attention + for (const auto& f16acc : {false, true}) { + std::string acctype = f16acc ? "float16_t" : "float"; + + for (const auto& tname : type_names) { + if (tname == "f32") { + continue; + } + + if (tname == "f16") { + string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm2.comp", + merge_maps(base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}}), true, true, f16acc); + } else { + std::string data_a_key = "DATA_A_" + to_uppercase(tname); + string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm2.comp", + merge_maps(base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}, {"DEQUANTFUNC", "dequantFunc"+to_uppercase(tname) }, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname) }}), true, true, f16acc); + } + } + } +#endif + for (const auto& tname : type_names) { // mul mat vec std::string data_a_key = "DATA_A_" + to_uppercase(tname); From c28a202f1c11b2fb76e50226e3f1a658f000ce23 Mon Sep 17 00:00:00 2001 From: Plamen Minev Date: Thu, 5 Dec 2024 23:36:41 +0200 Subject: [PATCH 152/245] fix(server) : not show alert when DONE is received (#10674) --- examples/server/public_simplechat/simplechat.js | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/server/public_simplechat/simplechat.js b/examples/server/public_simplechat/simplechat.js index 8e0df3b61df2b..2fcd24a860bd4 100644 --- a/examples/server/public_simplechat/simplechat.js +++ b/examples/server/public_simplechat/simplechat.js @@ -407,6 +407,9 @@ class SimpleChat { if (curLine.startsWith("data:")) { curLine = curLine.substring(5); } + if (curLine.trim() === "[DONE]") { + break; + } let curJson = JSON.parse(curLine); console.debug("DBUG:SC:PART:Json:", curJson); this.append_response(this.response_extract_stream(curJson, apiEP)); From 2070378cc6884650939cb2d6c349a6665d729be7 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 6 Dec 2024 11:14:32 +0100 Subject: [PATCH 153/245] server : (refactoring) do not rely on JSON internally (#10643) * server : (refactoring) reduce usage of json internally * move all response types to struct * wip [no ci] * many fixes * add virtual function * fix index * minor style fix * add std::move * refactor handle_completions_generic * add virtual functions * remove server.hpp * clarify server_sent_event RFC specs * apply review comments * fix model_alias and completion_probabilities * small clean up * remove virtual for to_json_oai_compat() * naming oai_compat --> oaicompat * fix unwanted recursive call * update docs --- common/common.h | 2 +- examples/server/README.md | 8 +- examples/server/server.cpp | 1337 +++++++++++------ examples/server/tests/README.md | 6 + examples/server/tests/tests.sh | 4 + .../server/tests/unit/test_chat_completion.py | 33 +- examples/server/tests/unit/test_completion.py | 39 + examples/server/utils.hpp | 249 +-- 8 files changed, 983 insertions(+), 695 deletions(-) diff --git a/common/common.h b/common/common.h index 0373fd3ead49e..95d20401d2a9a 100644 --- a/common/common.h +++ b/common/common.h @@ -215,7 +215,7 @@ struct common_params { struct common_params_speculative speculative; std::string model = ""; // model path // NOLINT - std::string model_alias = "unknown"; // model alias // NOLINT + std::string model_alias = ""; // model alias // NOLINT std::string model_url = ""; // model url to download // NOLINT std::string hf_token = ""; // HF token // NOLINT std::string hf_repo = ""; // HF repo // NOLINT diff --git a/examples/server/README.md b/examples/server/README.md index b2dd7b65a990c..8dbed2626a444 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -473,9 +473,11 @@ Notice that each `probs` is an array of length `n_probs`. - `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.). - `model`: The path to the model loaded with `-m` - `prompt`: The provided `prompt` -- `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token -- `stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered -- `stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided +- `stop_type`: Indicating whether the completion has stopped. Possible values are: + - `none`: Generating (not stopped) + - `eos`: Stopped because it encountered the EOS token + - `limit`: Stopped because `n_predict` tokens were generated before stop words or EOS was encountered + - `word`: Stopped due to encountering a stopping word from `stop` JSON array provided - `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word) - `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second` - `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 31dfd62408047..809fafa187add 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -33,8 +33,10 @@ using json = nlohmann::ordered_json; enum stop_type { - STOP_TYPE_FULL, - STOP_TYPE_PARTIAL, + STOP_TYPE_NONE, + STOP_TYPE_EOS, + STOP_TYPE_WORD, + STOP_TYPE_LIMIT, }; // state diagram: https://github.com/ggerganov/llama.cpp/pull/9283 @@ -69,12 +71,25 @@ enum server_task_inf_type { SERVER_TASK_INF_TYPE_INFILL, }; +// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11 +enum error_type { + ERROR_TYPE_INVALID_REQUEST, + ERROR_TYPE_AUTHENTICATION, + ERROR_TYPE_SERVER, + ERROR_TYPE_NOT_FOUND, + ERROR_TYPE_PERMISSION, + ERROR_TYPE_UNAVAILABLE, // custom error + ERROR_TYPE_NOT_SUPPORTED, // custom error +}; + struct server_task { int id = -1; // to be filled by server_queue int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL llama_tokens prompt_tokens; server_task_type type; + + // TODO @ngxson : we should get rid of json type here json data; server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION; @@ -89,15 +104,6 @@ struct server_task { } }; -struct server_task_result { - int id = -1; - - json data; - - bool stop; - bool error; -}; - struct slot_params { bool stream = true; bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt @@ -111,9 +117,603 @@ struct slot_params { int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit std::vector antiprompt; + bool timings_per_token = false; struct common_params_sampling sampling; struct common_params_speculative speculative; + + // params only used in to_json() + int32_t n_ctx; + uint32_t seed_cur; + bool can_speculative; + + // OAI-compat fields + bool verbose = false; + bool oaicompat = false; + bool oaicompat_chat = true; + std::string oaicompat_model; + std::string oaicompat_cmpl_id; + + json to_json() { + std::vector samplers; + samplers.reserve(sampling.samplers.size()); + for (const auto & sampler : sampling.samplers) { + samplers.emplace_back(common_sampler_type_to_str(sampler)); + } + + return json { + {"n_ctx", n_ctx}, + {"n_predict", n_predict}, // Server configured n_predict + {"temperature", sampling.temp}, + {"dynatemp_range", sampling.dynatemp_range}, + {"dynatemp_exponent", sampling.dynatemp_exponent}, + {"top_k", sampling.top_k}, + {"top_p", sampling.top_p}, + {"min_p", sampling.min_p}, + {"xtc_probability", sampling.xtc_probability}, + {"xtc_threshold", sampling.xtc_threshold}, + {"typical_p", sampling.typ_p}, + {"repeat_last_n", sampling.penalty_last_n}, + {"repeat_penalty", sampling.penalty_repeat}, + {"presence_penalty", sampling.penalty_present}, + {"frequency_penalty", sampling.penalty_freq}, + {"dry_multiplier", sampling.dry_multiplier}, + {"dry_base", sampling.dry_base}, + {"dry_allowed_length", sampling.dry_allowed_length}, + {"dry_penalty_last_n", sampling.dry_penalty_last_n}, + {"dry_sequence_breakers", sampling.dry_sequence_breakers}, + {"mirostat", sampling.mirostat}, + {"mirostat_tau", sampling.mirostat_tau}, + {"mirostat_eta", sampling.mirostat_eta}, + {"penalize_nl", sampling.penalize_nl}, + {"stop", antiprompt}, + {"max_tokens", n_predict}, // User configured n_predict + {"n_keep", n_keep}, + {"n_discard", n_discard}, + {"ignore_eos", sampling.ignore_eos}, + {"stream", stream}, + //{"logit_bias", sampling.logit_bias}, + {"n_probs", sampling.n_probs}, + {"min_keep", sampling.min_keep}, + {"grammar", sampling.grammar}, + {"samplers", samplers}, + {"speculative", can_speculative}, + {"speculative.n_max", speculative.n_max}, + {"speculative.n_min", speculative.n_min}, + {"speculative.p_min", speculative.p_min}, + {"timings_per_token", timings_per_token}, + }; + } +}; + +struct result_timings { + int32_t prompt_n = -1; + double prompt_ms; + double prompt_per_token_ms; + double prompt_per_second; + + int32_t predicted_n = -1; + double predicted_ms; + double predicted_per_token_ms; + double predicted_per_second; + + json to_json() { + return { + {"prompt_n", prompt_n}, + {"prompt_ms", prompt_ms}, + {"prompt_per_token_ms", prompt_per_token_ms}, + {"prompt_per_second", prompt_per_second}, + + {"predicted_n", predicted_n}, + {"predicted_ms", predicted_ms}, + {"predicted_per_token_ms", predicted_per_token_ms}, + {"predicted_per_second", predicted_per_second}, + }; + } +}; + +struct server_task_result { + int id = -1; + int id_slot = -1; + virtual bool is_error() { + // only used by server_task_result_error + return false; + } + virtual bool is_stop() { + // only used by server_task_result_cmpl_partial + return false; + } + virtual int get_index() { + return -1; + } + virtual json to_json() = 0; + virtual ~server_task_result() = default; +}; + +// using shared_ptr for polymorphism of server_task_result +using server_task_result_ptr = std::unique_ptr; + +inline std::string stop_type_to_str(stop_type type) { + switch (type) { + case STOP_TYPE_EOS: return "eos"; + case STOP_TYPE_WORD: return "word"; + case STOP_TYPE_LIMIT: return "limit"; + default: return "none"; + } +} + +struct completion_token_output { + llama_token tok; + std::string text_to_send; + struct token_prob { + llama_token tok; + std::string tok_str; + float prob; + }; + std::vector probs; + + json to_json() const { + json probs_for_token = json::array(); + for (const auto & p : probs) { + probs_for_token.push_back(json { + {"tok_str", p.tok_str}, + {"prob", p.prob}, + }); + } + return probs_for_token; + } + + static json probs_vector_to_json(const std::vector & probs) { + json out = json::array(); + for (const auto & prob : probs) { + const std::string tok_str = prob.text_to_send; + out.push_back(json { + {"content", tok_str}, + {"probs", prob.to_json()}, + }); + } + return out; + } +}; + +struct server_task_result_cmpl_final : server_task_result { + int index = 0; + std::string content; + bool stream; + result_timings timings; + std::string prompt; + + bool truncated; + int32_t n_decoded; + int32_t n_prompt_tokens; + int32_t n_tokens_cached; + int32_t has_new_line; + std::string stopping_word; + stop_type stop = STOP_TYPE_NONE; + + std::vector probs_output; + + slot_params generation_params; + + // OAI-compat fields + bool verbose = false; + bool oaicompat = false; + bool oaicompat_chat = true; // TODO: support oaicompat for non-chat + std::string oaicompat_model; + std::string oaicompat_cmpl_id; + + virtual int get_index() override { + return index; + } + + virtual json to_json() override { + return oaicompat ? to_json_oaicompat_chat() : to_json_non_oaicompat(); + } + + json to_json_non_oaicompat() { + json res = json { + {"index", index}, + {"content", content}, + {"id_slot", id_slot}, + {"stop", true}, + {"model", oaicompat_model}, + {"tokens_predicted", n_decoded}, + {"tokens_evaluated", n_prompt_tokens}, + {"generation_settings", generation_params.to_json()}, + {"prompt", prompt}, + {"has_new_line", has_new_line}, + {"truncated", truncated}, + {"stop_type", stop_type_to_str(stop)}, + {"stopping_word", stopping_word}, + {"tokens_cached", n_tokens_cached}, + {"timings", timings.to_json()}, + }; + if (!probs_output.empty()) { + res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output); + } + return res; + } + + json to_json_oaicompat_chat() { + std::string finish_reason = "length"; + if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { + finish_reason = "stop"; + } + + json choices = json::array({json{ + {"finish_reason", finish_reason}, + {"index", 0}, + {"message", json{ + {"content", content}, + {"role", "assistant"} + } + }}}); + + std::time_t t = std::time(0); + + json res = json { + {"choices", choices}, + {"created", t}, + {"model", oaicompat_model}, + {"object", "chat.completion"}, + {"usage", json { + {"completion_tokens", n_decoded}, + {"prompt_tokens", n_prompt_tokens}, + {"total_tokens", n_decoded + n_prompt_tokens} + }}, + {"id", oaicompat_cmpl_id} + }; + + // extra fields for debugging purposes + if (verbose) { + res["__verbose"] = to_json_non_oaicompat(); + } + if (timings.prompt_n >= 0) { + res.push_back({"timings", timings.to_json()}); + } + + return res; + } +}; + +struct server_task_result_cmpl_partial : server_task_result { + int index = 0; + std::string content; + + bool truncated; + int32_t n_decoded; + int32_t n_prompt_tokens; + + stop_type stop = STOP_TYPE_NONE; + + std::vector probs_output; + result_timings timings; + + // OAI-compat fields + bool verbose = false; + bool oaicompat = false; + bool oaicompat_chat = true; // TODO: support oaicompat for non-chat + std::string oaicompat_model; + std::string oaicompat_cmpl_id; + + virtual int get_index() override { + return index; + } + + virtual bool is_stop() override { + return stop != STOP_TYPE_NONE; + } + + virtual json to_json() override { + if (oaicompat) { + return to_json_oaicompat(); + } + bool is_stop = stop != STOP_TYPE_NONE; + // non-OAI-compat JSON + json res = json { + {"index", index}, + {"content", content}, + {"stop_type", stop_type_to_str(stop)}, + {"stop", is_stop}, + {"id_slot", id_slot}, + {"tokens_predicted", n_decoded}, + {"tokens_evaluated", n_prompt_tokens}, + }; + // populate the timings object when needed (usually for the last response or with timings_per_token enabled) + if (timings.prompt_n > 0) { + res.push_back({"timings", timings.to_json()}); + } + if (!probs_output.empty()) { + res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output); + } + if (is_stop) { + res.push_back({"truncated", truncated}); + } + return res; + } + + json to_json_oaicompat() { + bool first = n_decoded == 0; + + std::string finish_reason; + if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { + finish_reason = "stop"; + } else if (stop == STOP_TYPE_LIMIT) { + finish_reason = "length"; + } + + std::time_t t = std::time(0); + + json choices; + + if (!finish_reason.empty()) { + choices = json::array({json{{"finish_reason", finish_reason}, + {"index", 0}, + {"delta", json::object()}}}); + } else { + if (first) { + if (content.empty()) { + choices = json::array({json{{"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{{"role", "assistant"}}}}}); + } else { + // We have to send this as two updates to conform to openai behavior + json initial_ret = json{{"choices", json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{ + {"role", "assistant"} + }}}})}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"object", "chat.completion.chunk"}}; + + json second_ret = json{ + {"choices", json::array({json{{"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{ + {"content", content}}} + }})}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"object", "chat.completion.chunk"}}; + + return std::vector({initial_ret, second_ret}); + } + } else { + // Some idiosyncrasy in task processing logic makes several trailing calls + // with empty content, we ignore these at the calee site. + if (content.empty()) { + return std::vector({json::object()}); + } + + choices = json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", + json{ + {"content", content}, + }}, + }}); + } + } + + json ret = json { + {"choices", choices}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"object", "chat.completion.chunk"} + }; + + if (timings.prompt_n >= 0) { + ret.push_back({"timings", timings.to_json()}); + } + + if (!finish_reason.empty()) { + ret.push_back({"usage", json { + {"completion_tokens", n_decoded}, + {"prompt_tokens", n_prompt_tokens}, + {"total_tokens", n_decoded + n_prompt_tokens}, + }}); + } + + return std::vector({ret}); + } +}; + +struct server_task_result_embd : server_task_result { + int index = 0; + std::vector embedding; + + virtual int get_index() override { + return index; + } + + virtual json to_json() override { + return json { + {"index", index}, + {"embedding", embedding}, + }; + } +}; + +struct server_task_result_rerank : server_task_result { + int index = 0; + float score = -1e6; + + virtual int get_index() override { + return index; + } + + virtual json to_json() override { + return json { + {"index", index}, + {"score", score}, + }; + } +}; + +// this function maybe used outside of server_task_result_error +static json format_error_response(const std::string & message, const enum error_type type) { + std::string type_str; + int code = 500; + switch (type) { + case ERROR_TYPE_INVALID_REQUEST: + type_str = "invalid_request_error"; + code = 400; + break; + case ERROR_TYPE_AUTHENTICATION: + type_str = "authentication_error"; + code = 401; + break; + case ERROR_TYPE_NOT_FOUND: + type_str = "not_found_error"; + code = 404; + break; + case ERROR_TYPE_SERVER: + type_str = "server_error"; + code = 500; + break; + case ERROR_TYPE_PERMISSION: + type_str = "permission_error"; + code = 403; + break; + case ERROR_TYPE_NOT_SUPPORTED: + type_str = "not_supported_error"; + code = 501; + break; + case ERROR_TYPE_UNAVAILABLE: + type_str = "unavailable_error"; + code = 503; + break; + } + return json { + {"code", code}, + {"message", message}, + {"type", type_str}, + }; +} + +struct server_task_result_error : server_task_result { + int index = 0; + error_type err_type = ERROR_TYPE_SERVER; + std::string err_msg; + + virtual bool is_error() override { + return true; + } + + virtual json to_json() override { + return format_error_response(err_msg, err_type); + } +}; + +struct server_task_result_metrics : server_task_result { + int n_idle_slots; + int n_processing_slots; + int n_tasks_deferred; + int64_t t_start; + + int32_t kv_cache_tokens_count; + int32_t kv_cache_used_cells; + + // TODO: somehow reuse server_metrics in the future, instead of duplicating the fields + uint64_t n_prompt_tokens_processed_total = 0; + uint64_t t_prompt_processing_total = 0; + uint64_t n_tokens_predicted_total = 0; + uint64_t t_tokens_generation_total = 0; + + uint64_t n_prompt_tokens_processed = 0; + uint64_t t_prompt_processing = 0; + + uint64_t n_tokens_predicted = 0; + uint64_t t_tokens_generation = 0; + + uint64_t n_decode_total = 0; + uint64_t n_busy_slots_total = 0; + + // TODO: get rid of this json object and use to_json() instead + json slots_data = json::array(); + + virtual json to_json() override { + return json { + { "idle", n_idle_slots }, + { "processing", n_processing_slots }, + { "deferred", n_tasks_deferred }, + { "t_start", t_start }, + + { "n_prompt_tokens_processed_total", n_prompt_tokens_processed_total }, + { "t_tokens_generation_total", t_tokens_generation_total }, + { "n_tokens_predicted_total", n_tokens_predicted_total }, + { "t_prompt_processing_total", t_prompt_processing_total }, + + { "n_prompt_tokens_processed", n_prompt_tokens_processed }, + { "t_prompt_processing", t_prompt_processing }, + { "n_tokens_predicted", n_tokens_predicted }, + { "t_tokens_generation", t_tokens_generation }, + + { "n_decode_total", n_decode_total }, + { "n_busy_slots_total", n_busy_slots_total }, + + { "kv_cache_tokens_count", kv_cache_tokens_count }, + { "kv_cache_used_cells", kv_cache_used_cells }, + + { "slots", slots_data }, + }; + } +}; + +struct server_task_result_slot_save_load : server_task_result { + std::string filename; + bool is_save; // true = save, false = load + + size_t n_tokens; + size_t n_bytes; + double t_ms; + + virtual json to_json() override { + if (is_save) { + return json { + { "id_slot", id_slot }, + { "filename", filename }, + { "n_saved", n_tokens }, + { "n_written", n_bytes }, + { "timings", { + { "save_ms", t_ms } + }}, + }; + } else { + return json { + { "id_slot", id_slot }, + { "filename", filename }, + { "n_restored", n_tokens }, + { "n_read", n_bytes }, + { "timings", { + { "restore_ms", t_ms } + }}, + }; + } + } +}; + +struct server_task_result_slot_erase : server_task_result { + size_t n_erased; + + virtual json to_json() override { + return json { + { "id_slot", id_slot }, + { "n_erased", n_erased }, + }; + } +}; + +struct server_task_result_apply_lora : server_task_result { + virtual json to_json() override { + return json {{ "success", true }}; + } }; struct server_slot { @@ -162,15 +762,8 @@ struct server_slot { bool has_next_token = true; bool has_new_line = false; bool truncated = false; - bool stopped_eos = false; - bool stopped_word = false; - bool stopped_limit = false; - - bool timings_per_token = false; + stop_type stop; - bool oaicompat = false; - - std::string oaicompat_model; std::string stopping_word; // sampling @@ -200,9 +793,7 @@ struct server_slot { generated_text = ""; has_new_line = false; truncated = false; - stopped_eos = false; - stopped_word = false; - stopped_limit = false; + stop = STOP_TYPE_NONE; stopping_word = ""; n_past = 0; n_sent_text = 0; @@ -255,38 +846,40 @@ struct server_slot { } } - json get_formated_timings() const { - return json { - {"prompt_n", n_prompt_tokens_processed}, - {"prompt_ms", t_prompt_processing}, - {"prompt_per_token_ms", t_prompt_processing / n_prompt_tokens_processed}, - {"prompt_per_second", 1e3 / t_prompt_processing * n_prompt_tokens_processed}, - - {"predicted_n", n_decoded}, - {"predicted_ms", t_token_generation}, - {"predicted_per_token_ms", t_token_generation / n_decoded}, - {"predicted_per_second", 1e3 / t_token_generation * n_decoded}, - }; + result_timings get_timings() const { + result_timings timings; + timings.prompt_n = n_prompt_tokens_processed; + timings.prompt_ms = t_prompt_processing; + timings.prompt_per_token_ms = t_prompt_processing / n_prompt_tokens_processed; + timings.prompt_per_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; + + timings.predicted_n = n_decoded; + timings.predicted_ms = t_token_generation; + timings.predicted_per_token_ms = t_token_generation / n_decoded; + timings.predicted_per_second = 1e3 / t_token_generation * n_decoded; + + return timings; } - size_t find_stopping_strings(const std::string & text, const size_t last_token_size, const stop_type type) { + size_t find_stopping_strings(const std::string & text, const size_t last_token_size, bool is_full_stop) { size_t stop_pos = std::string::npos; for (const std::string & word : params.antiprompt) { size_t pos; - if (type == STOP_TYPE_FULL) { + if (is_full_stop) { const size_t tmp = word.size() + last_token_size; const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; pos = text.find(word, from_pos); } else { + // otherwise, partial stop pos = find_partial_stop_string(word, text); } if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) { - if (type == STOP_TYPE_FULL) { - stopped_word = true; + if (is_full_stop) { + stop = STOP_TYPE_WORD; stopping_word = word; has_next_token = false; } @@ -511,8 +1104,8 @@ struct server_response { // for keeping track of all tasks waiting for the result std::unordered_set waiting_task_ids; - // the main result queue - std::vector queue_results; + // the main result queue (using ptr for polymorphism) + std::vector queue_results; std::mutex mutex_results; std::condition_variable condition_results; @@ -552,7 +1145,7 @@ struct server_response { } // This function blocks the thread until there is a response for one of the id_tasks - server_task_result recv(const std::unordered_set & id_tasks) { + server_task_result_ptr recv(const std::unordered_set & id_tasks) { while (true) { std::unique_lock lock(mutex_results); condition_results.wait(lock, [&]{ @@ -560,8 +1153,8 @@ struct server_response { }); for (int i = 0; i < (int) queue_results.size(); i++) { - if (id_tasks.find(queue_results[i].id) != id_tasks.end()) { - server_task_result res = queue_results[i]; + if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) { + server_task_result_ptr res = std::move(queue_results[i]); queue_results.erase(queue_results.begin() + i); return res; } @@ -572,21 +1165,21 @@ struct server_response { } // single-task version of recv() - server_task_result recv(int id_task) { + server_task_result_ptr recv(int id_task) { std::unordered_set id_tasks = {id_task}; return recv(id_tasks); } // Send a new result to a waiting id_task - void send(server_task_result & result) { - SRV_DBG("sending result for task id = %d\n", result.id); + void send(server_task_result_ptr && result) { + SRV_DBG("sending result for task id = %d\n", result->id); std::unique_lock lock(mutex_results); for (const auto & id_task : waiting_task_ids) { - if (result.id == id_task) { - SRV_DBG("task id = %d moved to result queue\n", result.id); + if (result->id == id_task) { + SRV_DBG("task id = %d pushed to result queue\n", result->id); - queue_results.push_back(std::move(result)); + queue_results.emplace_back(std::move(result)); condition_results.notify_all(); return; } @@ -777,7 +1370,7 @@ struct server_context { slots.push_back(slot); } - default_generation_settings_for_props = get_formated_generation(slots.front()); + default_generation_settings_for_props = slots[0].params.to_json(); default_generation_settings_for_props["seed"] = -1; // the update_slots() logic will always submit a maximum of n_batch or n_parallel tokens @@ -873,14 +1466,19 @@ struct server_context { const auto & data = task.data; if (data.count("__oaicompat") != 0) { - slot.oaicompat = true; - slot.oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); + std::string model_name = params_base.model_alias.empty() ? DEFAULT_OAICOMPAT_MODEL : params_base.model_alias; + slot.params.oaicompat = true; + slot.params.oaicompat_chat = json_value(data, "__oaicompat_chat", false); + slot.params.oaicompat_model = json_value(data, "model", model_name); + slot.params.oaicompat_cmpl_id = json_value(data, "completion_id", std::string()); } else { - slot.oaicompat = false; - slot.oaicompat_model = ""; + slot.params.oaicompat = false; } - slot.timings_per_token = json_value(data, "timings_per_token", false); + + // enabling this will output extra debug information in the HTTP responses from the server + slot.params.verbose = params_base.verbosity > 9; + slot.params.timings_per_token = json_value(data, "timings_per_token", false); slot.params.stream = json_value(data, "stream", false); slot.params.cache_prompt = json_value(data, "cache_prompt", true); @@ -1110,14 +1708,14 @@ struct server_context { const std::string str_test = slot.generated_text.substr(pos); bool send_text = true; - size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_FULL); + size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), true); if (stop_pos != std::string::npos) { slot.generated_text.erase( slot.generated_text.begin() + pos + stop_pos, slot.generated_text.end()); pos = std::min(slot.n_sent_text, slot.generated_text.size()); } else if (slot.has_next_token) { - stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_PARTIAL); + stop_pos = slot.find_stopping_strings(str_test, token_str.size(), false); send_text = stop_pos == std::string::npos; } @@ -1141,7 +1739,7 @@ struct server_context { // check the limits if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) { - slot.stopped_limit = true; + slot.stop = STOP_TYPE_LIMIT; slot.has_next_token = false; SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict); @@ -1150,7 +1748,7 @@ struct server_context { if (slot.has_new_line) { // if we have already seen a new line, we stop after a certain time limit if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) { - slot.stopped_limit = true; + slot.stop = STOP_TYPE_LIMIT; slot.has_next_token = false; SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms); @@ -1170,7 +1768,7 @@ struct server_context { } if (pos < slot.generated_text.size() && n_indent < slot.params.n_indent) { - slot.stopped_limit = true; + slot.stop = STOP_TYPE_LIMIT; slot.has_next_token = false; // cut the last line @@ -1199,7 +1797,7 @@ struct server_context { // if context shift is disabled, we stop when it reaches the context limit if (slot.n_past >= slot.n_ctx) { slot.truncated = true; - slot.stopped_limit = true; + slot.stop = STOP_TYPE_LIMIT; slot.has_next_token = false; SLT_DBG(slot, "stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n", @@ -1207,7 +1805,7 @@ struct server_context { } if (llama_token_is_eog(model, result.tok)) { - slot.stopped_eos = true; + slot.stop = STOP_TYPE_EOS; slot.has_next_token = false; SLT_DBG(slot, "%s", "stopped by EOS\n"); @@ -1217,7 +1815,7 @@ struct server_context { if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) { slot.truncated = true; - slot.stopped_limit = true; + slot.stop = STOP_TYPE_LIMIT; slot.has_next_token = false; // stop prediction SLT_WRN(slot, @@ -1231,60 +1829,6 @@ struct server_context { return slot.has_next_token; // continue } - json get_formated_generation(const server_slot & slot) const { - std::vector samplers; - samplers.reserve(slot.params.sampling.samplers.size()); - for (const auto & sampler : slot.params.sampling.samplers) { - samplers.emplace_back(common_sampler_type_to_str(sampler)); - } - - return json { - {"n_ctx", slot.n_ctx}, - {"n_predict", slot.n_predict}, // Server configured n_predict - {"model", params_base.model_alias}, - {"seed", slot.params.sampling.seed}, - {"seed_cur", slot.smpl ? common_sampler_get_seed(slot.smpl) : 0}, - {"temperature", slot.params.sampling.temp}, - {"dynatemp_range", slot.params.sampling.dynatemp_range}, - {"dynatemp_exponent", slot.params.sampling.dynatemp_exponent}, - {"top_k", slot.params.sampling.top_k}, - {"top_p", slot.params.sampling.top_p}, - {"min_p", slot.params.sampling.min_p}, - {"xtc_probability", slot.params.sampling.xtc_probability}, - {"xtc_threshold", slot.params.sampling.xtc_threshold}, - {"typical_p", slot.params.sampling.typ_p}, - {"repeat_last_n", slot.params.sampling.penalty_last_n}, - {"repeat_penalty", slot.params.sampling.penalty_repeat}, - {"presence_penalty", slot.params.sampling.penalty_present}, - {"frequency_penalty", slot.params.sampling.penalty_freq}, - {"dry_multiplier", slot.params.sampling.dry_multiplier}, - {"dry_base", slot.params.sampling.dry_base}, - {"dry_allowed_length", slot.params.sampling.dry_allowed_length}, - {"dry_penalty_last_n", slot.params.sampling.dry_penalty_last_n}, - {"dry_sequence_breakers", slot.params.sampling.dry_sequence_breakers}, - {"mirostat", slot.params.sampling.mirostat}, - {"mirostat_tau", slot.params.sampling.mirostat_tau}, - {"mirostat_eta", slot.params.sampling.mirostat_eta}, - {"penalize_nl", slot.params.sampling.penalize_nl}, - {"stop", slot.params.antiprompt}, - {"max_tokens", slot.params.n_predict}, // User configured n_predict - {"n_keep", slot.params.n_keep}, - {"n_discard", slot.params.n_discard}, - {"ignore_eos", slot.params.sampling.ignore_eos}, - {"stream", slot.params.stream}, - //{"logit_bias", slot.params.sampling.logit_bias}, - {"n_probs", slot.params.sampling.n_probs}, - {"min_keep", slot.params.sampling.min_keep}, - {"grammar", slot.params.sampling.grammar}, - {"samplers", samplers}, - {"speculative", slot.can_speculate()}, - {"speculative.n_max", slot.params.speculative.n_max}, - {"speculative.n_min", slot.params.speculative.n_min}, - {"speculative.p_min", slot.params.speculative.p_min}, - {"timings_per_token", slot.timings_per_token}, - }; - } - void send_error(const server_task & task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { send_error(task.id, error, type); } @@ -1296,28 +1840,33 @@ struct server_context { void send_error(const int id_task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { SRV_ERR("task id = %d, error: %s\n", id_task, error.c_str()); - server_task_result res; - res.id = id_task; - res.stop = false; - res.error = true; - res.data = format_error_response(error, type); + auto res = std::make_unique(); + res->id = id_task; + res->err_type = type; + res->err_msg = error; - queue_results.send(res); + queue_results.send(std::move(res)); } void send_partial_response(server_slot & slot, completion_token_output tkn) { - server_task_result res; - res.id = slot.id_task; - res.error = false; - res.stop = false; - res.data = json { - {"content", tkn.text_to_send}, - {"stop", false}, - {"id_slot", slot.id}, - {"multimodal", false}, - {"index", slot.index}, - }; + auto res = std::make_unique(); + res->id = slot.id_task; + res->index = slot.index; + res->content = tkn.text_to_send; + + res->truncated = slot.truncated; + res->n_decoded = slot.n_decoded; + res->n_prompt_tokens = slot.n_prompt_tokens; + + res->stop = slot.stop; + + res->verbose = slot.params.verbose; + res->oaicompat = slot.params.oaicompat; + res->oaicompat_chat = slot.params.oaicompat_chat; + res->oaicompat_model = slot.params.oaicompat_model; + res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id; + // populate res.probs_output if (slot.params.sampling.n_probs > 0) { const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false); const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size()); @@ -1325,83 +1874,74 @@ struct server_context { std::vector probs_output; if (probs_pos < probs_stop_pos) { - probs_output = std::vector( + res->probs_output = std::vector( slot.generated_token_probs.begin() + probs_pos, slot.generated_token_probs.begin() + probs_stop_pos); } - slot.n_sent_token_probs = probs_stop_pos; - - res.data["completion_probabilities"] = probs_vector_to_json(ctx, probs_output); } - if (slot.oaicompat) { - res.data["oaicompat_token_ctr"] = slot.n_decoded; - res.data["model"] = slot.oaicompat_model; + // populate timings if this is final response or timings_per_token is enabled + if (slot.stop != STOP_TYPE_NONE || slot.params.timings_per_token) { + res->timings = slot.get_timings(); } - if (slot.timings_per_token) { - res.data["timings"] = slot.get_formated_timings(); + queue_results.send(std::move(res)); + } + + void send_final_response(server_slot & slot) { + if (slot.params.stream) { + // if in stream mode, send the last partial response + return send_partial_response(slot, {0, "", {}}); } - queue_results.send(res); - } + auto res = std::make_unique(); + res->id = slot.id_task; + res->id_slot = slot.id; - void send_final_response(const server_slot & slot) { - server_task_result res; - res.id = slot.id_task; - res.error = false; - res.stop = true; - res.data = json { - {"content", !slot.params.stream ? slot.generated_text : ""}, - {"id_slot", slot.id}, - {"stop", true}, - {"model", params_base.model_alias}, - {"tokens_predicted", slot.n_decoded}, - {"tokens_evaluated", slot.n_prompt_tokens}, - {"generation_settings", get_formated_generation(slot)}, - {"prompt", common_detokenize(ctx, slot.prompt_tokens)}, - {"has_new_line", slot.has_new_line}, - {"truncated", slot.truncated}, - {"stopped_eos", slot.stopped_eos}, - {"stopped_word", slot.stopped_word}, - {"stopped_limit", slot.stopped_limit}, - {"stopping_word", slot.stopping_word}, - {"tokens_cached", slot.n_past}, - {"timings", slot.get_formated_timings()}, - {"index", slot.index}, - }; + res->index = slot.index; + res->content = slot.generated_text; + res->timings = slot.get_timings(); + res->prompt = common_detokenize(ctx, slot.prompt_tokens, true); + + res->truncated = slot.truncated; + res->n_decoded = slot.n_decoded; + res->n_prompt_tokens = slot.n_prompt_tokens; + res->n_tokens_cached = slot.n_past; + res->has_new_line = slot.has_new_line; + res->stopping_word = slot.stopping_word; + res->stop = slot.stop; + res->verbose = slot.params.verbose; + res->oaicompat = slot.params.oaicompat; + res->oaicompat_chat = slot.params.oaicompat_chat; + res->oaicompat_model = slot.params.oaicompat_model; + res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id; + + // populate res.probs_output if (slot.params.sampling.n_probs > 0) { - std::vector probs; - if (!slot.params.stream && slot.stopped_word) { + if (!slot.params.stream && slot.stop == STOP_TYPE_WORD) { const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false); size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size()); - probs = std::vector( + res->probs_output = std::vector( slot.generated_token_probs.begin(), slot.generated_token_probs.end() - safe_offset); } else { - probs = std::vector( + res->probs_output = std::vector( slot.generated_token_probs.begin(), slot.generated_token_probs.end()); } - - res.data["completion_probabilities"] = probs_vector_to_json(ctx, probs); } - if (slot.oaicompat) { - res.data["oaicompat_token_ctr"] = slot.n_decoded; - res.data["model"] = slot.oaicompat_model; - } + res->generation_params = slot.params; // copy the parameters - queue_results.send(res); + queue_results.send(std::move(res)); } void send_embedding(const server_slot & slot, const llama_batch & batch) { - server_task_result res; - res.id = slot.id_task; - res.error = false; - res.stop = true; + auto res = std::make_unique(); + res->id = slot.id_task; + res->index = slot.index; const int n_embd = llama_n_embd(model); @@ -1420,32 +1960,23 @@ struct server_context { if (embd == NULL) { SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]); - res.data = json { - {"embedding", std::vector(n_embd, 0.0f)}, - {"index", slot.index}, - }; - + res->embedding = std::vector(n_embd, 0.0f); continue; } common_embd_normalize(embd, embd_res.data(), n_embd); - - res.data = json { - {"embedding", embd_res}, - {"index", slot.index}, - }; + res->embedding = embd_res; } SLT_DBG(slot, "%s", "sending embeddings\n"); - queue_results.send(res); + queue_results.send(std::move(res)); } void send_rerank(const server_slot & slot, const llama_batch & batch) { - server_task_result res; - res.id = slot.id_task; - res.error = false; - res.stop = true; + auto res = std::make_unique(); + res->id = slot.id_task; + res->index = slot.index; for (int i = 0; i < batch.n_tokens; ++i) { if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { @@ -1460,23 +1991,16 @@ struct server_context { if (embd == NULL) { SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]); - res.data = json { - {"index", slot.index}, - {"score", -1e6}, - }; - + res->score = -1e6; continue; } - res.data = json { - {"index", slot.index}, - {"score", embd[0]}, - }; + res->score = embd[0]; } - SLT_DBG(slot, "sending rerank result, res = '%s'\n", res.data.dump().c_str()); + SLT_DBG(slot, "sending rerank result, res.score = %f\n", res->score); - queue_results.send(res); + queue_results.send(std::move(res)); } // @@ -1567,49 +2091,54 @@ struct server_context { } // receive the results from task(s) created by create_tasks_inference - void receive_cmpl_results( + void receive_multi_results( const std::unordered_set & id_tasks, - const std::function&)> & result_handler, + const std::function&)> & result_handler, const std::function & error_handler) { - // TODO: currently, there is no way to detect the client has cancelled the request - std::vector results(id_tasks.size()); + std::vector results(id_tasks.size()); for (size_t i = 0; i < id_tasks.size(); i++) { - server_task_result result = queue_results.recv(id_tasks); + server_task_result_ptr result = queue_results.recv(id_tasks); - if (result.error) { - error_handler(result.data); + if (result->is_error()) { + error_handler(result->to_json()); cancel_tasks(id_tasks); return; } - const size_t idx = result.data["index"]; + GGML_ASSERT( + dynamic_cast(result.get()) != nullptr + || dynamic_cast(result.get()) != nullptr + || dynamic_cast(result.get()) != nullptr + ); + const size_t idx = result->get_index(); GGML_ASSERT(idx < results.size() && "index out of range"); - - results[idx] = result; + results[idx] = std::move(result); } result_handler(results); } // receive the results from task(s) created by create_tasks_inference, in stream mode void receive_cmpl_results_stream( - const std::unordered_set & id_tasks, const - std::function & result_handler, const - std::function & error_handler) { + const std::unordered_set & id_tasks, + const std::function & result_handler, + const std::function & error_handler) { size_t n_finished = 0; while (true) { - server_task_result result = queue_results.recv(id_tasks); - if (!result_handler(result)) { + server_task_result_ptr result = queue_results.recv(id_tasks); + + if (result->is_error()) { + error_handler(result->to_json()); cancel_tasks(id_tasks); - break; + return; } - if (result.error) { - error_handler(result.data); + GGML_ASSERT(dynamic_cast(result.get()) != nullptr); + if (!result_handler(result)) { cancel_tasks(id_tasks); break; } - if (result.stop) { + if (result->is_stop()) { if (++n_finished == id_tasks.size()) { break; } @@ -1676,7 +2205,7 @@ struct server_context { int n_processing_slots = 0; for (server_slot & slot : slots) { - json slot_data = get_formated_generation(slot); + json slot_data = slot.params.to_json(); slot_data["id"] = slot.id; slot_data["id_task"] = slot.id_task; slot_data["is_processing"] = slot.is_processing(); @@ -1686,9 +2215,6 @@ struct server_context { {"has_new_line", slot.has_new_line}, {"n_remain", slot.n_remaining}, {"n_decoded", slot.n_decoded}, - {"stopped_eos", slot.stopped_eos}, - {"stopped_word", slot.stopped_word}, - {"stopped_limit", slot.stopped_limit}, {"stopping_word", slot.stopping_word}, }; @@ -1702,39 +2228,33 @@ struct server_context { } SRV_DBG("n_idle_slots = %d, n_processing_slots = %d\n", n_idle_slots, n_processing_slots); - server_task_result res; - res.id = task.id; - res.stop = true; - res.error = false; - res.data = { - { "idle", n_idle_slots }, - { "processing", n_processing_slots }, - { "deferred", queue_tasks.queue_tasks_deferred.size() }, - { "t_start", metrics.t_start}, - - { "n_prompt_tokens_processed_total", metrics.n_prompt_tokens_processed_total}, - { "t_tokens_generation_total", metrics.t_tokens_generation_total}, - { "n_tokens_predicted_total", metrics.n_tokens_predicted_total}, - { "t_prompt_processing_total", metrics.t_prompt_processing_total}, + auto res = std::make_unique(); + res->id = task.id; + res->n_idle_slots = n_idle_slots; + res->n_processing_slots = n_processing_slots; + res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size(); + res->t_start = metrics.t_start; - { "n_prompt_tokens_processed", metrics.n_prompt_tokens_processed}, - { "t_prompt_processing", metrics.t_prompt_processing}, - { "n_tokens_predicted", metrics.n_tokens_predicted}, - { "t_tokens_generation", metrics.t_tokens_generation}, + res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx); + res->kv_cache_used_cells = llama_get_kv_cache_used_cells(ctx); - { "n_decode_total", metrics.n_decode_total}, - { "n_busy_slots_total", metrics.n_busy_slots_total}, + res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total; + res->t_prompt_processing_total = metrics.t_prompt_processing_total; + res->n_tokens_predicted_total = metrics.n_tokens_predicted_total; + res->t_tokens_generation_total = metrics.t_tokens_generation_total; - { "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)}, - { "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)}, + res->n_prompt_tokens_processed = metrics.n_prompt_tokens_processed; + res->t_prompt_processing = metrics.t_prompt_processing; + res->n_tokens_predicted = metrics.n_tokens_predicted; + res->t_tokens_generation = metrics.t_tokens_generation; - { "slots", slots_data }, - }; + res->n_decode_total = metrics.n_decode_total; + res->n_busy_slots_total = metrics.n_busy_slots_total; if (json_value(task.data, "reset_bucket", false)) { metrics.reset_bucket(); } - queue_results.send(res); + queue_results.send(std::move(res)); } break; case SERVER_TASK_TYPE_SLOT_SAVE: { @@ -1762,20 +2282,15 @@ struct server_context { const int64_t t_end = ggml_time_us(); const double t_save_ms = (t_end - t_start) / 1000.0; - server_task_result result; - result.id = task.id; - result.stop = true; - result.error = false; - result.data = json { - { "id_slot", id_slot }, - { "filename", filename }, - { "n_saved", token_count }, // tokens saved - { "n_written", nwrite }, // bytes written - { "timings", { - { "save_ms", t_save_ms } - } } - }; - queue_results.send(result); + auto res = std::make_unique(); + res->id = task.id; + res->id_slot = id_slot; + res->filename = filename; + res->is_save = true; + res->n_tokens = token_count; + res->n_bytes = nwrite; + res->t_ms = t_save_ms; + queue_results.send(std::move(res)); } break; case SERVER_TASK_TYPE_SLOT_RESTORE: { @@ -1810,20 +2325,15 @@ struct server_context { const int64_t t_end = ggml_time_us(); const double t_restore_ms = (t_end - t_start) / 1000.0; - server_task_result result; - result.id = task.id; - result.stop = true; - result.error = false; - result.data = json { - { "id_slot", id_slot }, - { "filename", filename }, - { "n_restored", token_count }, // tokens restored - { "n_read", nread }, // bytes read - { "timings", { - { "restore_ms", t_restore_ms } - } } - }; - queue_results.send(result); + auto res = std::make_unique(); + res->id = task.id; + res->id_slot = id_slot; + res->filename = filename; + res->is_save = false; + res->n_tokens = token_count; + res->n_bytes = nread; + res->t_ms = t_restore_ms; + queue_results.send(std::move(res)); } break; case SERVER_TASK_TYPE_SLOT_ERASE: { @@ -1845,25 +2355,18 @@ struct server_context { llama_kv_cache_seq_rm(ctx, slot->id, -1, -1); slot->cache_tokens.clear(); - server_task_result result; - result.id = task.id; - result.stop = true; - result.error = false; - result.data = json { - { "id_slot", id_slot }, - { "n_erased", n_erased } - }; - queue_results.send(result); + auto res = std::make_unique(); + res->id = task.id; + res->id_slot = id_slot; + res->n_erased = n_erased; + queue_results.send(std::move(res)); } break; case SERVER_TASK_TYPE_SET_LORA: { common_lora_adapters_apply(ctx, loras); - server_task_result result; - result.id = task.id; - result.stop = true; - result.error = false; - result.data = json{{ "success", true }}; - queue_results.send(result); + auto res = std::make_unique(); + res->id = task.id; + queue_results.send(std::move(res)); } break; } } @@ -2298,9 +2801,11 @@ struct server_context { const auto * cur_p = common_sampler_get_candidates(slot.smpl); for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) { + auto tok_id = cur_p->data[i].id; result.probs.push_back({ - cur_p->data[i].id, - i >= cur_p->size ? 0.0f : cur_p->data[i].p, + tok_id, + tokens_to_output_formatted_string(ctx, tok_id), + i >= cur_p->size ? 0.0f : cur_p->data[i].p, }); } @@ -2452,17 +2957,9 @@ int main(int argc, char ** argv) { common_init(); - // enabling this will output extra debug information in the HTTP responses from the server - // see format_final_response_oaicompat() - const bool verbose = params.verbosity > 9; - // struct that contains llama context and inference server_context ctx_server; - if (params.model_alias == "unknown") { - params.model_alias = params.model; - } - llama_backend_init(); llama_numa_init(params.numa); @@ -2647,19 +3144,27 @@ int main(int argc, char ** argv) { ctx_server.queue_tasks.post(task, true); // high-priority task // get the result - server_task_result result = ctx_server.queue_results.recv(task.id); + server_task_result_ptr result = ctx_server.queue_results.recv(task.id); ctx_server.queue_results.remove_waiting_task_id(task.id); + if (result->is_error()) { + res_error(res, result->to_json()); + return; + } + + // TODO: get rid of this dynamic_cast + auto res_metrics = dynamic_cast(result.get()); + GGML_ASSERT(res_metrics != nullptr); + // optionally return "fail_on_no_slot" error - const int n_idle_slots = result.data.at("idle"); if (req.has_param("fail_on_no_slot")) { - if (n_idle_slots == 0) { + if (res_metrics->n_idle_slots == 0) { res_error(res, format_error_response("no slot available", ERROR_TYPE_UNAVAILABLE)); return; } } - res_ok(res, result.data.at("slots")); + res_ok(res, res_metrics->slots_data); }; const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) { @@ -2679,73 +3184,69 @@ int main(int argc, char ** argv) { ctx_server.queue_tasks.post(task, true); // high-priority task // get the result - server_task_result result = ctx_server.queue_results.recv(task.id); + server_task_result_ptr result = ctx_server.queue_results.recv(task.id); ctx_server.queue_results.remove_waiting_task_id(task.id); - json data = result.data; - - const uint64_t n_prompt_tokens_processed = data.at("n_prompt_tokens_processed"); - const uint64_t t_prompt_processing = data.at("t_prompt_processing"); - - const uint64_t n_tokens_predicted = data.at("n_tokens_predicted"); - const uint64_t t_tokens_generation = data.at("t_tokens_generation"); - - const uint64_t n_decode_total = data.at("n_decode_total"); - const uint64_t n_busy_slots_total = data.at("n_busy_slots_total"); + if (result->is_error()) { + res_error(res, result->to_json()); + return; + } - const int32_t kv_cache_used_cells = data.at("kv_cache_used_cells"); + // TODO: get rid of this dynamic_cast + auto res_metrics = dynamic_cast(result.get()); + GGML_ASSERT(res_metrics != nullptr); // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names json all_metrics_def = json { {"counter", {{ {"name", "prompt_tokens_total"}, {"help", "Number of prompt tokens processed."}, - {"value", (uint64_t) data.at("n_prompt_tokens_processed_total")} + {"value", (uint64_t) res_metrics->n_prompt_tokens_processed_total} }, { {"name", "prompt_seconds_total"}, {"help", "Prompt process time"}, - {"value", (uint64_t) data.at("t_prompt_processing_total") / 1.e3} + {"value", (uint64_t) res_metrics->t_prompt_processing_total / 1.e3} }, { {"name", "tokens_predicted_total"}, {"help", "Number of generation tokens processed."}, - {"value", (uint64_t) data.at("n_tokens_predicted_total")} + {"value", (uint64_t) res_metrics->n_tokens_predicted_total} }, { {"name", "tokens_predicted_seconds_total"}, {"help", "Predict process time"}, - {"value", (uint64_t) data.at("t_tokens_generation_total") / 1.e3} + {"value", (uint64_t) res_metrics->t_tokens_generation_total / 1.e3} }, { {"name", "n_decode_total"}, {"help", "Total number of llama_decode() calls"}, - {"value", n_decode_total} + {"value", res_metrics->n_decode_total} }, { {"name", "n_busy_slots_per_decode"}, {"help", "Average number of busy slots per llama_decode() call"}, - {"value", (float) n_busy_slots_total / (float) n_decode_total} + {"value", (float) res_metrics->n_busy_slots_total / (float) res_metrics->n_decode_total} }}}, {"gauge", {{ {"name", "prompt_tokens_seconds"}, {"help", "Average prompt throughput in tokens/s."}, - {"value", n_prompt_tokens_processed ? 1.e3 / t_prompt_processing * n_prompt_tokens_processed : 0.} + {"value", res_metrics->n_prompt_tokens_processed ? 1.e3 / res_metrics->t_prompt_processing * res_metrics->n_prompt_tokens_processed : 0.} },{ {"name", "predicted_tokens_seconds"}, {"help", "Average generation throughput in tokens/s."}, - {"value", n_tokens_predicted ? 1.e3 / t_tokens_generation * n_tokens_predicted : 0.} + {"value", res_metrics->n_tokens_predicted ? 1.e3 / res_metrics->t_tokens_generation * res_metrics->n_tokens_predicted : 0.} },{ {"name", "kv_cache_usage_ratio"}, {"help", "KV-cache usage. 1 means 100 percent usage."}, - {"value", 1. * kv_cache_used_cells / params.n_ctx} + {"value", 1. * res_metrics->kv_cache_used_cells / params.n_ctx} },{ {"name", "kv_cache_tokens"}, {"help", "KV-cache tokens."}, - {"value", (uint64_t) data.at("kv_cache_tokens_count")} + {"value", (uint64_t) res_metrics->kv_cache_tokens_count} },{ {"name", "requests_processing"}, {"help", "Number of request processing."}, - {"value", (uint64_t) data.at("processing")} + {"value", (uint64_t) res_metrics->n_processing_slots} },{ {"name", "requests_deferred"}, {"help", "Number of request deferred."}, - {"value", (uint64_t) data.at("deferred")} + {"value", (uint64_t) res_metrics->n_tasks_deferred} }}} }; @@ -2766,8 +3267,7 @@ int main(int argc, char ** argv) { } } - const int64_t t_start = data.at("t_start"); - res.set_header("Process-Start-Time-Unix", std::to_string(t_start)); + res.set_header("Process-Start-Time-Unix", std::to_string(res_metrics->t_start)); res.set_content(prometheus.str(), "text/plain; version=0.0.4"); res.status = 200; // HTTP OK @@ -2793,14 +3293,15 @@ int main(int argc, char ** argv) { const int id_task = ctx_server.queue_tasks.post(task); ctx_server.queue_results.add_waiting_task_id(id_task); - server_task_result result = ctx_server.queue_results.recv(id_task); + server_task_result_ptr result = ctx_server.queue_results.recv(id_task); ctx_server.queue_results.remove_waiting_task_id(id_task); - if (result.error) { - res_error(res, result.data); - } else { - res_ok(res, result.data); + if (result->is_error()) { + res_error(res, result->to_json()); + return; } + + res_ok(res, result->to_json()); }; const auto handle_slots_restore = [&ctx_server, &res_error, &res_ok, ¶ms](const httplib::Request & req, httplib::Response & res, int id_slot) { @@ -2823,14 +3324,16 @@ int main(int argc, char ** argv) { const int id_task = ctx_server.queue_tasks.post(task); ctx_server.queue_results.add_waiting_task_id(id_task); - server_task_result result = ctx_server.queue_results.recv(id_task); + server_task_result_ptr result = ctx_server.queue_results.recv(id_task); ctx_server.queue_results.remove_waiting_task_id(id_task); - if (result.error) { - res_error(res, result.data); - } else { - res_ok(res, result.data); + if (result->is_error()) { + res_error(res, result->to_json()); + return; } + + GGML_ASSERT(dynamic_cast(result.get()) != nullptr); + res_ok(res, result->to_json()); }; const auto handle_slots_erase = [&ctx_server, &res_error, &res_ok](const httplib::Request & /* req */, httplib::Response & res, int id_slot) { @@ -2843,14 +3346,16 @@ int main(int argc, char ** argv) { const int id_task = ctx_server.queue_tasks.post(task); ctx_server.queue_results.add_waiting_task_id(id_task); - server_task_result result = ctx_server.queue_results.recv(id_task); + server_task_result_ptr result = ctx_server.queue_results.recv(id_task); ctx_server.queue_results.remove_waiting_task_id(id_task); - if (result.error) { - res_error(res, result.data); - } else { - res_ok(res, result.data); + if (result->is_error()) { + res_error(res, result->to_json()); + return; } + + GGML_ASSERT(dynamic_cast(result.get()) != nullptr); + res_ok(res, result->to_json()); }; const auto handle_slots_action = [¶ms, &res_error, &handle_slots_save, &handle_slots_restore, &handle_slots_erase](const httplib::Request & req, httplib::Response & res) { @@ -2905,12 +3410,19 @@ int main(int argc, char ** argv) { res_ok(res, {{ "success", true }}); }; - const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) { + // handle completion-like requests (completion, chat, infill) + // we can optionally provide a custom format for partial results and final results + const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok]( + server_task_inf_type inf_type, + json & data, + httplib::Response & res, + bool oai_compat = false) { if (ctx_server.params_base.embedding) { res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); return; } + data["completion_id"] = gen_chatcmplid(); std::vector tasks = ctx_server.create_tasks_inference(data, inf_type); ctx_server.queue_results.add_waiting_tasks(tasks); ctx_server.queue_tasks.post(tasks); @@ -2919,15 +3431,15 @@ int main(int argc, char ** argv) { const auto task_ids = server_task::get_list_id(tasks); if (!stream) { - ctx_server.receive_cmpl_results(task_ids, [&](std::vector & results) { + ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { if (results.size() == 1) { // single result - res_ok(res, results[0].data); + res_ok(res, results[0]->to_json()); } else { // multiple results (multitask) json arr = json::array(); - for (const auto & res : results) { - arr.push_back(res.data); + for (auto & res : results) { + arr.push_back(res->to_json()); } res_ok(res, arr); } @@ -2937,12 +3449,26 @@ int main(int argc, char ** argv) { ctx_server.queue_results.remove_waiting_task_ids(task_ids); } else { - const auto chunked_content_provider = [task_ids, &ctx_server](size_t, httplib::DataSink & sink) { - ctx_server.receive_cmpl_results_stream(task_ids, [&](const server_task_result & result) -> bool { - return server_sent_event(sink, "data", result.data); + const auto chunked_content_provider = [task_ids, &ctx_server, oai_compat](size_t, httplib::DataSink & sink) { + ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_ptr & result) -> bool { + json res_json = result->to_json(); + if (res_json.is_array()) { + for (const auto & res : res_json) { + if (!server_sent_event(sink, "data", res)) { + return false; + } + } + return true; + } else { + return server_sent_event(sink, "data", res_json); + } }, [&](const json & error_data) { server_sent_event(sink, "error", error_data); }); + if (oai_compat) { + static const std::string ev_done = "data: [DONE]\n\n"; + sink.write(ev_done.data(), ev_done.size()); + } sink.done(); return false; }; @@ -3010,61 +3536,15 @@ int main(int argc, char ** argv) { return handle_completions_generic(SERVER_TASK_INF_TYPE_INFILL, data, res); }; - // TODO: maybe merge this function with "handle_completions_generic" - const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) { + const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) { if (ctx_server.params_base.embedding) { res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); return; } json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template); - - std::vector tasks = ctx_server.create_tasks_inference(data, SERVER_TASK_INF_TYPE_COMPLETION); - ctx_server.queue_results.add_waiting_tasks(tasks); - ctx_server.queue_tasks.post(tasks); - - bool stream = json_value(data, "stream", false); - const auto task_ids = server_task::get_list_id(tasks); - const auto completion_id = gen_chatcmplid(); - - if (!stream) { - ctx_server.receive_cmpl_results(task_ids, [&](const std::vector & results) { - // multitask is never support in chat completion, there is only one result - json result_oai = format_final_response_oaicompat(data, results[0].data, completion_id, /*.streaming =*/ false, verbose); - res_ok(res, result_oai); - }, [&](const json & error_data) { - res_error(res, error_data); - }); - - ctx_server.queue_results.remove_waiting_task_ids(task_ids); - } else { - const auto chunked_content_provider = [task_ids, &ctx_server, completion_id](size_t, httplib::DataSink & sink) { - ctx_server.receive_cmpl_results_stream(task_ids, [&](const server_task_result & result) -> bool { - std::vector result_array = format_partial_response_oaicompat(result.data, completion_id); - for (auto & event_data : result_array) { - if (event_data.empty()) { - continue; // skip the stop token - } - if (!server_sent_event(sink, "data", event_data)) { - return false; // connection is closed - } - } - return true; // ok - }, [&](const json & error_data) { - server_sent_event(sink, "error", error_data); - }); - static const std::string ev_done = "data: [DONE]\n\n"; - sink.write(ev_done.data(), ev_done.size()); - sink.done(); - return true; - }; - - auto on_complete = [task_ids, &ctx_server] (bool) { - ctx_server.queue_results.remove_waiting_task_ids(task_ids); - }; - - res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete); - } + data["__oaicompat_chat"] = true; + return handle_completions_generic(SERVER_TASK_INF_TYPE_COMPLETION, data, res, true); }; const auto handle_models = [¶ms, &ctx_server](const httplib::Request &, httplib::Response & res) { @@ -3139,12 +3619,12 @@ int main(int argc, char ** argv) { const auto handle_embeddings = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { const json body = json::parse(req.body); - bool is_openai = false; + bool oaicompat = false; // an input prompt can be a string or a list of tokens (integer) json prompt; if (body.count("input") != 0) { - is_openai = true; + oaicompat = true; prompt = body.at("input"); } else if (body.count("content") != 0) { // with "content", we only support single prompt @@ -3165,9 +3645,10 @@ int main(int argc, char ** argv) { // get the result std::unordered_set task_ids = server_task::get_list_id(tasks); - ctx_server.receive_cmpl_results(task_ids, [&](std::vector & results) { - for (const auto & res : results) { - responses.push_back(res.data); + ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { + for (auto & res : results) { + GGML_ASSERT(dynamic_cast(res.get()) != nullptr); + responses.push_back(res->to_json()); } }, [&](const json & error_data) { res_error(res, error_data); @@ -3182,7 +3663,7 @@ int main(int argc, char ** argv) { } // write JSON response - json root = is_openai + json root = oaicompat ? format_embeddings_response_oaicompat(body, responses) : responses[0]; res_ok(res, root); @@ -3243,9 +3724,10 @@ int main(int argc, char ** argv) { // get the result std::unordered_set task_ids = server_task::get_list_id(tasks); - ctx_server.receive_cmpl_results(task_ids, [&](std::vector & results) { - for (const auto & res : results) { - responses.push_back(res.data); + ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { + for (auto & res : results) { + GGML_ASSERT(dynamic_cast(res.get()) != nullptr); + responses.push_back(res->to_json()); } }, [&](const json & error_data) { res_error(res, error_data); @@ -3301,11 +3783,16 @@ int main(int argc, char ** argv) { const int id_task = ctx_server.queue_tasks.post(task); ctx_server.queue_results.add_waiting_task_id(id_task); - server_task_result result = ctx_server.queue_results.recv(id_task); + server_task_result_ptr result = ctx_server.queue_results.recv(id_task); ctx_server.queue_results.remove_waiting_task_id(id_task); - res_ok(res, result.data); - res.status = 200; // HTTP OK + if (result->is_error()) { + res_error(res, result->to_json()); + return; + } + + GGML_ASSERT(dynamic_cast(result.get()) != nullptr); + res_ok(res, result->to_json()); }; // diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md index 2930a2e0dea0f..fa3d0a2f5ff66 100644 --- a/examples/server/tests/README.md +++ b/examples/server/tests/README.md @@ -44,4 +44,10 @@ To run with stdout/stderr display in real time (verbose output, but useful for d DEBUG=1 ./tests.sh -s -v -x ``` +Hint: You can compile and run test in single command, useful for local developement: + +```shell +cmake --build build -j --target llama-server && ./examples/server/tests/tests.sh +``` + To see all available arguments, please refer to [pytest documentation](https://docs.pytest.org/en/stable/how-to/usage.html) diff --git a/examples/server/tests/tests.sh b/examples/server/tests/tests.sh index 1e285dcdac14b..1e0777de367fc 100755 --- a/examples/server/tests/tests.sh +++ b/examples/server/tests/tests.sh @@ -1,5 +1,9 @@ #!/bin/bash +# make sure we are in the right directory +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +cd $SCRIPT_DIR + set -eu if [ $# -lt 1 ] diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py index 8a439f9ef0f29..f13c6c4ca4bd3 100644 --- a/examples/server/tests/unit/test_chat_completion.py +++ b/examples/server/tests/unit/test_chat_completion.py @@ -12,13 +12,13 @@ def create_server(): @pytest.mark.parametrize( - "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated", + "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason", [ - ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False), - ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False), + (None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"), + ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"), ] ) -def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated): +def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason): global server server.start() res = server.make_request("POST", "/chat/completions", data={ @@ -30,29 +30,27 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte ], }) assert res.status_code == 200 + assert res.body["model"] == model if model is not None else server.model_alias assert res.body["usage"]["prompt_tokens"] == n_prompt assert res.body["usage"]["completion_tokens"] == n_predicted choice = res.body["choices"][0] assert "assistant" == choice["message"]["role"] assert match_regex(re_content, choice["message"]["content"]) - if truncated: - assert choice["finish_reason"] == "length" - else: - assert choice["finish_reason"] == "stop" + assert choice["finish_reason"] == finish_reason @pytest.mark.parametrize( - "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated", + "system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason", [ - ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False), - ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False), + ("Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"), + ("You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"), ] ) -def test_chat_completion_stream(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated): +def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason): global server + server.model_alias = None # try using DEFAULT_OAICOMPAT_MODEL server.start() res = server.make_stream_request("POST", "/chat/completions", data={ - "model": model, "max_tokens": max_tokens, "messages": [ {"role": "system", "content": system_prompt}, @@ -63,16 +61,13 @@ def test_chat_completion_stream(model, system_prompt, user_prompt, max_tokens, r content = "" for data in res: choice = data["choices"][0] + assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future if choice["finish_reason"] in ["stop", "length"]: assert data["usage"]["prompt_tokens"] == n_prompt assert data["usage"]["completion_tokens"] == n_predicted assert "content" not in choice["delta"] assert match_regex(re_content, content) - # FIXME: not sure why this is incorrect in stream mode - # if truncated: - # assert choice["finish_reason"] == "length" - # else: - # assert choice["finish_reason"] == "stop" + assert choice["finish_reason"] == finish_reason else: assert choice["finish_reason"] is None content += choice["delta"]["content"] @@ -93,7 +88,7 @@ def test_chat_completion_with_openai_library(): temperature=0.8, ) print(res) - assert res.choices[0].finish_reason == "stop" + assert res.choices[0].finish_reason == "length" assert res.choices[0].message.content is not None assert match_regex("(Suddenly)+", res.choices[0].message.content) diff --git a/examples/server/tests/unit/test_completion.py b/examples/server/tests/unit/test_completion.py index 2fa30dd033431..1c3aa77de5bba 100644 --- a/examples/server/tests/unit/test_completion.py +++ b/examples/server/tests/unit/test_completion.py @@ -51,6 +51,24 @@ def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_promp content += data["content"] +def test_completion_stream_vs_non_stream(): + global server + server.start() + res_stream = server.make_stream_request("POST", "/completion", data={ + "n_predict": 8, + "prompt": "I believe the meaning of life is", + "stream": True, + }) + res_non_stream = server.make_request("POST", "/completion", data={ + "n_predict": 8, + "prompt": "I believe the meaning of life is", + }) + content_stream = "" + for data in res_stream: + content_stream += data["content"] + assert content_stream == res_non_stream.body["content"] + + @pytest.mark.parametrize("n_slots", [1, 2]) def test_consistent_result_same_seed(n_slots: int): global server @@ -221,3 +239,24 @@ def check_slots_status(): assert len(res.body["content"]) > 10 # FIXME: the result is not deterministic when using other slot than slot 0 # assert match_regex(re_content, res.body["content"]) + + +def test_n_probs(): + global server + server.start() + res = server.make_request("POST", "/completion", data={ + "prompt": "I believe the meaning of life is", + "n_probs": 10, + "temperature": 0.0, + "n_predict": 5, + }) + assert res.status_code == 200 + assert "completion_probabilities" in res.body + assert len(res.body["completion_probabilities"]) == 5 + for tok in res.body["completion_probabilities"]: + assert "probs" in tok + assert len(tok["probs"]) == 10 + for prob in tok["probs"]: + assert "prob" in prob + assert "tok_str" in prob + assert 0.0 <= prob["prob"] <= 1.0 diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index e4451532c9d0c..a96116ac36caa 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -20,6 +20,7 @@ #include #include #include +#include #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" @@ -40,17 +41,6 @@ using json = nlohmann::ordered_json; #define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) #define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) -// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11 -enum error_type { - ERROR_TYPE_INVALID_REQUEST, - ERROR_TYPE_AUTHENTICATION, - ERROR_TYPE_SERVER, - ERROR_TYPE_NOT_FOUND, - ERROR_TYPE_PERMISSION, - ERROR_TYPE_UNAVAILABLE, // custom error - ERROR_TYPE_NOT_SUPPORTED, // custom error -}; - template static T json_value(const json & body, const std::string & key, const T & default_value) { // Fallback null to default value @@ -485,48 +475,11 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx, return out; } -struct completion_token_output { - llama_token tok; - std::string text_to_send; - - struct token_prob { - llama_token tok; - float prob; - }; - - std::vector probs; -}; - -// convert a vector of completion_token_output to json -static json probs_vector_to_json(const llama_context * ctx, const std::vector & probs) { - json out = json::array(); - - for (const auto & prob : probs) { - json probs_for_token = json::array(); - - for (const auto & p : prob.probs) { - const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok); - probs_for_token.push_back(json { - {"tok_str", tok_str}, - {"prob", p.prob}, - }); - } - - const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok); - out.push_back(json { - {"content", tok_str}, - {"probs", probs_for_token}, - }); - } - - return out; -} - static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) { const std::string str = std::string(event) + ": " + data.dump(-1, ' ', false, json::error_handler_t::replace) + - "\n\n"; // note: these newlines are important (not sure why though, if you know, add a comment to explain) + "\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row). LOG_DBG("data stream, to_send: %s", str.c_str()); @@ -604,164 +557,6 @@ static json oaicompat_completion_params_parse( return llama_params; } -static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false, bool verbose = false) { - bool stopped_word = result.count("stopped_word") != 0; - bool stopped_eos = json_value(result, "stopped_eos", false); - int num_tokens_predicted = json_value(result, "tokens_predicted", 0); - int num_prompt_tokens = json_value(result, "tokens_evaluated", 0); - std::string content = json_value(result, "content", std::string("")); - - std::string finish_reason = "length"; - if (stopped_word || stopped_eos) { - finish_reason = "stop"; - } - - json choices = - streaming ? json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"delta", json::object()}}}) - : json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"message", json{{"content", content}, - {"role", "assistant"}}}}}); - - std::time_t t = std::time(0); - - json res = json { - {"choices", choices}, - {"created", t}, - {"model", - json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, - {"object", streaming ? "chat.completion.chunk" : "chat.completion"}, - {"usage", json { - {"completion_tokens", num_tokens_predicted}, - {"prompt_tokens", num_prompt_tokens}, - {"total_tokens", num_tokens_predicted + num_prompt_tokens} - }}, - {"id", completion_id} - }; - - // extra fields for debugging purposes - if (verbose) { - res["__verbose"] = result; - } - - if (result.contains("completion_probabilities")) { - res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array()); - } - - if (result.contains("timings")) { - res.push_back({"timings", json_value(result, "timings", json::object())}); - } - - return res; -} - -// return value is vector as there is one case where we might need to generate two responses -static std::vector format_partial_response_oaicompat(const json & result, const std::string & completion_id) { - if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) { - return std::vector({result}); - } - - bool first = json_value(result, "oaicompat_token_ctr", 0) == 0; - std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); - - bool stopped_word = json_value(result, "stopped_word", false); - bool stopped_eos = json_value(result, "stopped_eos", false); - bool stopped_limit = json_value(result, "stopped_limit", false); - std::string content = json_value(result, "content", std::string("")); - - std::string finish_reason; - if (stopped_word || stopped_eos) { - finish_reason = "stop"; - } - if (stopped_limit) { - finish_reason = "length"; - } - - std::time_t t = std::time(0); - - json choices; - - if (!finish_reason.empty()) { - choices = json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"delta", json::object()}}}); - } else { - if (first) { - if (content.empty()) { - choices = json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{{"role", "assistant"}}}}}); - } else { - // We have to send this as two updates to conform to openai behavior - json initial_ret = json{{"choices", json::array({json{ - {"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{ - {"role", "assistant"} - }}}})}, - {"created", t}, - {"id", completion_id}, - {"model", modelname}, - {"object", "chat.completion.chunk"}}; - - json second_ret = json{ - {"choices", json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{ - {"content", content}}} - }})}, - {"created", t}, - {"id", completion_id}, - {"model", modelname}, - {"object", "chat.completion.chunk"}}; - - return std::vector({initial_ret, second_ret}); - } - } else { - // Some idiosyncrasy in task processing logic makes several trailing calls - // with empty content, we ignore these at the calee site. - if (content.empty()) { - return std::vector({json::object()}); - } - - choices = json::array({json{ - {"finish_reason", nullptr}, - {"index", 0}, - {"delta", - json{ - {"content", content}, - }}, - }}); - } - } - - json ret = json { - {"choices", choices}, - {"created", t}, - {"id", completion_id}, - {"model", modelname}, - {"object", "chat.completion.chunk"} - }; - - if (result.contains("timings")) { - ret.push_back({"timings", json_value(result, "timings", json::object())}); - } - - if (!finish_reason.empty()) { - int num_tokens_predicted = json_value(result, "tokens_predicted", 0); - int num_prompt_tokens = json_value(result, "tokens_evaluated", 0); - ret.push_back({"usage", json { - {"completion_tokens", num_tokens_predicted}, - {"prompt_tokens", num_prompt_tokens}, - {"total_tokens", num_tokens_predicted + num_prompt_tokens} - }}); - } - - return std::vector({ret}); -} - static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) { json data = json::array(); int i = 0; @@ -853,43 +648,3 @@ static json format_detokenized_response(const std::string & content) { {"content", content} }; } - -static json format_error_response(const std::string & message, const enum error_type type) { - std::string type_str; - int code = 500; - switch (type) { - case ERROR_TYPE_INVALID_REQUEST: - type_str = "invalid_request_error"; - code = 400; - break; - case ERROR_TYPE_AUTHENTICATION: - type_str = "authentication_error"; - code = 401; - break; - case ERROR_TYPE_NOT_FOUND: - type_str = "not_found_error"; - code = 404; - break; - case ERROR_TYPE_SERVER: - type_str = "server_error"; - code = 500; - break; - case ERROR_TYPE_PERMISSION: - type_str = "permission_error"; - code = 403; - break; - case ERROR_TYPE_NOT_SUPPORTED: - type_str = "not_supported_error"; - code = 501; - break; - case ERROR_TYPE_UNAVAILABLE: - type_str = "unavailable_error"; - code = 503; - break; - } - return json { - {"code", code}, - {"message", message}, - {"type", type_str}, - }; -} From 064e58a78a79f0aaf65a48c30fefe83dba3239e2 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 6 Dec 2024 13:29:05 +0100 Subject: [PATCH 154/245] common : bring back --no-warmup to server (#10686) --- common/arg.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/arg.cpp b/common/arg.cpp index 078c7538490c4..0db59f70187e8 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -786,7 +786,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.warmup = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--spm-infill"}, string_format( From 905344ebc151fafd4b16502153b4695e6b5bd24d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 6 Dec 2024 21:33:15 +0200 Subject: [PATCH 155/245] convert : add custom attention mapping --- gguf-py/gguf/tensor_mapping.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 1b6a3f4add875..f0a7b6478508e 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -146,6 +146,7 @@ class TensorNameMap: # Attention query MODEL_TENSOR.ATTN_Q: ( "model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2 + "model.layers.{bid}.self_attn.q_proj_no_perm", # llama-custom "layers.{bid}.attention.wq", # llama-pth "encoder.layer.{bid}.attention.self.query", # bert "transformer.h.{bid}.attn.q_proj", # gpt-j @@ -158,6 +159,7 @@ class TensorNameMap: # Attention key MODEL_TENSOR.ATTN_K: ( "model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2 + "model.layers.{bid}.self_attn.k_proj_no_perm", # llama-custom "layers.{bid}.attention.wk", # llama-pth "encoder.layer.{bid}.attention.self.key", # bert "transformer.h.{bid}.attn.k_proj", # gpt-j From a1b52c1a3748f24b0629e4609af1627215963b2c Mon Sep 17 00:00:00 2001 From: Sukriti Sharma Date: Sat, 7 Dec 2024 00:02:14 -0700 Subject: [PATCH 156/245] convert : add support for Roberta embeddings (#10695) --- convert_hf_to_gguf.py | 8 +- convert_hf_to_gguf_update.py | 1 + models/ggml-vocab-roberta-bpe.gguf.inp | 112 +++++++++++++++++++++++++ models/ggml-vocab-roberta-bpe.gguf.out | 46 ++++++++++ 4 files changed, 165 insertions(+), 2 deletions(-) create mode 100644 models/ggml-vocab-roberta-bpe.gguf.inp create mode 100644 models/ggml-vocab-roberta-bpe.gguf.out diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 9f1419e29eb4e..a4eece934021f 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -661,6 +661,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35": # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0 res = "minerva-7b" + if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65": + # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base + res = "roberta-bpe" if res is None: logger.warning("\n") @@ -2533,7 +2536,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("BertModel", "CamembertModel") +@Model.register("BertModel", "CamembertModel", "RobertaModel") class BertModel(Model): model_arch = gguf.MODEL_ARCH.BERT @@ -2574,7 +2577,8 @@ def set_vocab(self): # we need this to validate the size of the token_type embeddings # though currently we are passing all zeros to the token_type embeddings - self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B" + # "Sequence A" or "Sequence B" + self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) # convert to phantom space vocab def phantom(tok): diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index ce3c571df3453..aa10e5db796a0 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -103,6 +103,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", }, {"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", }, {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", }, + {"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"}, ] diff --git a/models/ggml-vocab-roberta-bpe.gguf.inp b/models/ggml-vocab-roberta-bpe.gguf.inp new file mode 100644 index 0000000000000..9baf7d77ae6b5 --- /dev/null +++ b/models/ggml-vocab-roberta-bpe.gguf.inp @@ -0,0 +1,112 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + + +__ggml_vocab_test__ + + + +__ggml_vocab_test__ + + + + +__ggml_vocab_test__ + + +__ggml_vocab_test__ +Hello world +__ggml_vocab_test__ + Hello world +__ggml_vocab_test__ +Hello World +__ggml_vocab_test__ + Hello World +__ggml_vocab_test__ + Hello World! +__ggml_vocab_test__ +Hello, world! +__ggml_vocab_test__ + Hello, world! +__ggml_vocab_test__ + this is 🦙.cpp +__ggml_vocab_test__ +w048 7tuijk dsdfhu +__ggml_vocab_test__ +нещо на Български +__ggml_vocab_test__ +កាន់តែពិសេសអាចខលចេញ +__ggml_vocab_test__ +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token) +__ggml_vocab_test__ +Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello + Hello +__ggml_vocab_test__ + ( +__ggml_vocab_test__ + + = +__ggml_vocab_test__ +' era +__ggml_vocab_test__ +Hello, y'all! How are you 😁 ?我想在apple工作1314151天~ +__ggml_vocab_test__ +!!!!!! +__ggml_vocab_test__ +3 +__ggml_vocab_test__ +33 +__ggml_vocab_test__ +333 +__ggml_vocab_test__ +3333 +__ggml_vocab_test__ +33333 +__ggml_vocab_test__ +333333 +__ggml_vocab_test__ +3333333 +__ggml_vocab_test__ +33333333 +__ggml_vocab_test__ +333333333 +__ggml_vocab_test__ +Cửa Việt +__ggml_vocab_test__ + discards +__ggml_vocab_test__ + + + + + + + + + + + +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL +__ggml_vocab_test__ diff --git a/models/ggml-vocab-roberta-bpe.gguf.out b/models/ggml-vocab-roberta-bpe.gguf.out new file mode 100644 index 0000000000000..f181ac3dcc5bc --- /dev/null +++ b/models/ggml-vocab-roberta-bpe.gguf.out @@ -0,0 +1,46 @@ + 2550 204 18430 377 + 597 2768 298 8564 + + 1437 + 1437 1437 + 1437 1437 1437 + 50117 + 50118 + 50140 + 50140 50118 + 50117 50118 + 31414 232 + 20920 232 + 31414 623 + 20920 623 + 20920 623 328 + 31414 6 232 328 + 20920 6 232 328 + 42 16 8103 18164 27 4 49317 + 605 40976 262 10109 18474 385 29 36807 6455 + 36765 25482 22063 23171 34251 18697 10809 26161 18697 3602 22063 27969 40966 25417 15264 26161 24269 36709 41171 35328 + 1376 17772 7471 1376 17772 19002 1376 17772 9085 1376 4333 13859 1376 17772 9357 1376 4333 9264 1376 17772 25448 1376 17772 18400 1376 17772 4333 1376 4333 10172 1376 17772 4333 1376 17772 7258 1376 17772 19002 1376 17772 5782 1376 17772 10172 1376 17772 3726 1376 17772 5782 1376 4333 10172 1376 17772 23171 + 6569 15113 7471 36 21113 43 17841 19002 17 8384 6569 14285 4958 12605 36 34654 2841 4203 354 10146 26511 1070 43 36174 5782 36 8338 21554 14 34 63 308 19233 43 + 31414 + 20920 + 1437 20920 + 1437 1437 20920 + 1437 1437 1437 20920 + 1437 1437 1437 20920 50118 1437 1437 1437 20920 + 36 + 50118 5457 + 108 3567 + 31414 6 1423 108 1250 328 1336 32 47 17841 10172 17487 47876 3602 48617 15264 46537 11423 27326 48494 8210 49233 1558 1570 27761 49429 43251 10809 17772 + 32376 12846 + 246 + 3103 + 25631 + 46152 + 3103 25631 + 46152 3103 + 46152 25631 + 46152 46152 + 46152 3103 25631 + 347 1376 2023 12410 102 16376 1376 2023 6382 90 + 9553 5954 + 50118 1437 50140 1437 50140 50118 1437 50117 1437 50117 50117 1437 50117 50118 1437 1437 50118 1437 1437 1437 50118 1437 1437 1437 1437 50118 1437 1437 1437 1437 1437 50118 6569 15113 7471 36 21113 43 17841 19002 17 8384 6569 14285 4958 12605 36 34654 2841 4203 354 10146 26511 1070 43 36174 5782 8103 18164 27 6569 18164 27 155 2357 30242 155 25631 30242 3103 30242 25631 30242 46152 30242 3103 25631 155 4 246 155 7586 246 155 734 246 25974 17772 7471 1376 17772 19002 1376 17772 9085 1376 4333 13859 1376 17772 9357 1376 4333 9264 1376 17772 25448 1376 17772 18400 1376 17772 4333 1376 4333 10172 1376 17772 4333 1376 17772 7258 1376 17772 19002 1376 17772 5782 18636 10172 17487 47876 3602 48617 15264 46537 11423 27326 48494 8210 49233 1558 1570 27761 49429 43251 10809 17772 36738 48332 47463 18697 10809 25482 22063 23171 34251 18697 10809 26161 18697 3602 22063 27969 40966 25417 15264 26161 24269 36709 41171 35328 128 49690 108 49972 49519 12905 48149 48149 43796 32376 12846 27282 28749 38 348 57 128 41042 37 18 89 6 128 4629 47 686 116 128 448 45 686 38 581 146 24 6 128 495 47 101 103 6845 116 166 108 30660 10 108 462 574 From ebc1fad682c08d9652301a9e4d3fbec3164a4625 Mon Sep 17 00:00:00 2001 From: Robert Ormandi <52251610+ormandi@users.noreply.github.com> Date: Sat, 7 Dec 2024 01:55:01 -0600 Subject: [PATCH 157/245] metal : Extend how Llama.cpp locates metal resources (#10676) * metal : Extend how Llama.cpp locates metal resources (#10675) * It searches the resource file in the directory where the current binary is located as well. * Resolves symbolic links. Rationale: When we plug this dependency into a Bazel build and run it in the context of Bazel (e.g. testing): * the execution directory is often very different from where the files are located and no direct control over this (Bazel sandboxing), * the Bazel sandbox often use symbolic links to make files available. With this patch, we can have the resource file added to the target, can build and run tests in the context of Bazel. * Update ggml/src/ggml-metal/ggml-metal.m Co-authored-by: Georgi Gerganov * Update ggml/src/ggml-metal/ggml-metal.m Co-authored-by: Georgi Gerganov --------- Co-authored-by: Georgi Gerganov --- ggml/src/ggml-metal/ggml-metal.m | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index f80fda7a4ca38..34fe5778e4f09 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -510,6 +510,35 @@ @implementation GGMLMetalClass #endif NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"]; + if (path_lib == nil) { + // Try to find the resource in the directory where the current binary located. + NSString * current_binary = [[NSProcessInfo processInfo] arguments][0]; + NSString * bin_dir = [current_binary stringByDeletingLastPathComponent]; + NSString * default_metallib_path = [NSString pathWithComponents:@[bin_dir, @"default.metallib"]]; + if ([[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) { + GGML_LOG_INFO("%s: found '%s'\n", __func__, [default_metallib_path UTF8String]); + NSDictionary * atts = [[NSFileManager defaultManager] attributesOfItemAtPath:default_metallib_path error:&error]; + if (atts && atts[NSFileType] == NSFileTypeSymbolicLink) { + // Optionally, if this is a symlink, try to resolve it. + default_metallib_path = [[NSFileManager defaultManager] destinationOfSymbolicLinkAtPath:default_metallib_path error:&error]; + if (default_metallib_path && [default_metallib_path length] > 0 && ![[default_metallib_path substringToIndex:1] isEqualToString:@"/"]) { + // It is a relative path, adding the binary directory as directory prefix. + default_metallib_path = [NSString pathWithComponents:@[bin_dir, default_metallib_path]]; + } + if (!default_metallib_path || ![[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) { + // Link to the resource could not be resolved. + default_metallib_path = nil; + } else { + GGML_LOG_INFO("%s: symlink resolved '%s'\n", __func__, [default_metallib_path UTF8String]); + } + } + } else { + // The resource couldn't be found in the binary's directory. + default_metallib_path = nil; + } + path_lib = default_metallib_path; + } + if (try_metallib && path_lib != nil) { // pre-compiled library found NSURL * libURL = [NSURL fileURLWithPath:path_lib]; From a507932c73e17a4754c607ff19314f68303632b1 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sat, 7 Dec 2024 10:24:15 +0100 Subject: [PATCH 158/245] Vulkan: VK_KHR_cooperative_matrix support to speed up prompt processing (#10597) * Vulkan: Implement VK_KHR_cooperative_matrix support in the matrix matrix multiplication shader * Improve performance with better q4_k and q5_k dequant and store unrolling * Add Vulkan MUL_MAT and MUL_MAT_ID accumulator precision selection * Rework mulmat shader selection and compilation logic, avoid compiling shaders that won't get used by device * Vulkan: Implement accumulator switch for specific mul mat mat shaders * Vulkan: Unroll more loops for more mul mat mat performance * Vulkan: Add VK_AMD_shader_core_properties2 support to read Compute Unit count for split_k logic * Disable coopmat support on AMD proprietary driver * Remove redundant checks * Add environment variable GGML_VK_DISABLE_COOPMAT to disable VK_KHR_cooperative_matrix support * Fix rebase typo * Fix coopmat2 MUL_MAT_ID pipeline selection --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 845 +++++++++++------- .../ggml-vulkan/vulkan-shaders/mul_mm.comp | 229 +++-- .../vulkan-shaders/vulkan-shaders-gen.cpp | 73 +- 3 files changed, 750 insertions(+), 397 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index c7ac0e8f7bd70..4b2e449a58b40 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1,7 +1,8 @@ #include "ggml-vulkan.h" #include -#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_PERF) +#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_PERF) || defined(GGML_VULKAN_CHECK_RESULTS) #include +#include "ggml-cpu.h" #endif #include @@ -169,8 +170,22 @@ struct vk_device_struct { bool uma; bool coopmat2; + bool coopmat_support; + bool coopmat_acc_f32_support; + bool coopmat_acc_f16_support; + uint32_t coopmat_m; + uint32_t coopmat_n; + uint32_t coopmat_k; + size_t idx; + bool mul_mat_l; + bool mul_mat_m; + bool mul_mat_s; + bool mul_mat_id_l; + bool mul_mat_id_m; + bool mul_mat_id_s; + vk_matmul_pipeline pipeline_matmul_f32; vk_matmul_pipeline pipeline_matmul_f32_f16; vk_matmul_pipeline2 pipeline_matmul_f16; @@ -181,10 +196,10 @@ struct vk_device_struct { vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT]; vk_matmul_pipeline pipeline_matmul_id_f32; - vk_matmul_pipeline pipeline_matmul_id_f16; - vk_matmul_pipeline pipeline_matmul_id_f16_f32; + vk_matmul_pipeline2 pipeline_matmul_id_f16; + vk_matmul_pipeline2 pipeline_matmul_id_f16_f32; - vk_matmul_pipeline pipeline_dequant_mul_mat_mat_id[GGML_TYPE_COUNT]; + vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_id[GGML_TYPE_COUNT]; vk_pipeline pipeline_dequant[GGML_TYPE_COUNT]; vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_COUNT]; @@ -1325,6 +1340,18 @@ static std::array fa_rows_cols(uint32_t D, uint32_t clamp, ggml_typ return {64, 64}; }; +static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vector& warptile, bool mul_mat_id) { + // Needs to be kept up to date on shader changes + const uint32_t bank_conflict_offset = device->coopmat_support ? 8 : 1; + const uint32_t type_size = device->fp16 ? sizeof(ggml_fp16_t) : sizeof(float); + const uint32_t warps = warptile[0] / device->subgroup_size; + + const uint32_t load_bufs = (warptile[1] + warptile[2]) * (warptile[3] + bank_conflict_offset) * type_size; + const uint32_t mmid_row_ids = mul_mat_id ? 3072 * sizeof(uint32_t) : 0; + const uint32_t coopmat_stage = device->coopmat_support ? warptile[7] * warptile[8] / warps * sizeof(float) : 0; + + return (load_bufs + mmid_row_ids + coopmat_stage) <= device->properties.limits.maxComputeSharedMemorySize; +} static void ggml_vk_load_shaders(vk_device& device) { VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")"); @@ -1382,12 +1409,25 @@ static void ggml_vk_load_shaders(vk_device& device) { m_align = 64; s_align = 32; } else { - l_warptile = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size }; - m_warptile = { 128, 64, 64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size }; - s_warptile = { subgroup_size_16, 32, 32, 16, 32, 32, 2, 2, 2, device->subgroup_size }; - l_warptile_mmq = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size }; - m_warptile_mmq = { 128, 64, 64, 32, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size }; - s_warptile_mmq = { subgroup_size_16, 32, 32, 32, 32, 32, 2, 2, 2, device->subgroup_size }; + // Matrix cores require different warp group sizes + const uint32_t tm_l = device->coopmat_support ? device->coopmat_m : 4; + const uint32_t tm_m = device->coopmat_support ? device->coopmat_m : 4; + const uint32_t tm_s = device->coopmat_support ? device->coopmat_m : 2; + const uint32_t tn_l = device->coopmat_support ? device->coopmat_n : 4; + const uint32_t tn_m = device->coopmat_support ? device->coopmat_n : 2; + const uint32_t tn_s = device->coopmat_support ? device->coopmat_n : 2; + const uint32_t tk_l = device->coopmat_support ? device->coopmat_k : 1; + const uint32_t tk_m = device->coopmat_support ? device->coopmat_k : 1; + const uint32_t tk_s = device->coopmat_support ? device->coopmat_k : 1; + + l_warptile = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, tm_l, tn_l, tk_l, device->subgroup_size }; + m_warptile = { 128, 64, 64, 16, device->subgroup_size, 32, 2, tm_m, tn_m, tk_m, device->subgroup_size }; + s_warptile = { subgroup_size_16, 32, 32, 16, 32, 32, 2, tm_s, tn_s, tk_s, device->subgroup_size }; + + l_warptile_mmq = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, tm_l, tn_l, tk_l, device->subgroup_size }; + m_warptile_mmq = { 128, 64, 64, 32, device->subgroup_size, 32, 2, tm_m, tn_m, tk_m, device->subgroup_size }; + s_warptile_mmq = { subgroup_size_16, 32, 32, 32, 32, 32, 2, tm_s, tn_s, tk_s, device->subgroup_size }; + l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 }; m_mmq_wg_denoms = m_wg_denoms = { 64, 64, 1 }; s_mmq_wg_denoms = s_wg_denoms = { 32, 32, 1 }; @@ -1428,25 +1468,36 @@ static void ggml_vk_load_shaders(vk_device& device) { // assert mul_mat_mat_id shaders will fit. GGML_ASSERT(shmem_needed + 3072*4 <= device->properties.limits.maxComputeSharedMemorySize); } + // Disable medium and large matrix multiplication if not enough shared memory is available + // Check mmq warptiles as the largest configuration + // Throw an error if not enough for any matrix multiplication is available + if (!ggml_vk_matmul_shmem_support(device, s_warptile_mmq, false)) { + std::cerr << "ggml_vulkan: Error: Shared memory size too small for matrix multiplication." << std::endl; + throw std::runtime_error("Shared memory size too small for matrix multiplication."); + } else if (!ggml_vk_matmul_shmem_support(device, m_warptile_mmq, false)) { + device->mul_mat_m = false; + device->mul_mat_l = false; + } else if (!ggml_vk_matmul_shmem_support(device, l_warptile_mmq, false)) { + device->mul_mat_l = false; + } + + // Disable mul_mat_id if not enough shared memory is available + if (!ggml_vk_matmul_shmem_support(device, s_warptile_mmq, true)) { + device->mul_mat_id_s = false; + device->mul_mat_id_m = false; + device->mul_mat_id_l = false; + } else if (!ggml_vk_matmul_shmem_support(device, m_warptile_mmq, true)) { + device->mul_mat_id_m = false; + device->mul_mat_id_l = false; + } else if (!ggml_vk_matmul_shmem_support(device, l_warptile_mmq, true)) { + device->mul_mat_id_l = false; + } } device->pipeline_matmul_f32 = std::make_shared(); device->pipeline_matmul_f32_f16 = std::make_shared(); device->pipeline_matmul_id_f32 = std::make_shared(); - device->pipeline_matmul_id_f16_f32 = std::make_shared(); - device->pipeline_matmul_id_f16 = std::make_shared(); - device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0] = std::make_shared(); - device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1] = std::make_shared(); - device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0] = std::make_shared(); - device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1] = std::make_shared(); - device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0] = std::make_shared(); - device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K] = std::make_shared(); - device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K] = std::make_shared(); - device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K] = std::make_shared(); - device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K] = std::make_shared(); - device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared(); - device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared(); std::vector> compiles; auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, const std::vector& specialization_constants, uint32_t align, bool disable_robustness = false) { @@ -1543,119 +1594,191 @@ static void ggml_vk_load_shaders(vk_device& device) { CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_matmul_id_f16, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 4) - - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, 4) + CREATE_MM2(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_id_push_constants, 4) + + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) #undef CREATE_MM #undef CREATE_MM2 } else -#endif - if (device->fp16) { +#endif // defined(VK_NV_cooperative_matrix2) + if (device->coopmat_support) { // Create 6 variants, {s,m,l}x{unaligned,aligned} -#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \ - - CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_matmul_f16.f32acc, matmul_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_matmul_f16_f32.f32acc, matmul_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3); - - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); +#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ + if (device->mul_mat ## ID ## _l) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \ + if (device->mul_mat ## ID ## _m) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \ + if (device->mul_mat ## ID ## _s) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \ + if (device->mul_mat ## ID ## _l) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align); \ + if (device->mul_mat ## ID ## _m) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align); \ + if (device->mul_mat ## ID ## _s) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \ + + // Create 2 variants, {f16,f32} accumulator +#define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ + CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ + CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ + + CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); // If there's not enough shared memory for row_ids and the result tile, don't create these pipelines. - if (device->properties.limits.maxComputeSharedMemorySize >= 32768) { - CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4); - CREATE_MM(pipeline_matmul_id_f16, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4); - CREATE_MM(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4); - - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); + if (device->mul_mat_id_s || device->mul_mat_id_m || device->mul_mat_id_l) { + CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); + CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); + CREATE_MM2(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); + + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + } +#undef CREATE_MM + } else if (device->fp16) { + // Create 6 variants, {s,m,l}x{unaligned,aligned} +#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ + if (device->mul_mat ## ID ## _l) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \ + if (device->mul_mat ## ID ## _m) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \ + if (device->mul_mat ## ID ## _s) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \ + if (device->mul_mat ## ID ## _l) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align); \ + if (device->mul_mat ## ID ## _m) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align); \ + if (device->mul_mat ## ID ## _s) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \ + + CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + + // If there's not enough shared memory for row_ids and the result tile, don't create these pipelines. + if (device->mul_mat_id_s || device->mul_mat_id_m || device->mul_mat_id_l) { + CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); + CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); + CREATE_MM2(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); + + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); } #undef CREATE_MM } else { // Create 6 variants, {s,m,l}x{unaligned,aligned} -#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \ - - CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_matmul_f16.f32acc, matmul_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_matmul_f16_f32.f32acc, matmul_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3); - - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); +#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ + if (device->mul_mat ## ID ## _l) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \ + if (device->mul_mat ## ID ## _m) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \ + if (device->mul_mat ## ID ## _s) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \ + if (device->mul_mat ## ID ## _l) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align); \ + if (device->mul_mat ## ID ## _m) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align); \ + if (device->mul_mat ## ID ## _s) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \ + + CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_matmul_f16.f32acc, matmul_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_matmul_f16_f32.f32acc, matmul_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); // If there's not enough shared memory for row_ids and the result tile, don't create these pipelines. - if (device->properties.limits.maxComputeSharedMemorySize >= 32768) { - CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4); - CREATE_MM(pipeline_matmul_id_f16, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4); - CREATE_MM(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4); - - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); + if (device->mul_mat_id_s || device->mul_mat_id_m || device->mul_mat_id_l) { + CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); + CREATE_MM(pipeline_matmul_id_f16.f32acc, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); + CREATE_MM(pipeline_matmul_id_f16_f32.f32acc, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); + + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f32acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f32acc, matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f32acc, matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f32acc, matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f32acc, matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f32acc, matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f32acc, matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f32acc, matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f32acc, matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f32acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc, matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); } +#undef CREATE_MM2 #undef CREATE_MM } @@ -1851,8 +1974,10 @@ static vk_device ggml_vk_get_device(size_t idx) { bool fp16_compute = false; bool maintenance4_support = false; bool sm_builtins = false; + bool amd_shader_core_properties2 = false; bool pipeline_robustness = false; bool coopmat2_support = false; + device->coopmat_support = false; // Check if maintenance4 is supported for (const auto& properties : ext_props) { @@ -1864,10 +1989,18 @@ static vk_device ggml_vk_get_device(size_t idx) { fp16_compute = true; } else if (strcmp("VK_NV_shader_sm_builtins", properties.extensionName) == 0) { sm_builtins = true; + } else if (strcmp("VK_AMD_shader_core_properties2", properties.extensionName) == 0) { + amd_shader_core_properties2 = true; } else if (strcmp("VK_EXT_pipeline_robustness", properties.extensionName) == 0) { pipeline_robustness = true; + } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 && + !getenv("GGML_VK_DISABLE_COOPMAT")) { + device->coopmat_support = true; + device->coopmat_m = 0; + device->coopmat_n = 0; + device->coopmat_k = 0; } else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 && - !getenv("GGML_VULKAN_DISABLE_COOPMAT2")) { + !getenv("GGML_VK_DISABLE_COOPMAT2")) { coopmat2_support = true; } } @@ -1876,11 +2009,14 @@ static vk_device ggml_vk_get_device(size_t idx) { vk::PhysicalDeviceMaintenance3Properties props3; vk::PhysicalDeviceMaintenance4Properties props4; vk::PhysicalDeviceSubgroupProperties subgroup_props; + vk::PhysicalDeviceDriverProperties driver_props; vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props; + vk::PhysicalDeviceShaderCoreProperties2AMD amd_shader_core_properties2_props; props2.pNext = &props3; props3.pNext = &subgroup_props; + subgroup_props.pNext = &driver_props; - VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&subgroup_props; + VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&driver_props; if (maintenance4_support) { last_struct->pNext = (VkBaseOutStructure *)&props4; @@ -1890,6 +2026,10 @@ static vk_device ggml_vk_get_device(size_t idx) { last_struct->pNext = (VkBaseOutStructure *)&sm_props; last_struct = (VkBaseOutStructure *)&sm_props; } + if (amd_shader_core_properties2) { + last_struct->pNext = (VkBaseOutStructure *)&amd_shader_core_properties2_props; + last_struct = (VkBaseOutStructure *)&amd_shader_core_properties2_props; + } #if defined(VK_NV_cooperative_matrix2) vk::PhysicalDeviceCooperativeMatrix2PropertiesNV coopmat2_props; @@ -1905,7 +2045,7 @@ static vk_device ggml_vk_get_device(size_t idx) { const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE"); if (GGML_VK_FORCE_MAX_ALLOCATION_SIZE != nullptr) { - device->max_memory_allocation_size = std::stoi(GGML_VK_FORCE_MAX_ALLOCATION_SIZE); + device->max_memory_allocation_size = std::stoul(GGML_VK_FORCE_MAX_ALLOCATION_SIZE); } else if (maintenance4_support) { device->max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize); } else { @@ -1917,15 +2057,22 @@ static vk_device ggml_vk_get_device(size_t idx) { device->uma = device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; if (sm_builtins) { device->shader_core_count = sm_props.shaderSMCount; + } else if (amd_shader_core_properties2) { + device->shader_core_count = amd_shader_core_properties2_props.activeComputeUnitCount; } else { device->shader_core_count = 0; } - const char* GGML_VK_DISABLE_F16 = getenv("GGML_VK_DISABLE_F16"); - const bool force_disable_f16 = GGML_VK_DISABLE_F16 != nullptr; + const bool force_disable_f16 = getenv("GGML_VK_DISABLE_F16") != nullptr; device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute; + if (device->vendor_id == VK_VENDOR_ID_INTEL || (props2.properties.vendorID == VK_VENDOR_ID_AMD && driver_props.driverID == vk::DriverId::eAmdProprietary)) { + // Intel drivers don't support coopmat properly yet + // Only RADV supports coopmat properly on AMD + device->coopmat_support = false; + } + std::vector queue_family_props = device->physical_device.getQueueFamilyProperties(); // Try to find a non-graphics compute queue and transfer-focused queues @@ -1976,6 +2123,16 @@ static vk_device ggml_vk_get_device(size_t idx) { device_extensions.push_back("VK_EXT_pipeline_robustness"); } + VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features; + coopmat_features.pNext = nullptr; + coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR; + coopmat_features.cooperativeMatrix = VK_FALSE; + + if (device->coopmat_support) { + last_struct->pNext = (VkBaseOutStructure *)&coopmat_features; + last_struct = (VkBaseOutStructure *)&coopmat_features; + } + #if defined(VK_NV_cooperative_matrix2) VkPhysicalDeviceCooperativeMatrix2FeaturesNV coopmat2_features {}; coopmat2_features.pNext = nullptr; @@ -1993,6 +2150,8 @@ static vk_device ggml_vk_get_device(size_t idx) { device->pipeline_robustness = pl_robustness_features.pipelineRobustness; + device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix; + if (coopmat2_support) { #if defined(VK_NV_cooperative_matrix2) if (coopmat2_features.cooperativeMatrixWorkgroupScope && @@ -2083,6 +2242,74 @@ static vk_device ggml_vk_get_device(size_t idx) { if (device->fp16) { device_extensions.push_back("VK_KHR_shader_float16_int8"); } + + if (device->coopmat_support) { + // Query supported shapes + std::vector cm_props; + + PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR pfn_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR = + (PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR)vkGetInstanceProcAddr(vk_instance.instance, "vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR"); + + uint32_t cm_props_num; + + pfn_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(device->physical_device, &cm_props_num, nullptr); + + cm_props.resize(cm_props_num); + + for (auto& prop : cm_props) { + prop.sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR; + } + + pfn_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(device->physical_device, &cm_props_num, cm_props.data()); + + VK_LOG_DEBUG("ggml_vulkan: Cooperative Matrix Shapes: " << cm_props.size()); + + for (auto& prop : cm_props) { + VK_LOG_DEBUG("ggml_vulkan: M: " << prop.MSize << " N: " << prop.NSize << " K: " << prop.KSize << " A: " << vk::to_string((vk::ComponentTypeKHR)prop.AType) << " B: " << vk::to_string((vk::ComponentTypeKHR)prop.BType) << " C: " << vk::to_string((vk::ComponentTypeKHR)prop.CType) << " Result: " << vk::to_string((vk::ComponentTypeKHR)prop.ResultType) << " saturatingAccumulation: " << prop.saturatingAccumulation << " scope: " << vk::to_string((vk::ScopeKHR)prop.scope)); + + if ((vk::ComponentTypeKHR)prop.AType == vk::ComponentTypeKHR::eFloat16 && + (vk::ComponentTypeKHR)prop.BType == vk::ComponentTypeKHR::eFloat16 && + (vk::ScopeKHR)prop.scope == vk::ScopeKHR::eSubgroup + ) { + if ((vk::ComponentTypeKHR)prop.CType == vk::ComponentTypeKHR::eFloat32 && + (vk::ComponentTypeKHR)prop.ResultType == vk::ComponentTypeKHR::eFloat32) { + // coopmat sizes not set yet + if (device->coopmat_m == 0) { + device->coopmat_acc_f32_support = true; + device->coopmat_m = prop.MSize; + device->coopmat_n = prop.NSize; + device->coopmat_k = prop.KSize; + } else if (device->coopmat_m == prop.MSize && device->coopmat_n == prop.NSize && device->coopmat_k == prop.KSize) { + // Only enable if shape is identical + device->coopmat_acc_f32_support = true; + } + } else if ((vk::ComponentTypeKHR)prop.CType == vk::ComponentTypeKHR::eFloat16 && + (vk::ComponentTypeKHR)prop.ResultType == vk::ComponentTypeKHR::eFloat16) { + // coopmat sizes not set yet + if (device->coopmat_m == 0) { + device->coopmat_acc_f16_support = true; + device->coopmat_m = prop.MSize; + device->coopmat_n = prop.NSize; + device->coopmat_k = prop.KSize; + } else if (device->coopmat_m == prop.MSize && device->coopmat_n == prop.NSize && device->coopmat_k == prop.KSize) { + // Only enable if shape is identical + device->coopmat_acc_f16_support = true; + } + } + } + } + + if (device->coopmat_m == 0) { + // No suitable matmul mode found + GGML_LOG_DEBUG("ggml_vulkan: WARNING: No suitable matrix core mode found. Disabling matrix cores.\n"); + device->coopmat_support = false; + } + } + + if (device->coopmat_support) { + device_extensions.push_back("VK_KHR_cooperative_matrix"); + } + device->name = GGML_VK_NAME + std::to_string(idx); device_create_info = { @@ -2098,6 +2325,37 @@ static vk_device ggml_vk_get_device(size_t idx) { ggml_vk_create_queue(device, device->compute_queue, compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer }, false); // Shaders + // Disable matmul tile sizes early if performance low or not supported + switch (device->vendor_id) { +#ifndef GGML_VULKAN_RUN_TESTS + case VK_VENDOR_ID_AMD: + case VK_VENDOR_ID_INTEL: + device->mul_mat_l = false; + device->mul_mat_m = true; + device->mul_mat_s = true; + device->mul_mat_id_l = false; + device->mul_mat_id_m = true; + device->mul_mat_id_s = true; + break; + case VK_VENDOR_ID_APPLE: + device->mul_mat_l = false; + device->mul_mat_m = true; + device->mul_mat_s = false; + device->mul_mat_id_l = false; + device->mul_mat_id_m = true; + device->mul_mat_id_s = false; + break; +#endif + default: + device->mul_mat_l = true; + device->mul_mat_m = true; + device->mul_mat_s = true; + device->mul_mat_id_l = true; + device->mul_mat_id_m = true; + device->mul_mat_id_s = true; + break; + } + ggml_vk_load_shaders(device); if (!device->single_queue) { @@ -2155,15 +2413,24 @@ static void ggml_vk_print_gpu_info(size_t idx) { bool fp16_storage = false; bool fp16_compute = false; + bool coopmat_support = false; for (auto properties : ext_props) { if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { fp16_storage = true; } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { fp16_compute = true; + } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0) { + coopmat_support = true; } } + if (props2.properties.vendorID == VK_VENDOR_ID_INTEL || (props2.properties.vendorID == VK_VENDOR_ID_AMD && driver_props.driverID == vk::DriverId::eAmdProprietary)) { + // Intel drivers don't support coopmat properly yet + // Only RADV supports coopmat properly on AMD + coopmat_support = false; + } + const char* GGML_VK_DISABLE_F16 = getenv("GGML_VK_DISABLE_F16"); bool force_disable_f16 = GGML_VK_DISABLE_F16 != nullptr; @@ -2186,13 +2453,28 @@ static void ggml_vk_print_gpu_info(size_t idx) { vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES; vk11_features.pNext = &vk12_features; + // Pointer to the last chain element + VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&vk12_features; + + VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features; + coopmat_features.pNext = nullptr; + coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR; + coopmat_features.cooperativeMatrix = VK_FALSE; + + if (coopmat_support) { + last_struct->pNext = (VkBaseOutStructure *)&coopmat_features; + last_struct = (VkBaseOutStructure *)&coopmat_features; + } + vkGetPhysicalDeviceFeatures2(physical_device, &device_features2); fp16 = fp16 && vk12_features.shaderFloat16; + coopmat_support = coopmat_support && coopmat_features.cooperativeMatrix; + std::string device_name = props2.properties.deviceName.data(); - GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu\n", - idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size); + GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu | matrix cores: %d\n", + idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size, coopmat_support); if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) { GGML_LOG_DEBUG("ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want.\n"); @@ -2428,7 +2710,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) { return ctx->device->pipeline_matmul_f32_f16; } - if (prec == GGML_PREC_DEFAULT && ctx->device->coopmat2) { + if (prec == GGML_PREC_DEFAULT && ctx->device->fp16) { if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { return ctx->device->pipeline_matmul_f16_f32.f16acc; } @@ -2469,7 +2751,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte assert(src1_type == GGML_TYPE_F16); return ctx->device->pipeline_dequant_mul_mat_mat_f16[src0_type].f16acc; } - return ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f32acc; + return ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f32acc; } static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) { @@ -2498,16 +2780,25 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * return b_type == GGML_TYPE_F32 ? ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[a_type] : ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[a_type]; } -static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) { +static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type, ggml_prec prec) { VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_id_pipeline()"); if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { return ctx->device->pipeline_matmul_id_f32; } - if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { - return ctx->device->pipeline_matmul_id_f16_f32; - } - if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { - return ctx->device->pipeline_matmul_id_f16; + if (prec == GGML_PREC_DEFAULT && ctx->device->fp16) { + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { + return ctx->device->pipeline_matmul_id_f16_f32.f16acc; + } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { + return ctx->device->pipeline_matmul_id_f16.f16acc; + } + } else { + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { + return ctx->device->pipeline_matmul_id_f16_f32.f32acc; + } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { + return ctx->device->pipeline_matmul_id_f16.f32acc; + } } GGML_ASSERT(src1_type == GGML_TYPE_F32); @@ -2529,7 +2820,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co return nullptr; } - return ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type]; + return ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f32acc; } static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) { @@ -3119,62 +3410,31 @@ static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int return split_k; } -static vk_pipeline ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) { - if (m <= 32 || n <= 32) { - return aligned ? mmp->a_s : mmp->s; - } - return aligned ? mmp->a_m : mmp->m; - - GGML_UNUSED(ctx); -} - -static vk_pipeline ggml_vk_guess_matmul_pipeline_apple(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, bool aligned) { - return aligned ? mmp->a_m : mmp->m; - - GGML_UNUSED(ctx); -} - -static vk_pipeline ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, bool aligned) { - return aligned ? mmp->a_s : mmp->s; - - GGML_UNUSED(ctx); -} - -static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) { +static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned, ggml_type type_a) { VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")"); - switch (ctx->device->vendor_id) { - case VK_VENDOR_ID_AMD: - return ggml_vk_guess_matmul_pipeline_amd(ctx, mmp, m, n, aligned); - case VK_VENDOR_ID_APPLE: - return ggml_vk_guess_matmul_pipeline_apple(ctx, mmp, aligned); - case VK_VENDOR_ID_INTEL: - return ggml_vk_guess_matmul_pipeline_intel(ctx, mmp, aligned); - default: - break; - } if (ctx->device->coopmat2) { - if ((m % mmp->l->wg_denoms[0]) == 0 && (n % mmp->l->wg_denoms[1]) == 0) { + if ((ctx->device->mul_mat_l && (m % mmp->l->wg_denoms[0]) == 0 && (n % mmp->l->wg_denoms[1]) == 0) || (!ctx->device->mul_mat_m && !ctx->device->mul_mat_s)) { return aligned ? mmp->a_l : mmp->l; } - if ((m % mmp->m->wg_denoms[0]) == 0 && (n % mmp->m->wg_denoms[1]) == 0) { + if ((ctx->device->mul_mat_m && (m % mmp->m->wg_denoms[0]) == 0 && (n % mmp->m->wg_denoms[1]) == 0) || !ctx->device->mul_mat_s) { return aligned ? mmp->a_m : mmp->m; } return aligned ? mmp->a_s : mmp->s; } - if (m <= 32 || n <= 32) { + if ((ctx->device->mul_mat_s && (m <= 32 || n <= 32)) || (!ctx->device->mul_mat_m && !ctx->device->mul_mat_l)) { return aligned ? mmp->a_s : mmp->s; } - if (m <= 64 || n <= 64) { + if ((ctx->device->mul_mat_m && (m <= 64 || n <= 64)) || !ctx->device->mul_mat_l) { return aligned ? mmp->a_m : mmp->m; } return aligned ? mmp->a_l : mmp->l; } -static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n) { +static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, ggml_type type_a) { VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")"); - return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true)->align; + return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true, type_a)->align; } static void ggml_vk_matmul( @@ -3201,6 +3461,33 @@ static void ggml_vk_matmul( ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 }); } +static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) { + VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")"); + + if (ctx->device->coopmat2) { + if ((ctx->device->mul_mat_id_l && (m % mmp->l->wg_denoms[0]) == 0 && (n % mmp->l->wg_denoms[1]) == 0) || (!ctx->device->mul_mat_id_m && !ctx->device->mul_mat_id_s)) { + return aligned ? mmp->a_l : mmp->l; + } + if ((ctx->device->mul_mat_id_m && (m % mmp->m->wg_denoms[0]) == 0 && (n % mmp->m->wg_denoms[1]) == 0) || !ctx->device->mul_mat_id_s) { + return aligned ? mmp->a_m : mmp->m; + } + return aligned ? mmp->a_s : mmp->s; + } + + if ((ctx->device->mul_mat_id_s && (m <= 32 || n <= 32)) || (!ctx->device->mul_mat_id_m && !ctx->device->mul_mat_id_l)) { + return aligned ? mmp->a_s : mmp->s; + } + if ((ctx->device->mul_mat_id_m && (m <= 64 || n <= 64)) || !ctx->device->mul_mat_id_l) { + return aligned ? mmp->a_m : mmp->m; + } + return aligned ? mmp->a_l : mmp->l; +} + +static uint32_t ggml_vk_guess_matmul_id_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n) { + VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")"); + return ggml_vk_guess_matmul_id_pipeline(ctx, mmp, m, n, true)->align; +} + static void ggml_vk_matmul_id( ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& ids, @@ -3350,10 +3637,10 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub const int y_ne = ne11 * ne10; const int d_ne = ne11 * ne01; - const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11)); + const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11, src0->type)); const bool aligned = ne10 == kpad && ne01 > 8 && ne11 > 8; - vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned); + vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned, src0->type); const uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, pipeline); @@ -3904,7 +4191,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig; - vk_matmul_pipeline mmp = ggml_vk_get_mul_mat_mat_id_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type); + vk_matmul_pipeline mmp = ggml_vk_get_mul_mat_mat_id_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type, (ggml_prec)dst->op_params[0]); const bool qx_needs_dequant = mmp == nullptr || x_non_contig; const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig; @@ -3920,10 +4207,10 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& const uint64_t y_ne = ne11 * ne10; const uint64_t d_ne = ne21 * ne20; - const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, nei1)); + const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_id_pipeline_align(ctx, mmp, ne01, nei1)); const bool aligned = ne10 == kpad && ne01 > 8 && nei1 > 8; - vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, nei1, aligned); + vk_pipeline pipeline = ggml_vk_guess_matmul_id_pipeline(ctx, mmp, ne01, nei1, aligned); const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type); const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); @@ -5504,19 +5791,27 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t for (size_t i = 0; i < x_ne; i++) { if (std::is_same()) { x[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f; + // x[i] = 1.0f; + // x[i] = i + 1; + // x[i] = (i % k == i / k) ? 1.0f : 0.0f; } else if (std::is_same()) { x[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f); + // x[i] = ggml_fp32_to_fp16(1.0f); + // x[i] = ggml_fp32_to_fp16(i + 1); + // x[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f); } else { GGML_ABORT("fatal error"); } } for (size_t i = 0; i < y_ne; i++) { if (std::is_same()) { - // y[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f; - y[i] = (i % k == i / k) ? 1.0f : 0.0f; + y[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f; + // y[i] = (i % k == i / k) ? 1.0f : 0.0f; + // y[i] = i + 1; } else if (std::is_same()) { - // y[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f); - y[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f); + y[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f); + // y[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f); + // y[i] = ggml_fp32_to_fp16(i + 1); } else { GGML_ABORT("fatal error"); } @@ -5600,7 +5895,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t double err = std::fabs(d[i] - d_chk[i]); avg_err += err; - if (err > 0.05f && first_err_n == -1) { + if ((err > 0.05f || std::isnan(err)) && first_err_n == -1) { first_err_b = i / (m * n); first_err_n = (i % (m * n)) / m; first_err_m = (i % (m * n)) % m; @@ -5613,12 +5908,10 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t std::cerr << "TEST " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time / num_it << "ms " << tflops << " TFLOPS avg_err=" << avg_err << std::endl; - if (avg_err > 0.1) { + if (avg_err > 0.1 || std::isnan(avg_err)) { std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl; std::cerr << "Actual result: " << std::endl << std::endl; ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); - std::cerr << std::endl; - ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n + 15, first_err_b); std::cerr << "Expected result: " << std::endl << std::endl; ggml_vk_print_matrix_area(d_chk, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); @@ -5801,13 +6094,13 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, vk_pipeline p; std::string shname; if (shader_size == 0) { - p = ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->a_s; + p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->a_s : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->a_s; shname = std::string(ggml_type_name(quant)) + "_ALIGNED_S"; } else if (shader_size == 1) { - p = ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->a_m; + p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->a_m : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->a_m; shname = std::string(ggml_type_name(quant)) + "_ALIGNED_M"; } else if (shader_size == 2) { - p = ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->a_l; + p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->a_l : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->a_l; shname = std::string(ggml_type_name(quant)) + "_ALIGNED_L"; } else { GGML_ASSERT(0); @@ -5817,13 +6110,13 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, if (k != kpad) { if (shader_size == 0) { - p = ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->s; + p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->s : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->s; shname = std::string(ggml_type_name(quant)) + "_S"; } else if (shader_size == 1) { - p = ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->m; + p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->m : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->m; shname = std::string(ggml_type_name(quant)) + "_M"; } else if (shader_size == 2) { - p = ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->l; + p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->l : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->l; shname = std::string(ggml_type_name(quant)) + "_L"; } else { GGML_ASSERT(0); @@ -5982,105 +6275,13 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { #if defined(GGML_VULKAN_RUN_TESTS) - ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32); - ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_0); - ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_1); - ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_0); - ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_1); - ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q8_0); - ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q2_K); - ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q3_K); - ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_K); - ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_K); - ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q6_K); - ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_IQ4_NL); - - ggml_vk_test_matmul(ctx, 512, 512, 100, 32, 100, 1, 2); - - ggml_vk_test_matmul(ctx, 128, 512, 512, 2, 100, 1, 0); - ggml_vk_test_matmul(ctx, 128, 512, 512, 2, 100, 1, 1); - ggml_vk_test_matmul(ctx, 128, 512, 512, 2, 100, 1, 2); - // ggml_vk_test_matmul(ctx, 128, 512, 512, 2, 100, 4, 0); - // ggml_vk_test_matmul(ctx, 128, 512, 512, 2, 100, 4, 1); - // ggml_vk_test_matmul(ctx, 128, 512, 512, 2, 100, 4, 2); - - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_0); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_0); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_0); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_0); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_0); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_0); - - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_1); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_1); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_1); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_1); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_1); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_1); - - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_0); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_0); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_0); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_0); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_0); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_0); - - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_1); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_1); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_1); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_1); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_1); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_1); - - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q8_0); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q8_0); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q8_0); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q8_0); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0); - - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q2_K); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q2_K); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q2_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K); - - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q3_K); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q3_K); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q3_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K); - - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_K); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_K); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K); - - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_K); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_K); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K); - - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q6_K); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q6_K); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q6_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K); - - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_IQ4_NL); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_IQ4_NL); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_IQ4_NL); - - std::cerr << std::endl; - const std::vector vals { + 512, 512, 128, + 128, 512, 512, + 4096, 512, 4096, + 11008, 512, 4096, + 4096, 512, 11008, + 32000, 512, 4096, 8, 8, 8, 100, 46, 576, 623, 111, 128, @@ -6093,15 +6294,6 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { 49, 49, 128, 128, 49, 49, 4096, 49, 4096, - 11008, 49, 4096, - 4096, 49, 11008, - 32000, 49, 4096, - 512, 512, 128, - 128, 512, 512, - 4096, 512, 4096, - 11008, 512, 4096, - 4096, 512, 11008, - 32000, 512, 4096, }; const size_t num_it = 100; @@ -6109,10 +6301,45 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0); ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1); ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2); - // ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0); - // ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1); - // ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2); - std::cerr << std::endl; + std::cerr << '\n'; + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 0); + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 1); + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 2); + std::cerr << '\n'; + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0); + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1); + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2); + std::cerr << '\n' << std::endl; + + if (vals[i + 2] % 32 == 0) { + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0, GGML_TYPE_Q4_0); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1, GGML_TYPE_Q4_0); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2, GGML_TYPE_Q4_0); + std::cerr << '\n'; + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 0, GGML_TYPE_Q4_0); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 1, GGML_TYPE_Q4_0); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 2, GGML_TYPE_Q4_0); + std::cerr << '\n'; + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0, GGML_TYPE_Q4_0); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1, GGML_TYPE_Q4_0); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2, GGML_TYPE_Q4_0); + std::cerr << '\n' << std::endl; + } + + if (vals[i + 2] % 256 == 0) { + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0, GGML_TYPE_Q4_K); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1, GGML_TYPE_Q4_K); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2, GGML_TYPE_Q4_K); + std::cerr << '\n'; + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 0, GGML_TYPE_Q4_K); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 1, GGML_TYPE_Q4_K); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 2, GGML_TYPE_Q4_K); + std::cerr << '\n'; + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0, GGML_TYPE_Q4_K); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1, GGML_TYPE_Q4_K); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2, GGML_TYPE_Q4_K); + std::cerr << '\n' << std::endl; + } } GGML_ABORT("fatal error"); @@ -7200,8 +7427,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_MUL_MAT_ID: { ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; - if (op->op == GGML_OP_MUL_MAT_ID && - ggml_vk_get_device(ctx->device)->properties.limits.maxComputeSharedMemorySize < 32768) { + const vk_device& device = ggml_vk_get_device(ctx->device); + if (op->op == GGML_OP_MUL_MAT_ID && !device->mul_mat_id_s && !device->mul_mat_id_m && !device->mul_mat_id_l) { // If there's not enough shared memory for row_ids and the result tile, fallback to CPU return false; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp index 2ff5c430519bc..48122cbef906e 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp @@ -7,6 +7,12 @@ #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require #endif +#ifdef COOPMAT +#extension GL_KHR_cooperative_matrix : enable +#extension GL_KHR_memory_scope_semantics : enable +#extension GL_KHR_shader_subgroup_basic : enable +#endif + #ifdef MUL_MAT_ID #extension GL_EXT_shader_explicit_arithmetic_types_int16 : require #endif @@ -57,6 +63,7 @@ layout (push_constant) uniform parameter #endif } p; +layout (constant_id = 0) const uint BLOCK_SIZE = 64; layout (constant_id = 1) const uint BM = 64; layout (constant_id = 2) const uint BN = 64; layout (constant_id = 3) const uint BK = 16; // Assumed to be 32 if working with a quant @@ -65,13 +72,26 @@ layout (constant_id = 5) const uint WN = 32; layout (constant_id = 6) const uint WMITER = 2; layout (constant_id = 7) const uint TM = 4; layout (constant_id = 8) const uint TN = 2; -layout (constant_id = 9) const uint WARP = 32; +layout (constant_id = 9) const uint TK = 1; // Only needed for coopmat +layout (constant_id = 10) const uint WARP = 32; + +#ifdef COOPMAT +#define SHMEM_STRIDE (BK + 8) +#else +#define SHMEM_STRIDE (BK + 1) +#endif -shared FLOAT_TYPE buf_a[BM * (BK+1)]; -shared FLOAT_TYPE buf_b[BN * (BK+1)]; +shared FLOAT_TYPE buf_a[BM * SHMEM_STRIDE]; +shared FLOAT_TYPE buf_b[BN * SHMEM_STRIDE]; #ifdef MUL_MAT_ID shared u16vec2 row_ids[3072]; +#endif // MUL_MAT_ID + +#define NUM_WARPS (BLOCK_SIZE / WARP) + +#ifdef COOPMAT +shared ACC_TYPE coopmat_stage[TM * TN * NUM_WARPS]; #endif void main() { @@ -98,17 +118,32 @@ void main() { const uint ik = gl_WorkGroupID.x / blocks_m; const uint ic = gl_WorkGroupID.y; - const uint warp_i = gl_LocalInvocationID.x / WARP; - const uint warp_r = warp_i % (BM / WM); - const uint warp_c = warp_i / (BM / WM); - const uint WNITER = (WM * WN) / (WARP * TM * TN * WMITER); const uint WSUBM = WM / WMITER; const uint WSUBN = WN / WNITER; +#ifdef COOPMAT + const uint warp_i = gl_SubgroupID; + + const uint tiw = gl_SubgroupInvocationID; + + const uint cms_per_row = WM / TM; + const uint cms_per_col = WN / TN; + + const uint storestride = WARP / TM; + const uint store_r = tiw % TM; + const uint store_c = tiw / TM; +#else + const uint warp_i = gl_LocalInvocationID.x / WARP; + const uint tiw = gl_LocalInvocationID.x % WARP; + const uint tiwr = tiw % (WSUBM / TM); const uint tiwc = tiw / (WSUBM / TM); +#endif + + const uint warp_r = warp_i % (BM / WM); + const uint warp_c = warp_i / (BM / WM); const uint loadr_a = gl_LocalInvocationID.x % (BK / LOAD_VEC_A); const uint loadc_a = gl_LocalInvocationID.x / (BK / LOAD_VEC_A); @@ -156,21 +191,31 @@ void main() { uint pos_b = (batch_idx * p.batch_stride_b + ic * BN * p.stride_b + start_k) / LOAD_VEC_B; #endif - float sums[WMITER * TM * WNITER * TN]; +#ifdef COOPMAT + coopmat cache_a; + coopmat cache_b; + coopmat sums[cms_per_row * cms_per_col]; + + [[unroll]] for (uint i = 0; i < cms_per_row * cms_per_col; i++) { + sums[i] = coopmat(0.0f); + } +#else + ACC_TYPE sums[WMITER * TM * WNITER * TN]; FLOAT_TYPE cache_a[WMITER * TM]; FLOAT_TYPE cache_b[WNITER * TN]; [[unroll]] for (uint i = 0; i < WMITER*TM*WNITER*TN; i++) { - sums[i] = 0.0f; + sums[i] = ACC_TYPE(0.0f); } +#endif - [[unroll]] for (uint block = start_k; block < end_k; block += BK) { + for (uint block = start_k; block < end_k; block += BK) { [[unroll]] for (uint l = 0; l < BM; l += loadstride_a) { #if defined(DATA_A_F32) || defined(DATA_A_F16) #if LOAD_VEC_A == 8 const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; buf_a[buf_idx ] = FLOAT_TYPE(data_a[idx][0].x); buf_a[buf_idx + 1] = FLOAT_TYPE(data_a[idx][0].y); buf_a[buf_idx + 2] = FLOAT_TYPE(data_a[idx][0].z); @@ -181,21 +226,21 @@ void main() { buf_a[buf_idx + 7] = FLOAT_TYPE(data_a[idx][1].w); #elif LOAD_VEC_A == 4 const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; buf_a[buf_idx ] = FLOAT_TYPE(data_a[idx].x); buf_a[buf_idx + 1] = FLOAT_TYPE(data_a[idx].y); buf_a[buf_idx + 2] = FLOAT_TYPE(data_a[idx].z); buf_a[buf_idx + 3] = FLOAT_TYPE(data_a[idx].w); #else if (ir * BM + loadc_a + l < p.M && block + loadr_a < end_k) { - buf_a[(loadc_a + l) * (BK+1) + loadr_a] = FLOAT_TYPE(data_a[pos_a + (loadc_a + l) * p.stride_a + loadr_a]); + buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = FLOAT_TYPE(data_a[pos_a + (loadc_a + l) * p.stride_a + loadr_a]); } else { - buf_a[(loadc_a + l) * (BK+1) + loadr_a] = FLOAT_TYPE(0.0f); + buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = FLOAT_TYPE(0.0f); } #endif #elif defined(DATA_A_Q4_0) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a; const uint ib = idx / 16; const uint iqs = idx & 0xF; @@ -208,7 +253,7 @@ void main() { buf_a[buf_idx + 16] = FLOAT_TYPE(v.y); #elif defined(DATA_A_Q4_1) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a; const uint ib = idx / 16; const uint iqs = idx & 0xF; @@ -222,7 +267,7 @@ void main() { buf_a[buf_idx + 16] = FLOAT_TYPE(v.y); #elif defined(DATA_A_Q5_0) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a; const uint ib = idx / 16; const uint iqs = idx & 0xF; @@ -237,7 +282,7 @@ void main() { buf_a[buf_idx + 16] = FLOAT_TYPE(v.y); #elif defined(DATA_A_Q5_1) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a; const uint ib = idx / 16; const uint iqs = idx & 0xF; @@ -253,7 +298,7 @@ void main() { buf_a[buf_idx + 16] = FLOAT_TYPE(v.y); #elif defined(DATA_A_Q8_0) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; const uint ib = idx / 16; const uint iqs = (idx & 0xF) * 2; @@ -265,7 +310,7 @@ void main() { buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); #elif defined(DATA_A_Q2_K) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; const uint ib = idx / 128; // 2 values per idx const uint iqs = idx % 128; // 0..127 @@ -284,7 +329,7 @@ void main() { buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); #elif defined(DATA_A_Q3_K) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; const uint ib = idx / 128; // 2 values per idx const uint iqs = idx % 128; // 0..127 @@ -308,7 +353,7 @@ void main() { buf_a[buf_idx + 1] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi + 1] >> qsshift) & 3) - (((data_a[ib].hmask[hmi + 1] & m) != 0) ? 0 : 4))); #elif defined(DATA_A_Q4_K) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; const uint ib = idx / 128; // 2 values per idx const uint iqs = idx % 128; // 0..127 @@ -320,15 +365,20 @@ void main() { const vec2 loadd = vec2(data_a[ib].d); - uint8_t sc; - uint8_t mbyte; - if (is < 4) { - sc = uint8_t(data_a[ib].scales[is ] & 63); - mbyte = uint8_t(data_a[ib].scales[is + 4] & 63); - } else { - sc = uint8_t((data_a[ib].scales[is + 4] & 0xF) | ((data_a[ib].scales[is - 4] >> 6) << 4)); - mbyte = uint8_t((data_a[ib].scales[is + 4] >> 4) | ((data_a[ib].scales[is ] >> 6) << 4)); - } + const uint scidx0 = (is < 4) ? is : (is + 4); + const uint scidx1 = (is < 4) ? is : (is - 4); + const uint scidxmask1 = (is < 4) ? 0x30 : 0xC0; + const uint scidxshift1 = (is < 4) ? 0 : 2; + const uint mbidx0 = is + 4; + const uint mbidx1 = (is < 4) ? is + 4 : is; + const uint mbidxmask0 = (is < 4) ? 0xF : 0xF0; + const uint mbidxshift0 = (is < 4) ? 0 : 4; + const uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0; + const uint mbidxshift1 = (is < 4) ? 0 : 2; + + const uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1)); + const uint8_t mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1)); + const float d = loadd.x * sc; const float m = -loadd.y * mbyte; @@ -336,7 +386,7 @@ void main() { buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF), m)); #elif defined(DATA_A_Q5_K) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; const uint ib = idx / 128; // 2 values per idx const uint iqs = idx % 128; // 0..127 @@ -351,15 +401,20 @@ void main() { const vec2 loadd = vec2(data_a[ib].d); - uint8_t sc; - uint8_t mbyte; - if (is < 4) { - sc = uint8_t(data_a[ib].scales[is ] & 63); - mbyte = uint8_t(data_a[ib].scales[is + 4] & 63); - } else { - sc = uint8_t((data_a[ib].scales[is + 4] & 0xF) | ((data_a[ib].scales[is - 4] >> 6) << 4)); - mbyte = uint8_t((data_a[ib].scales[is + 4] >> 4) | ((data_a[ib].scales[is ] >> 6) << 4)); - } + const uint scidx0 = (is < 4) ? is : (is + 4); + const uint scidx1 = (is < 4) ? is : (is - 4); + const uint scidxmask1 = (is < 4) ? 0x30 : 0xC0; + const uint scidxshift1 = (is < 4) ? 0 : 2; + const uint mbidx0 = is + 4; + const uint mbidx1 = (is < 4) ? is + 4 : is; + const uint mbidxmask0 = (is < 4) ? 0xF : 0xF0; + const uint mbidxshift0 = (is < 4) ? 0 : 4; + const uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0; + const uint mbidxshift1 = (is < 4) ? 0 : 2; + + const uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1)); + const uint8_t mbyte = uint8_t(((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1)); + const float d = loadd.x * sc; const float m = -loadd.y * mbyte; @@ -367,7 +422,7 @@ void main() { buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi + 1] & hm) != 0 ? 16 : 0), m)); #elif defined(DATA_A_Q6_K) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; const uint ib = idx / 128; // 2 values per idx const uint iqs = idx % 128; // 0..127 @@ -386,7 +441,7 @@ void main() { buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32)); #elif defined(DATA_A_IQ4_NL) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a; const uint ib = idx / 16; const uint iqs = idx & 0xF; @@ -407,7 +462,7 @@ void main() { #else const uint idx = pos_b + (loadc_b + l) * p.stride_b / LOAD_VEC_B + loadr_b; #endif - const uint buf_idx = (loadc_b + l) * (BK+1) + loadr_b * LOAD_VEC_B; + const uint buf_idx = (loadc_b + l) * SHMEM_STRIDE + loadr_b * LOAD_VEC_B; buf_b[buf_idx + 0] = FLOAT_TYPE(data_b[idx][0].x); buf_b[buf_idx + 1] = FLOAT_TYPE(data_b[idx][0].y); buf_b[buf_idx + 2] = FLOAT_TYPE(data_b[idx][0].z); @@ -423,24 +478,24 @@ void main() { #else const uint idx = pos_b + (loadc_b + l) * p.stride_b / LOAD_VEC_B + loadr_b; #endif - const uint buf_idx = (loadc_b + l) * (BK+1) + loadr_b * LOAD_VEC_B; + const uint buf_idx = (loadc_b + l) * SHMEM_STRIDE + loadr_b * LOAD_VEC_B; buf_b[buf_idx + 0] = FLOAT_TYPE(data_b[idx].x); buf_b[buf_idx + 1] = FLOAT_TYPE(data_b[idx].y); buf_b[buf_idx + 2] = FLOAT_TYPE(data_b[idx].z); buf_b[buf_idx + 3] = FLOAT_TYPE(data_b[idx].w); #elif !MUL_MAT_ID if (ic * BN + loadc_b + l < p.N && block + loadr_b < end_k) { - buf_b[(loadc_b + l) * (BK+1) + loadr_b] = FLOAT_TYPE(data_b[pos_b + (loadc_b + l) * p.stride_b + loadr_b]); + buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(data_b[pos_b + (loadc_b + l) * p.stride_b + loadr_b]); } else { - buf_b[(loadc_b + l) * (BK+1) + loadr_b] = FLOAT_TYPE(0.0f); + buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(0.0f); } #else const uint row_i = ic * BN + loadc_b + l; if (row_i < _ne1) { const u16vec2 row_idx = row_ids[row_i]; - buf_b[(loadc_b + l) * (BK+1) + loadr_b] = FLOAT_TYPE(data_b[pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + loadr_b]); + buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(data_b[pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + loadr_b]); } else { - buf_b[(loadc_b + l) * (BK+1) + loadr_b] = FLOAT_TYPE(0.0f); + buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(0.0f); } #endif } @@ -450,16 +505,30 @@ void main() { pos_a += BK / LOAD_VEC_A; pos_b += BK / LOAD_VEC_B; - for (uint i = 0; i < BK; i++) { +#ifdef COOPMAT + [[unroll]] for (uint i = 0; i < BK; i += TK) { + [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) { + // Load from shared into cache + coopMatLoad(cache_a, buf_a, (warp_r * WM + cm_row * TM) * SHMEM_STRIDE + i, SHMEM_STRIDE, gl_CooperativeMatrixLayoutRowMajor); + + [[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) { + coopMatLoad(cache_b, buf_b, (warp_c * WN + cm_col * TN) * SHMEM_STRIDE + i, SHMEM_STRIDE, gl_CooperativeMatrixLayoutColumnMajor); + + sums[cm_col * cms_per_row + cm_row] = coopMatMulAdd(cache_a, cache_b, sums[cm_col * cms_per_row + cm_row]); + } + } + } +#else + [[unroll]] for (uint i = 0; i < BK; i++) { // Load from shared into cache [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) { [[unroll]] for (uint j = 0; j < TM; j++) { - cache_a[wsir * TM + j] = buf_a[(warp_r * WM + wsir * WSUBM + tiwr * TM + j) * (BK+1) + i]; + cache_a[wsir * TM + j] = buf_a[(warp_r * WM + wsir * WSUBM + tiwr * TM + j) * SHMEM_STRIDE + i]; } } [[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) { [[unroll]] for (uint j = 0; j < TN; j++) { - cache_b[wsic * TN + j] = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + j) * (BK+1) + i]; + cache_b[wsic * TN + j] = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + j) * SHMEM_STRIDE + i]; } } @@ -468,12 +537,13 @@ void main() { [[unroll]] for (uint cc = 0; cc < TN; cc++) { [[unroll]] for (uint cr = 0; cr < TM; cr++) { const uint sums_idx = (wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr; - sums[sums_idx] = fma(float(cache_a[wsir * TM + cr]), float(cache_b[wsic * TN + cc]), sums[sums_idx]); + sums[sums_idx] = fma(ACC_TYPE(cache_a[wsir * TM + cr]), ACC_TYPE(cache_b[wsic * TN + cc]), sums[sums_idx]); } } } } } +#endif barrier(); } @@ -485,6 +555,54 @@ void main() { const uint offsets = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z; #endif +#ifdef COOPMAT +#ifdef MUL_MAT_ID + [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) { + [[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) { + coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor); + + [[unroll]] for (uint col = 0; col < BN; col += storestride) { + const uint row_i = dc + cm_col * TN + col + store_c; + if (row_i >= _ne1) break; + + const u16vec2 row_idx = row_ids[row_i]; + + data_d[row_idx.y * p.batch_stride_d + row_idx.x * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]); + } + } + } +#else + const bool is_aligned = p.stride_d % 4 == 0; // Assumption: D_TYPE == float + + [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) { + [[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) { + const bool is_in_bounds = dr + (cm_row + 1) * TM <= p.M && dc + (cm_col + 1) * TN <= p.N; + + if (is_aligned && is_in_bounds) { + // Full coopMat is within bounds and stride_d is aligned with 16B + coopmat cm_dtype = coopmat(sums[cm_col * cms_per_row + cm_row]); + coopMatStore(cm_dtype, data_d, offsets + (dc + cm_col * TN) * p.stride_d + dr + cm_row * TM, p.stride_d, gl_CooperativeMatrixLayoutColumnMajor); + } else if (is_in_bounds) { + // Full coopMat is within bounds, but stride_d is not aligned + coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor); + + [[unroll]] for (uint col = 0; col < TN; col += storestride) { + data_d[offsets + (dc + cm_col * TN + col + store_c) * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]); + } + } else if (dr + cm_row * TM < p.M && dc + cm_col * TN < p.N) { + // Partial coopMat is within bounds + coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor); + + [[unroll]] for (uint col = 0; col < TN; col += storestride) { + if (dr + cm_row * TM + store_r < p.M && dc + cm_col * TN + col + store_c < p.N) { + data_d[offsets + (dc + cm_col * TN + col + store_c) * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]); + } + } + } + } + } +#endif // MUL_MAT_ID +#else [[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) { [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) { @@ -496,7 +614,7 @@ void main() { if (row_i >= _ne1) break; const u16vec2 row_idx = row_ids[row_i]; -#endif +#endif // MUL_MAT_ID [[unroll]] for (uint cr = 0; cr < TM; cr++) { #ifdef MUL_MAT_ID data_d[row_idx.y * p.batch_stride_d + row_idx.x * p.stride_d + dr_warp + cr] = D_TYPE(sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr]); @@ -504,9 +622,10 @@ void main() { if (dr_warp + cr < p.M && dc_warp + cc < p.N) { data_d[offsets + (dc_warp + cc) * p.stride_d + dr_warp + cr] = D_TYPE(sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr]); } -#endif +#endif // MUL_MAT_ID } } } } +#endif // COOPMAT } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 4716e2c80869b..c4967ed658eb2 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -60,6 +60,7 @@ const std::vector type_names = { "iq4_nl" }; +namespace { void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str) { #ifdef _WIN32 HANDLE stdout_read, stdout_write; @@ -198,8 +199,8 @@ static uint32_t compile_count = 0; static std::mutex compile_count_mutex; static std::condition_variable compile_count_cond; -void string_to_spv_func(const std::string& _name, const std::string& in_fname, const std::map& defines, bool fp16 = true, bool coopmat2 = false, bool f16acc = false) { - std::string name = _name + (f16acc ? "_f16acc" : "") + (coopmat2 ? "_cm2" : (fp16 ? "" : "_fp32")); +void string_to_spv_func(const std::string& _name, const std::string& in_fname, const std::map& defines, bool fp16 = true, bool coopmat = false, bool coopmat2 = false, bool f16acc = false) { + std::string name = _name + (f16acc ? "_f16acc" : "") + (coopmat ? "_coopmat" : "") + (coopmat2 ? "_cm2" : (fp16 ? "" : "_fp32")); std::string out_fname = join_paths(output_dir, name + ".spv"); std::string in_path = join_paths(input_dir, in_fname); @@ -258,7 +259,7 @@ std::map merge_maps(const std::map> compiles; -void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map& defines, bool fp16 = true, bool coopmat2 = false, bool f16acc = false) { +void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map& defines, bool fp16 = true, bool coopmat = false, bool coopmat2 = false, bool f16acc = false) { { // wait until fewer than N compiles are in progress. // 16 is an arbitrary limit, the goal is to avoid "failed to create pipe" errors. @@ -269,10 +270,10 @@ void string_to_spv(const std::string& _name, const std::string& in_fname, const } compile_count++; } - compiles.push_back(std::async(string_to_spv_func, _name, in_fname, defines, fp16, coopmat2, f16acc)); + compiles.push_back(std::async(string_to_spv_func, _name, in_fname, defines, fp16, coopmat, coopmat2, f16acc)); } -void matmul_shaders(bool fp16, bool matmul_id, bool coopmat2, bool f16acc) { +void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool f16acc) { std::string load_vec = coopmat2 ? "1" : fp16 ? "8" : "4"; std::string aligned_b_type_f32 = coopmat2 ? "float" : fp16 ? "mat2x4" : "vec4"; std::string aligned_b_type_f16 = coopmat2 ? "float16_t" : fp16 ? "f16mat2x4" : "f16vec4"; @@ -291,14 +292,20 @@ void matmul_shaders(bool fp16, bool matmul_id, bool coopmat2, bool f16acc) { base_dict["ACC_TYPE"] = f16acc ? "float16_t" : "float"; + if (coopmat) { + base_dict["COOPMAT"] = "1"; + } + + base_dict["ACC_TYPE"] = f16acc ? "float16_t" : "float"; + std::string source_name = coopmat2 ? "mul_mm_cm2.comp" : "mul_mm.comp"; // Shaders with f16 B_TYPE - string_to_spv(shader_name + "_f32_f16", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, }), fp16, coopmat2, f16acc); - string_to_spv(shader_name + "_f32_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc); + string_to_spv(shader_name + "_f32_f16", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, }), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_f32_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); - string_to_spv(shader_name + "_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc); - string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat2, f16acc); + string_to_spv(shader_name + "_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc); for (const auto& tname : type_names) { std::string data_a_key = "DATA_A_" + to_uppercase(tname); @@ -307,12 +314,12 @@ void matmul_shaders(bool fp16, bool matmul_id, bool coopmat2, bool f16acc) { // For aligned matmul loads std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16") ? load_vec : "2"; - string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat2, f16acc); - string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc); + string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); if (tname != "f16" && tname != "f32") { - string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat2, f16acc); - string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc); + string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); } } } @@ -322,25 +329,24 @@ void process_shaders() { std::map base_dict = {{"FLOAT_TYPE", "float"}}; // matmul - for (const auto& fp16 : {false, true}) { - for (const auto& matmul_id : {false, true}) { - for (const auto& coopmat2 : {false, true}) { - for (const auto& f16acc : {false, true}) { -#if !defined(VK_NV_cooperative_matrix2) - if (coopmat2) { - continue; - } + for (const auto& matmul_id : {false, true}) { + // No coopmats + // fp32 + matmul_shaders(false, matmul_id, false, false, false); + + // fp16, fp32acc and fp16acc + matmul_shaders(true, matmul_id, false, false, false); + matmul_shaders(true, matmul_id, false, false, true); + + // Coopmat, fp32acc and fp16acc + matmul_shaders(true, matmul_id, true, false, false); + matmul_shaders(true, matmul_id, true, false, true); + +#if defined(VK_NV_cooperative_matrix2) + // Coopmat2, fp32acc and fp16acc + matmul_shaders(true, matmul_id, false, true, false); + matmul_shaders(true, matmul_id, false, true, true); #endif - if (coopmat2 && !fp16) { - continue; - } - if (!coopmat2 && f16acc) { - continue; - } - matmul_shaders(fp16, matmul_id, coopmat2, f16acc); - } - } - } } #if defined(VK_NV_cooperative_matrix2) @@ -355,11 +361,11 @@ void process_shaders() { if (tname == "f16") { string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm2.comp", - merge_maps(base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}}), true, true, f16acc); + merge_maps(base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}}), true, false, true, f16acc); } else { std::string data_a_key = "DATA_A_" + to_uppercase(tname); string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm2.comp", - merge_maps(base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}, {"DEQUANTFUNC", "dequantFunc"+to_uppercase(tname) }, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname) }}), true, true, f16acc); + merge_maps(base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}, {"DEQUANTFUNC", "dequantFunc"+to_uppercase(tname) }, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname) }}), true, false, true, f16acc); } } } @@ -524,6 +530,7 @@ void write_output_files() { fclose(hdr); fclose(src); } +} int main(int argc, char** argv) { std::map args; From 2911db5ea0abd940af30e6a57a33ba54f5ee1df0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 7 Dec 2024 11:52:44 +0200 Subject: [PATCH 159/245] server : fix free of spec context and batch (#10651) ggml-ci --- common/speculative.cpp | 4 ++++ examples/server/server.cpp | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index e559675c436ef..3fcbb0020b6af 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -62,6 +62,10 @@ struct common_speculative * common_speculative_init( } void common_speculative_free(struct common_speculative * spec) { + if (spec == nullptr) { + return; + } + common_sampler_free(spec->smpl); llama_batch_free(spec->batch); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 809fafa187add..d57a296a2de07 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -720,7 +720,7 @@ struct server_slot { int id; int id_task = -1; - llama_batch batch_spec; + llama_batch batch_spec = {}; llama_context * ctx_dft = nullptr; From d90c615041e3bdda82412b03cf0d61977a509bf4 Mon Sep 17 00:00:00 2001 From: Djip007 <3705339+Djip007@users.noreply.github.com> Date: Sat, 7 Dec 2024 13:37:50 +0100 Subject: [PATCH 160/245] ggml : refactor online repacking (#10446) * rename ggml-cpu-aarch64.c to .cpp * reformat extra cpu backend. - clean Q4_0_N_M and IQ4_0_N_M - remove from "file" tensor type - allow only with dynamic repack - extract cpu extra bufts and convert to C++ - hbm - "aarch64" - more generic use of extra buffer - generalise extra_supports_op - new API for "cpu-accel": - amx - aarch64 * clang-format * Clean Q4_0_N_M ref Enable restrict on C++ * add op GGML_OP_MUL_MAT_ID for Q4_0_N_M with runtime repack * added/corrected control on tensor size for Q4 repacking. * Update ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp Co-authored-by: Georgi Gerganov * Update ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp Co-authored-by: Georgi Gerganov * add debug logs on repacks. --------- Co-authored-by: Georgi Gerganov --- Makefile | 22 +- Package.swift | 5 +- docs/build.md | 2 +- examples/quantize/README.md | 2 - examples/quantize/quantize.cpp | 3 - ggml/include/ggml-cpu.h | 17 - ggml/include/ggml.h | 29 +- ggml/src/CMakeLists.txt | 4 +- ggml/src/ggml-aarch64.c | 129 ---- ggml/src/ggml-aarch64.h | 19 - ggml/src/ggml-cann/ggml-cann.cpp | 2 +- ggml/src/ggml-common.h | 88 +-- ggml/src/ggml-cpu/CMakeLists.txt | 6 +- ggml/src/ggml-cpu/amx/amx.cpp | 160 ++-- ggml/src/ggml-cpu/amx/amx.h | 14 +- ggml/src/ggml-cpu/amx/common.h | 19 +- ggml/src/ggml-cpu/amx/mmq.cpp | 17 +- ggml/src/ggml-cpu/amx/mmq.h | 8 +- ...gml-cpu-aarch64.c => ggml-cpu-aarch64.cpp} | 717 ++++++++++++++---- ggml/src/ggml-cpu/ggml-cpu-aarch64.h | 28 +- ggml/src/ggml-cpu/ggml-cpu-hbm.cpp | 55 ++ ggml/src/ggml-cpu/ggml-cpu-hbm.h | 8 + ggml/src/ggml-cpu/ggml-cpu-traits.cpp | 36 + ggml/src/ggml-cpu/ggml-cpu-traits.h | 38 + ggml/src/ggml-cpu/ggml-cpu.c | 442 ++++------- ggml/src/ggml-cpu/ggml-cpu.cpp | 172 +---- ggml/src/ggml-cuda/ggml-cuda.cu | 2 +- ggml/src/ggml-quants.c | 9 - ggml/src/ggml-sycl/ggml-sycl.cpp | 2 +- ggml/src/ggml.c | 86 ++- gguf-py/gguf/constants.py | 12 +- include/llama.h | 6 +- src/llama.cpp | 24 +- 33 files changed, 1135 insertions(+), 1048 deletions(-) delete mode 100644 ggml/src/ggml-aarch64.c delete mode 100644 ggml/src/ggml-aarch64.h rename ggml/src/ggml-cpu/{ggml-cpu-aarch64.c => ggml-cpu-aarch64.cpp} (85%) create mode 100644 ggml/src/ggml-cpu/ggml-cpu-hbm.cpp create mode 100644 ggml/src/ggml-cpu/ggml-cpu-hbm.h create mode 100644 ggml/src/ggml-cpu/ggml-cpu-traits.cpp create mode 100644 ggml/src/ggml-cpu/ggml-cpu-traits.h diff --git a/Makefile b/Makefile index d76c4ad535815..bcea450e46be4 100644 --- a/Makefile +++ b/Makefile @@ -445,6 +445,10 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) MK_CFLAGS += -march=native -mtune=native HOST_CXXFLAGS += -march=native -mtune=native + # Usage AMX build test + #MK_CFLAGS += -march=graniterapids -mtune=graniterapids + #HOST_CXXFLAGS += -march=graniterapids -mtune=graniterapids + # Usage AVX-only #MK_CFLAGS += -mfma -mf16c -mavx #MK_CXXFLAGS += -mfma -mf16c -mavx @@ -948,7 +952,6 @@ DIR_COMMON = common OBJ_GGML = \ $(DIR_GGML)/src/ggml.o \ - $(DIR_GGML)/src/ggml-aarch64.o \ $(DIR_GGML)/src/ggml-alloc.o \ $(DIR_GGML)/src/ggml-backend.o \ $(DIR_GGML)/src/ggml-backend-reg.o \ @@ -956,9 +959,11 @@ OBJ_GGML = \ $(DIR_GGML)/src/ggml-quants.o \ $(DIR_GGML)/src/ggml-threading.o \ $(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \ - $(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o \ + $(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \ $(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \ + $(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \ $(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \ + $(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \ $(OBJ_GGML_EXT) OBJ_LLAMA = \ @@ -1098,17 +1103,10 @@ DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d) # Default target all: $(BUILD_TARGETS) +# force c++ build for source file that have same name as c file # Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files -# g++ -M -I ./ggml/include/ -I ./ggml/src ggml/src/ggml-cpu/ggml-cpu.cpp | grep ggml -$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \ - ggml/src/ggml-cpu/ggml-cpu.cpp \ - ggml/include/ggml-backend.h \ - ggml/include/ggml.h \ - ggml/include/ggml-alloc.h \ - ggml/src/ggml-backend-impl.h \ - ggml/include/ggml-cpu.h \ - ggml/src/ggml-impl.h - $(CXX) $(CXXFLAGS) -c $< -o $@ +$(DIR_GGML)/%_cpp.o: $(DIR_GGML)/%.cpp + $(CXX) $(CXXFLAGS) -MMD -c $< -o $@ # Rules for building object files $(DIR_GGML)/%.o: $(DIR_GGML)/%.c diff --git a/Package.swift b/Package.swift index d32b74a63fbfa..3afeb2f1930e4 100644 --- a/Package.swift +++ b/Package.swift @@ -10,14 +10,15 @@ var sources = [ "src/unicode.cpp", "src/unicode-data.cpp", "ggml/src/ggml.c", - "ggml/src/ggml-aarch64.c", "ggml/src/ggml-alloc.c", "ggml/src/ggml-backend.cpp", "ggml/src/ggml-backend-reg.cpp", "ggml/src/ggml-cpu/ggml-cpu.c", "ggml/src/ggml-cpu/ggml-cpu.cpp", - "ggml/src/ggml-cpu/ggml-cpu-aarch64.c", + "ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp", + "ggml/src/ggml-cpu/ggml-cpu-hbm.cpp", "ggml/src/ggml-cpu/ggml-cpu-quants.c", + "ggml/src/ggml-cpu/ggml-cpu-traits.cpp", "ggml/src/ggml-threading.cpp", "ggml/src/ggml-quants.c", ] diff --git a/docs/build.md b/docs/build.md index a4964cbd14909..26e6737888b78 100644 --- a/docs/build.md +++ b/docs/build.md @@ -55,7 +55,7 @@ cmake --build build --config Release cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF cmake --build build-arm64-windows-llvm-release ``` - Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_4_8 CPU kernels. + Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_N_M CPU kernels. ## BLAS Build diff --git a/examples/quantize/README.md b/examples/quantize/README.md index 704f0d56bea72..5d1e11c67b13f 100644 --- a/examples/quantize/README.md +++ b/examples/quantize/README.md @@ -54,8 +54,6 @@ As the models are currently fully loaded into memory, you will need adequate dis Several quantization methods are supported. They differ in the resulting model disk size and inference speed. -The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format. - *(outdated)* | Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index b989932107dba..8d47b17b6bce7 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -48,9 +48,6 @@ static const std::vector QUANT_OPTIONS = { { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", }, { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", }, { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", }, - { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, - { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, - { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", }, { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", }, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index e14ea9ea5301f..3aa71badb5fb0 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -103,24 +103,14 @@ extern "C" { // Internal types and functions exposed for tests and benchmarks - typedef void (*ggml_from_float_to_mat_t) - (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs); typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx, const void * GGML_RESTRICT y, size_t by, int nrc); - typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, - const void * GGML_RESTRICT y, int nr, int nc); - typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, - const void * GGML_RESTRICT y, int nr, int nc); struct ggml_type_traits_cpu { ggml_from_float_t from_float; - ggml_from_float_to_mat_t from_float_to_mat; ggml_vec_dot_t vec_dot; enum ggml_type vec_dot_type; int64_t nrows; // number of rows to process simultaneously - int64_t ncols; // number of columns to process simultaneously - ggml_gemv_t gemv; - ggml_gemm_t gemm; }; GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type); @@ -140,13 +130,6 @@ extern "C" { GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void); -#ifdef GGML_USE_CPU_HBM - GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void); -#endif - - GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void); - GGML_BACKEND_API bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft); - #ifdef __cplusplus } #endif diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 1c8cc11b6b24d..386d5a15d81a1 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -384,15 +384,15 @@ extern "C" { GGML_TYPE_F64 = 28, GGML_TYPE_IQ1_M = 29, GGML_TYPE_BF16 = 30, - GGML_TYPE_Q4_0_4_4 = 31, - GGML_TYPE_Q4_0_4_8 = 32, - GGML_TYPE_Q4_0_8_8 = 33, + // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files + // GGML_TYPE_Q4_0_4_8 = 32, + // GGML_TYPE_Q4_0_8_8 = 33, GGML_TYPE_TQ1_0 = 34, GGML_TYPE_TQ2_0 = 35, - GGML_TYPE_IQ4_NL_4_4 = 36, + // GGML_TYPE_IQ4_NL_4_4 = 36, // GGML_TYPE_IQ4_NL_4_8 = 37, // GGML_TYPE_IQ4_NL_8_8 = 38, - GGML_TYPE_COUNT, + GGML_TYPE_COUNT = 39, }; // precision @@ -433,9 +433,6 @@ extern "C" { GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors }; // available tensor operations: @@ -2205,11 +2202,19 @@ extern "C" { GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx); GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data); -#ifdef __cplusplus -// restrict not standard in C++ -#define GGML_RESTRICT +#ifdef __cplusplus + // restrict not standard in C++ +# if defined(__GNUC__) +# define GGML_RESTRICT __restrict__ +# elif defined(__clang__) +# define GGML_RESTRICT __restrict +# elif defined(_MSC_VER) +# define GGML_RESTRICT __restrict +# else +# define GGML_RESTRICT +# endif #else -#define GGML_RESTRICT restrict +# define GGML_RESTRICT restrict #endif typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index f07533fdb316a..a267a8b596bd1 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -220,9 +220,7 @@ add_library(ggml-base ggml-threading.cpp ggml-threading.h ggml-quants.c - ggml-quants.h - ggml-aarch64.c - ggml-aarch64.h) + ggml-quants.h) target_include_directories(ggml-base PRIVATE .) diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c deleted file mode 100644 index 0139120519cea..0000000000000 --- a/ggml/src/ggml-aarch64.c +++ /dev/null @@ -1,129 +0,0 @@ -#define GGML_COMMON_DECL_C -#include "ggml-common.h" - -#include "ggml-aarch64.h" -#include "ggml-impl.h" -#include "ggml-quants.h" -#include - -#define UNUSED GGML_UNUSED - -static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) { - block_q4_0x4 out; - - for (int i = 0; i < 4; i++) { - out.d[i] = in[i].d; - } - - const int end = QK4_0 * 2 / blck_size_interleave; - - if (blck_size_interleave == 8) { - const uint64_t xor_mask = 0x8888888888888888ULL; - for (int i = 0; i < end; ++i) { - int src_id = i % 4; - int src_offset = (i / 4) * blck_size_interleave; - int dst_offset = i * blck_size_interleave; - - uint64_t elems; - // Using memcpy to avoid unaligned memory accesses - memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t)); - elems ^= xor_mask; - memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t)); - } - } else if (blck_size_interleave == 4) { - const uint32_t xor_mask = 0x88888888; - for (int i = 0; i < end; ++i) { - int src_id = i % 4; - int src_offset = (i / 4) * blck_size_interleave; - int dst_offset = i * blck_size_interleave; - - uint32_t elems; - memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t)); - elems ^= xor_mask; - memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t)); - } - } else { - GGML_ASSERT(false); - } - - return out; -} - -// interleave 8 block_q4_0s in blocks of blck_size_interleave -// returns an interleaved block_q4_0x8 -// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks -// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave -static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) { - block_q4_0x8 out; - - for (int i = 0; i < 8; i++) { - out.d[i] = in[i].d; - } - - const int end = QK4_0 * 4 / blck_size_interleave; - const uint64_t xor_mask = 0x8888888888888888ULL; - - for (int i = 0; i < end; ++i) { - int src_id = i % 8; - int src_offset = (i / 8) * blck_size_interleave; - int dst_offset = i * blck_size_interleave; - - uint64_t elems; - memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t)); - elems ^= xor_mask; - memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t)); - } - - return out; -} - -static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int blck_size_interleave) { - assert(n_per_row % QK4_0 == 0); - const int nb = n_per_row / QK4_0; - - void * out_ptr = NULL; - if (nrows_interleaved == 8) { - out_ptr = (block_q4_0x8 *) dst; - } - else if (nrows_interleaved == 4) { - out_ptr = (block_q4_0x4 *) dst; - } - assert(nrows_interleaved <= 8); - block_q4_0 dst_tmp[8]; - - for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) { - - for (int64_t x = 0; x < nb; x++) { - - for (int i = 0; i < nrows_interleaved; i++ ) { - quantize_row_q4_0_ref(src + b + i * n_per_row + x * QK4_0, (block_q4_0 *) dst_tmp + i, QK4_0); - } - - if (nrows_interleaved == 8) { - *(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave); - out_ptr = (block_q4_0x8 *) out_ptr + 1; - } - else if (nrows_interleaved == 4) { - *(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave); - out_ptr = (block_q4_0x4 *) out_ptr + 1; - } - } - } - - return ((nrow * n_per_row) / QK4_0 * sizeof(block_q4_0)); -} - -size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { - UNUSED(quant_weights); - return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4); -} - -size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { - UNUSED(quant_weights); - return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8); -} - -size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { - UNUSED(quant_weights); - return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8); -} diff --git a/ggml/src/ggml-aarch64.h b/ggml/src/ggml-aarch64.h deleted file mode 100644 index a578685911be6..0000000000000 --- a/ggml/src/ggml-aarch64.h +++ /dev/null @@ -1,19 +0,0 @@ -#pragma once - -#include "ggml.h" - -// GGML internal header - -#ifdef __cplusplus -extern "C" { -#endif - -// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") -size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); - -#ifdef __cplusplus -} -#endif - diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 04e25b8ab1a23..fa04ab84f3f15 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2089,7 +2089,7 @@ static void * ggml_backend_cann_reg_get_proc_address(ggml_backend_reg_t reg, con static const ggml_backend_reg_i ggml_backend_cann_reg_interface = { /* .get_name = */ ggml_backend_cann_reg_get_name, /* .get_device_count = */ ggml_backend_cann_reg_get_device_count, - /* .get_device_get = */ ggml_backend_cann_reg_get_device, + /* .get_device = */ ggml_backend_cann_reg_get_device, /* .get_proc_address = */ ggml_backend_cann_reg_get_proc_address, }; diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 27253a6c2b3ca..7fd2aadeca1af 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -6,7 +6,20 @@ typedef uint16_t ggml_half; typedef uint32_t ggml_half2; -#define GGML_COMMON_AGGR +#define GGML_COMMON_AGGR_U +#define GGML_COMMON_AGGR_S + +#define GGML_COMMON_DECL +#elif defined(GGML_COMMON_DECL_CPP) +#include + +typedef uint16_t ggml_half; +typedef uint32_t ggml_half2; + +// std-c++ allow anonymous unions but some compiler warn on it +#define GGML_COMMON_AGGR_U data +// std-c++ do not allow it. +#define GGML_COMMON_AGGR_S data #define GGML_COMMON_DECL #elif defined(GGML_COMMON_DECL_METAL) @@ -15,7 +28,8 @@ typedef uint32_t ggml_half2; typedef half ggml_half; typedef half2 ggml_half2; -#define GGML_COMMON_AGGR +#define GGML_COMMON_AGGR_U +#define GGML_COMMON_AGGR_S #define GGML_COMMON_DECL #elif defined(GGML_COMMON_DECL_CUDA) @@ -29,7 +43,8 @@ typedef half2 ggml_half2; typedef half ggml_half; typedef half2 ggml_half2; -#define GGML_COMMON_AGGR data +#define GGML_COMMON_AGGR_U +#define GGML_COMMON_AGGR_S data #define GGML_COMMON_DECL #elif defined(GGML_COMMON_DECL_HIP) @@ -39,7 +54,8 @@ typedef half2 ggml_half2; typedef half ggml_half; typedef half2 ggml_half2; -#define GGML_COMMON_AGGR data +#define GGML_COMMON_AGGR_U +#define GGML_COMMON_AGGR_S data #define GGML_COMMON_DECL #elif defined(GGML_COMMON_DECL_SYCL) @@ -49,7 +65,8 @@ typedef half2 ggml_half2; typedef sycl::half ggml_half; typedef sycl::half2 ggml_half2; -#define GGML_COMMON_AGGR data +#define GGML_COMMON_AGGR_U +#define GGML_COMMON_AGGR_S data #define GGML_COMMON_DECL #endif @@ -154,9 +171,9 @@ typedef struct { struct { ggml_half d; // delta ggml_half m; // min - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 dm; - }; + } GGML_COMMON_AGGR_U; uint8_t qs[QK4_1 / 2]; // nibbles / quants } block_q4_1; static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding"); @@ -175,9 +192,9 @@ typedef struct { struct { ggml_half d; // delta ggml_half m; // min - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 dm; - }; + } GGML_COMMON_AGGR_U; uint8_t qh[4]; // 5-th bit of quants uint8_t qs[QK5_1 / 2]; // nibbles / quants } block_q5_1; @@ -196,37 +213,13 @@ typedef struct { struct { ggml_half d; // delta ggml_half s; // d * sum(qs[i]) - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 ds; - }; + } GGML_COMMON_AGGR_U; int8_t qs[QK8_1]; // quants } block_q8_1; static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding"); -typedef struct { - ggml_half d[4]; // deltas for 4 q4_0 blocks - uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks -} block_q4_0x4; -static_assert(sizeof(block_q4_0x4) == 4 * sizeof(ggml_half) + QK4_0 * 2, "wrong q4_0x4 block size/padding"); - -typedef struct { - ggml_half d[8]; // deltas for 8 q4_0 blocks - uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks -} block_q4_0x8; -static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_half) + QK4_0 * 4, "wrong q4_0x8 block size/padding"); - -typedef struct { - ggml_half d[4]; // deltas for 4 q8_0 blocks - int8_t qs[QK8_0 * 4]; // quants for 4 q8_0 blocks -} block_q8_0x4; -static_assert(sizeof(block_q8_0x4) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong q8_0x4 block size/padding"); - -typedef struct { - ggml_half d[8]; // deltas for 8 q8_0 blocks - int8_t qs[QK8_0 * 8]; // quants for 8 q8_0 blocks -} block_q8_0x8; -static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding"); - // // Ternary quantization // @@ -261,9 +254,9 @@ typedef struct { struct { ggml_half d; // super-block scale for quantized scales ggml_half dmin; // super-block scale for quantized mins - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 dm; - }; + } GGML_COMMON_AGGR_U; } block_q2_K; static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); @@ -288,9 +281,9 @@ typedef struct { struct { ggml_half d; // super-block scale for quantized scales ggml_half dmin; // super-block scale for quantized mins - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 dm; - }; + } GGML_COMMON_AGGR_U; uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits uint8_t qs[QK_K/2]; // 4--bit quants } block_q4_K; @@ -305,9 +298,9 @@ typedef struct { struct { ggml_half d; // super-block scale for quantized scales ggml_half dmin; // super-block scale for quantized mins - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 dm; - }; + } GGML_COMMON_AGGR_U; uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits uint8_t qh[QK_K/8]; // quants, high bit uint8_t qs[QK_K/2]; // quants, low 4 bits @@ -418,12 +411,6 @@ typedef struct { } block_iq4_xs; static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding"); -typedef struct { - ggml_half d[4]; // deltas for 4 iq4_nl blocks - uint8_t qs[QK4_NL * 2];// nibbles / quants for 4 iq4_nl blocks -} block_iq4_nlx4; -static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding"); - #endif // GGML_COMMON_DECL #endif // GGML_COMMON_DECL @@ -437,6 +424,13 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro #define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = { #define GGML_TABLE_END() }; +#define GGML_COMMON_IMPL +#elif defined(GGML_COMMON_IMPL_CPP) +#include + +#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = { +#define GGML_TABLE_END() }; + #define GGML_COMMON_IMPL #elif defined(GGML_COMMON_IMPL_METAL) #include diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index bc326c0593024..0e0556703620f 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -10,10 +10,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name) list (APPEND GGML_CPU_SOURCES ggml-cpu/ggml-cpu.c ggml-cpu/ggml-cpu.cpp - ggml-cpu/ggml-cpu-aarch64.c + ggml-cpu/ggml-cpu-aarch64.cpp ggml-cpu/ggml-cpu-aarch64.h + ggml-cpu/ggml-cpu-hbm.cpp + ggml-cpu/ggml-cpu-hbm.h ggml-cpu/ggml-cpu-quants.c ggml-cpu/ggml-cpu-quants.h + ggml-cpu/ggml-cpu-traits.cpp + ggml-cpu/ggml-cpu-traits.h ggml-cpu/amx/amx.cpp ggml-cpu/amx/amx.h ggml-cpu/amx/mmq.cpp diff --git a/ggml/src/ggml-cpu/amx/amx.cpp b/ggml/src/ggml-cpu/amx/amx.cpp index 09c0df0f5c253..b9074cb3aca55 100644 --- a/ggml/src/ggml-cpu/amx/amx.cpp +++ b/ggml/src/ggml-cpu/amx/amx.cpp @@ -5,6 +5,7 @@ #include "ggml-backend.h" #include "ggml-impl.h" #include "ggml-cpu.h" +#include "ggml-cpu-traits.h" #if defined(__gnu_linux__) #include @@ -17,31 +18,65 @@ #if defined(__AMX_INT8__) && defined(__AVX512VNNI__) +// AMX type_trais +namespace ggml::cpu::amx { +class tensor_traits : public ggml::cpu::tensor_traits { + bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override { + size = ggml_backend_amx_desired_wsize(op); + return true; + } + + bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override { + if (op->op == GGML_OP_MUL_MAT) { + ggml_backend_amx_mul_mat(params, op); + return true; + } + return false; + } +}; + +static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struct ggml_tensor *) { + static tensor_traits traits; + return &traits; +} +} // namespace ggml::cpu::amx + // AMX buffer interface static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) { free(buffer->context); } static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) { - return (void *)(buffer->context); + return (void *) (buffer->context); } -static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { - memset((char *)tensor->data + offset, value, size); +static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { + tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor); GGML_UNUSED(buffer); } -static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, + uint8_t value, size_t offset, size_t size) { + memset((char *) tensor->data + offset, value, size); + + GGML_UNUSED(buffer); +} + +static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, + const void * data, size_t offset, size_t size) { if (qtype_has_amx_kernels(tensor->type)) { + GGML_LOG_DEBUG("%s: amx repack tensor %s of type %s\n", __func__, tensor->name, ggml_type_name(tensor->type)); ggml_backend_amx_convert_weight(tensor, data, offset, size); } else { - memcpy((char *)tensor->data + offset, data, size); + memcpy((char *) tensor->data + offset, data, size); } GGML_UNUSED(buffer); } +/* +// need to figure what we need to do with buffer->extra. static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { GGML_ASSERT(!qtype_has_amx_kernels(tensor->type)); memcpy(data, (const char *)tensor->data + offset, size); @@ -62,6 +97,7 @@ static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con GGML_UNUSED(buffer); } +*/ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { memset(buffer->context, value, buffer->size); @@ -70,13 +106,13 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = { /* .free_buffer = */ ggml_backend_amx_buffer_free_buffer, /* .get_base = */ ggml_backend_amx_buffer_get_base, - /* .init_tensor = */ NULL, // no initialization required + /* .init_tensor = */ ggml_backend_amx_buffer_init_tensor, /* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor, /* .set_tensor = */ ggml_backend_amx_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_amx_buffer_get_tensor, - /* .cpy_tensor = */ ggml_backend_amx_buffer_cpy_tensor, + /* .get_tensor = */ nullptr, + /* .cpy_tensor = */ nullptr, /* .clear = */ ggml_backend_amx_buffer_clear, - /* .reset = */ NULL, + /* .reset = */ nullptr, }; static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) { @@ -101,14 +137,44 @@ static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_typ GGML_UNUSED(buft); } -static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) { - return ggml_backend_amx_get_alloc_size(tensor); +namespace ggml::cpu::amx { +class extra_buffer_type : ggml::cpu::extra_buffer_type { + bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override { + // handle only 2d gemm for now + auto is_contiguous_2d = [](const struct ggml_tensor * t) { + return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1; + }; + + if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) && // src0 must be contiguous + is_contiguous_2d(op->src[1]) && // src1 must be contiguous + op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() && + op->ne[0] % (TILE_N * 2) == 0 && // out_features is 32x + (qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))) { + // src1 must be host buffer + if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { + return false; + } + // src1 must be float32 + if (op->src[1]->type == GGML_TYPE_F32) { + return true; + } + } + return false; + } - GGML_UNUSED(buft); -} + ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override { + if (op->op == GGML_OP_MUL_MAT && op->src[0]->buffer && + op->src[0]->buffer->buft == ggml_backend_amx_buffer_type()) { + return (ggml::cpu::tensor_traits *) op->src[0]->extra; + } -static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) { - return false; + return nullptr; + } +}; +} // namespace ggml::cpu::amx + +static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { + return ggml_backend_amx_get_alloc_size(tensor); GGML_UNUSED(buft); } @@ -129,68 +195,26 @@ static bool ggml_amx_init() { return true; #endif } + ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() { static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = { /* .iface = */ { - /* .get_name = */ ggml_backend_amx_buffer_type_get_name, - /* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX - /* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size, - /* .is_host = */ ggml_backend_amx_buffer_type_is_host, - }, + /* .get_name = */ ggml_backend_amx_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment, + /* .get_max_size = */ nullptr, // defaults to SIZE_MAX + /* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size, + /* .is_host = */ nullptr, + }, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), - /* .context = */ NULL, + /* .context = */ new ggml::cpu::amx::extra_buffer_type(), }; if (!ggml_amx_init()) { - return NULL; + return nullptr; } return &ggml_backend_buffer_type_amx; } -bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft) { - return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name; -} - -bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op) { - // handle only 2d gemm for now - auto is_contiguous_2d = [](const struct ggml_tensor * t) { - return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1; - }; - - switch (op->op) { - case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - return true; - - case GGML_OP_MUL_MAT: { - const struct ggml_tensor * src0 = op->src[0]; - const struct ggml_tensor * src1 = op->src[1]; - - const enum ggml_type type = src0->type; - const int64_t ne0 = op->ne[0]; - - // amx kernels enables for Q4_0, Q4_1, Q8_0, F16 - // Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256 - bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16); - - bool can_use_amx = - is_contiguous_2d(src0) && // src0 must be contiguous - is_contiguous_2d(src1) && // src1 must be contiguous - src1->type == GGML_TYPE_F32 && // src1 must be float32 - has_amx_kernels && // with amx kernel impls - ne0 % (TILE_N * 2) == 0; // out_features is 32x - - return can_use_amx; - } - default: - return false; - } -} - -#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__) +#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__) diff --git a/ggml/src/ggml-cpu/amx/amx.h b/ggml/src/ggml-cpu/amx/amx.h index c4354627361d6..5b65d76bdc89c 100644 --- a/ggml/src/ggml-cpu/amx/amx.h +++ b/ggml/src/ggml-cpu/amx/amx.h @@ -1,20 +1,8 @@ #include "ggml-backend.h" #include "ggml-cpu-impl.h" -#ifdef __cplusplus -extern "C" { -#endif +// GGML internal header #if defined(__AMX_INT8__) && defined(__AVX512VNNI__) - ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void); -bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft); -bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op); -void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst); -size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst); - -#endif - -#ifdef __cplusplus -} #endif diff --git a/ggml/src/ggml-cpu/amx/common.h b/ggml/src/ggml-cpu/amx/common.h index 40074c3fc92fa..f392e898518a7 100644 --- a/ggml/src/ggml-cpu/amx/common.h +++ b/ggml/src/ggml-cpu/amx/common.h @@ -7,7 +7,7 @@ #include #include -#if defined(_OPENMP) +#if defined(GGML_USE_OPENMP) #include #endif @@ -56,11 +56,11 @@ inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) { } template -inline void parallel_for(int nth, int n, const func_t& f) { -#if defined(_OPENMP) -#pragma omp parallel num_threads(nth) +inline void parallel_for(int n, const func_t& f) { +#if defined(GGML_USE_OPENMP) +#pragma omp parallel { - //int nth = omp_get_num_threads(); + int nth = omp_get_num_threads(); int ith = omp_get_thread_num(); int tbegin, tend; balance211(n, nth, ith, tbegin, tend); @@ -68,8 +68,6 @@ inline void parallel_for(int nth, int n, const func_t& f) { } #else f(0, n); - - GGML_UNUSED(nth); #endif } @@ -91,10 +89,3 @@ inline bool qtype_has_amx_kernels(const enum ggml_type type) { (type == GGML_TYPE_Q6_K) || (type == GGML_TYPE_IQ4_XS); } - -// ggml backend context -struct ggml_backend_amx_context { - int n_threads = GGML_DEFAULT_N_THREADS; - std::unique_ptr work_data; - size_t work_size = 0; -}; diff --git a/ggml/src/ggml-cpu/amx/mmq.cpp b/ggml/src/ggml-cpu/amx/mmq.cpp index 0ec3aa86df8a2..0ea91596bc7e2 100644 --- a/ggml/src/ggml-cpu/amx/mmq.cpp +++ b/ggml/src/ggml-cpu/amx/mmq.cpp @@ -18,10 +18,6 @@ #include #endif -#if defined(_OPENMP) -#include -#endif - #if (defined(_WIN32) || defined(_WIN64)) #define RESTRICT __restrict #else @@ -1382,13 +1378,13 @@ struct tinygemm_kernel_avx #define PACKED_INDEX(n, k, KB, tile_size) (n * KB + k) * tile_size template -void convert_B_packed_format(void * RESTRICT packed_B, const TB * RESTRICT B, int N, int K, int n_threads) { +void convert_B_packed_format(void * RESTRICT packed_B, const TB * RESTRICT B, int N, int K) { const int NB = N / TILE_N; const int KB = K / BLOCK_K; const int TILE_SIZE = get_tile_size(); // parallel on NB should be enough - parallel_for(n_threads, NB, [&](int begin, int end) { + parallel_for(NB, [&](int begin, int end) { for (int n = begin; n < end; ++n) { for (int k = 0; k < KB; ++k) { int n0 = n * TILE_N; @@ -2334,15 +2330,8 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d const int K = tensor->ne[0]; // ne0: in_features const int N = tensor->ne[1]; // ne1: out_features -#if defined(_OPENMP) - // the buffer ctx is not initialized when .set_tensor is called - int n_threads = omp_get_num_threads(); -#else - int n_threads = 1; -#endif - GGML_DISPATCH_QTYPES(TYPE, [&] { - convert_B_packed_format((void *)((char *)tensor->data + offset), (const type *)data, N, K, n_threads); + convert_B_packed_format((void *)((char *)tensor->data + offset), (const type *)data, N, K); }); } diff --git a/ggml/src/ggml-cpu/amx/mmq.h b/ggml/src/ggml-cpu/amx/mmq.h index f3736609315bd..baf7684773453 100644 --- a/ggml/src/ggml-cpu/amx/mmq.h +++ b/ggml/src/ggml-cpu/amx/mmq.h @@ -1,16 +1,10 @@ #pragma once #include "common.h" -#ifdef __cplusplus -extern "C" { -#endif +size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst); size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor); void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst); - -#ifdef __cplusplus -} -#endif diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp similarity index 85% rename from ggml/src/ggml-cpu/ggml-cpu-aarch64.c rename to ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp index 11152385e677a..9b9e3c92a1d69 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp @@ -1,20 +1,57 @@ -#define GGML_COMMON_IMPL_C +#define GGML_COMMON_IMPL_CPP +#define GGML_COMMON_DECL_CPP #include "ggml-common.h" +#include "ggml-backend-impl.h" #include "ggml-quants.h" #include "ggml-impl.h" #include "ggml-cpu.h" -#include "ggml-cpu/ggml-cpu-impl.h" +#include "ggml-cpu-impl.h" +#include "ggml-cpu-traits.h" -#include -#include -#include -#include -#include // for qsort -#include // for GGML_ASSERT +#include +#include +#include +#include +#include // for qsort +#include // for GGML_ASSERT #include "ggml-cpu-aarch64.h" +// TODO: move to include file? +template constexpr int QK_0() { + if constexpr (K == 4) { + return QK4_0; + } + if constexpr (K == 8) { + return QK8_0; + } + return -1; +} + +template struct block { + ggml_half d[N]; // deltas for N qK_0 blocks + int8_t qs[(QK_0() * N * K) / 8]; // quants for N qK_0 blocks +}; + +// control size +static_assert(sizeof(block<4, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding"); +static_assert(sizeof(block<4, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding"); +static_assert(sizeof(block<8, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding"); +static_assert(sizeof(block<8, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding"); + +using block_q4_0x4 = block<4, 4>; +using block_q4_0x8 = block<4, 8>; +using block_q8_0x4 = block<8, 4>; +using block_q8_0x8 = block<8, 8>; + +struct block_iq4_nlx4 { + ggml_half d[4]; // deltas for 4 iq4_nl blocks + uint8_t qs[QK4_NL * 2]; // nibbles / quants for 4 iq4_nl blocks +}; + +static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding"); + #if defined(__GNUC__) #pragma GCC diagnostic ignored "-Woverlength-strings" #elif defined(_MSC_VER) @@ -185,12 +222,12 @@ static inline __m256i mul_sum_i8_pairs_int32x8(const __m256i x, const __m256i y) static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; -static void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k) { +static void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { assert(QK8_0 == 32); assert(k % QK8_0 == 0); const int nb = k / QK8_0; - block_q8_0x4 * restrict y = (block_q8_0x4 *) vy; + block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy; #if defined(__ARM_NEON) float32x4_t srcv[4][8]; @@ -279,12 +316,12 @@ static void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int6 #endif } -static void quantize_q8_0_4x8(const float * restrict x, void * restrict vy, int64_t k) { +static void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { assert(QK8_0 == 32); assert(k % QK8_0 == 0); const int nb = k / QK8_0; - block_q8_0x4 * restrict y = (block_q8_0x4 *) vy; + block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy; #if defined(__ARM_NEON) float32x4_t srcv[4][8]; @@ -494,7 +531,7 @@ static void quantize_q8_0_4x8(const float * restrict x, void * restrict vy, int6 #endif } -void quantize_mat_q8_0(const float * restrict x, void * restrict vy, int64_t nrow, int64_t n_per_row, int64_t blck_size_interleave) { +static void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row, int64_t blck_size_interleave) { assert(nrow == 4); UNUSED(nrow); if (blck_size_interleave == 4) { @@ -506,7 +543,7 @@ void quantize_mat_q8_0(const float * restrict x, void * restrict vy, int64_t nro } } -void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { +static void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; const int ncols_interleaved = 4; @@ -591,7 +628,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * } } -void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { +static void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; const int ncols_interleaved = 4; @@ -701,7 +738,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * } } -void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { +static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; const int ncols_interleaved = 8; @@ -974,7 +1011,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * } } -void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { +static void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; const int ncols_interleaved = 4; @@ -1070,7 +1107,7 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void } } -void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { +static void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; const int ncols_interleaved = 4; @@ -1586,7 +1623,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * } } -void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { +static void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; const int ncols_interleaved = 4; @@ -2040,7 +2077,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * } } -void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { +static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; const int ncols_interleaved = 8; @@ -2560,31 +2597,31 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * const __m512i rhs_mat_2367ABEF_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31) BA(24-31) BB(24-31) BE(24-31) BF(24-31) // Shuffle pattern one - right side input - const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3) - const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3) + const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, (_MM_PERM_ENUM)136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3) + const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, (_MM_PERM_ENUM)136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3) - const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11) - const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11) + const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, (_MM_PERM_ENUM)136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11) + const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, (_MM_PERM_ENUM)136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11) - const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19) - const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19) + const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, (_MM_PERM_ENUM)136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19) + const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, (_MM_PERM_ENUM)136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19) - const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27) - const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27) + const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, (_MM_PERM_ENUM)136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27) + const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, (_MM_PERM_ENUM)136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27) // Shuffle pattern two - right side input - const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7) - const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7) + const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, (_MM_PERM_ENUM)221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7) + const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, (_MM_PERM_ENUM)221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7) - const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15) - const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15) + const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, (_MM_PERM_ENUM)221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15) + const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, (_MM_PERM_ENUM)221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15) - const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23) - const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23) + const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, (_MM_PERM_ENUM)221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23) + const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, (_MM_PERM_ENUM)221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23) - const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31) - const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31) + const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, (_MM_PERM_ENUM)221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31) + const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, (_MM_PERM_ENUM)221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31) // Scale values - Load the weight scale values of two block_q4_0x8 const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d); @@ -2618,31 +2655,31 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * // Shuffle pattern one - left side input - const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, 160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) - const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, 160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) + const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, (_MM_PERM_ENUM)160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) + const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, (_MM_PERM_ENUM)160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) - const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, 160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) - const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, 160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) + const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, (_MM_PERM_ENUM)160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) + const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, (_MM_PERM_ENUM)160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) - const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, 160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) - const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, 160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) + const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, (_MM_PERM_ENUM)160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) + const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, (_MM_PERM_ENUM)160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) - const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, 160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) - const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, 160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) + const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, (_MM_PERM_ENUM)160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) + const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, (_MM_PERM_ENUM)160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) // Shuffle pattern two - left side input - const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, 245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) - const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, 245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) + const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, (_MM_PERM_ENUM)245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) + const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, (_MM_PERM_ENUM)245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) - const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, 245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) - const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, 245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) + const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, (_MM_PERM_ENUM)245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) + const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, (_MM_PERM_ENUM)245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) - const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, 245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) - const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, 245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) + const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, (_MM_PERM_ENUM)245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) + const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, (_MM_PERM_ENUM)245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) - const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, 245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) - const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, 245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) + const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, (_MM_PERM_ENUM)245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) + const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, (_MM_PERM_ENUM)245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane // Resembles MMLAs into 2x2 matrices in ARM Version @@ -2671,10 +2708,10 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * // Straighten out to make 4 row vectors - __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, 78)); - __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01); - __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, 78)); - __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11); + __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, (_MM_PERM_ENUM)78)); + __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, (_MM_PERM_ENUM)78), iacc_mat_01); + __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, (_MM_PERM_ENUM)78)); + __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, (_MM_PERM_ENUM)78), iacc_mat_11); // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptrs[rp][b].d), loadMask), 68); @@ -2753,31 +2790,31 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * const __m512i rhs_mat_2367ABEF_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31) BA(24-31) BB(24-31) BE(24-31) BF(24-31) // Shuffle pattern one - right side input - const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3) - const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3) + const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, (_MM_PERM_ENUM)136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3) + const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, (_MM_PERM_ENUM)136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3) - const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11) - const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11) + const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, (_MM_PERM_ENUM)136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11) + const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, (_MM_PERM_ENUM)136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11) - const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19) - const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19) + const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, (_MM_PERM_ENUM)136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19) + const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, (_MM_PERM_ENUM)136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19) - const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27) - const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27) + const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, (_MM_PERM_ENUM)136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27) + const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, (_MM_PERM_ENUM)136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27) // Shuffle pattern two - right side input - const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7) - const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7) + const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, (_MM_PERM_ENUM)221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7) + const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, (_MM_PERM_ENUM)221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7) - const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15) - const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15) + const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, (_MM_PERM_ENUM)221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15) + const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, (_MM_PERM_ENUM)221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15) - const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23) - const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23) + const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, (_MM_PERM_ENUM)221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23) + const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, (_MM_PERM_ENUM)221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23) - const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31) - const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31) + const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, (_MM_PERM_ENUM)221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31) + const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, (_MM_PERM_ENUM)221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31) // Scale values - Load the weight scale values of two block_q4_0x8 @@ -2809,31 +2846,31 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * // Shuffle pattern one - left side input - const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, 160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) - const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, 160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) + const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, (_MM_PERM_ENUM)160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) + const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, (_MM_PERM_ENUM)160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) - const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, 160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) - const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, 160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) + const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, (_MM_PERM_ENUM)160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) + const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, (_MM_PERM_ENUM)160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) - const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, 160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) - const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, 160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) + const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, (_MM_PERM_ENUM)160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) + const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, (_MM_PERM_ENUM)160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) - const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, 160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) - const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, 160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) + const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, (_MM_PERM_ENUM)160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) + const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, (_MM_PERM_ENUM)160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) // Shuffle pattern two - left side input - const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, 245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) - const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, 245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) + const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, (_MM_PERM_ENUM)245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) + const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, (_MM_PERM_ENUM)245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) - const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, 245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) - const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, 245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) + const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, (_MM_PERM_ENUM)245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) + const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, (_MM_PERM_ENUM)245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) - const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, 245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) - const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, 245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) + const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, (_MM_PERM_ENUM)245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) + const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, (_MM_PERM_ENUM)245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) - const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, 245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) - const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, 245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) + const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, (_MM_PERM_ENUM)245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) + const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, (_MM_PERM_ENUM)245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane // Resembles MMLAs into 2x2 matrices in ARM Version @@ -2862,10 +2899,10 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * // Straighten out to make 4 row vectors - __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, 78)); - __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01); - __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, 78)); - __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11); + __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, (_MM_PERM_ENUM)78)); + __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, (_MM_PERM_ENUM)78), iacc_mat_01); + __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, (_MM_PERM_ENUM)78)); + __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, (_MM_PERM_ENUM)78), iacc_mat_11); // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptr[b].d), loadMask), 68); @@ -3460,7 +3497,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * } } -void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { +static void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; const int ncols_interleaved = 4; @@ -3571,7 +3608,6 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void } } -// FIXME: this code is duplicated from ggml-aarch64.c static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) { block_q4_0x4 out; @@ -3641,20 +3677,20 @@ static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_in return out; } -static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * restrict data, size_t data_size) { +static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { GGML_ASSERT(t->type == GGML_TYPE_Q4_0); GGML_ASSERT(interleave_block == 4 || interleave_block == 8); + constexpr int nrows_interleaved = 4; block_q4_0x4 * dst = (block_q4_0x4 *)t->data; const block_q4_0 * src = (const block_q4_0 *)data; block_q4_0 dst_tmp[4]; - int nrow = t->ne[1]; // Number of rows - int nrows_interleaved = 4; + int nrow = ggml_nrows(t); int nblocks = t->ne[0] / QK4_0; GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0)); - if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { + if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { return -1; } @@ -3672,20 +3708,20 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block GGML_UNUSED(data_size); } -static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block, const void * restrict data, size_t data_size) { +static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { GGML_ASSERT(t->type == GGML_TYPE_Q4_0); GGML_ASSERT(interleave_block == 8); + constexpr int nrows_interleaved = 8; block_q4_0x8 * dst = (block_q4_0x8*)t->data; const block_q4_0 * src = (const block_q4_0*) data; block_q4_0 dst_tmp[8]; - int nrow = t->ne[1]; // Number of rows - int nrows_interleaved = 8; + int nrow = ggml_nrows(t); int nblocks = t->ne[0] / QK4_0; GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0)); - if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { + if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { return -1; } @@ -3736,20 +3772,20 @@ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_s return out; } -static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * restrict data, size_t data_size) { +static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL); GGML_ASSERT(interleave_block == 4 || interleave_block == 8); block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data; const block_iq4_nl * src = (const block_iq4_nl *)data; block_iq4_nl dst_tmp[4]; - int nrow = t->ne[1]; // Number of rows + int nrow = ggml_nrows(t); int nrows_interleaved = 4; int nblocks = t->ne[0] / QK4_0; GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl)); - if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { + if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { return -1; } @@ -3767,57 +3803,456 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b GGML_UNUSED(data_size); } -// Prepare for optimized kernels if applicable -void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * restrict data, size_t data_size) { - if (cur->type == repack_type) { - memcpy(cur->data, data, data_size); - return; +namespace ggml::cpu::aarch64 { +// repack +template +int repack(struct ggml_tensor *, const void *, size_t); + +// TODO: generalise. +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size); +} + +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size); +} + +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size); +} + +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size); +} + +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size); +} + +// gemv +template +void gemv(int, float *, size_t, const void *, const void *, int, int); + +template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc); +} + +template <> +void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc); +} + +// gemm +template +void gemm(int, float *, size_t, const void *, const void *, int, int); + +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc); +} + +template <> +void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc); +} + +class tensor_traits_base : public ggml::cpu::tensor_traits { + public: + virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0; +}; + +template class tensor_traits : public tensor_traits_base { + + bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override { + // not realy a GGML_TYPE_Q8_0 but same size. + switch (op->op) { + case GGML_OP_MUL_MAT: + size = ggml_row_size(GGML_TYPE_Q8_0, ggml_nelements(op->src[1])); + return true; + case GGML_OP_MUL_MAT_ID: + size = ggml_row_size(GGML_TYPE_Q8_0, ggml_nelements(op->src[1])); + size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc. + size += sizeof(int64_t) * (1+op->src[0]->ne[2]) * op->src[1]->ne[2]; + return true; + default: + // GGML_ABORT("fatal error"); + break; + } + return false; } - if (cur->type == GGML_TYPE_Q4_0) { - switch (repack_type) { - case GGML_TYPE_Q4_0_8_8: - repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size); - break; - case GGML_TYPE_Q4_0_4_8: - repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size); - break; - case GGML_TYPE_Q4_0_4_4: - repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size); - break; - default: - GGML_ABORT("Unsupported type"); + bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override { + switch (op->op) { + case GGML_OP_MUL_MAT: + forward_mul_mat(params, op); + return true; + case GGML_OP_MUL_MAT_ID: + forward_mul_mat_id(params, op); + return true; + default: + // GGML_ABORT("fatal error"); + break; } - } else if (cur->type == GGML_TYPE_IQ4_NL) { - switch (repack_type) { - case GGML_TYPE_IQ4_NL_4_4: - repack_iq4_nl_to_iq4_nl_4_bl(cur, 4, data, data_size); - break; - default: - GGML_ABORT("Unsupported type"); + return false; + } + + void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) { + const ggml_tensor * src0 = op->src[0]; + const ggml_tensor * src1 = op->src[1]; + ggml_tensor * dst = op; + + GGML_TENSOR_BINARY_OP_LOCALS + + const int ith = params->ith; + const int nth = params->nth; + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + GGML_ASSERT(ggml_n_dims(op->src[0]) == 2); + // GGML_ASSERT(ggml_n_dims(op->src[1]) == 2); + + char * wdata = static_cast(params->wdata); + const size_t nbw1 = ggml_row_size(GGML_TYPE_Q8_0, ne10); + + assert(params->wsize >= nbw1 * ne11); + + const ggml_from_float_t from_float = ggml_get_type_traits_cpu(GGML_TYPE_Q8_0)->from_float; + + int64_t i11_processed = 0; + for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) { + quantize_mat_q8_0((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10, + INTER_SIZE); + } + i11_processed = ne11 - ne11 % 4; + for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) { + from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10); + } + + ggml_barrier(params->threadpool); + + const void * src1_wdata = params->wdata; + const size_t src1_col_stride = ggml_row_size(GGML_TYPE_Q8_0, ne10); + int64_t src0_start = (ith * ne01) / nth; + int64_t src0_end = ((ith + 1) * ne01) / nth; + src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start; + src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end; + if (src0_start >= src0_end) { + return; + } + + // If there are more than three rows in src1, use gemm; otherwise, use gemv. + if (ne11 > 3) { + gemm(ne00, (float *) ((char *) dst->data) + src0_start, ne01, + (const char *) src0->data + src0_start * nb01, + (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start); + } + for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) { + gemv(ne00, (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01, + (const char *) src0->data + src0_start * nb01, + (const char *) src1_wdata + (src1_col_stride * iter), 1, + src0_end - src0_start); } - } else { - GGML_ABORT("Unsupported type"); } -} -enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) { + void forward_mul_mat_id(ggml_compute_params * params, ggml_tensor * op) { + const ggml_tensor * src0 = op->src[0]; + const ggml_tensor * src1 = op->src[1]; + const ggml_tensor * ids = op->src[2]; + ggml_tensor * dst = op; + + GGML_TENSOR_BINARY_OP_LOCALS + + const int ith = params->ith; + const int nth = params->nth; + + const ggml_from_float_t from_float = ggml_get_type_traits_cpu(GGML_TYPE_Q8_0)->from_float; + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(src0->type)); + GGML_ASSERT(nb10 == ggml_type_size(src1->type)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(ne03 == 1); + GGML_ASSERT(ne13 == 1); + GGML_ASSERT(ne3 == 1); + + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + // row groups + const int n_ids = ids->ne[0]; // n_expert_used + const int n_as = ne02; // n_expert + + const size_t nbw1 = ggml_row_size(GGML_TYPE_Q8_0, ne10); + const size_t nbw2 = nbw1*ne11; + const size_t nbw3 = nbw2*ne12; + + struct mmid_row_mapping { + int32_t i1; + int32_t i2; + }; + + GGML_ASSERT(params->wsize >= (GGML_PAD(nbw3, sizeof(int64_t)) + n_as * sizeof(int64_t) + + n_as * ne12 * sizeof(mmid_row_mapping))); + + auto wdata = (char *) params->wdata; + auto wdata_src1_end = (char *) wdata + GGML_PAD(nbw3, sizeof(int64_t)); + int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as] + struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12] + + // src1: float32 => block_q8_0 + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = ith; i11 < ne11; i11 += nth) { + from_float((float *)((char *) src1->data + i12 * nb12 + i11 * nb11), + (void *) (wdata + i12 * nbw2 + i11 * nbw1), + ne10); + } + } + +#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ne12 + (i1)] + + if (ith == 0) { + // initialize matrix_row_counts + memset(matrix_row_counts, 0, n_as * sizeof(int64_t)); + + // group rows by src0 matrix + for (int32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) { + for (int32_t id = 0; id < n_ids; ++id) { + const int32_t i02 = + *(const int32_t *) ((const char *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]); + + GGML_ASSERT(i02 >= 0 && i02 < n_as); + + MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = { id, iid1 }; + matrix_row_counts[i02] += 1; + } + } + } + + ggml_barrier(params->threadpool); + + // compute each matrix multiplication in sequence + for (int cur_a = 0; cur_a < n_as; ++cur_a) { + const int64_t cne1 = matrix_row_counts[cur_a]; + + if (cne1 == 0) { + continue; + } + + auto src0_cur = (const char *) src0->data + cur_a*nb02; + + //const int64_t nr0 = ne01; // src0 rows + const int64_t nr1 = cne1; // src1 rows + + int64_t src0_cur_start = (ith * ne01) / nth; + int64_t src0_cur_end = ((ith + 1) * ne01) / nth; + src0_cur_start = + (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start; + src0_cur_end = (src0_cur_end % NB_COLS) ? src0_cur_end + NB_COLS - (src0_cur_end % NB_COLS) : src0_cur_end; + + if (src0_cur_start >= src0_cur_end) return; + + for (int ir1 = 0; ir1 < nr1; ir1++) { + struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1); + const int id = row_mapping.i1; // selected expert index + + const int64_t i11 = id % ne11; + const int64_t i12 = row_mapping.i2; // row index in src1 + + const int64_t i1 = id; // selected expert index + const int64_t i2 = i12; // row + + auto src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2); + + gemv( + ne00, (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, + ne01, src0_cur + src0_cur_start * nb01, + src1_col, 1, src0_cur_end - src0_cur_start); + } + } +#undef MMID_MATRIX_ROW + } + + int repack(struct ggml_tensor * t, const void * data, size_t data_size) override { + GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type), + (int) NB_COLS, (int) INTER_SIZE); + return ggml::cpu::aarch64::repack(t, data, data_size); + } +}; + +// instance for Q4 +static const tensor_traits q4_0_4x4_q8_0; +static const tensor_traits q4_0_4x8_q8_0; +static const tensor_traits q4_0_8x8_q8_0; + +// instance for IQ4 +static const tensor_traits iq4_nl_4x4_q8_0; + +} // namespace ggml::cpu::aarch64 + +static const ggml::cpu::tensor_traits * ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) { if (cur->type == GGML_TYPE_Q4_0) { - // TODO: enable for AVX2 - currently disabled due to bad gemv performance - if (/* ggml_cpu_has_avx2() || */ (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) { - return GGML_TYPE_Q4_0_8_8; + if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) { + if (cur->ne[1] % 8 == 0) { + return &ggml::cpu::aarch64::q4_0_8x8_q8_0; + } } if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { - return GGML_TYPE_Q4_0_4_8; + if (cur->ne[1] % 4 == 0) { + return &ggml::cpu::aarch64::q4_0_4x8_q8_0; + } } if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { - return GGML_TYPE_Q4_0_4_4; + if (cur->ne[1] % 4 == 0) { + return &ggml::cpu::aarch64::q4_0_4x4_q8_0; + } } } else if (cur->type == GGML_TYPE_IQ4_NL) { if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { - return GGML_TYPE_IQ4_NL_4_4; + if (cur->ne[1] % 4 == 0) { + return &ggml::cpu::aarch64::iq4_nl_4x4_q8_0; + } } } - return cur->type; + return nullptr; +} + +static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { + tensor->extra = (void *) const_cast(ggml_aarch64_get_optimal_repack_type(tensor)); + + GGML_UNUSED(buffer); +} + +static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, + const void * data, size_t offset, size_t size) { + GGML_ASSERT(offset == 0); + GGML_ASSERT(size == ggml_nbytes(tensor)); + + auto tensor_traits = (ggml::cpu::aarch64::tensor_traits_base *) tensor->extra; + auto OK = tensor_traits->repack(tensor, data, size); + + GGML_ASSERT(OK == 0); + GGML_UNUSED(buffer); +} + +static const char * ggml_backend_cpu_aarch64_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return "CPU_AARCH64"; + + GGML_UNUSED(buft); +} + +static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); + + if (buffer == nullptr) { + return nullptr; + } + + buffer->buft = buft; + buffer->iface.init_tensor = ggml_backend_cpu_aarch64_buffer_init_tensor; + buffer->iface.set_tensor = ggml_backend_cpu_aarch64_buffer_set_tensor; + return buffer; +} + +static size_t ggml_backend_cpu_aarch64_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + return TENSOR_ALIGNMENT; + + GGML_UNUSED(buft); +} + +namespace ggml::cpu::aarch64 { +class extra_buffer_type : ggml::cpu::extra_buffer_type { + bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override { + if ( op->op == GGML_OP_MUL_MAT && + op->src[0]->buffer && + (ggml_n_dims(op->src[0]) == 2) && + op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type() && + ggml_aarch64_get_optimal_repack_type(op->src[0]) + ) { + if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { + return false; + } + if (op->src[1]->type == GGML_TYPE_F32) { + return true; + } + //if (op->src[1]->type == GGML_TYPE_Q8_0) { + // return true; + //} + // may be possible if Q8_0 packed... + } else if (op->op == GGML_OP_MUL_MAT_ID + && op->src[0]->buffer + && (ggml_n_dims(op->src[0]) == 3) + && op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type() + && ggml_aarch64_get_optimal_repack_type(op->src[0]) + ) { + if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { + return false; + } + if (op->src[1]->type == GGML_TYPE_F32) { + return true; + } + //if (op->src[1]->type == GGML_TYPE_Q8_0) { + // return true; + //} + } + return false; + } + + ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override { + if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) { + if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type()) { + return (ggml::cpu::tensor_traits *) op->src[0]->extra; + } + } + return nullptr; + } +}; +} // namespace ggml::cpu::aarch64 + +ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) { + static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_aarch64 = { + /* .iface = */ { + /* .get_name = */ ggml_backend_cpu_aarch64_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_cpu_aarch64_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_aarch64_buffer_type_get_alignment, + /* .get_max_size = */ nullptr, // defaults to SIZE_MAX + /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes + /* .is_host = */ nullptr, + }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), + /* .context = */ new ggml::cpu::aarch64::extra_buffer_type(), + }; + + return &ggml_backend_cpu_buffer_type_aarch64; } diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h index 3d9db6a19eb87..6e84c826b4091 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h @@ -1,32 +1,8 @@ #pragma once +#include "ggml-cpu-traits.h" #include "ggml.h" // GGML internal header -#ifdef __cplusplus -extern "C" { -#endif - -// Quantization -void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave); - -// GEMV -void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); - -// GEMM -void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); - -void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size); -enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur); - -#ifdef __cplusplus -} -#endif - +ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void); diff --git a/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp b/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp new file mode 100644 index 0000000000000..fa8dea2af9c72 --- /dev/null +++ b/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp @@ -0,0 +1,55 @@ +#ifdef GGML_USE_CPU_HBM + +#include "ggml-backend.h" +#include "ggml-backend-impl.h" +#include "ggml-cpu.h" +#include "ggml-impl.h" + +#include "ggml-cpu-hbm.h" + +// buffer type HBM + +#include + +static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return "CPU_HBM"; + + GGML_UNUSED(buft); +} + +static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) { + hbw_free(buffer->context); +} + +static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, + size_t size) { + void * ptr; + int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size); + if (result != 0) { + GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size); + return NULL; + } + + ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); + buffer->buft = buft; + buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer; + + return buffer; +} + +ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) { + static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = { + /* .iface = */ { + /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, + /* .get_max_size = */ nullptr, // defaults to SIZE_MAX + /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes + /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, + }, + /* .context = */ nullptr, + }; + + return &ggml_backend_cpu_buffer_type_hbm; +} +#endif diff --git a/ggml/src/ggml-cpu/ggml-cpu-hbm.h b/ggml/src/ggml-cpu/ggml-cpu-hbm.h new file mode 100644 index 0000000000000..09a1f09d72be2 --- /dev/null +++ b/ggml/src/ggml-cpu/ggml-cpu-hbm.h @@ -0,0 +1,8 @@ +#pragma once + +#include "ggml-backend.h" +#include "ggml.h" + +// GGML CPU internal header + +ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void); diff --git a/ggml/src/ggml-cpu/ggml-cpu-traits.cpp b/ggml/src/ggml-cpu/ggml-cpu-traits.cpp new file mode 100644 index 0000000000000..62a0712dabbf6 --- /dev/null +++ b/ggml/src/ggml-cpu/ggml-cpu-traits.cpp @@ -0,0 +1,36 @@ +#include "ggml-cpu-traits.h" + +#include "ggml-backend-impl.h" +#include "ggml-backend.h" + +namespace ggml::cpu { +tensor_traits::~tensor_traits() {} + +extra_buffer_type::~extra_buffer_type() {} +} // namespace ggml::cpu + +bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) { + for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) { + if (extra && extra->context) { + auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context; + auto tensor_traits = buf_extra->get_tensor_traits(op); + if (tensor_traits && tensor_traits->compute_forward(params, op)) { + return true; + } + } + } + return false; +} + +bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) { + for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) { + if (extra && extra->context) { + auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context; + auto tensor_traits = buf_extra->get_tensor_traits(op); + if (tensor_traits && tensor_traits->work_size(n_threads, op, *size)) { + return true; + } + } + } + return false; +} diff --git a/ggml/src/ggml-cpu/ggml-cpu-traits.h b/ggml/src/ggml-cpu/ggml-cpu-traits.h new file mode 100644 index 0000000000000..99a6186b1d6b5 --- /dev/null +++ b/ggml/src/ggml-cpu/ggml-cpu-traits.h @@ -0,0 +1,38 @@ +#pragma once +#include "ggml-backend-impl.h" +#include "ggml-cpu-impl.h" +#include "ggml.h" + +#ifdef __cplusplus +# include +extern "C" { +#endif + +// return true if op part of extra "accelerator" +bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op); +bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size); + +#ifdef __cplusplus +} + +namespace ggml::cpu { +// register in tensor->extra +class tensor_traits { + public: + virtual ~tensor_traits(); + virtual bool work_size(int n_threads, const struct ggml_tensor * op, size_t & size) = 0; + virtual bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) = 0; +}; + +class extra_buffer_type { + public: + virtual ~extra_buffer_type(); + virtual bool supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) = 0; + virtual tensor_traits * get_tensor_traits(const struct ggml_tensor * op) = 0; +}; +} // namespace ggml::cpu + +// implemented in ggml-cpu.cpp. +std::vector & ggml_backend_cpu_get_extra_buffers_type(); + +#endif diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index f12b62e3bd8b9..ea17d6077e7cf 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -3,7 +3,7 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" -#include "ggml-cpu-aarch64.h" +#include "ggml-cpu-traits.h" #include "ggml-cpu-impl.h" #include "ggml-cpu.h" #include "ggml-impl.h" @@ -224,10 +224,6 @@ typedef void * thread_ret_t; typedef pthread_t ggml_thread_t; -#ifdef GGML_USE_CPU_HBM -#include -#endif - #if defined(__APPLE__) #include #include @@ -301,7 +297,6 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { }, [GGML_TYPE_Q8_0] = { .from_float = quantize_row_q8_0, - .from_float_to_mat = quantize_mat_q8_0, .vec_dot = ggml_vec_dot_q8_0_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, #if defined (__ARM_FEATURE_MATMUL_INT8) @@ -409,33 +404,6 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_BF16, .nrows = 1, }, - [GGML_TYPE_Q4_0_4_4] = { - .from_float = NULL, - .vec_dot = NULL, - .vec_dot_type = GGML_TYPE_Q8_0, - .nrows = 1, - .ncols = 4, - .gemv = ggml_gemv_q4_0_4x4_q8_0, - .gemm = ggml_gemm_q4_0_4x4_q8_0, - }, - [GGML_TYPE_Q4_0_4_8] = { - .from_float = NULL, - .vec_dot = NULL, - .vec_dot_type = GGML_TYPE_Q8_0, - .nrows = 1, - .ncols = 4, - .gemv = ggml_gemv_q4_0_4x8_q8_0, - .gemm = ggml_gemm_q4_0_4x8_q8_0, - }, - [GGML_TYPE_Q4_0_8_8] = { - .from_float = NULL, - .vec_dot = NULL, - .vec_dot_type = GGML_TYPE_Q8_0, - .nrows = 1, - .ncols = 8, - .gemv = ggml_gemv_q4_0_8x8_q8_0, - .gemm = ggml_gemm_q4_0_8x8_q8_0, - }, [GGML_TYPE_TQ1_0] = { .from_float = quantize_row_tq1_0, .vec_dot = ggml_vec_dot_tq1_0_q8_K, @@ -448,15 +416,6 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, }, - [GGML_TYPE_IQ4_NL_4_4] = { - .from_float = NULL, - .vec_dot = NULL, - .vec_dot_type = GGML_TYPE_Q8_0, - .nrows = 1, - .ncols = 4, - .gemv = ggml_gemv_iq4_nl_4x4_q8_0, - .gemm = ggml_gemm_iq4_nl_4x4_q8_0, - }, }; const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) { @@ -4509,9 +4468,6 @@ static void ggml_compute_forward_add( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: { ggml_compute_forward_add_q_f32(params, dst); } break; @@ -4889,9 +4845,6 @@ static void ggml_compute_forward_add1( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: { ggml_compute_forward_add1_q_f32(params, dst); } break; @@ -5019,9 +4972,6 @@ static void ggml_compute_forward_acc( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: default: { GGML_ABORT("fatal error"); @@ -7437,27 +7387,9 @@ static void ggml_compute_forward_mul_mat( const int ith = params->ith; const int nth = params->nth; - enum ggml_type type = src0->type; - - if (src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) { - type = (enum ggml_type)(intptr_t)src0->extra; - } - -#if defined(__AMX_INT8__) && defined(__AVX512VNNI__) - if (src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) { - ggml_backend_amx_mul_mat(params, dst); - return; - } -#endif - - enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; + enum ggml_type const vec_dot_type = type_traits_cpu[src0->type].vec_dot_type; ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float; - ggml_from_float_to_mat_t const from_float_to_mat = type_traits_cpu[vec_dot_type].from_float_to_mat; - int64_t const vec_dot_num_rows = type_traits_cpu[type].nrows; - int64_t const matmul_num_cols = type_traits_cpu[type].ncols; - int64_t const blck_size_interleave = ggml_get_type_traits(type)->blck_size_interleave; - ggml_gemv_t const gemv = type_traits_cpu[type].gemv; - ggml_gemm_t const gemm = type_traits_cpu[type].gemm; + int64_t const vec_dot_num_rows = type_traits_cpu[src0->type].nrows; GGML_ASSERT(ne0 == ne01); GGML_ASSERT(ne1 == ne11); @@ -7465,7 +7397,7 @@ static void ggml_compute_forward_mul_mat( GGML_ASSERT(ne3 == ne13); // we don't support permuted src0 or src1 - GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb00 == ggml_type_size(src0->type)); GGML_ASSERT(nb10 == ggml_type_size(src1->type)); // dst cannot be transposed or permuted @@ -7477,6 +7409,7 @@ static void ggml_compute_forward_mul_mat( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows + // TODO: extract to "extra_op" #if GGML_USE_LLAMAFILE // broadcast factors const int64_t r2 = ne12 / ne02; @@ -7487,15 +7420,15 @@ static void ggml_compute_forward_mul_mat( if (src1_cont) { for (int64_t i13 = 0; i13 < ne13; i13++) for (int64_t i12 = 0; i12 < ne12; i12++) - if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type), + if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type), (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, - nb01/ggml_type_size(type), + nb01/ggml_type_size(src0->type), (const char *)src1->data + i12*nb12 + i13*nb13, nb11/ggml_type_size(src1->type), (char *)dst->data + i12*nb2 + i13*nb3, nb1/ggml_type_size(dst->type), ith, nth, - type, + src0->type, src1->type, dst->type)) goto UseGgmlGemm1; @@ -7516,19 +7449,10 @@ UseGgmlGemm1:; for (int64_t i13 = 0; i13 < ne13; ++i13) { for (int64_t i12 = 0; i12 < ne12; ++i12) { - int64_t i11_processed = 0; - if ((ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) { - for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) { - from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), - (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1), - 4, ne10, blck_size_interleave); - } - i11_processed = ne11 - ne11 % 4; - } - for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) { + for (int64_t i11 = ith; i11 < ne11; i11 += nth) { from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), - (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1), - ne10); + (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1), + ne10); } } } @@ -7548,15 +7472,15 @@ UseGgmlGemm1:; for (int64_t i13 = 0; i13 < ne13; i13++) for (int64_t i12 = 0; i12 < ne12; i12++) - if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type), + if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type), (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, - nb01/ggml_type_size(type), + nb01/ggml_type_size(src0->type), (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size, row_size/ggml_type_size(vec_dot_type), (char *)dst->data + i12*nb2 + i13*nb3, nb1/ggml_type_size(dst->type), ith, nth, - type, + src0->type, vec_dot_type, dst->type)) goto UseGgmlGemm2; @@ -7598,28 +7522,6 @@ UseGgmlGemm2:; const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0; const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1; - if ((ggml_n_dims(src0) == 2) && gemv) { - const void * src1_wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; - const size_t src1_col_stride = ggml_is_contiguous(src1) || src1->type != vec_dot_type ? ggml_row_size(vec_dot_type, ne10) : nb11; - int64_t src0_start = (ith * ne01) / nth; - int64_t src0_end = ((ith + 1) * ne01) / nth; - src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start; - src0_end = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end; - if (src0_start >= src0_end) return; - - // If there are more than three rows in src1, use gemm; otherwise, use gemv. - if (gemm && (ne11 > 3)) { - gemm(ne00, (float *)((char *) dst->data) + src0_start, ne01, (const char *) src0->data + src0_start * nb01, - (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start); - } - for (int iter = gemm ? ne11 - ne11 % 4 : 0; iter < ne11; iter++) { - gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01, - (const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1, - src0_end - src0_start); - } - return; - } - // The first chunk comes from our thread_id, the rest will get auto-assigned. int current_chunk = ith; @@ -7642,7 +7544,7 @@ UseGgmlGemm2:; num_rows_per_vec_dot = 1; } - ggml_compute_forward_mul_mat_one_chunk(params, dst, type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end); + ggml_compute_forward_mul_mat_one_chunk(params, dst, src0->type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end); if (nth >= nchunk0 * nchunk1) { break; @@ -7674,8 +7576,6 @@ static void ggml_compute_forward_mul_mat_id( ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot; enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float; - int64_t const matmul_num_cols = type_traits_cpu[type].ncols; - ggml_gemv_t const gemv = type_traits_cpu[type].gemv; // we don't support permuted src0 or src1 GGML_ASSERT(nb00 == ggml_type_size(type)); @@ -7761,34 +7661,6 @@ static void ggml_compute_forward_mul_mat_id( const int64_t nr0 = ne01; // src0 rows const int64_t nr1 = cne1; // src1 rows - if (((ggml_n_dims(src0) - 1) == 2) && gemv) { - int64_t src0_cur_start = (ith * ne01) / nth; - int64_t src0_cur_end = ((ith + 1) * ne01) / nth; - src0_cur_start = (src0_cur_start % matmul_num_cols) ? src0_cur_start + matmul_num_cols - (src0_cur_start % matmul_num_cols): src0_cur_start; - src0_cur_end = (src0_cur_end % matmul_num_cols) ? src0_cur_end + matmul_num_cols - (src0_cur_end % matmul_num_cols): src0_cur_end; - if (src0_cur_start >= src0_cur_end) return; - - for (int ir1 = 0; ir1 < nr1; ir1++) { - struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1); - const int id = row_mapping.i1; // selected expert index - - const int64_t i11 = id % ne11; - const int64_t i12 = row_mapping.i2; // row index in src1 - - const int64_t i1 = id; // selected expert index - const int64_t i2 = i12; // row - - const char * src1_col = (const char *) wdata + - (src1_cont || src1->type != vec_dot_type - ? (i11 + i12 * ne11) * row_size - : (i11 * nb11 + i12 * nb12)); - - gemv(ne00, (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01, - (const char *) src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start); - } - continue; - } - // distribute the thread work across the inner or outer loop based on which one is larger const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows @@ -8096,9 +7968,6 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: { ggml_compute_forward_out_prod_q_f32(params, dst); } break; @@ -8361,9 +8230,6 @@ static void ggml_compute_forward_set( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: default: { GGML_ABORT("fatal error"); @@ -8625,9 +8491,6 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: { ggml_compute_forward_get_rows_q(params, dst); } break; @@ -9217,10 +9080,6 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: case GGML_TYPE_Q8_K: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: - case GGML_TYPE_IQ4_NL_4_4: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -12426,6 +12285,9 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm return; } + // extra_buffer op? + if (ggml_cpu_extra_compute_forward(params, tensor)) return; + switch (tensor->op) { case GGML_OP_DUP: { @@ -13373,146 +13235,142 @@ struct ggml_cplan ggml_graph_plan( size_t cur = 0; - switch (node->op) { - case GGML_OP_CPY: - case GGML_OP_DUP: - { - if (ggml_is_quantized(node->type) || - // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32 - (node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) || - (node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) { + if (!ggml_cpu_extra_work_size(n_threads, node, &cur)) { + + switch (node->op) { + case GGML_OP_CPY: + case GGML_OP_DUP: + { + if (ggml_is_quantized(node->type) || + // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32 + (node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) || + (node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) { + cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; + } + } break; + case GGML_OP_ADD: + case GGML_OP_ADD1: + { + if (ggml_is_quantized(node->src[0]->type)) { + cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks; + } + } break; + case GGML_OP_ACC: + { + if (ggml_is_quantized(node->src[0]->type)) { + cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks; + } + } break; + case GGML_OP_COUNT_EQUAL: + { + cur = ggml_type_size(node->type)*n_tasks; + } break; + case GGML_OP_MUL_MAT: + { + const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type; + + if (node->src[1]->type != vec_dot_type) { + cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1])); + } + } break; + case GGML_OP_MUL_MAT_ID: + { + cur = 0; + const struct ggml_tensor * src0 = node->src[0]; + const struct ggml_tensor * src1 = node->src[1]; + const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type; + if (src1->type != vec_dot_type) { + cur += ggml_row_size(vec_dot_type, ggml_nelements(src1)); + } + const int n_as = src0->ne[2]; + cur += GGML_PAD(cur, sizeof(int64_t)); // align + cur += n_as * sizeof(int64_t); // matrix_row_counts + cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows + } break; + case GGML_OP_OUT_PROD: + { + if (ggml_is_quantized(node->src[0]->type)) { + cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks; + } + } break; + case GGML_OP_SOFT_MAX: + case GGML_OP_ROPE: + { cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; - } - } break; - case GGML_OP_ADD: - case GGML_OP_ADD1: - { - if (ggml_is_quantized(node->src[0]->type)) { - cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks; - } - } break; - case GGML_OP_ACC: - { - if (ggml_is_quantized(node->src[0]->type)) { - cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks; - } - } break; - case GGML_OP_COUNT_EQUAL: - { - cur = ggml_type_size(node->type)*n_tasks; - } break; - case GGML_OP_MUL_MAT: - { -#if defined(__AMX_INT8__) && defined(__AVX512VNNI__) - if (node->src[0]->buffer && ggml_backend_amx_buft_is_amx(node->src[0]->buffer->buft)) { - cur = ggml_backend_amx_desired_wsize(node); - } -#endif - const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type; + } break; + case GGML_OP_CONV_TRANSPOSE_1D: + { + GGML_ASSERT(node->src[0]->ne[3] == 1); + GGML_ASSERT(node->src[1]->ne[2] == 1); + GGML_ASSERT(node->src[1]->ne[3] == 1); + + const int64_t ne00 = node->src[0]->ne[0]; // K + const int64_t ne01 = node->src[0]->ne[1]; // Cout + const int64_t ne02 = node->src[0]->ne[2]; // Cin + const int64_t ne10 = node->src[1]->ne[0]; // L + const int64_t ne11 = node->src[1]->ne[1]; // Cin + + if ((node->src[0]->type == GGML_TYPE_F16 || + node->src[0]->type == GGML_TYPE_BF16) && + node->src[1]->type == GGML_TYPE_F32) { + cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02; + cur += sizeof(ggml_fp16_t)*ne10*ne11; + } else if (node->src[0]->type == GGML_TYPE_F32 && + node->src[1]->type == GGML_TYPE_F32) { + cur += sizeof(float)*ne00*ne01*ne02; + cur += sizeof(float)*ne10*ne11; + } else { + GGML_ABORT("fatal error"); + } + } break; + case GGML_OP_CONV_TRANSPOSE_2D: + { + const int64_t ne00 = node->src[0]->ne[0]; // W + const int64_t ne01 = node->src[0]->ne[1]; // H + const int64_t ne02 = node->src[0]->ne[2]; // Channels Out + const int64_t ne03 = node->src[0]->ne[3]; // Channels In - if (node->src[1]->type != vec_dot_type) { - size_t cur2 = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1])); - cur = MAX(cur, cur2); - } - } break; - case GGML_OP_MUL_MAT_ID: - { - cur = 0; - const struct ggml_tensor * src0 = node->src[0]; - const struct ggml_tensor * src1 = node->src[1]; - const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type; - if (src1->type != vec_dot_type) { - cur += ggml_row_size(vec_dot_type, ggml_nelements(src1)); - } - const int n_as = src0->ne[2]; - cur += GGML_PAD(cur, sizeof(int64_t)); // align - cur += n_as * sizeof(int64_t); // matrix_row_counts - cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows - } break; - case GGML_OP_OUT_PROD: - { - if (ggml_is_quantized(node->src[0]->type)) { - cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks; - } - } break; - case GGML_OP_SOFT_MAX: - case GGML_OP_ROPE: - { - cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; - } break; - case GGML_OP_CONV_TRANSPOSE_1D: - { - GGML_ASSERT(node->src[0]->ne[3] == 1); - GGML_ASSERT(node->src[1]->ne[2] == 1); - GGML_ASSERT(node->src[1]->ne[3] == 1); - - const int64_t ne00 = node->src[0]->ne[0]; // K - const int64_t ne01 = node->src[0]->ne[1]; // Cout - const int64_t ne02 = node->src[0]->ne[2]; // Cin - - const int64_t ne10 = node->src[1]->ne[0]; // L - const int64_t ne11 = node->src[1]->ne[1]; // Cin - - if ((node->src[0]->type == GGML_TYPE_F16 || - node->src[0]->type == GGML_TYPE_BF16) && - node->src[1]->type == GGML_TYPE_F32) { - cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02; - cur += sizeof(ggml_fp16_t)*ne10*ne11; - } else if (node->src[0]->type == GGML_TYPE_F32 && - node->src[1]->type == GGML_TYPE_F32) { - cur += sizeof(float)*ne00*ne01*ne02; - cur += sizeof(float)*ne10*ne11; - } else { - GGML_ABORT("fatal error"); - } - } break; - case GGML_OP_CONV_TRANSPOSE_2D: - { - const int64_t ne00 = node->src[0]->ne[0]; // W - const int64_t ne01 = node->src[0]->ne[1]; // H - const int64_t ne02 = node->src[0]->ne[2]; // Channels Out - const int64_t ne03 = node->src[0]->ne[3]; // Channels In - - const int64_t ne10 = node->src[1]->ne[0]; // W - const int64_t ne11 = node->src[1]->ne[1]; // H - const int64_t ne12 = node->src[1]->ne[2]; // Channels In - - cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03; - cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12; - } break; - case GGML_OP_FLASH_ATTN_EXT: - { - const int64_t ne00 = node->src[0]->ne[0]; // D + const int64_t ne10 = node->src[1]->ne[0]; // W + const int64_t ne11 = node->src[1]->ne[1]; // H + const int64_t ne12 = node->src[1]->ne[2]; // Channels In - cur = 3*sizeof(float)*ne00*n_tasks; // 3x head size/thread - } break; - case GGML_OP_FLASH_ATTN_BACK: - { - const int64_t D = node->src[0]->ne[0]; - const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL); - const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back - if (node->src[1]->type == GGML_TYPE_F32) { - cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 - } else if (node->src[1]->type == GGML_TYPE_F16) { - cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 - } else if (node->src[1]->type == GGML_TYPE_BF16) { - cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 - } - } break; + cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03; + cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12; + } break; + case GGML_OP_FLASH_ATTN_EXT: + { + const int64_t ne00 = node->src[0]->ne[0]; // D - case GGML_OP_CROSS_ENTROPY_LOSS: - { - cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks); - } break; - case GGML_OP_COUNT: - { - GGML_ABORT("fatal error"); - } - default: - break; + cur = 3*sizeof(float)*ne00*n_tasks; // 3x head size/thread + } break; + case GGML_OP_FLASH_ATTN_BACK: + { + const int64_t D = node->src[0]->ne[0]; + const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL); + const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back + if (node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 + } else if (node->src[1]->type == GGML_TYPE_F16) { + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 + } else if (node->src[1]->type == GGML_TYPE_BF16) { + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 + } + } break; + + case GGML_OP_CROSS_ENTROPY_LOSS: + { + cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks); + } break; + case GGML_OP_COUNT: + { + GGML_ABORT("fatal error"); + } + default: + break; + } } work_size = MAX(work_size, cur); diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index d3b4bdb965092..c390957afa8e3 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -2,12 +2,18 @@ #include "ggml-backend-impl.h" #include "ggml-cpu.h" #include "ggml-cpu-aarch64.h" +#include "ggml-cpu-traits.h" #include "ggml-impl.h" #include "amx/amx.h" + #include #include #include +#ifdef GGML_USE_CPU_HBM +#include "ggml-cpu-hbm.h" +#endif + #if defined(__APPLE__) #include #include @@ -23,115 +29,7 @@ // ggml-backend interface -#ifdef GGML_USE_CPU_HBM - -// buffer type HBM - -#include - -static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - return "CPU_HBM"; - - GGML_UNUSED(buft); -} - -static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) { - hbw_free(buffer->context); -} - -static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - void * ptr; - int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size); - if (result != 0) { - GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size); - return NULL; - } - - ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); - buffer->buft = buft; - buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer; - - return buffer; -} - -ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) { - static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = { - /* .iface = */ { - /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name, - /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX - /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, - }, - /* .context = */ NULL, - }; - - return &ggml_backend_cpu_buffer_type_hbm; -} -#endif - -// buffer type AARCH64 - -static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { - tensor->extra = (void *)ggml_aarch64_get_optimal_repack_type(tensor); // NOLINT - - GGML_UNUSED(buffer); -} - -static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - GGML_ASSERT(offset == 0); - GGML_ASSERT(size == ggml_nbytes(tensor)); - - enum ggml_type repack_type = (enum ggml_type)(intptr_t)tensor->extra; - - ggml_aarch64_repack_tensor(tensor, repack_type, data, size); - - GGML_UNUSED(buffer); -} - -static const char * ggml_backend_cpu_aarch64_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - return "CPU_AARCH64"; - - GGML_UNUSED(buft); -} - -static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - auto * buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); - - if (buffer == NULL) { - return NULL; - } - - buffer->buft = buft; - buffer->iface.init_tensor = ggml_backend_cpu_aarch64_buffer_init_tensor; - buffer->iface.set_tensor = ggml_backend_cpu_aarch64_buffer_set_tensor; - - return buffer; -} - -ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) { - static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_aarch64 = { - /* .iface = */ { - /* .get_name = */ ggml_backend_cpu_aarch64_buffer_type_get_name, - /* .alloc_buffer = */ ggml_backend_cpu_aarch64_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX - /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .is_host = */ NULL, - }, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), - /* .context = */ NULL, - }; - - return &ggml_backend_cpu_buffer_type_aarch64; -} - -bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft) { - return buft == ggml_backend_cpu_aarch64_buffer_type(); -} - -static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) { +std::vector& ggml_backend_cpu_get_extra_buffers_type() { static std::vector bufts = []() { std::vector bufts; @@ -152,11 +50,22 @@ static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backen return bufts; }(); - return bufts.data(); + return bufts; +} + +static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) { + return ggml_backend_cpu_get_extra_buffers_type().data(); GGML_UNUSED(device); } +static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) { + for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) { + if (extra && extra == buft) return true; + } + return false; +} + // CPU backend - backend (stream) struct ggml_backend_cpu_context { @@ -465,25 +374,19 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st return true; } - if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) { - if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) { - return false; + // extra_buffer_op? + for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) { + if (extra) { + auto buf_extra = (ggml::cpu::extra_buffer_type*) extra->context; + if (buf_extra && buf_extra->supports_op(dev, op)) { + return true; + } } } -#if defined(__AMX_INT8__) && defined(__AVX512VNNI__) - if (src0 && src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) { - return ggml_backend_amx_device_supports_op(op); - } - for (int i = 1; i < GGML_MAX_SRC; i++) { - if (op->src[i] && op->src[i]->buffer && ggml_backend_amx_buft_is_amx(op->src[i]->buffer->buft)) { - return false; - } - } -#endif - - for (int i = 1; i < GGML_MAX_SRC; i++) { - if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) { + // the other case need host buffer. + for (int i = 0; i < GGML_MAX_SRC; i++) { + if (op->src[i] && op->src[i]->buffer && !ggml_backend_buft_is_host(op->src[i]->buffer->buft)) { return false; } } @@ -506,19 +409,10 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st default: return true; } - - GGML_UNUSED(dev); } static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - bool supported = ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft); - -#if defined(__AMX_INT8__) && defined(__AVX512VNNI__) - supported = supported || ggml_backend_amx_buft_is_amx(buft); -#endif - - return supported; - + return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_is_extra_buffer_type(buft); GGML_UNUSED(dev); } @@ -666,10 +560,12 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) { if (strcmp(name, "ggml_backend_set_n_threads") == 0) { - return (void *)ggml_backend_cpu_set_n_threads; + ggml_backend_set_n_threads_t fct = ggml_backend_cpu_set_n_threads; + return (void *)fct; } if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) { - return (void *)ggml_backend_cpu_get_extra_bufts; + ggml_backend_dev_get_extra_bufts_t fct = ggml_backend_cpu_device_get_extra_buffers_type; + return (void *)fct; } if (strcmp(name, "ggml_backend_get_features") == 0) { return (void *)ggml_backend_cpu_get_features; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index d6e4bfdd0d437..15fcb2a65cbcf 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3210,7 +3210,7 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con static const ggml_backend_reg_i ggml_backend_cuda_reg_interface = { /* .get_name = */ ggml_backend_cuda_reg_get_name, /* .get_device_count = */ ggml_backend_cuda_reg_get_device_count, - /* .get_device_get = */ ggml_backend_cuda_reg_get_device, + /* .get_device = */ ggml_backend_cuda_reg_get_device, /* .get_proc_address = */ ggml_backend_cuda_reg_get_proc_address, }; diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 7301a9c6caab8..7918388ae9f2e 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -5220,15 +5220,6 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte { VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb); } break; - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - { - VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x4, data, nbytes / sizeof(block_q4_0x4), 4); - } break; - case GGML_TYPE_Q4_0_8_8: - { - VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8); - } break; case GGML_TYPE_I8: case GGML_TYPE_I16: diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 46be0181c179f..eb7707bd6d6a2 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4536,7 +4536,7 @@ static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, cons static const ggml_backend_reg_i ggml_backend_sycl_reg_interface = { /* .get_name = */ ggml_backend_sycl_reg_get_name, /* .get_device_count = */ ggml_backend_sycl_reg_get_device_count, - /* .get_device_get = */ ggml_backend_sycl_reg_get_device, + /* .get_device = */ ggml_backend_sycl_reg_get_device, /* .get_proc_address = */ ggml_backend_sycl_reg_get_proc_address, }; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index ba65160c0eec3..7948e752ec768 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -8,7 +8,10 @@ // FIXME: required here for quantization functions #include "ggml-quants.h" -#include "ggml-aarch64.h" + +#ifdef GGML_USE_CPU_HBM +#include +#endif #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW @@ -788,32 +791,23 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row, .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref, }, - [GGML_TYPE_Q4_0_4_4] = { - .type_name = "q4_0_4x4", - .blck_size = QK4_0, - .blck_size_interleave = 4, - .type_size = sizeof(block_q4_0), - .is_quantized = true, - .to_float = NULL, - .from_float_ref = NULL, + [31] = { // GGML_TYPE_Q4_0_4_4 + .type_name = "TYPE_Q4_0_4_4 REMOVED, use Q4_0 with runtime repacking", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, }, - [GGML_TYPE_Q4_0_4_8] = { - .type_name = "q4_0_4x8", - .blck_size = QK4_0, - .blck_size_interleave = 8, - .type_size = sizeof(block_q4_0), - .is_quantized = true, - .to_float = NULL, - .from_float_ref = NULL, + [32] = { // GGML_TYPE_Q4_0_4_8 + .type_name = "TYPE_Q4_0_4_8 REMOVED, use Q4_0 with runtime repacking", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, }, - [GGML_TYPE_Q4_0_8_8] = { - .type_name = "q4_0_8x8", - .blck_size = QK4_0, - .blck_size_interleave = 8, - .type_size = sizeof(block_q4_0), - .is_quantized = true, - .to_float = NULL, - .from_float_ref = NULL, + [33] = { // GGML_TYPE_Q4_0_8_8 + .type_name = "TYPE_Q4_0_8_8 REMOVED, use Q4_0 with runtime repacking", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, }, [GGML_TYPE_TQ1_0] = { .type_name = "tq1_0", @@ -831,14 +825,23 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_tq2_0, .from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref, }, - [GGML_TYPE_IQ4_NL_4_4] = { - .type_name = "iq4_nl_4x4", - .blck_size = QK4_NL, - .blck_size_interleave = 4, - .type_size = sizeof(block_iq4_nl), - .is_quantized = true, - .to_float = NULL, - .from_float_ref = NULL, + [36] = { // GGML_TYPE_IQ4_NL_4_4 + .type_name = "TYPE_IQ4_NL_4_4 REMOVED, use IQ4_NL with runtime repacking", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, + }, + [37] = { // GGML_TYPE_IQ4_NL_4_8 + .type_name = "TYPE_IQ4_NL_4_8 REMOVED, use IQ4_NL with runtime repacking", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, + }, + [38] = { // GGML_TYPE_IQ4_NL_8_8 + .type_name = "TYPE_IQ4_NL_8_8 REMOVED, use IQ4_NL with runtime repacking", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, }, }; @@ -1270,9 +1273,6 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break; case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break; case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break; - case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break; - case GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = GGML_TYPE_Q4_0_4_8; break; - case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break; case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; } @@ -6304,9 +6304,6 @@ size_t ggml_quantize_chunk( case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_F16: { size_t elemsize = sizeof(ggml_fp16_t); @@ -6838,7 +6835,16 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p (int64_t) info->ne[2] * (int64_t) info->ne[3]; - if (ggml_blck_size(info->type) == 0 || ne % ggml_blck_size(info->type) != 0) { + if (ggml_blck_size(info->type) == 0 ) { + // this tensor type support have been removed: + fprintf(stderr, "%s: tensor '%s' of type %d: %s\n", + __func__, info->name.data, (int) info->type, ggml_type_name(info->type)); + fclose(file); + gguf_free(ctx); + return NULL; + } + + if (ne % ggml_blck_size(info->type) != 0) { fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n", __func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type)); fclose(file); diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 703199fcb3f68..66247b80302e6 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1432,9 +1432,6 @@ class GGMLQuantizationType(IntEnum): F64 = 28 IQ1_M = 29 BF16 = 30 - Q4_0_4_4 = 31 - Q4_0_4_8 = 32 - Q4_0_8_8 = 33 TQ1_0 = 34 TQ2_0 = 35 @@ -1478,9 +1475,9 @@ class LlamaFileType(IntEnum): MOSTLY_IQ4_XS = 30 # except 1d tensors MOSTLY_IQ1_M = 31 # except 1d tensors MOSTLY_BF16 = 32 # except 1d tensors - MOSTLY_Q4_0_4_4 = 33 # except 1d tensors - MOSTLY_Q4_0_4_8 = 34 # except 1d tensors - MOSTLY_Q4_0_8_8 = 35 # except 1d tensors + # MOSTLY_Q4_0_4_4 = 33 # removed from gguf files, use Q4_0 and runtime repack + # MOSTLY_Q4_0_4_8 = 34 # removed from gguf files, use Q4_0 and runtime repack + # MOSTLY_Q4_0_8_8 = 35 # removed from gguf files, use Q4_0 and runtime repack MOSTLY_TQ1_0 = 36 # except 1d tensors MOSTLY_TQ2_0 = 37 # except 1d tensors @@ -1556,9 +1553,6 @@ def get_type(val: Any) -> GGUFValueType: GGMLQuantizationType.F64: (1, 8), GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32), GGMLQuantizationType.BF16: (1, 2), - GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16), - GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16), - GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16), GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13), GGMLQuantizationType.TQ2_0: (256, 2 + 64), } diff --git a/include/llama.h b/include/llama.h index d121354c19d56..36945cde335dc 100644 --- a/include/llama.h +++ b/include/llama.h @@ -172,9 +172,9 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors + //LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // removed from gguf files, use Q4_0 and runtime repack + //LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // removed from gguf files, use Q4_0 and runtime repack + //LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors diff --git a/src/llama.cpp b/src/llama.cpp index de38a2b4ac43a..d7283fd9957f3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4582,9 +4582,6 @@ struct llama_model_loader { case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; - case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break; - case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break; - case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break; default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); @@ -5348,9 +5345,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; - case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4"; - case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8"; - case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8"; default: return "unknown, may not work"; } @@ -18376,10 +18370,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ3_S; } - else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || - new_type == GGML_TYPE_Q4_0_8_8) { - new_type = GGML_TYPE_Q4_0; - } else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) { new_type = GGML_TYPE_Q4_K; } @@ -18702,9 +18692,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break; case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break; - case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break; - case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break; - case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break; default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } @@ -19043,14 +19030,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s f32_data = (float *) f32_conv_buf.data(); } - int chunk_size_multiplier = 1; - if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) { - if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0; - else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0; - if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8; - else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4; - } - LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type)); fflush(stdout); @@ -19063,8 +19042,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s const int64_t nrows = tensor->ne[1]; static const int64_t min_chunk_size = 32 * 512; - const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) * - chunk_size_multiplier; + const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)); const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1]; const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size; From 65b3db9fcf194988aa78b21b163e864aee275f70 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 7 Dec 2024 18:02:05 +0200 Subject: [PATCH 161/245] server : various fixes (#10704) * server : various fixes ggml-ci * server : show curent seed in slot_params ggml-ci * fix /slots endpoint * Update examples/server/server.cpp Co-authored-by: Georgi Gerganov * server : reflect endpoint response changes in the readme ggml-ci --------- Co-authored-by: Xuan Son Nguyen Co-authored-by: Xuan Son Nguyen --- examples/server/CMakeLists.txt | 8 -- examples/server/README.md | 187 ++++++++++++++++++++++++--------- examples/server/server.cpp | 76 +++++++------- examples/server/utils.hpp | 4 +- 4 files changed, 178 insertions(+), 97 deletions(-) diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 0035859a6732e..63fca1d590f52 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -34,14 +34,6 @@ endforeach() add_executable(${TARGET} ${TARGET_SRCS}) install(TARGETS ${TARGET} RUNTIME) -# clean up generated files in pre-build step -foreach(asset ${PUBLIC_ASSETS}) - set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp") - add_custom_command(TARGET ${TARGET} PRE_BUILD - COMMAND "${CMAKE_COMMAND}" -E remove -f "${output}" - ) -endforeach() - target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT}) if (LLAMA_SERVER_SSL) diff --git a/examples/server/README.md b/examples/server/README.md index 8dbed2626a444..0bab40a82250c 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -618,9 +618,76 @@ This endpoint is public (no API key check). By default, it is read-only. To make ```json { - "default_generation_settings": { ... }, + "default_generation_settings": { + "id": 0, + "id_task": -1, + "n_ctx": 1024, + "speculative": false, + "is_processing": false, + "params": { + "n_predict": -1, + "seed": 4294967295, + "temperature": 0.800000011920929, + "dynatemp_range": 0.0, + "dynatemp_exponent": 1.0, + "top_k": 40, + "top_p": 0.949999988079071, + "min_p": 0.05000000074505806, + "xtc_probability": 0.0, + "xtc_threshold": 0.10000000149011612, + "typical_p": 1.0, + "repeat_last_n": 64, + "repeat_penalty": 1.0, + "presence_penalty": 0.0, + "frequency_penalty": 0.0, + "dry_multiplier": 0.0, + "dry_base": 1.75, + "dry_allowed_length": 2, + "dry_penalty_last_n": -1, + "dry_sequence_breakers": [ + "\n", + ":", + "\"", + "*" + ], + "mirostat": 0, + "mirostat_tau": 5.0, + "mirostat_eta": 0.10000000149011612, + "penalize_nl": false, + "stop": [], + "max_tokens": -1, + "n_keep": 0, + "n_discard": 0, + "ignore_eos": false, + "stream": true, + "n_probs": 0, + "min_keep": 0, + "grammar": "", + "samplers": [ + "dry", + "top_k", + "typ_p", + "top_p", + "min_p", + "xtc", + "temperature" + ], + "speculative.n_max": 16, + "speculative.n_min": 5, + "speculative.p_min": 0.8999999761581421, + "timings_per_token": false + }, + "prompt": "", + "next_token": { + "has_next_token": true, + "has_new_line": false, + "n_remain": -1, + "n_decoded": 0, + "stopping_word": "" + } + }, "total_slots": 1, - "chat_template": "" + "chat_template": "..." } ``` @@ -739,56 +806,74 @@ Example: ```json [ - { - "dynatemp_exponent": 1.0, - "dynatemp_range": 0.0, - "frequency_penalty": 0.0, - "grammar": "", - "id": 0, - "ignore_eos": false, - "is_processing": false, - "logit_bias": [], - "min_p": 0.05000000074505806, - "mirostat": 0, - "mirostat_eta": 0.10000000149011612, - "mirostat_tau": 5.0, - "model": "llama-2-7b-32k-instruct.Q2_K.gguf", - "n_ctx": 2048, - "n_keep": 0, - "n_predict": 100000, - "n_probs": 0, - "next_token": { - "has_next_token": true, - "n_remain": -1, - "n_decoded": 0, - "stopped_eos": false, - "stopped_limit": false, - "stopped_word": false, - "stopping_word": "" - }, - "penalize_nl": true, - "presence_penalty": 0.0, - "prompt": "Say hello to llama.cpp", - "repeat_last_n": 64, - "repeat_penalty": 1.100000023841858, - "samplers": [ - "top_k", - "typical_p", - "top_p", - "min_p", - "temperature" - ], - "seed": 42, - "stop": [ - "\n" - ], - "stream": false, - "task_id": 0, - "temperature": 0.0, - "top_k": 40, - "top_p": 0.949999988079071, - "typical_p": 1.0 + { + "id": 0, + "id_task": -1, + "n_ctx": 1024, + "speculative": false, + "is_processing": false, + "params": { + "n_predict": -1, + "seed": 4294967295, + "temperature": 0.800000011920929, + "dynatemp_range": 0.0, + "dynatemp_exponent": 1.0, + "top_k": 40, + "top_p": 0.949999988079071, + "min_p": 0.05000000074505806, + "xtc_probability": 0.0, + "xtc_threshold": 0.10000000149011612, + "typical_p": 1.0, + "repeat_last_n": 64, + "repeat_penalty": 1.0, + "presence_penalty": 0.0, + "frequency_penalty": 0.0, + "dry_multiplier": 0.0, + "dry_base": 1.75, + "dry_allowed_length": 2, + "dry_penalty_last_n": -1, + "dry_sequence_breakers": [ + "\n", + ":", + "\"", + "*" + ], + "mirostat": 0, + "mirostat_tau": 5.0, + "mirostat_eta": 0.10000000149011612, + "penalize_nl": false, + "stop": [], + "max_tokens": -1, + "n_keep": 0, + "n_discard": 0, + "ignore_eos": false, + "stream": true, + "n_probs": 0, + "min_keep": 0, + "grammar": "", + "samplers": [ + "dry", + "top_k", + "typ_p", + "top_p", + "min_p", + "xtc", + "temperature" + ], + "speculative.n_max": 16, + "speculative.n_min": 5, + "speculative.p_min": 0.8999999761581421, + "timings_per_token": false + }, + "prompt": "", + "next_token": { + "has_next_token": true, + "has_new_line": false, + "n_remain": -1, + "n_decoded": 0, + "stopping_word": "" } + } ] ``` diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d57a296a2de07..1ce8fbae23d5c 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -122,11 +122,6 @@ struct slot_params { struct common_params_sampling sampling; struct common_params_speculative speculative; - // params only used in to_json() - int32_t n_ctx; - uint32_t seed_cur; - bool can_speculative; - // OAI-compat fields bool verbose = false; bool oaicompat = false; @@ -134,7 +129,7 @@ struct slot_params { std::string oaicompat_model; std::string oaicompat_cmpl_id; - json to_json() { + json to_json() const { std::vector samplers; samplers.reserve(sampling.samplers.size()); for (const auto & sampler : sampling.samplers) { @@ -142,8 +137,8 @@ struct slot_params { } return json { - {"n_ctx", n_ctx}, {"n_predict", n_predict}, // Server configured n_predict + {"seed", sampling.seed}, {"temperature", sampling.temp}, {"dynatemp_range", sampling.dynatemp_range}, {"dynatemp_exponent", sampling.dynatemp_exponent}, @@ -177,7 +172,6 @@ struct slot_params { {"min_keep", sampling.min_keep}, {"grammar", sampling.grammar}, {"samplers", samplers}, - {"speculative", can_speculative}, {"speculative.n_max", speculative.n_max}, {"speculative.n_min", speculative.n_min}, {"speculative.p_min", speculative.p_min}, @@ -483,12 +477,6 @@ struct server_task_result_cmpl_partial : server_task_result { return std::vector({initial_ret, second_ret}); } } else { - // Some idiosyncrasy in task processing logic makes several trailing calls - // with empty content, we ignore these at the calee site. - if (content.empty()) { - return std::vector({json::object()}); - } - choices = json::array({json{ {"finish_reason", nullptr}, {"index", 0}, @@ -722,6 +710,7 @@ struct server_slot { llama_batch batch_spec = {}; + llama_context * ctx = nullptr; llama_context * ctx_dft = nullptr; common_speculative * spec = nullptr; @@ -906,6 +895,27 @@ struct server_slot { t_token_generation, n_decoded, t_gen, n_gen_second, t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded); } + + json to_json() const { + return json { + {"id", id}, + {"id_task", id_task}, + {"n_ctx", n_ctx}, + {"speculative", can_speculate()}, + {"is_processing", is_processing()}, + {"params", params.to_json()}, + {"prompt", common_detokenize(ctx, prompt_tokens)}, + {"next_token", + { + {"has_next_token", has_next_token}, + {"has_new_line", has_new_line}, + {"n_remain", n_remaining}, + {"n_decoded", n_decoded}, + {"stopping_word", stopping_word}, + } + }, + }; + } }; struct server_metrics { @@ -1338,6 +1348,7 @@ struct server_context { server_slot slot; slot.id = i; + slot.ctx = ctx; slot.n_ctx = n_ctx_slot; slot.n_predict = params_base.n_predict; @@ -1370,8 +1381,7 @@ struct server_context { slots.push_back(slot); } - default_generation_settings_for_props = slots[0].params.to_json(); - default_generation_settings_for_props["seed"] = -1; + default_generation_settings_for_props = slots[0].to_json(); // the update_slots() logic will always submit a maximum of n_batch or n_parallel tokens // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used) @@ -1848,17 +1858,18 @@ struct server_context { queue_results.send(std::move(res)); } - void send_partial_response(server_slot & slot, completion_token_output tkn) { + void send_partial_response(server_slot & slot, const completion_token_output & tkn) { auto res = std::make_unique(); - res->id = slot.id_task; - res->index = slot.index; - res->content = tkn.text_to_send; + + res->id = slot.id_task; + res->index = slot.index; + res->content = tkn.text_to_send; res->truncated = slot.truncated; res->n_decoded = slot.n_decoded; res->n_prompt_tokens = slot.n_prompt_tokens; - res->stop = slot.stop; + res->stop = slot.stop; res->verbose = slot.params.verbose; res->oaicompat = slot.params.oaicompat; @@ -1869,6 +1880,7 @@ struct server_context { // populate res.probs_output if (slot.params.sampling.n_probs > 0) { const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false); + const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size()); const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size()); @@ -1891,7 +1903,8 @@ struct server_context { void send_final_response(server_slot & slot) { if (slot.params.stream) { // if in stream mode, send the last partial response - return send_partial_response(slot, {0, "", {}}); + send_partial_response(slot, {0, "", {}}); + return; } auto res = std::make_unique(); @@ -2012,6 +2025,7 @@ struct server_context { std::vector tasks; auto create_task = [&](json & task_data, llama_tokens & prompt_tokens) { SRV_DBG("create task, n_tokens = %d\n", (int) prompt_tokens.size()); + server_task task; task.id = queue_tasks.get_new_id(); task.inf_type = inf_type; @@ -2205,18 +2219,7 @@ struct server_context { int n_processing_slots = 0; for (server_slot & slot : slots) { - json slot_data = slot.params.to_json(); - slot_data["id"] = slot.id; - slot_data["id_task"] = slot.id_task; - slot_data["is_processing"] = slot.is_processing(); - slot_data["prompt"] = common_detokenize(ctx, slot.prompt_tokens); - slot_data["next_token"] = { - {"has_next_token", slot.has_next_token}, - {"has_new_line", slot.has_new_line}, - {"n_remain", slot.n_remaining}, - {"n_decoded", slot.n_decoded}, - {"stopping_word", slot.stopping_word}, - }; + json slot_data = slot.to_json(); if (slot.is_processing()) { n_processing_slots++; @@ -2230,6 +2233,7 @@ struct server_context { auto res = std::make_unique(); res->id = task.id; + res->slots_data = std::move(slots_data); res->n_idle_slots = n_idle_slots; res->n_processing_slots = n_processing_slots; res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size(); @@ -3003,11 +3007,11 @@ int main(int argc, char ** argv) { res.status = 200; }; - svr->set_exception_handler([&res_error](const httplib::Request &, httplib::Response & res, std::exception_ptr ep) { + svr->set_exception_handler([&res_error](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) { std::string message; try { std::rethrow_exception(ep); - } catch (std::exception & e) { + } catch (const std::exception & e) { message = e.what(); } catch (...) { message = "Unknown Exception"; diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index a96116ac36caa..c9fe7d966b209 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -327,12 +327,12 @@ static std::string llama_get_chat_template(const struct llama_model * model) { std::string template_key = "tokenizer.chat_template"; // call with NULL buffer to get the total size of the string int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0); - if (res < 0) { + if (res < 2) { return ""; } else { std::vector model_template(res, 0); llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); - return std::string(model_template.data(), model_template.size()); + return std::string(model_template.data(), model_template.size() - 1); } } From a0b1dcc47082cbcc4e382a7148516b971f6b0282 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 7 Dec 2024 18:38:15 +0200 Subject: [PATCH 162/245] ggml : disable iq4_nl interleave size 8 (#10709) ggml-ci --- ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp | 32 +++++++++++++++----------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp index 9b9e3c92a1d69..a51d1a6c59517 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp @@ -3748,16 +3748,18 @@ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_s const int end = QK4_NL * 2 / blck_size_interleave; - if (blck_size_interleave == 8) { - for (int i = 0; i < end; ++i) { - int src_id = i % 4; - int src_offset = (i / 4) * blck_size_interleave; - int dst_offset = i * blck_size_interleave; - - // Using memcpy to avoid unaligned memory accesses - memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t)); - } - } else if (blck_size_interleave == 4) { + // TODO: this branch seems wrong + //if (blck_size_interleave == 8) { + // for (int i = 0; i < end; ++i) { + // int src_id = i % 4; + // int src_offset = (i / 4) * blck_size_interleave; + // int dst_offset = i * blck_size_interleave; + + // // Using memcpy to avoid unaligned memory accesses + // memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t)); + // } + //} else + if (blck_size_interleave == 4) { for (int i = 0; i < end; ++i) { int src_id = i % 4; int src_offset = (i / 4) * blck_size_interleave; @@ -3774,7 +3776,8 @@ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_s static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL); - GGML_ASSERT(interleave_block == 4 || interleave_block == 8); + //GGML_ASSERT(interleave_block == 4 || interleave_block == 8); + GGML_ASSERT(interleave_block == 4); block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data; const block_iq4_nl * src = (const block_iq4_nl *)data; @@ -3825,9 +3828,10 @@ template <> int repack(struct ggml_tensor * t, const void * return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size); } -template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { - return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size); -} +// TODO: needs to be revisited +//template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { +// return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size); +//} // gemv template From c5ad0bfe46c517c9ac1ffc3fde8d66eea9cd01e3 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 7 Dec 2024 20:21:09 +0100 Subject: [PATCH 163/245] server : (refactor) no more json in server_task input (#10691) * server : (refactor) no more json in server_task input * add test for slots endpoint * add tests for /props and /slots * remove task inf_type * fix CI by adding safe_json_to_str * add "model_path" to /props * update readme --- examples/server/README.md | 2 + examples/server/server.cpp | 744 +++++++++--------- examples/server/tests/unit/test_basic.py | 30 + .../server/tests/unit/test_chat_completion.py | 5 + examples/server/tests/utils.py | 10 +- examples/server/utils.hpp | 20 +- 6 files changed, 427 insertions(+), 384 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 0bab40a82250c..117c52d3f1310 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -687,12 +687,14 @@ This endpoint is public (no API key check). By default, it is read-only. To make } }, "total_slots": 1, + "model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf", "chat_template": "..." } ``` - `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint. - `total_slots` - the total number of slots for process requests (defined by `--parallel` option) +- `model_path` - the path to model file (same with `-m` argument) - `chat_template` - the model's original Jinja2 prompt template ### POST `/props`: Change server global properties. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1ce8fbae23d5c..1c21e55aaa011 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -54,7 +54,10 @@ enum server_state { }; enum server_task_type { - SERVER_TASK_TYPE_INFERENCE, + SERVER_TASK_TYPE_COMPLETION, + SERVER_TASK_TYPE_EMBEDDING, + SERVER_TASK_TYPE_RERANK, + SERVER_TASK_TYPE_INFILL, SERVER_TASK_TYPE_CANCEL, SERVER_TASK_TYPE_NEXT_RESPONSE, SERVER_TASK_TYPE_METRICS, @@ -64,13 +67,6 @@ enum server_task_type { SERVER_TASK_TYPE_SET_LORA, }; -enum server_task_inf_type { - SERVER_TASK_INF_TYPE_COMPLETION, - SERVER_TASK_INF_TYPE_EMBEDDING, - SERVER_TASK_INF_TYPE_RERANK, - SERVER_TASK_INF_TYPE_INFILL, -}; - // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11 enum error_type { ERROR_TYPE_INVALID_REQUEST, @@ -82,28 +78,6 @@ enum error_type { ERROR_TYPE_NOT_SUPPORTED, // custom error }; -struct server_task { - int id = -1; // to be filled by server_queue - int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL - - llama_tokens prompt_tokens; - server_task_type type; - - // TODO @ngxson : we should get rid of json type here - json data; - - server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION; - - // utility function - static std::unordered_set get_list_id(const std::vector & tasks) { - std::unordered_set ids(tasks.size()); - for (size_t i = 0; i < tasks.size(); i++) { - ids.insert(tasks[i].id); - } - return ids; - } -}; - struct slot_params { bool stream = true; bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt @@ -118,6 +92,7 @@ struct slot_params { std::vector antiprompt; bool timings_per_token = false; + bool ignore_eos = false; struct common_params_sampling sampling; struct common_params_speculative speculative; @@ -167,7 +142,7 @@ struct slot_params { {"n_discard", n_discard}, {"ignore_eos", sampling.ignore_eos}, {"stream", stream}, - //{"logit_bias", sampling.logit_bias}, + {"logit_bias", format_logit_bias(sampling.logit_bias)}, {"n_probs", sampling.n_probs}, {"min_keep", sampling.min_keep}, {"grammar", sampling.grammar}, @@ -180,6 +155,209 @@ struct slot_params { } }; +struct server_task { + int id = -1; // to be filled by server_queue + int index = -1; // used when there are multiple prompts (batch request) + + server_task_type type; + + // used by SERVER_TASK_TYPE_CANCEL + int id_target = -1; + + // used by SERVER_TASK_TYPE_INFERENCE + slot_params params; + llama_tokens prompt_tokens; + int id_selected_slot = -1; + + // used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE + struct slot_action { + int slot_id; + std::string filename; + std::string filepath; + }; + slot_action slot_action; + + // used by SERVER_TASK_TYPE_METRICS + bool metrics_reset_bucket = false; + + server_task(server_task_type type) : type(type) {} + + static slot_params params_from_json_cmpl( + const llama_model * model, + const common_params & params_base, + const json & data) { + slot_params params; + + // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them) + slot_params defaults; + defaults.sampling = params_base.sampling; + defaults.speculative = params_base.speculative; + + // enabling this will output extra debug information in the HTTP responses from the server + params.verbose = params_base.verbosity > 9; + params.timings_per_token = json_value(data, "timings_per_token", false); + + params.stream = json_value(data, "stream", false); + params.cache_prompt = json_value(data, "cache_prompt", true); + params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict)); + params.n_indent = json_value(data, "n_indent", defaults.n_indent); + params.n_keep = json_value(data, "n_keep", defaults.n_keep); + params.n_discard = json_value(data, "n_discard", defaults.n_discard); + //params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement + params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms); + + params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k); + params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p); + params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p); + params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability); + params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold); + params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p); + params.sampling.temp = json_value(data, "temperature", defaults.sampling.temp); + params.sampling.dynatemp_range = json_value(data, "dynatemp_range", defaults.sampling.dynatemp_range); + params.sampling.dynatemp_exponent = json_value(data, "dynatemp_exponent", defaults.sampling.dynatemp_exponent); + params.sampling.penalty_last_n = json_value(data, "repeat_last_n", defaults.sampling.penalty_last_n); + params.sampling.penalty_repeat = json_value(data, "repeat_penalty", defaults.sampling.penalty_repeat); + params.sampling.penalty_freq = json_value(data, "frequency_penalty", defaults.sampling.penalty_freq); + params.sampling.penalty_present = json_value(data, "presence_penalty", defaults.sampling.penalty_present); + params.sampling.dry_multiplier = json_value(data, "dry_multiplier", defaults.sampling.dry_multiplier); + params.sampling.dry_base = json_value(data, "dry_base", defaults.sampling.dry_base); + params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length); + params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n); + params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat); + params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau); + params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta); + params.sampling.penalize_nl = json_value(data, "penalize_nl", defaults.sampling.penalize_nl); + params.sampling.seed = json_value(data, "seed", defaults.sampling.seed); + params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs); + params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep); + + params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min); + params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max); + params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min); + + params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min); + params.speculative.n_min = std::max(params.speculative.n_min, 2); + params.speculative.n_max = std::max(params.speculative.n_max, 0); + + if (params.sampling.dry_base < 1.0f) { + params.sampling.dry_base = defaults.sampling.dry_base; + } + + // sequence breakers for DRY + { + // Currently, this is not compatible with TextGen WebUI, Koboldcpp and SillyTavern format + // Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39 + + if (data.contains("dry_sequence_breakers")) { + params.sampling.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector()); + if (params.sampling.dry_sequence_breakers.empty()) { + throw std::runtime_error("Error: dry_sequence_breakers must be a non-empty array of strings"); + } + } + } + + // process "json_schema" and "grammar" + if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) { + throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both"); + } + if (data.contains("json_schema") && !data.contains("grammar")) { + try { + auto schema = json_value(data, "json_schema", json::object()); + params.sampling.grammar = json_schema_to_grammar(schema); + } catch (const std::exception & e) { + throw std::runtime_error(std::string("\"json_schema\": ") + e.what()); + } + } else { + params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar); + } + + { + params.sampling.logit_bias.clear(); + params.ignore_eos = json_value(data, "ignore_eos", false); + + const auto & logit_bias = data.find("logit_bias"); + if (logit_bias != data.end() && logit_bias->is_array()) { + const int n_vocab = llama_n_vocab(model); + for (const auto & el : *logit_bias) { + // TODO: we may want to throw errors here, in case "el" is incorrect + if (el.is_array() && el.size() == 2) { + float bias; + if (el[1].is_number()) { + bias = el[1].get(); + } else if (el[1].is_boolean() && !el[1].get()) { + bias = -INFINITY; + } else { + continue; + } + + if (el[0].is_number_integer()) { + llama_token tok = el[0].get(); + if (tok >= 0 && tok < n_vocab) { + params.sampling.logit_bias.push_back({tok, bias}); + } + } else if (el[0].is_string()) { + auto toks = common_tokenize(model, el[0].get(), false); + for (auto tok : toks) { + params.sampling.logit_bias.push_back({tok, bias}); + } + } + } + } + } + } + + { + params.antiprompt.clear(); + + const auto & stop = data.find("stop"); + if (stop != data.end() && stop->is_array()) { + for (const auto & word : *stop) { + if (!word.empty()) { + params.antiprompt.push_back(word); + } + } + } + } + + { + const auto & samplers = data.find("samplers"); + if (samplers != data.end()) { + if (samplers->is_array()) { + std::vector sampler_names; + for (const auto & name : *samplers) { + if (name.is_string()) { + sampler_names.emplace_back(name); + } + } + params.sampling.samplers = common_sampler_types_from_names(sampler_names, false); + } else if (samplers->is_string()){ + std::string sampler_string; + for (const auto & name : *samplers) { + sampler_string += name; + } + params.sampling.samplers = common_sampler_types_from_chars(sampler_string); + } + } else { + params.sampling.samplers = defaults.sampling.samplers; + } + } + + std::string model_name = params_base.model_alias.empty() ? DEFAULT_OAICOMPAT_MODEL : params_base.model_alias; + params.oaicompat_model = json_value(data, "model", model_name); + + return params; + } + + // utility function + static std::unordered_set get_list_id(const std::vector & tasks) { + std::unordered_set ids(tasks.size()); + for (size_t i = 0; i < tasks.size(); i++) { + ids.insert(tasks[i].id); + } + return ids; + } +}; + struct result_timings { int32_t prompt_n = -1; double prompt_ms; @@ -191,7 +369,7 @@ struct result_timings { double predicted_per_token_ms; double predicted_per_second; - json to_json() { + json to_json() const { return { {"prompt_n", prompt_n}, {"prompt_ms", prompt_ms}, @@ -623,7 +801,8 @@ struct server_task_result_metrics : server_task_result { uint64_t n_decode_total = 0; uint64_t n_busy_slots_total = 0; - // TODO: get rid of this json object and use to_json() instead + // while we can also use std::vector this requires copying the slot object which can be quite messy + // therefore, we use json to temporarily store the slot.to_json() result json slots_data = json::array(); virtual json to_json() override { @@ -708,6 +887,9 @@ struct server_slot { int id; int id_task = -1; + // only used for completion/embedding/infill/rerank + server_task_type task_type = SERVER_TASK_TYPE_COMPLETION; + llama_batch batch_spec = {}; llama_context * ctx = nullptr; @@ -746,8 +928,6 @@ struct server_slot { llama_tokens cache_tokens; std::vector generated_token_probs; - server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION; - bool has_next_token = true; bool has_new_line = false; bool truncated = false; @@ -787,11 +967,15 @@ struct server_slot { n_past = 0; n_sent_text = 0; n_sent_token_probs = 0; - inf_type = SERVER_TASK_INF_TYPE_COMPLETION; + task_type = SERVER_TASK_TYPE_COMPLETION; generated_token_probs.clear(); } + bool is_non_causal() const { + return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK; + } + bool has_budget(const common_params & global_params) { if (params.n_predict == -1 && global_params.n_predict == -1) { return true; // limitless @@ -903,6 +1087,7 @@ struct server_slot { {"n_ctx", n_ctx}, {"speculative", can_speculate()}, {"is_processing", is_processing()}, + {"non_causal", is_non_causal()}, {"params", params.to_json()}, {"prompt", common_detokenize(ctx, prompt_tokens)}, {"next_token", @@ -988,9 +1173,7 @@ struct server_queue { // Add a new task to the end of the queue int post(server_task task, bool front = false) { std::unique_lock lock(mutex_tasks); - if (task.id == -1) { - task.id = id++; - } + GGML_ASSERT(task.id != -1); QUE_DBG("new task, id = %d, front = %d\n", task.id, front); if (front) { queue_tasks.push_front(std::move(task)); @@ -1468,104 +1651,14 @@ struct server_context { } bool launch_slot_with_task(server_slot & slot, const server_task & task) { - // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them) - slot_params defaults; - defaults.sampling = params_base.sampling; - defaults.speculative = params_base.speculative; + slot.reset(); + slot.id_task = task.id; + slot.index = task.index; + slot.task_type = task.type; + slot.params = std::move(task.params); + slot.prompt_tokens = std::move(task.prompt_tokens); - const auto & data = task.data; - - if (data.count("__oaicompat") != 0) { - std::string model_name = params_base.model_alias.empty() ? DEFAULT_OAICOMPAT_MODEL : params_base.model_alias; - slot.params.oaicompat = true; - slot.params.oaicompat_chat = json_value(data, "__oaicompat_chat", false); - slot.params.oaicompat_model = json_value(data, "model", model_name); - slot.params.oaicompat_cmpl_id = json_value(data, "completion_id", std::string()); - } else { - slot.params.oaicompat = false; - } - - - // enabling this will output extra debug information in the HTTP responses from the server - slot.params.verbose = params_base.verbosity > 9; - slot.params.timings_per_token = json_value(data, "timings_per_token", false); - - slot.params.stream = json_value(data, "stream", false); - slot.params.cache_prompt = json_value(data, "cache_prompt", true); - slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict)); - slot.params.n_indent = json_value(data, "n_indent", defaults.n_indent); - slot.params.n_keep = json_value(data, "n_keep", defaults.n_keep); - slot.params.n_discard = json_value(data, "n_discard", defaults.n_discard); - //slot.params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement - slot.params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms); - - slot.params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k); - slot.params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p); - slot.params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p); - slot.params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability); - slot.params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold); - slot.params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p); - slot.params.sampling.temp = json_value(data, "temperature", defaults.sampling.temp); - slot.params.sampling.dynatemp_range = json_value(data, "dynatemp_range", defaults.sampling.dynatemp_range); - slot.params.sampling.dynatemp_exponent = json_value(data, "dynatemp_exponent", defaults.sampling.dynatemp_exponent); - slot.params.sampling.penalty_last_n = json_value(data, "repeat_last_n", defaults.sampling.penalty_last_n); - slot.params.sampling.penalty_repeat = json_value(data, "repeat_penalty", defaults.sampling.penalty_repeat); - slot.params.sampling.penalty_freq = json_value(data, "frequency_penalty", defaults.sampling.penalty_freq); - slot.params.sampling.penalty_present = json_value(data, "presence_penalty", defaults.sampling.penalty_present); - slot.params.sampling.dry_multiplier = json_value(data, "dry_multiplier", defaults.sampling.dry_multiplier); - slot.params.sampling.dry_base = json_value(data, "dry_base", defaults.sampling.dry_base); - slot.params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length); - slot.params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n); - slot.params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat); - slot.params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau); - slot.params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta); - slot.params.sampling.penalize_nl = json_value(data, "penalize_nl", defaults.sampling.penalize_nl); - slot.params.sampling.seed = json_value(data, "seed", defaults.sampling.seed); - slot.params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs); - slot.params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep); - - slot.params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min); - slot.params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max); - slot.params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min); - - slot.params.speculative.n_min = std::min(slot.params.speculative.n_max, slot.params.speculative.n_min); - slot.params.speculative.n_min = std::max(slot.params.speculative.n_min, 2); - slot.params.speculative.n_max = std::max(slot.params.speculative.n_max, 0); - - if (slot.params.sampling.dry_base < 1.0f) { - slot.params.sampling.dry_base = defaults.sampling.dry_base; - } - - // sequence breakers for DRY - { - // Currently, this is not compatible with TextGen WebUI, Koboldcpp and SillyTavern format - // Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39 - - if (data.contains("dry_sequence_breakers")) { - slot.params.sampling.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector()); - if (slot.params.sampling.dry_sequence_breakers.empty()) { - send_error(task, "Error: dry_sequence_breakers must be a non-empty array of strings", ERROR_TYPE_INVALID_REQUEST); - return false; - } - } - } - - // process "json_schema" and "grammar" - if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) { - send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST); - return false; - } - if (data.contains("json_schema") && !data.contains("grammar")) { - try { - auto schema = json_value(data, "json_schema", json::object()); - slot.params.sampling.grammar = json_schema_to_grammar(schema); - } catch (const std::exception & e) { - send_error(task, std::string("\"json_schema\": ") + e.what(), ERROR_TYPE_INVALID_REQUEST); - return false; - } - } else { - slot.params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar); - } + SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str()); if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) { // Might be better to reject the request with a 400 ? @@ -1573,78 +1666,8 @@ struct server_context { SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.n_predict, slot.n_predict); } - { - slot.params.sampling.logit_bias.clear(); - - if (json_value(data, "ignore_eos", false) && has_eos_token) { - slot.params.sampling.logit_bias.push_back({llama_token_eos(model), -INFINITY}); - } - - const auto & logit_bias = data.find("logit_bias"); - if (logit_bias != data.end() && logit_bias->is_array()) { - const int n_vocab = llama_n_vocab(model); - for (const auto & el : *logit_bias) { - // TODO: we may want to throw errors here, in case "el" is incorrect - if (el.is_array() && el.size() == 2) { - float bias; - if (el[1].is_number()) { - bias = el[1].get(); - } else if (el[1].is_boolean() && !el[1].get()) { - bias = -INFINITY; - } else { - continue; - } - - if (el[0].is_number_integer()) { - llama_token tok = el[0].get(); - if (tok >= 0 && tok < n_vocab) { - slot.params.sampling.logit_bias.push_back({tok, bias}); - } - } else if (el[0].is_string()) { - auto toks = common_tokenize(model, el[0].get(), false); - for (auto tok : toks) { - slot.params.sampling.logit_bias.push_back({tok, bias}); - } - } - } - } - } - } - - { - slot.params.antiprompt.clear(); - - const auto & stop = data.find("stop"); - if (stop != data.end() && stop->is_array()) { - for (const auto & word : *stop) { - if (!word.empty()) { - slot.params.antiprompt.push_back(word); - } - } - } - } - - { - const auto & samplers = data.find("samplers"); - if (samplers != data.end()) { - if (samplers->is_array()) { - std::vector sampler_names; - for (const auto & name : *samplers) { - if (name.is_string()) { - sampler_names.emplace_back(name); - } - } - slot.params.sampling.samplers = common_sampler_types_from_names(sampler_names, false); - } else if (samplers->is_string()){ - std::string sampler_string; - for (const auto & name : *samplers) { - sampler_string += name; - } - slot.params.sampling.samplers = common_sampler_types_from_chars(sampler_string); - } - } else { - slot.params.sampling.samplers = defaults.sampling.samplers; - } + if (slot.params.ignore_eos && has_eos_token) { + slot.params.sampling.logit_bias.push_back({llama_token_eos(model), -INFINITY}); } { @@ -2020,82 +2043,13 @@ struct server_context { // Functions to create new task(s) and receive result(s) // - // break the input "prompt" into multiple tasks if needed, then format and tokenize the input prompt(s) - std::vector create_tasks_inference(json data, server_task_inf_type inf_type) { - std::vector tasks; - auto create_task = [&](json & task_data, llama_tokens & prompt_tokens) { - SRV_DBG("create task, n_tokens = %d\n", (int) prompt_tokens.size()); - - server_task task; - task.id = queue_tasks.get_new_id(); - task.inf_type = inf_type; - task.type = SERVER_TASK_TYPE_INFERENCE; - task.data = task_data; - task.prompt_tokens = std::move(prompt_tokens); - tasks.push_back(std::move(task)); - }; - - static constexpr const char * error_msg = "\"prompt\" must be a string, an array of token ids or an array of prompts"; - if (!data.contains("prompt")) { - throw std::runtime_error(error_msg); - } - - // because llama_tokenize api is thread-safe, we can tokenize the prompt from HTTP thread - bool add_special = inf_type != SERVER_TASK_INF_TYPE_RERANK && inf_type != SERVER_TASK_INF_TYPE_INFILL; - std::vector tokenized_prompts = tokenize_input_prompts(ctx, data.at("prompt"), add_special, true); - switch (inf_type) { - case SERVER_TASK_INF_TYPE_RERANK: - { - // prompts[0] is the question - // the rest are the answers/documents - GGML_ASSERT(tokenized_prompts.size() > 1); - SRV_DBG("creating rerank tasks, n_prompts = %d\n", (int) tokenized_prompts.size() - 1); - for (size_t i = 1; i < tokenized_prompts.size(); i++) { - data["index"] = i - 1; - auto tokens = format_rerank(model, tokenized_prompts[0], tokenized_prompts[i]); - create_task(data, tokens); - } - } break; - case SERVER_TASK_INF_TYPE_INFILL: - { - SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size()); - for (size_t i = 0; i < tokenized_prompts.size(); i++) { - data["index"] = i; - auto tokens = format_infill( - ctx, - data.at("input_prefix"), - data.at("input_suffix"), - data.at("input_extra"), - params_base.n_batch, - params_base.n_predict, - slots[0].n_ctx, // TODO: there should be a better way - params_base.spm_infill, - tokenized_prompts[i] - ); - create_task(data, tokens); - } - } break; - default: - { - SRV_DBG("creating multi-prompt tasks, n_prompts = %d\n", (int) tokenized_prompts.size()); - for (size_t i = 0; i < tokenized_prompts.size(); i++) { - data["index"] = i; - create_task(data, tokenized_prompts[i]); - } - } - } - - return tasks; - } - void cancel_tasks(const std::unordered_set & id_tasks) { std::vector cancel_tasks; cancel_tasks.reserve(id_tasks.size()); for (const auto & id_task : id_tasks) { SRV_WRN("cancel task, id_task = %d\n", id_task); - server_task task; - task.type = SERVER_TASK_TYPE_CANCEL; + server_task task(SERVER_TASK_TYPE_CANCEL); task.id_target = id_task; cancel_tasks.push_back(task); queue_results.remove_waiting_task_id(id_task); @@ -2104,7 +2058,7 @@ struct server_context { queue_tasks.post(cancel_tasks, true); } - // receive the results from task(s) created by create_tasks_inference + // receive the results from task(s) void receive_multi_results( const std::unordered_set & id_tasks, const std::function&)> & result_handler, @@ -2131,7 +2085,7 @@ struct server_context { result_handler(results); } - // receive the results from task(s) created by create_tasks_inference, in stream mode + // receive the results from task(s), in stream mode void receive_cmpl_results_stream( const std::unordered_set & id_tasks, const std::function & result_handler, @@ -2166,9 +2120,12 @@ struct server_context { void process_single_task(server_task task) { switch (task.type) { - case SERVER_TASK_TYPE_INFERENCE: + case SERVER_TASK_TYPE_COMPLETION: + case SERVER_TASK_TYPE_INFILL: + case SERVER_TASK_TYPE_EMBEDDING: + case SERVER_TASK_TYPE_RERANK: { - const int id_slot = json_value(task.data, "id_slot", -1); + const int id_slot = task.id_selected_slot; server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task); @@ -2185,13 +2142,6 @@ struct server_context { break; } - slot->reset(); - - slot->id_task = task.id; - slot->inf_type = task.inf_type; - slot->index = json_value(task.data, "index", 0); - slot->prompt_tokens = std::move(task.prompt_tokens); - if (!launch_slot_with_task(*slot, task)) { SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id); break; @@ -2255,14 +2205,14 @@ struct server_context { res->n_decode_total = metrics.n_decode_total; res->n_busy_slots_total = metrics.n_busy_slots_total; - if (json_value(task.data, "reset_bucket", false)) { + if (task.metrics_reset_bucket) { metrics.reset_bucket(); } queue_results.send(std::move(res)); } break; case SERVER_TASK_TYPE_SLOT_SAVE: { - int id_slot = task.data.at("id_slot"); + int id_slot = task.slot_action.slot_id; server_slot * slot = get_slot_by_id(id_slot); if (slot == nullptr) { send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); @@ -2278,8 +2228,8 @@ struct server_context { const size_t token_count = slot->cache_tokens.size(); const int64_t t_start = ggml_time_us(); - std::string filename = task.data.at("filename"); - std::string filepath = task.data.at("filepath"); + std::string filename = task.slot_action.filename; + std::string filepath = task.slot_action.filepath; const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), token_count); @@ -2298,7 +2248,7 @@ struct server_context { } break; case SERVER_TASK_TYPE_SLOT_RESTORE: { - int id_slot = task.data.at("id_slot"); + int id_slot = task.slot_action.slot_id; server_slot * slot = get_slot_by_id(id_slot); if (slot == nullptr) { send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); @@ -2313,8 +2263,8 @@ struct server_context { const int64_t t_start = ggml_time_us(); - std::string filename = task.data.at("filename"); - std::string filepath = task.data.at("filepath"); + std::string filename = task.slot_action.filename; + std::string filepath = task.slot_action.filepath; slot->cache_tokens.resize(slot->n_ctx); size_t token_count = 0; @@ -2341,7 +2291,7 @@ struct server_context { } break; case SERVER_TASK_TYPE_SLOT_ERASE: { - int id_slot = task.data.at("id_slot"); + int id_slot = task.slot_action.slot_id; server_slot * slot = get_slot_by_id(id_slot); if (slot == nullptr) { send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); @@ -2400,10 +2350,8 @@ struct server_context { { SRV_DBG("%s", "posting NEXT_RESPONSE\n"); - server_task task; - task.type = SERVER_TASK_TYPE_NEXT_RESPONSE; - task.id_target = -1; - + server_task task(SERVER_TASK_TYPE_NEXT_RESPONSE); + task.id = queue_tasks.get_new_id(); queue_tasks.post(task); } @@ -2517,7 +2465,7 @@ struct server_context { continue; } - if (slot.inf_type == SERVER_TASK_INF_TYPE_EMBEDDING || slot.inf_type == SERVER_TASK_INF_TYPE_RERANK) { + if (slot.is_non_causal()) { if (slot.n_prompt_tokens > n_ubatch) { slot.release(); send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER); @@ -2632,7 +2580,7 @@ struct server_context { } // non-causal tasks require to fit the entire prompt in the physical batch - if (slot.inf_type == SERVER_TASK_INF_TYPE_EMBEDDING || slot.inf_type == SERVER_TASK_INF_TYPE_RERANK) { + if (slot.is_non_causal()) { // cannot fit the prompt in the current batch - will try next iter if (batch.n_tokens + slot.n_prompt_tokens > n_batch) { continue; @@ -2640,10 +2588,7 @@ struct server_context { } // check that we are in the right batch_type, if not defer the slot - const bool slot_type = - slot.inf_type == SERVER_TASK_INF_TYPE_EMBEDDING || - slot.inf_type == SERVER_TASK_INF_TYPE_RERANK ? 1 : 0; - + int slot_type = slot.is_non_causal(); if (batch_type == -1) { batch_type = slot_type; } else if (batch_type != slot_type) { @@ -2760,7 +2705,7 @@ struct server_context { } if (slot.state == SLOT_STATE_DONE_PROMPT) { - if (slot.inf_type == SERVER_TASK_INF_TYPE_EMBEDDING) { + if (slot.task_type == SERVER_TASK_TYPE_EMBEDDING) { // prompt evaluated for embedding send_embedding(slot, batch_view); slot.release(); @@ -2768,7 +2713,7 @@ struct server_context { continue; // continue loop of slots } - if (slot.inf_type == SERVER_TASK_INF_TYPE_RERANK) { + if (slot.task_type == SERVER_TASK_TYPE_RERANK) { send_rerank(slot, batch_view); slot.release(); slot.i_batch = -1; @@ -2998,12 +2943,12 @@ int main(int argc, char ** argv) { auto res_error = [](httplib::Response & res, const json & error_data) { json final_response {{"error", error_data}}; - res.set_content(final_response.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON); + res.set_content(safe_json_to_str(final_response), MIMETYPE_JSON); res.status = json_value(error_data, "code", 500); }; auto res_ok = [](httplib::Response & res, const json & data) { - res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON); + res.set_content(safe_json_to_str(data), MIMETYPE_JSON); res.status = 200; }; @@ -3140,10 +3085,8 @@ int main(int argc, char ** argv) { } // request slots data using task queue - server_task task; + server_task task(SERVER_TASK_TYPE_METRICS); task.id = ctx_server.queue_tasks.get_new_id(); - task.type = SERVER_TASK_TYPE_METRICS; - ctx_server.queue_results.add_waiting_task_id(task.id); ctx_server.queue_tasks.post(task, true); // high-priority task @@ -3178,11 +3121,9 @@ int main(int argc, char ** argv) { } // request slots data using task queue - server_task task; + server_task task(SERVER_TASK_TYPE_METRICS); task.id = ctx_server.queue_tasks.get_new_id(); - task.id_target = -1; - task.type = SERVER_TASK_TYPE_METRICS; - task.data.push_back({{"reset_bucket", true}}); + task.metrics_reset_bucket = true; ctx_server.queue_results.add_waiting_task_id(task.id); ctx_server.queue_tasks.post(task, true); // high-priority task @@ -3286,19 +3227,17 @@ int main(int argc, char ** argv) { } std::string filepath = params.slot_save_path + filename; - server_task task; - task.type = SERVER_TASK_TYPE_SLOT_SAVE; - task.data = { - { "id_slot", id_slot }, - { "filename", filename }, - { "filepath", filepath }, - }; + server_task task(SERVER_TASK_TYPE_SLOT_SAVE); + task.id = ctx_server.queue_tasks.get_new_id(); + task.slot_action.slot_id = id_slot; + task.slot_action.filename = filename; + task.slot_action.filepath = filepath; - const int id_task = ctx_server.queue_tasks.post(task); - ctx_server.queue_results.add_waiting_task_id(id_task); + ctx_server.queue_results.add_waiting_task_id(task.id); + ctx_server.queue_tasks.post(task); - server_task_result_ptr result = ctx_server.queue_results.recv(id_task); - ctx_server.queue_results.remove_waiting_task_id(id_task); + server_task_result_ptr result = ctx_server.queue_results.recv(task.id); + ctx_server.queue_results.remove_waiting_task_id(task.id); if (result->is_error()) { res_error(res, result->to_json()); @@ -3317,19 +3256,17 @@ int main(int argc, char ** argv) { } std::string filepath = params.slot_save_path + filename; - server_task task; - task.type = SERVER_TASK_TYPE_SLOT_RESTORE; - task.data = { - { "id_slot", id_slot }, - { "filename", filename }, - { "filepath", filepath }, - }; + server_task task(SERVER_TASK_TYPE_SLOT_RESTORE); + task.id = ctx_server.queue_tasks.get_new_id(); + task.slot_action.slot_id = id_slot; + task.slot_action.filename = filename; + task.slot_action.filepath = filepath; - const int id_task = ctx_server.queue_tasks.post(task); - ctx_server.queue_results.add_waiting_task_id(id_task); + ctx_server.queue_results.add_waiting_task_id(task.id); + ctx_server.queue_tasks.post(task); - server_task_result_ptr result = ctx_server.queue_results.recv(id_task); - ctx_server.queue_results.remove_waiting_task_id(id_task); + server_task_result_ptr result = ctx_server.queue_results.recv(task.id); + ctx_server.queue_results.remove_waiting_task_id(task.id); if (result->is_error()) { res_error(res, result->to_json()); @@ -3341,17 +3278,15 @@ int main(int argc, char ** argv) { }; const auto handle_slots_erase = [&ctx_server, &res_error, &res_ok](const httplib::Request & /* req */, httplib::Response & res, int id_slot) { - server_task task; - task.type = SERVER_TASK_TYPE_SLOT_ERASE; - task.data = { - { "id_slot", id_slot }, - }; + server_task task(SERVER_TASK_TYPE_SLOT_ERASE); + task.id = ctx_server.queue_tasks.get_new_id(); + task.slot_action.slot_id = id_slot; - const int id_task = ctx_server.queue_tasks.post(task); - ctx_server.queue_results.add_waiting_task_id(id_task); + ctx_server.queue_results.add_waiting_task_id(task.id); + ctx_server.queue_tasks.post(task); - server_task_result_ptr result = ctx_server.queue_results.recv(id_task); - ctx_server.queue_results.remove_waiting_task_id(id_task); + server_task_result_ptr result = ctx_server.queue_results.recv(task.id); + ctx_server.queue_results.remove_waiting_task_id(task.id); if (result->is_error()) { res_error(res, result->to_json()); @@ -3392,9 +3327,11 @@ int main(int argc, char ** argv) { }; const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) { + // this endpoint is publicly available, please only return what is safe to be exposed json data = { { "default_generation_settings", ctx_server.default_generation_settings_for_props }, { "total_slots", ctx_server.params_base.n_parallel }, + { "model_path", ctx_server.params_base.model }, { "chat_template", llama_get_chat_template(ctx_server.model) }, }; @@ -3417,17 +3354,47 @@ int main(int argc, char ** argv) { // handle completion-like requests (completion, chat, infill) // we can optionally provide a custom format for partial results and final results const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok]( - server_task_inf_type inf_type, + server_task_type type, json & data, httplib::Response & res, - bool oai_compat = false) { + bool oaicompat = false, + bool oaicompat_chat = false) { + GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL); + if (ctx_server.params_base.embedding) { res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); return; } - data["completion_id"] = gen_chatcmplid(); - std::vector tasks = ctx_server.create_tasks_inference(data, inf_type); + auto completion_id = gen_chatcmplid(); + std::vector tasks; + + try { + std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, data.at("prompt"), true, true); + tasks.reserve(tokenized_prompts.size()); + for (size_t i = 0; i < tokenized_prompts.size(); i++) { + server_task task = server_task(type); + + task.id = ctx_server.queue_tasks.get_new_id(); + task.index = i; + + task.prompt_tokens = std::move(tokenized_prompts[i]); + task.params = server_task::params_from_json_cmpl(ctx_server.model, ctx_server.params_base, data); + task.id_selected_slot = json_value(data, "id_slot", -1); + + // OAI-compat + task.params.oaicompat = oaicompat; + task.params.oaicompat_chat = oaicompat_chat; + task.params.oaicompat_cmpl_id = completion_id; + // oaicompat_model is already populated by params_from_json_cmpl + + tasks.push_back(task); + } + } catch (const std::exception & e) { + res_error(res, format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST)); + return; + } + ctx_server.queue_results.add_waiting_tasks(tasks); ctx_server.queue_tasks.post(tasks); @@ -3453,7 +3420,7 @@ int main(int argc, char ** argv) { ctx_server.queue_results.remove_waiting_task_ids(task_ids); } else { - const auto chunked_content_provider = [task_ids, &ctx_server, oai_compat](size_t, httplib::DataSink & sink) { + const auto chunked_content_provider = [task_ids, &ctx_server, oaicompat](size_t, httplib::DataSink & sink) { ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_ptr & result) -> bool { json res_json = result->to_json(); if (res_json.is_array()) { @@ -3469,7 +3436,7 @@ int main(int argc, char ** argv) { }, [&](const json & error_data) { server_sent_event(sink, "error", error_data); }); - if (oai_compat) { + if (oaicompat) { static const std::string ev_done = "data: [DONE]\n\n"; sink.write(ev_done.data(), ev_done.size()); } @@ -3487,7 +3454,12 @@ int main(int argc, char ** argv) { const auto handle_completions = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) { json data = json::parse(req.body); - return handle_completions_generic(SERVER_TASK_INF_TYPE_COMPLETION, data, res); + return handle_completions_generic( + SERVER_TASK_TYPE_COMPLETION, + data, + res, + /* oaicompat */ false, + /* oaicompat_chat */ false); }; const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) { @@ -3537,7 +3509,7 @@ int main(int argc, char ** argv) { } data["input_extra"] = input_extra; // default to empty array if it's not exist - return handle_completions_generic(SERVER_TASK_INF_TYPE_INFILL, data, res); + return handle_completions_generic(SERVER_TASK_TYPE_INFILL, data, res); }; const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) { @@ -3547,11 +3519,15 @@ int main(int argc, char ** argv) { } json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template); - data["__oaicompat_chat"] = true; - return handle_completions_generic(SERVER_TASK_INF_TYPE_COMPLETION, data, res, true); + return handle_completions_generic( + SERVER_TASK_TYPE_COMPLETION, + data, + res, + /* oaicompat */ true, + /* oaicompat_chat */ true); }; - const auto handle_models = [¶ms, &ctx_server](const httplib::Request &, httplib::Response & res) { + const auto handle_models = [¶ms, &ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) { json models = { {"object", "list"}, {"data", { @@ -3565,7 +3541,7 @@ int main(int argc, char ** argv) { }} }; - res.set_content(models.dump(), MIMETYPE_JSON); + res_ok(res, models); }; const auto handle_tokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) { @@ -3642,7 +3618,16 @@ int main(int argc, char ** argv) { json responses = json::array(); bool error = false; { - std::vector tasks = ctx_server.create_tasks_inference({{"prompt", prompt}}, SERVER_TASK_INF_TYPE_EMBEDDING); + std::vector tasks; + std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, /* add_special */ false, true); + for (size_t i = 0; i < tokenized_prompts.size(); i++) { + server_task task = server_task(SERVER_TASK_TYPE_EMBEDDING); + task.id = ctx_server.queue_tasks.get_new_id(); + task.index = i; + task.prompt_tokens = std::move(tokenized_prompts[i]); + tasks.push_back(task); + } + ctx_server.queue_results.add_waiting_tasks(tasks); ctx_server.queue_tasks.post(tasks); @@ -3669,7 +3654,7 @@ int main(int argc, char ** argv) { // write JSON response json root = oaicompat ? format_embeddings_response_oaicompat(body, responses) - : responses[0]; + : responses.size() == 1 ? responses[0] : json(responses); res_ok(res, root); }; @@ -3708,20 +3693,23 @@ int main(int argc, char ** argv) { return; } - // construct prompt object: array of ["query", "doc0", "doc1", ...] - json prompt; - prompt.push_back(query); - for (const auto & doc : documents) { - prompt.push_back(doc); - } - - LOG_DBG("rerank prompt: %s\n", prompt.dump().c_str()); + llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.ctx, query, /* add_special */ false, true)[0]; // create and queue the task json responses = json::array(); bool error = false; { - std::vector tasks = ctx_server.create_tasks_inference({{"prompt", prompt}}, SERVER_TASK_INF_TYPE_RERANK); + std::vector tasks; + std::vector tokenized_docs = tokenize_input_prompts(ctx_server.ctx, documents, /* add_special */ false, true); + tasks.reserve(tokenized_docs.size()); + for (size_t i = 0; i < tokenized_docs.size(); i++) { + server_task task = server_task(SERVER_TASK_TYPE_RERANK); + task.id = ctx_server.queue_tasks.get_new_id(); + task.index = i; + task.prompt_tokens = format_rerank(ctx_server.model, tokenized_query, tokenized_docs[i]); + tasks.push_back(task); + } + ctx_server.queue_results.add_waiting_tasks(tasks); ctx_server.queue_tasks.post(tasks); @@ -3782,13 +3770,13 @@ int main(int argc, char ** argv) { } } - server_task task; - task.type = SERVER_TASK_TYPE_SET_LORA; - const int id_task = ctx_server.queue_tasks.post(task); - ctx_server.queue_results.add_waiting_task_id(id_task); + server_task task(SERVER_TASK_TYPE_SET_LORA); + task.id = ctx_server.queue_tasks.get_new_id(); + ctx_server.queue_results.add_waiting_task_id(task.id); + ctx_server.queue_tasks.post(task); - server_task_result_ptr result = ctx_server.queue_results.recv(id_task); - ctx_server.queue_results.remove_waiting_task_id(id_task); + server_task_result_ptr result = ctx_server.queue_results.recv(task.id); + ctx_server.queue_results.remove_waiting_task_id(task.id); if (result->is_error()) { res_error(res, result->to_json()); diff --git a/examples/server/tests/unit/test_basic.py b/examples/server/tests/unit/test_basic.py index d82d54a5a6f47..1d512401685fb 100644 --- a/examples/server/tests/unit/test_basic.py +++ b/examples/server/tests/unit/test_basic.py @@ -22,7 +22,12 @@ def test_server_props(): server.start() res = server.make_request("GET", "/props") assert res.status_code == 200 + assert ".gguf" in res.body["model_path"] assert res.body["total_slots"] == server.n_slots + default_val = res.body["default_generation_settings"] + assert server.n_ctx is not None and server.n_slots is not None + assert default_val["n_ctx"] == server.n_ctx / server.n_slots + assert default_val["params"]["seed"] == server.seed def test_server_models(): @@ -33,6 +38,31 @@ def test_server_models(): assert len(res.body["data"]) == 1 assert res.body["data"][0]["id"] == server.model_alias + +def test_server_slots(): + global server + + # without slots endpoint enabled, this should return error + server.server_slots = False + server.start() + res = server.make_request("GET", "/slots") + assert res.status_code == 501 # ERROR_TYPE_NOT_SUPPORTED + assert "error" in res.body + server.stop() + + # with slots endpoint enabled, this should return slots info + server.server_slots = True + server.n_slots = 2 + server.start() + res = server.make_request("GET", "/slots") + assert res.status_code == 200 + assert len(res.body) == server.n_slots + assert server.n_ctx is not None and server.n_slots is not None + assert res.body[0]["n_ctx"] == server.n_ctx / server.n_slots + assert "params" in res.body[0] + assert res.body[0]["params"]["seed"] == server.seed + + def test_load_split_model(): global server server.model_hf_repo = "ggml-org/models" diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py index f13c6c4ca4bd3..6573cc17f7b87 100644 --- a/examples/server/tests/unit/test_chat_completion.py +++ b/examples/server/tests/unit/test_chat_completion.py @@ -30,6 +30,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte ], }) assert res.status_code == 200 + assert "cmpl" in res.body["id"] # make sure the completion id has the expected format assert res.body["model"] == model if model is not None else server.model_alias assert res.body["usage"]["prompt_tokens"] == n_prompt assert res.body["usage"]["completion_tokens"] == n_predicted @@ -59,9 +60,13 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte "stream": True, }) content = "" + last_cmpl_id = None for data in res: choice = data["choices"][0] assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future + if last_cmpl_id is None: + last_cmpl_id = data["id"] + assert last_cmpl_id == data["id"] # make sure the completion id is the same for all events in the stream if choice["finish_reason"] in ["stop", "length"]: assert data["usage"]["prompt_tokens"] == n_prompt assert data["usage"]["completion_tokens"] == n_predicted diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index e17a05ff6902a..69215eaa4ebb7 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -64,6 +64,7 @@ class ServerProcess: server_embeddings: bool | None = False server_reranking: bool | None = False server_metrics: bool | None = False + server_slots: bool | None = False draft: int | None = None api_key: str | None = None response_format: str | None = None @@ -91,7 +92,6 @@ def start(self, timeout_seconds: int = 10) -> None: else: server_path = "../../../build/bin/llama-server" server_args = [ - "--slots", # requires to get slot status via /slots endpoint "--host", self.server_host, "--port", @@ -129,6 +129,8 @@ def start(self, timeout_seconds: int = 10) -> None: server_args.append("--reranking") if self.server_metrics: server_args.append("--metrics") + if self.server_slots: + server_args.append("--slots") if self.model_alias: server_args.extend(["--alias", self.model_alias]) if self.n_ctx: @@ -181,7 +183,7 @@ def start(self, timeout_seconds: int = 10) -> None: start_time = time.time() while time.time() - start_time < timeout_seconds: try: - response = self.make_request("GET", "/slots", headers={ + response = self.make_request("GET", "/health", headers={ "Authorization": f"Bearer {self.api_key}" if self.api_key else None }) if response.status_code == 200: @@ -224,7 +226,7 @@ def make_request( result.headers = dict(response.headers) result.status_code = response.status_code result.body = response.json() if parse_body else None - print("Response from server", result.body) + print("Response from server", json.dumps(result.body, indent=2)) return result def make_stream_request( @@ -245,7 +247,7 @@ def make_stream_request( break elif line.startswith('data: '): data = json.loads(line[6:]) - print("Partial response from server", data) + print("Partial response from server", json.dumps(data, indent=2)) yield data diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index c9fe7d966b209..8f545aea52dc4 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -164,6 +164,9 @@ static std::vector tokenize_input_prompts(llama_context * ctx, con } else { throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts"); } + if (result.empty()) { + throw std::runtime_error("\"prompt\" must not be empty"); + } return result; } @@ -496,8 +499,6 @@ static json oaicompat_completion_params_parse( const std::string & chat_template) { json llama_params; - llama_params["__oaicompat"] = true; - // Apply chat template to the list of messages llama_params["prompt"] = format_chat(model, chat_template, body.at("messages")); @@ -648,3 +649,18 @@ static json format_detokenized_response(const std::string & content) { {"content", content} }; } + +static json format_logit_bias(const std::vector & logit_bias) { + json data = json::array(); + for (const auto & lb : logit_bias) { + data.push_back(json{ + {"bias", lb.bias}, + {"token", lb.token}, + }); + } + return data; +} + +static std::string safe_json_to_str(json data) { + return data.dump(-1, ' ', false, json::error_handler_t::replace); +} From 9651dc8a80656ccb99d43e8ac85c616e77c424af Mon Sep 17 00:00:00 2001 From: Robert Collins Date: Sat, 7 Dec 2024 16:12:27 -0500 Subject: [PATCH 164/245] llama : add 128k yarn context for Qwen (#10698) * add 128k yarn context for Qwen * added property for model tensors * removing useless line --- convert_hf_to_gguf.py | 8 ++++++++ gguf-py/gguf/constants.py | 1 + 2 files changed, 9 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a4eece934021f..c63d929c187a8 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1992,6 +1992,14 @@ def set_vocab(self): except FileNotFoundError: self._set_vocab_gpt2() + def set_gguf_parameters(self): + super().set_gguf_parameters() + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "yarn": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) + @Model.register("Qwen2MoeForCausalLM") class Qwen2MoeModel(Model): diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 66247b80302e6..4c8710b39e830 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -761,6 +761,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_K, From 14d46933b9dde09da930b874681222a2fdb638cd Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sun, 8 Dec 2024 02:05:55 -0600 Subject: [PATCH 165/245] vulkan: compile a test shader in cmake to check for coopmat2 support (#10713) --- ggml/src/ggml-vulkan/CMakeLists.txt | 14 ++++++++++++++ ggml/src/ggml-vulkan/ggml-vulkan.cpp | 19 +++++++++++++------ .../vulkan-shaders/test_coopmat2_support.comp | 7 +++++++ .../vulkan-shaders/vulkan-shaders-gen.cpp | 4 ++-- 4 files changed, 36 insertions(+), 8 deletions(-) create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat2_support.comp diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt index ae0485e04255d..546c853b6b728 100644 --- a/ggml/src/ggml-vulkan/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/CMakeLists.txt @@ -8,6 +8,20 @@ if (Vulkan_FOUND) ../../include/ggml-vulkan.h ) + # Compile a test shader to determine whether GL_NV_cooperative_matrix2 is supported. + # If it's not, there will be an error to stderr. + # If it's supported, set a define to indicate that we should compile those shaders + execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat2_support.comp" + OUTPUT_VARIABLE glslc_output + ERROR_VARIABLE glslc_error) + + if (${glslc_error} MATCHES ".*extension not supported: GL_NV_cooperative_matrix2.*") + message(STATUS "GL_NV_cooperative_matrix2 not supported by glslc") + else() + message(STATUS "GL_NV_cooperative_matrix2 supported by glslc") + add_compile_definitions(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) + endif() + target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan) target_include_directories(ggml-vulkan PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 4b2e449a58b40..9e2de9439abbc 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1513,7 +1513,7 @@ static void ggml_vk_load_shaders(vk_device& device) { compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align, disable_robustness)); }; -#if defined(VK_NV_cooperative_matrix2) +#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) if (device->coopmat2) { auto const &fa_wg_denoms = [&](uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::array { @@ -1611,7 +1611,7 @@ static void ggml_vk_load_shaders(vk_device& device) { #undef CREATE_MM #undef CREATE_MM2 } else -#endif // defined(VK_NV_cooperative_matrix2) +#endif // defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) if (device->coopmat_support) { // Create 6 variants, {s,m,l}x{unaligned,aligned} #define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ @@ -2153,7 +2153,7 @@ static vk_device ggml_vk_get_device(size_t idx) { device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix; if (coopmat2_support) { -#if defined(VK_NV_cooperative_matrix2) +#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) if (coopmat2_features.cooperativeMatrixWorkgroupScope && coopmat2_features.cooperativeMatrixFlexibleDimensions && coopmat2_features.cooperativeMatrixReductions && @@ -2414,14 +2414,19 @@ static void ggml_vk_print_gpu_info(size_t idx) { bool fp16_storage = false; bool fp16_compute = false; bool coopmat_support = false; + bool coopmat2_support = false; for (auto properties : ext_props) { if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { fp16_storage = true; } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { fp16_compute = true; - } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0) { + } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 && + !getenv("GGML_VK_DISABLE_COOPMAT")) { coopmat_support = true; + } else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 && + !getenv("GGML_VK_DISABLE_COOPMAT2")) { + coopmat2_support = true; } } @@ -2472,9 +2477,11 @@ static void ggml_vk_print_gpu_info(size_t idx) { coopmat_support = coopmat_support && coopmat_features.cooperativeMatrix; + std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none"; + std::string device_name = props2.properties.deviceName.data(); - GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu | matrix cores: %d\n", - idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size, coopmat_support); + GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu | matrix cores: %s\n", + idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size, matrix_cores.c_str()); if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) { GGML_LOG_DEBUG("ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want.\n"); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat2_support.comp b/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat2_support.comp new file mode 100644 index 0000000000000..28eb24e11f871 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat2_support.comp @@ -0,0 +1,7 @@ +#version 460 + +#extension GL_NV_cooperative_matrix2 : require + +void main() +{ +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index c4967ed658eb2..bc6fca5066998 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -342,14 +342,14 @@ void process_shaders() { matmul_shaders(true, matmul_id, true, false, false); matmul_shaders(true, matmul_id, true, false, true); -#if defined(VK_NV_cooperative_matrix2) +#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) // Coopmat2, fp32acc and fp16acc matmul_shaders(true, matmul_id, false, true, false); matmul_shaders(true, matmul_id, false, true, true); #endif } -#if defined(VK_NV_cooperative_matrix2) +#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) // flash attention for (const auto& f16acc : {false, true}) { std::string acctype = f16acc ? "float16_t" : "float"; From fd9cf67133aa4312844adb6a6323bc844118572b Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Sun, 8 Dec 2024 12:14:54 +0100 Subject: [PATCH 166/245] llama : use cmake for swift build (#10525) * llama : use cmake for swift build * swift : <> -> "" * ci : remove make * ci : disable ios build * Revert "swift : <> -> """ This reverts commit d39ffd9556482b77d4ea5b118b453fc1c097a31d. * ci : try fix ios build * ci : cont * ci : cont --------- Co-authored-by: Georgi Gerganov --- .github/workflows/build.yml | 107 ++++++++++-------- Package.swift | 78 +------------ Sources/llama/llama.h | 4 + Sources/llama/module.modulemap | 5 + cmake/llama.pc.in | 2 +- .../llama.cpp.swift/LibLlama.swift | 8 +- .../llama.swiftui.xcodeproj/project.pbxproj | 8 +- 7 files changed, 81 insertions(+), 131 deletions(-) create mode 100644 Sources/llama/llama.h create mode 100644 Sources/llama/module.modulemap diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f3326a5fbab82..886d33d2d5640 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -552,35 +552,44 @@ jobs: -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO -# TODO: tmp disabled. see for possible re-enable: -# https://github.com/ggerganov/llama.cpp/pull/10525 -# macOS-latest-swift: -# runs-on: macos-latest -# -# strategy: -# matrix: -# destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS'] -# -# steps: -# - name: Clone -# id: checkout -# uses: actions/checkout@v4 -# -# - name: Dependencies -# id: depends -# continue-on-error: true -# run: | -# brew update -# -# - name: xcodebuild for swift package -# id: xcodebuild -# run: | -# xcodebuild -scheme llama -destination "${{ matrix.destination }}" -# -# - name: Build Swift Example -# id: make_build_swift_example -# run: | -# make swift + macOS-latest-swift: + runs-on: macos-latest + + strategy: + matrix: + destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS'] + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + + - name: Dependencies + id: depends + continue-on-error: true + run: | + brew update + + - name: Build llama.cpp with CMake + id: cmake_build + run: | + sysctl -a + mkdir build + cd build + cmake -G Xcode .. \ + -DGGML_METAL_USE_BF16=ON \ + -DGGML_METAL_EMBED_LIBRARY=ON \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_SERVER=OFF \ + -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" + cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) + sudo cmake --install . --config Release + + - name: xcodebuild for swift package + id: xcodebuild + run: | + xcodebuild -scheme llama-Package -destination "${{ matrix.destination }}" windows-msys2: runs-on: windows-latest @@ -1104,6 +1113,29 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + - name: Build + id: cmake_build + run: | + sysctl -a + mkdir build + cd build + cmake -G Xcode .. \ + -DGGML_METAL_USE_BF16=ON \ + -DGGML_METAL_EMBED_LIBRARY=ON \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_SERVER=OFF \ + -DCMAKE_SYSTEM_NAME=iOS \ + -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ + -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml + cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO + sudo cmake --install . --config Release + + - name: xcodebuild for swift package + id: xcodebuild + run: | + xcodebuild -scheme llama-Package -destination 'generic/platform=iOS' + - name: Build Xcode project run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build @@ -1131,23 +1163,6 @@ jobs: ./gradlew build --no-daemon -# freeBSD-latest: -# runs-on: macos-12 -# steps: -# - name: Clone -# uses: actions/checkout@v4 -# -# - name: Build -# uses: cross-platform-actions/action@v0.19.0 -# with: -# operating_system: freebsd -# version: '13.2' -# hypervisor: 'qemu' -# run: | -# sudo pkg update -# sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas -# gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu` - release: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} diff --git a/Package.swift b/Package.swift index 3afeb2f1930e4..01c996d242037 100644 --- a/Package.swift +++ b/Package.swift @@ -2,60 +2,6 @@ import PackageDescription -var sources = [ - "src/llama.cpp", - "src/llama-vocab.cpp", - "src/llama-grammar.cpp", - "src/llama-sampling.cpp", - "src/unicode.cpp", - "src/unicode-data.cpp", - "ggml/src/ggml.c", - "ggml/src/ggml-alloc.c", - "ggml/src/ggml-backend.cpp", - "ggml/src/ggml-backend-reg.cpp", - "ggml/src/ggml-cpu/ggml-cpu.c", - "ggml/src/ggml-cpu/ggml-cpu.cpp", - "ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp", - "ggml/src/ggml-cpu/ggml-cpu-hbm.cpp", - "ggml/src/ggml-cpu/ggml-cpu-quants.c", - "ggml/src/ggml-cpu/ggml-cpu-traits.cpp", - "ggml/src/ggml-threading.cpp", - "ggml/src/ggml-quants.c", -] - -var resources: [Resource] = [] -var linkerSettings: [LinkerSetting] = [] -var cSettings: [CSetting] = [ - .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]), - .unsafeFlags(["-fno-objc-arc"]), - .headerSearchPath("ggml/src"), - .headerSearchPath("ggml/src/ggml-cpu"), - // NOTE: NEW_LAPACK will required iOS version 16.4+ - // We should consider add this in the future when we drop support for iOS 14 - // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc) - // .define("ACCELERATE_NEW_LAPACK"), - // .define("ACCELERATE_LAPACK_ILP64") - .define("GGML_USE_CPU"), -] - - -#if canImport(Darwin) -sources.append("ggml/src/ggml-common.h") -sources.append("ggml/src/ggml-metal/ggml-metal.m") -resources.append(.process("ggml/src/ggml-metal/ggml-metal.metal")) -linkerSettings.append(.linkedFramework("Accelerate")) -cSettings.append( - contentsOf: [ - .define("GGML_USE_ACCELERATE"), - .define("GGML_USE_METAL"), - ] -) -#endif - -#if os(Linux) - cSettings.append(.define("_GNU_SOURCE")) -#endif - let package = Package( name: "llama", platforms: [ @@ -68,26 +14,6 @@ let package = Package( .library(name: "llama", targets: ["llama"]), ], targets: [ - .target( - name: "llama", - path: ".", - exclude: [ - "build", - "cmake", - "examples", - "scripts", - "models", - "tests", - "CMakeLists.txt", - "Makefile", - "ggml/src/ggml-metal-embed.metal" - ], - sources: sources, - resources: resources, - publicHeadersPath: "spm-headers", - cSettings: cSettings, - linkerSettings: linkerSettings - ) - ], - cxxLanguageStandard: .cxx17 + .systemLibrary(name: "llama", pkgConfig: "llama"), + ] ) diff --git a/Sources/llama/llama.h b/Sources/llama/llama.h new file mode 100644 index 0000000000000..41725880ed8c0 --- /dev/null +++ b/Sources/llama/llama.h @@ -0,0 +1,4 @@ +#pragma once + +#include + diff --git a/Sources/llama/module.modulemap b/Sources/llama/module.modulemap new file mode 100644 index 0000000000000..d010555b1cb65 --- /dev/null +++ b/Sources/llama/module.modulemap @@ -0,0 +1,5 @@ +module llama [system] { + header "llama.h" + link "llama" + export * +} diff --git a/cmake/llama.pc.in b/cmake/llama.pc.in index 326acbb6108fd..0b2b6bcfabfd1 100644 --- a/cmake/llama.pc.in +++ b/cmake/llama.pc.in @@ -6,5 +6,5 @@ includedir=${prefix}/include Name: llama Description: Port of Facebook's LLaMA model in C/C++ Version: @PROJECT_VERSION@ -Libs: -L${libdir} -lllama +Libs: -L${libdir} -lggml -lggml-base -lllama Cflags: -I${includedir} diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 65cd4eb515c7f..998c673d5d31f 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -210,20 +210,20 @@ actor LlamaContext { llama_kv_cache_clear(context) - let t_pp_start = ggml_time_us() + let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000; if llama_decode(context, batch) != 0 { print("llama_decode() failed during prompt") } llama_synchronize(context) - let t_pp_end = ggml_time_us() + let t_pp_end = DispatchTime.now().uptimeNanoseconds / 1000; // bench text generation llama_kv_cache_clear(context) - let t_tg_start = ggml_time_us() + let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000; for i in 0.. Date: Sun, 8 Dec 2024 19:19:19 +0100 Subject: [PATCH 167/245] Vulkan: fix NaN in tanh.comp with AMD proprietary driver on Windows (#10723) * Vulkan: fix NaN in tanh.comp * Faster NaN-free tanh --- ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp b/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp index 74630dc7fef12..495f966bdc6e5 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp @@ -16,6 +16,5 @@ void main() { if (i >= p.KX) { return; } - - data_d[i] = D_TYPE(tanh(data_a[i])); + data_d[i] = D_TYPE(1. - 2. / (exp(2.*data_a[i]) + 1.)); } From 5eb735876f2bb5cbd98b5f8704fdba946dfe1b9c Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 8 Dec 2024 20:38:51 +0100 Subject: [PATCH 168/245] server : bring back info of final chunk in stream mode (#10722) * server : bring back into to final chunk in stream mode * clarify a bit * traling space --- examples/server/server.cpp | 174 +++++++++--------- examples/server/tests/unit/test_completion.py | 6 + 2 files changed, 94 insertions(+), 86 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1c21e55aaa011..1d9c0533d4c40 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -392,7 +392,7 @@ struct server_task_result { return false; } virtual bool is_stop() { - // only used by server_task_result_cmpl_partial + // only used by server_task_result_cmpl_* return false; } virtual int get_index() { @@ -478,14 +478,20 @@ struct server_task_result_cmpl_final : server_task_result { return index; } + virtual bool is_stop() override { + return true; // in stream mode, final responses are considered stop + } + virtual json to_json() override { - return oaicompat ? to_json_oaicompat_chat() : to_json_non_oaicompat(); + return oaicompat + ? (stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat()) + : to_json_non_oaicompat(); } json to_json_non_oaicompat() { json res = json { {"index", index}, - {"content", content}, + {"content", stream ? "" : content}, // in stream mode, content is already in last partial chunk {"id_slot", id_slot}, {"stop", true}, {"model", oaicompat_model}, @@ -546,18 +552,46 @@ struct server_task_result_cmpl_final : server_task_result { return res; } + + json to_json_oaicompat_chat_stream() { + std::time_t t = std::time(0); + std::string finish_reason = "length"; + if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { + finish_reason = "stop"; + } + + json choices = json::array({json{{"finish_reason", finish_reason}, + {"index", 0}, + {"delta", json::object()}}}); + + json ret = json { + {"choices", choices}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"object", "chat.completion.chunk"}, + {"usage", json { + {"completion_tokens", n_decoded}, + {"prompt_tokens", n_prompt_tokens}, + {"total_tokens", n_decoded + n_prompt_tokens}, + }}, + }; + + if (timings.prompt_n >= 0) { + ret.push_back({"timings", timings.to_json()}); + } + + return ret; + } }; struct server_task_result_cmpl_partial : server_task_result { int index = 0; std::string content; - bool truncated; int32_t n_decoded; int32_t n_prompt_tokens; - stop_type stop = STOP_TYPE_NONE; - std::vector probs_output; result_timings timings; @@ -573,20 +607,19 @@ struct server_task_result_cmpl_partial : server_task_result { } virtual bool is_stop() override { - return stop != STOP_TYPE_NONE; + return false; // in stream mode, partial responses are not considered stop } virtual json to_json() override { - if (oaicompat) { - return to_json_oaicompat(); - } - bool is_stop = stop != STOP_TYPE_NONE; + return oaicompat ? to_json_oaicompat() : to_json_non_oaicompat(); + } + + json to_json_non_oaicompat() { // non-OAI-compat JSON json res = json { {"index", index}, {"content", content}, - {"stop_type", stop_type_to_str(stop)}, - {"stop", is_stop}, + {"stop", false}, {"id_slot", id_slot}, {"tokens_predicted", n_decoded}, {"tokens_evaluated", n_prompt_tokens}, @@ -598,72 +631,54 @@ struct server_task_result_cmpl_partial : server_task_result { if (!probs_output.empty()) { res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output); } - if (is_stop) { - res.push_back({"truncated", truncated}); - } return res; } json to_json_oaicompat() { bool first = n_decoded == 0; - - std::string finish_reason; - if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { - finish_reason = "stop"; - } else if (stop == STOP_TYPE_LIMIT) { - finish_reason = "length"; - } - std::time_t t = std::time(0); - json choices; - if (!finish_reason.empty()) { - choices = json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"delta", json::object()}}}); - } else { - if (first) { - if (content.empty()) { - choices = json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{{"role", "assistant"}}}}}); - } else { - // We have to send this as two updates to conform to openai behavior - json initial_ret = json{{"choices", json::array({json{ - {"finish_reason", nullptr}, + if (first) { + if (content.empty()) { + choices = json::array({json{{"finish_reason", nullptr}, {"index", 0}, - {"delta", json{ - {"role", "assistant"} - }}}})}, - {"created", t}, - {"id", oaicompat_cmpl_id}, - {"model", oaicompat_model}, - {"object", "chat.completion.chunk"}}; - - json second_ret = json{ - {"choices", json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{ - {"content", content}}} - }})}, - {"created", t}, - {"id", oaicompat_cmpl_id}, - {"model", oaicompat_model}, - {"object", "chat.completion.chunk"}}; - - return std::vector({initial_ret, second_ret}); - } + {"delta", json{{"role", "assistant"}}}}}); } else { - choices = json::array({json{ - {"finish_reason", nullptr}, - {"index", 0}, - {"delta", - json{ - {"content", content}, - }}, - }}); + // We have to send this as two updates to conform to openai behavior + json initial_ret = json{{"choices", json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{ + {"role", "assistant"} + }}}})}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"object", "chat.completion.chunk"}}; + + json second_ret = json{ + {"choices", json::array({json{{"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{ + {"content", content}}} + }})}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"object", "chat.completion.chunk"}}; + + return std::vector({initial_ret, second_ret}); } + } else { + choices = json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", + json{ + {"content", content}, + }}, + }}); } json ret = json { @@ -678,14 +693,6 @@ struct server_task_result_cmpl_partial : server_task_result { ret.push_back({"timings", timings.to_json()}); } - if (!finish_reason.empty()) { - ret.push_back({"usage", json { - {"completion_tokens", n_decoded}, - {"prompt_tokens", n_prompt_tokens}, - {"total_tokens", n_decoded + n_prompt_tokens}, - }}); - } - return std::vector({ret}); } }; @@ -1888,12 +1895,9 @@ struct server_context { res->index = slot.index; res->content = tkn.text_to_send; - res->truncated = slot.truncated; res->n_decoded = slot.n_decoded; res->n_prompt_tokens = slot.n_prompt_tokens; - res->stop = slot.stop; - res->verbose = slot.params.verbose; res->oaicompat = slot.params.oaicompat; res->oaicompat_chat = slot.params.oaicompat_chat; @@ -1924,12 +1928,6 @@ struct server_context { } void send_final_response(server_slot & slot) { - if (slot.params.stream) { - // if in stream mode, send the last partial response - send_partial_response(slot, {0, "", {}}); - return; - } - auto res = std::make_unique(); res->id = slot.id_task; res->id_slot = slot.id; @@ -1948,6 +1946,7 @@ struct server_context { res->stop = slot.stop; res->verbose = slot.params.verbose; + res->stream = slot.params.stream; res->oaicompat = slot.params.oaicompat; res->oaicompat_chat = slot.params.oaicompat_chat; res->oaicompat_model = slot.params.oaicompat_model; @@ -2100,7 +2099,10 @@ struct server_context { return; } - GGML_ASSERT(dynamic_cast(result.get()) != nullptr); + GGML_ASSERT( + dynamic_cast(result.get()) != nullptr + || dynamic_cast(result.get()) != nullptr + ); if (!result_handler(result)) { cancel_tasks(id_tasks); break; diff --git a/examples/server/tests/unit/test_completion.py b/examples/server/tests/unit/test_completion.py index 1c3aa77de5bba..7f4f9cd038be4 100644 --- a/examples/server/tests/unit/test_completion.py +++ b/examples/server/tests/unit/test_completion.py @@ -42,10 +42,16 @@ def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_promp }) content = "" for data in res: + assert "stop" in data and type(data["stop"]) == bool if data["stop"]: assert data["timings"]["prompt_n"] == n_prompt assert data["timings"]["predicted_n"] == n_predicted assert data["truncated"] == truncated + assert data["stop_type"] == "limit" + assert "generation_settings" in data + assert server.n_predict is not None + assert data["generation_settings"]["n_predict"] == min(n_predict, server.n_predict) + assert data["generation_settings"]["seed"] == server.seed assert match_regex(re_content, content) else: content += data["content"] From efc76613a0f2856a5db9d1a7a69ab9db21c40eca Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 8 Dec 2024 23:04:29 +0100 Subject: [PATCH 169/245] server : fix format_infill (#10724) * server : fix format_infill * fix * rename * update test * use another model * update test * update test * test_invalid_input_extra_req --- examples/server/server.cpp | 22 ++++++++++++++ examples/server/tests/unit/test_infill.py | 36 ++++++++++++++++++----- examples/server/tests/utils.py | 3 ++ 3 files changed, 53 insertions(+), 8 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1d9c0533d4c40..47bfd6c4aac90 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3484,6 +3484,11 @@ int main(int argc, char ** argv) { json data = json::parse(req.body); // validate input + if (data.contains("prompt") && !data.at("prompt").is_string()) { + // prompt is optional + res_error(res, format_error_response("\"prompt\" must be a string", ERROR_TYPE_INVALID_REQUEST)); + } + if (!data.contains("input_prefix")) { res_error(res, format_error_response("\"input_prefix\" is required", ERROR_TYPE_INVALID_REQUEST)); } @@ -3493,9 +3498,11 @@ int main(int argc, char ** argv) { } if (data.contains("input_extra") && !data.at("input_extra").is_array()) { + // input_extra is optional res_error(res, format_error_response("\"input_extra\" must be an array of {\"filename\": string, \"text\": string}", ERROR_TYPE_INVALID_REQUEST)); return; } + json input_extra = json_value(data, "input_extra", json::array()); for (const auto & chunk : input_extra) { // { "text": string, "filename": string } @@ -3511,6 +3518,21 @@ int main(int argc, char ** argv) { } data["input_extra"] = input_extra; // default to empty array if it's not exist + std::string prompt = json_value(data, "prompt", std::string()); + std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true); + SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size()); + data["prompt"] = format_infill( + ctx_server.ctx, + data.at("input_prefix"), + data.at("input_suffix"), + data.at("input_extra"), + ctx_server.params_base.n_batch, + ctx_server.params_base.n_predict, + ctx_server.slots[0].n_ctx, // TODO: there should be a better way + ctx_server.params_base.spm_infill, + tokenized_prompts[0] + ); + return handle_completions_generic(SERVER_TASK_TYPE_INFILL, data, res); }; diff --git a/examples/server/tests/unit/test_infill.py b/examples/server/tests/unit/test_infill.py index 6a6d40a1cbc8b..ad4b8192a7875 100644 --- a/examples/server/tests/unit/test_infill.py +++ b/examples/server/tests/unit/test_infill.py @@ -13,28 +13,28 @@ def test_infill_without_input_extra(): global server server.start() res = server.make_request("POST", "/infill", data={ - "prompt": "Complete this", - "input_prefix": "#include \n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_", + "input_prefix": "#include \n#include \"llama.h\"\n\nint main() {\n", + "prompt": " int n_threads = llama_", "input_suffix": "}\n", }) assert res.status_code == 200 - assert match_regex("(One|day|she|saw|big|scary|bird)+", res.body["content"]) + assert match_regex("(Ann|small|shiny)+", res.body["content"]) def test_infill_with_input_extra(): global server server.start() res = server.make_request("POST", "/infill", data={ - "prompt": "Complete this", "input_extra": [{ "filename": "llama.h", "text": "LLAMA_API int32_t llama_n_threads();\n" }], - "input_prefix": "#include \n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_", + "input_prefix": "#include \n#include \"llama.h\"\n\nint main() {\n", + "prompt": " int n_threads = llama_", "input_suffix": "}\n", }) assert res.status_code == 200 - assert match_regex("(cuts|Jimmy|mom|came|into|the|room)+", res.body["content"]) + assert match_regex("(Dad|excited|park)+", res.body["content"]) @pytest.mark.parametrize("input_extra", [ @@ -48,10 +48,30 @@ def test_invalid_input_extra_req(input_extra): global server server.start() res = server.make_request("POST", "/infill", data={ - "prompt": "Complete this", "input_extra": [input_extra], - "input_prefix": "#include \n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_", + "input_prefix": "#include \n#include \"llama.h\"\n\nint main() {\n", + "prompt": " int n_threads = llama_", "input_suffix": "}\n", }) assert res.status_code == 400 assert "error" in res.body + + +@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test") +def test_with_qwen_model(): + global server + server.model_file = None + server.model_hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-IQ3_XXS-GGUF" + server.model_hf_file = "qwen2.5-coder-1.5b-iq3_xxs-imat.gguf" + server.start(timeout_seconds=600) + res = server.make_request("POST", "/infill", data={ + "input_extra": [{ + "filename": "llama.h", + "text": "LLAMA_API int32_t llama_n_threads();\n" + }], + "input_prefix": "#include \n#include \"llama.h\"\n\nint main() {\n", + "prompt": " int n_threads = llama_", + "input_suffix": "}\n", + }) + assert res.status_code == 200 + assert res.body["content"] == "n_threads();\n printf(\"Number of threads: %d\\n\", n_threads);\n return 0;\n" diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index 69215eaa4ebb7..7c89b9cd37505 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -371,3 +371,6 @@ def match_regex(regex: str, text: str) -> bool: ).search(text) is not None ) + +def is_slow_test_allowed(): + return os.environ.get("SLOW_TESTS") == "1" or os.environ.get("SLOW_TESTS") == "ON" From e2edfebf2a5c3c403a0baecacbd909013ef33bb5 Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Mon, 9 Dec 2024 09:15:13 +0200 Subject: [PATCH 170/245] cmake : simplify msvc charsets (#10672) --- CMakeLists.txt | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f84fff9e6ad42..a717a508ff5d9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,11 +46,9 @@ if (WIN32) add_compile_definitions(_CRT_SECURE_NO_WARNINGS) endif() -if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") - add_compile_options("$<$:/source-charset:utf-8>") - add_compile_options("$<$:/source-charset:utf-8>") - add_compile_options("$<$:/execution-charset:utf-8>") - add_compile_options("$<$:/execution-charset:utf-8>") +if (MSVC) + add_compile_options("$<$:/utf-8>") + add_compile_options("$<$:/utf-8>") endif() # From 24d7c8f92b5af7763bfa44901d6816835c0b0cbf Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Mon, 9 Dec 2024 01:24:01 -0600 Subject: [PATCH 171/245] vulkan: fix compile warnings (#10731) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 9e2de9439abbc..5d9eba9833fa3 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -427,7 +427,7 @@ static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_pus // and a shift: // // n/d = (mulhi(n, mp) + n) >> L; -void init_fastdiv_values(uint32_t d, uint32_t &mp, uint32_t &L) +static void init_fastdiv_values(uint32_t d, uint32_t &mp, uint32_t &L) { // compute L = ceil(log2(d)); L = 0; @@ -439,6 +439,7 @@ void init_fastdiv_values(uint32_t d, uint32_t &mp, uint32_t &L) } template void init_pushconst_fastdiv(T &p) { + GGML_UNUSED(p); static_assert(!std::is_const::value, "unexpected type"); } @@ -3417,7 +3418,7 @@ static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int return split_k; } -static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned, ggml_type type_a) { +static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) { VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")"); if (ctx->device->coopmat2) { @@ -3439,9 +3440,9 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, return aligned ? mmp->a_l : mmp->l; } -static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, ggml_type type_a) { +static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n) { VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")"); - return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true, type_a)->align; + return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true)->align; } static void ggml_vk_matmul( @@ -3571,6 +3572,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], 1 , (uint32_t)tensor->ne[0] , (uint32_t)(tensor->ne[0] * tensor->ne[1]) , (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]), 0, 0.0f, 0.0f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; init_pushconst_fastdiv(pc); ggml_vk_sync_buffers(subctx); @@ -3644,10 +3646,10 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub const int y_ne = ne11 * ne10; const int d_ne = ne11 * ne01; - const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11, src0->type)); + const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11)); const bool aligned = ne10 == kpad && ne01 > 8 && ne11 > 8; - vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned, src0->type); + vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned); const uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, pipeline); @@ -5351,7 +5353,8 @@ static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, con (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, - op_params[0], 0.0f + op_params[0], 0.0f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, dryrun); } @@ -5365,6 +5368,7 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, dryrun); } @@ -5378,6 +5382,7 @@ static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, dryrun); } @@ -5391,6 +5396,7 @@ static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, dryrun); } @@ -5405,6 +5411,7 @@ static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, con (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, op_params[0], op_params[1], + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, dryrun); } @@ -5418,6 +5425,7 @@ static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, dryrun); } @@ -5431,6 +5439,7 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, co (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, dryrun); } @@ -5445,6 +5454,7 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, d_offset, 0.0f, 0.0f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, dryrun); } From 458f369a033a24fabd0b0d6866309e268cff7bfb Mon Sep 17 00:00:00 2001 From: Srihari-mcw <96763064+Srihari-mcw@users.noreply.github.com> Date: Mon, 9 Dec 2024 23:10:19 +0530 Subject: [PATCH 172/245] Changes to CMakePresets.json to add ninja clang target on windows (#10668) * Update cmakepreset.json to use clang with ninja by default * Update cmakepreset.json to add clang and ninja based configs * Updates to build.md file * Make updates to rename preset targets * Update with .cmake file * Remove additional whitespaces * Add .cmake file for x64-windows-llvm * Update docs/build.md * Update docs/build.md --------- Co-authored-by: Max Krasnyansky --- CMakePresets.json | 12 ++++++++++++ cmake/x64-windows-llvm.cmake | 11 +++++++++++ docs/build.md | 7 +++++++ 3 files changed, 30 insertions(+) create mode 100644 cmake/x64-windows-llvm.cmake diff --git a/CMakePresets.json b/CMakePresets.json index 436448967c9af..13bdd7907ab40 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -31,6 +31,13 @@ { "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } }, { "name": "vulkan", "hidden": true, "cacheVariables": { "GGML_VULKAN": "ON" } }, + { + "name": "x64-windows-llvm", "hidden": true, + "cacheVariables": { + "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/x64-windows-llvm.cmake" + } + }, + { "name": "arm64-windows-msvc", "hidden": true, "architecture": { "value": "arm64", "strategy": "external" }, @@ -70,6 +77,11 @@ { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] }, { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] }, + { "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] }, + { "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] }, + { "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] }, + { "name": "x64-windows-llvm+static-release", "inherits": [ "base", "x64-windows-llvm", "reldbg", "static" ] }, + { "name": "x64-windows-msvc-debug", "inherits": [ "base", "debug" ] }, { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] }, { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] }, diff --git a/cmake/x64-windows-llvm.cmake b/cmake/x64-windows-llvm.cmake new file mode 100644 index 0000000000000..0603d738fbef0 --- /dev/null +++ b/cmake/x64-windows-llvm.cmake @@ -0,0 +1,11 @@ +set( CMAKE_SYSTEM_NAME Windows ) +set( CMAKE_SYSTEM_PROCESSOR x86_64 ) + +set( CMAKE_C_COMPILER clang ) +set( CMAKE_CXX_COMPILER clang++ ) + +set( arch_c_flags "-march=native" ) + +set( CMAKE_C_FLAGS_INIT "${arch_c_flags}" ) +set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags}" ) + diff --git a/docs/build.md b/docs/build.md index 26e6737888b78..84019b2046a89 100644 --- a/docs/build.md +++ b/docs/build.md @@ -57,6 +57,13 @@ cmake --build build --config Release ``` Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_N_M CPU kernels. + For building with ninja generator and clang compiler as default: + -set path:set LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\x64;C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.41.34120\lib\x64\uwp;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\x64 + ```bash + cmake --preset x64-windows-llvm-release + cmake --build build-x64-windows-llvm-release + ``` + ## BLAS Build Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Using BLAS doesn't affect the generation performance. There are currently several different BLAS implementations available for build and use: From 44aee1cdf4416274590c2100122104bc70ec88ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Mon, 9 Dec 2024 20:07:12 +0100 Subject: [PATCH 173/245] CUDA: fix shared memory access condition for mmv (#10740) --- ggml/src/ggml-cuda/mmv.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/mmv.cu b/ggml/src/ggml-cuda/mmv.cu index cfe91f4283fa6..a4b4f6bc10d98 100644 --- a/ggml/src/ggml-cuda/mmv.cu +++ b/ggml/src/ggml-cuda/mmv.cu @@ -57,7 +57,7 @@ static __global__ void mul_mat_vec( if (block_size > WARP_SIZE) { buf_iw[tid/WARP_SIZE] = sumf; __syncthreads(); - if (tid > WARP_SIZE) { + if (tid >= WARP_SIZE) { return; } sumf = buf_iw[tid]; From 43fdc54452eae10ba4888b614b03c6379260d4dd Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Tue, 10 Dec 2024 11:22:20 -0600 Subject: [PATCH 174/245] vulkan: disable spirv-opt for coopmat shaders (#10763) There are some bugs in the 1.3.296 SDK, so disable this. It isn't strictly necessary anyway. Add missing dependency on vulkan-shaders-gen, so shaders get recompiled when it changes. Fix coopmat support reporting when glslc doesn't support NV_coopmat2. --- ggml/src/ggml-vulkan/CMakeLists.txt | 2 +- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 2 ++ ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp | 7 +++++-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt index 546c853b6b728..6d46e5f24c169 100644 --- a/ggml/src/ggml-vulkan/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/CMakeLists.txt @@ -81,7 +81,7 @@ if (Vulkan_FOUND) --target-cpp ${_ggml_vk_source} --no-clean - DEPENDS ${_ggml_vk_shader_deps} + DEPENDS ${_ggml_vk_shader_deps} ${_ggml_vk_genshaders_cmd} COMMENT "Generate vulkan shaders" ) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 5d9eba9833fa3..ad5535c11d4a1 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -2425,9 +2425,11 @@ static void ggml_vk_print_gpu_info(size_t idx) { } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 && !getenv("GGML_VK_DISABLE_COOPMAT")) { coopmat_support = true; +#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) } else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 && !getenv("GGML_VK_DISABLE_COOPMAT2")) { coopmat2_support = true; +#endif } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index bc6fca5066998..ee35208337463 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -206,10 +206,13 @@ void string_to_spv_func(const std::string& _name, const std::string& in_fname, c std::string target_env = (name.find("_cm2") != std::string::npos) ? "--target-env=vulkan1.3" : "--target-env=vulkan1.2"; + // disable spirv-opt for coopmat shaders for https://github.com/ggerganov/llama.cpp/issues/10734 + std::string opt_level = coopmat ? "" : "-O"; + #ifdef _WIN32 - std::vector cmd = {GLSLC, "-fshader-stage=compute", target_env, "-O", "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""}; + std::vector cmd = {GLSLC, "-fshader-stage=compute", target_env, opt_level, "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""}; #else - std::vector cmd = {GLSLC, "-fshader-stage=compute", target_env, "-O", in_path, "-o", out_fname}; + std::vector cmd = {GLSLC, "-fshader-stage=compute", target_env, opt_level, in_path, "-o", out_fname}; #endif #ifdef GGML_VULKAN_SHADER_DEBUG_INFO From 99e9454bf9653ddaa73113c086bab89db584ff58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Y=C3=BCg?= Date: Tue, 10 Dec 2024 17:22:34 +0000 Subject: [PATCH 175/245] server : add flag to disable the web-ui (#10762) (#10751) Co-authored-by: eugenio.segala --- common/arg.cpp | 7 ++++++ examples/server/README.md | 1 + examples/server/server.cpp | 30 ++++++++++++++---------- examples/server/tests/unit/test_basic.py | 18 ++++++++++++++ examples/server/tests/utils.py | 3 +++ 5 files changed, 46 insertions(+), 13 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 0db59f70187e8..808ec1c3e65fc 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1711,6 +1711,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.public_path = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH")); + add_opt(common_arg( + {"--no-webui"}, + string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"), + [](common_params & params) { + params.webui = false; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI")); add_opt(common_arg( {"--embedding", "--embeddings"}, string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), diff --git a/examples/server/README.md b/examples/server/README.md index 117c52d3f1310..6294f541fc7d7 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -146,6 +146,7 @@ The project is under active development, and we are [looking for feedback and co | `--host HOST` | ip address to listen (default: 127.0.0.1)
(env: LLAMA_ARG_HOST) | | `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) | | `--path PATH` | path to serve static files from (default: )
(env: LLAMA_ARG_STATIC_PATH) | +| `--no-webui` | disable the Web UI
(env: LLAMA_ARG_NO_WEBUI) | | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) | | `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)
(env: LLAMA_ARG_RERANKING) | | `--api-key KEY` | API key to use for authentication (default: none)
(env: LLAMA_API_KEY) | diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 47bfd6c4aac90..8cb992470a302 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3815,20 +3815,24 @@ int main(int argc, char ** argv) { // Router // - // register static assets routes - if (!params.public_path.empty()) { - // Set the base directory for serving static files - bool is_found = svr->set_mount_point("/", params.public_path); - if (!is_found) { - LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str()); - return 1; - } + if (!params.webui) { + LOG_INF("Web UI is disabled\n"); } else { - // using embedded static index.html - svr->Get("/", [](const httplib::Request &, httplib::Response & res) { - res.set_content(reinterpret_cast(index_html), index_html_len, "text/html; charset=utf-8"); - return false; - }); + // register static assets routes + if (!params.public_path.empty()) { + // Set the base directory for serving static files + bool is_found = svr->set_mount_point("/", params.public_path); + if (!is_found) { + LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str()); + return 1; + } + } else { + // using embedded static index.html + svr->Get("/", [](const httplib::Request &, httplib::Response & res) { + res.set_content(reinterpret_cast(index_html), index_html_len, "text/html; charset=utf-8"); + return false; + }); + } } // register API routes diff --git a/examples/server/tests/unit/test_basic.py b/examples/server/tests/unit/test_basic.py index 1d512401685fb..1485de8ceb3fc 100644 --- a/examples/server/tests/unit/test_basic.py +++ b/examples/server/tests/unit/test_basic.py @@ -1,4 +1,5 @@ import pytest +import requests from utils import * server = ServerPreset.tinyllama2() @@ -76,3 +77,20 @@ def test_load_split_model(): }) assert res.status_code == 200 assert match_regex("(little|girl)+", res.body["content"]) + + +def test_no_webui(): + global server + # default: webui enabled + server.start() + url = f"http://{server.server_host}:{server.server_port}" + res = requests.get(url) + assert res.status_code == 200 + assert "" in res.text + server.stop() + + # with --no-webui + server.no_webui = True + server.start() + res = requests.get(url) + assert res.status_code == 404 diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index 7c89b9cd37505..d988ccf5e3061 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -72,6 +72,7 @@ class ServerProcess: disable_ctx_shift: int | None = False draft_min: int | None = None draft_max: int | None = None + no_webui: bool | None = None # session variables process: subprocess.Popen | None = None @@ -158,6 +159,8 @@ def start(self, timeout_seconds: int = 10) -> None: server_args.extend(["--draft-max", self.draft_max]) if self.draft_min: server_args.extend(["--draft-min", self.draft_min]) + if self.no_webui: + server_args.append("--no-webui") args = [str(arg) for arg in [server_path, *server_args]] print(f"bench: starting server with: {' '.join(args)}") From 371229563f2f2d3e5f8cc637f43427e224ee9bcc Mon Sep 17 00:00:00 2001 From: Andreas Kieslinger <47689530+aendk@users.noreply.github.com> Date: Tue, 10 Dec 2024 18:23:24 +0100 Subject: [PATCH 176/245] CUDA: rename macros to avoid conflicts with WinAPI (#10736) * Renames NVIDIA GPU-architecture flags to avoid name clashes with WinAPI. (e.g. CC_PASCAL, GPU architecture or WinAPI pascal compiler flag?) * Reverts erroneous rename in SYCL-code. * Renames GGML_CUDA_MIN_CC_DP4A to GGML_CUDA_CC_DP4A. * Renames the rest of the compute capability macros for consistency. --- ggml/src/ggml-common.h | 2 +- ggml/src/ggml-cuda/common.cuh | 70 ++++++++++++++++----------------- ggml/src/ggml-cuda/convert.cu | 6 +-- ggml/src/ggml-cuda/fattn.cu | 2 +- ggml/src/ggml-cuda/ggml-cuda.cu | 12 +++--- ggml/src/ggml-cuda/mma.cuh | 8 ++-- ggml/src/ggml-cuda/mmq.cu | 10 ++--- ggml/src/ggml-cuda/mmq.cuh | 26 ++++++------ ggml/src/ggml-cuda/mmvq.cu | 2 +- ggml/src/ggml-cuda/sum.cu | 2 - 10 files changed, 69 insertions(+), 71 deletions(-) diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 7fd2aadeca1af..f13fd4dea6f41 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -473,7 +473,7 @@ GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128) 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255, GGML_TABLE_END() -//#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics +//#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A // lowest compute capability for integer intrinsics GGML_TABLE_BEGIN(uint64_t, ksigns64, 128) 0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff, 0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff, diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 535118d87928e..2c0a562267281 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -41,28 +41,28 @@ #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed) #define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons -#define CC_PASCAL 600 -#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products -#define CC_VOLTA 700 -#define CC_TURING 750 -#define CC_AMPERE 800 -#define CC_OFFSET_AMD 1000000 +#define GGML_CUDA_CC_PASCAL 600 +#define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products +#define GGML_CUDA_CC_VOLTA 700 +#define GGML_CUDA_CC_TURING 750 +#define GGML_CUDA_CC_AMPERE 800 +#define GGML_CUDA_CC_OFFSET_AMD 1000000 // GCN/CNDA, wave size is 64 -#define CC_GCN4 (CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16 -#define CC_VEGA (CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue -#define CC_VEGA20 (CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a -#define CC_CDNA (CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers -#define CC_CDNA2 (CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing -#define CC_CDNA3 (CC_OFFSET_AMD + 942) // MI300 +#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16 +#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue +#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a +#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers +#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing +#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 942) // MI300 // RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32 -#define CC_RDNA1 (CC_OFFSET_AMD + 1010) // RX 5000 -#define CC_RDNA2 (CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a -#define CC_RDNA3 (CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA +#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 1010) // RX 5000 +#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a +#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA -#define CC_QY1 210 -#define CC_QY2 220 +#define GGML_CUDA_CC_QY1 210 +#define GGML_CUDA_CC_QY2 220 #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses @@ -131,36 +131,36 @@ typedef float dfloat; // dequantize float typedef float2 dfloat2; #endif // GGML_CUDA_F16 -#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL +#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL #define FP16_AVAILABLE -#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL +#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL #if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610 #define FAST_FP16_AVAILABLE #endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610 -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA #define FP16_MMA_AVAILABLE -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING #define INT8_MMA_AVAILABLE -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING -#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1) +#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1) #define FLASH_ATTN_AVAILABLE -#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1) +#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1) static constexpr bool fast_fp16_available(const int cc) { - return cc >= CC_PASCAL && cc != 610; + return cc >= GGML_CUDA_CC_PASCAL && cc != 610; } static constexpr bool fp16_mma_available(const int cc) { - return cc < CC_OFFSET_AMD && cc >= CC_VOLTA; + return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA; } static constexpr bool int8_mma_available(const int cc) { - return cc < CC_OFFSET_AMD && cc >= CC_TURING; + return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING; } [[noreturn]] @@ -187,7 +187,7 @@ static __device__ void no_device_code( #endif // __CUDA_ARCH__ static __device__ __forceinline__ int warp_reduce_sum(int x) { -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE return __reduce_add_sync(0xffffffff, x); #else #pragma unroll @@ -195,7 +195,7 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) { x += __shfl_xor_sync(0xffffffff, x, offset, 32); } return x; -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE } static __device__ __forceinline__ float warp_reduce_sum(float x) { @@ -284,7 +284,7 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal } static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL #pragma unroll for (int offset = 16; offset > 0; offset >>= 1) { x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, 32)); @@ -293,7 +293,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { #else GGML_UNUSED(x); NO_DEVICE_CODE; -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL } #if CUDART_VERSION < CUDART_HMASK @@ -333,13 +333,13 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) -#if __CUDA_ARCH__ >= MIN_CC_DP4A +#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A return __dp4a(a, b, c); -#else // __CUDA_ARCH__ >= MIN_CC_DP4A +#else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A const int8_t * a8 = (const int8_t *) &a; const int8_t * b8 = (const int8_t *) &b; return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3]; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) } diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index c0a4447075c6e..3896f956d7338 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -26,7 +26,7 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __ template static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int64_t k) { -#if __CUDA_ARCH__ >= CC_PASCAL +#if __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE; const int64_t i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x; @@ -64,7 +64,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h GGML_UNUSED(y); GGML_UNUSED(k); NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= CC_PASCAL +#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL } template @@ -599,7 +599,7 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { case GGML_TYPE_Q5_1: return dequantize_block_cuda; case GGML_TYPE_Q8_0: - if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= CC_PASCAL) { + if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= GGML_CUDA_CC_PASCAL) { return dequantize_block_q8_0_f16_cuda; } return dequantize_block_cuda; diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index 0e7ebbc539352..0b26b0f8e0595 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -304,7 +304,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV); // On AMD the tile kernels perform poorly, use the vec kernel instead: - if (cc >= CC_OFFSET_AMD) { + if (cc >= GGML_CUDA_CC_OFFSET_AMD) { if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) { ggml_cuda_flash_attn_ext_vec_f16(ctx, dst); } else { diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 15fcb2a65cbcf..c180adc84b861 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -177,7 +177,7 @@ static ggml_cuda_device_info ggml_cuda_init() { info.devices[id].smpb = prop.sharedMemPerBlock; #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) info.devices[id].smpbo = prop.sharedMemPerBlock; - info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD; + info.devices[id].cc = 100*prop.major + 10*prop.minor + GGML_CUDA_CC_OFFSET_AMD; #else info.devices[id].smpbo = prop.sharedMemPerBlockOptin; info.devices[id].cc = 100*prop.major + 10*prop.minor; @@ -1081,7 +1081,7 @@ static void ggml_cuda_op_mul_mat_cublas( const int compute_capability = ggml_cuda_info().devices[id].cc; - if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) { + if (compute_capability >= GGML_CUDA_CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) { // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32 ggml_cuda_pool_alloc src0_as_f16(ctx.pool(id)); if (src0->type != GGML_TYPE_F16) { @@ -1108,7 +1108,7 @@ static void ggml_cuda_op_mul_mat_cublas( const half beta_f16 = 0.0f; cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F; - if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) { + if (ggml_cuda_info().devices[ctx.device].cc == GGML_CUDA_CC_CDNA) { cu_compute_type = CUBLAS_COMPUTE_32F; } @@ -1612,7 +1612,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F; cudaDataType_t cu_data_type = CUDA_R_16F; - if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) { + if (ggml_cuda_info().devices[ctx.device].cc == GGML_CUDA_CC_CDNA) { cu_compute_type = CUBLAS_COMPUTE_32F; } @@ -2357,7 +2357,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, std::vector ggml_cuda_cpy_fn_ptrs; if (cuda_ctx->cuda_graph->graph == nullptr) { - if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) { + if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) { cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true; #ifndef NDEBUG GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__); @@ -3028,7 +3028,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g return true; } const int cc = ggml_cuda_info().devices[dev_ctx->device].cc; - return cc >= CC_VOLTA && cc < CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16; + return cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16; } case GGML_OP_CROSS_ENTROPY_LOSS: case GGML_OP_CROSS_ENTROPY_LOSS_BACK: diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index a452a3cc3d152..7d11540afd237 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -171,7 +171,7 @@ struct mma_int_C_I16J8 { __device__ __forceinline__ void mma_K4(const mma_int_A_I16K4 & mma_A, const mma_int_B_J8K4 & mma_B) { #ifdef INT8_MMA_AVAILABLE -#if __CUDA_ARCH__ >= CC_AMPERE +#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE asm("mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};" : "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3]) : "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_B.x[0])); @@ -183,7 +183,7 @@ struct mma_int_C_I16J8 { asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};" : "+r"(x[2]), "+r"(x[3]) : "r"(mma_A.x[1]), "r"(mma_B.x[0])); -#endif // __CUDA_ARCH__ >= CC_AMPERE +#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE #else GGML_UNUSED(mma_A); GGML_UNUSED(mma_B); @@ -193,7 +193,7 @@ struct mma_int_C_I16J8 { __device__ __forceinline__ void mma_K8(const mma_int_A_I16K8 & mma_A, const mma_int_B_J8K8 & mma_B) { #ifdef INT8_MMA_AVAILABLE -#if __CUDA_ARCH__ >= CC_AMPERE +#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE asm("mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};" : "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3]) : "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_A.x[2]), "r"(mma_A.x[3]), "r"(mma_B.x[0]), "r"(mma_B.x[1])); @@ -211,7 +211,7 @@ struct mma_int_C_I16J8 { asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};" : "+r"(x[2]), "+r"(x[3]) : "r"(mma_A.x[3]), "r"(mma_B.x[1])); -#endif // __CUDA_ARCH__ >= CC_AMPERE +#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE #else GGML_UNUSED(mma_A); GGML_UNUSED(mma_B); diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 7f7c8c90b6fe2..270251df4f115 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -27,7 +27,7 @@ void ggml_cuda_op_mul_mat_q( // The stream-k decomposition is only faster for recent NVIDIA GPUs. // Also its fixup needs to allocate a temporary buffer in the memory pool. // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer. - const bool use_stream_k = compute_capability >= CC_VOLTA && compute_capability < CC_OFFSET_AMD && src1_ncols == ne11; + const bool use_stream_k = compute_capability >= GGML_CUDA_CC_VOLTA && compute_capability < GGML_CUDA_CC_OFFSET_AMD && src1_ncols == ne11; const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k}; switch (src0->type) { @@ -136,7 +136,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) { return true; } - if (cc < MIN_CC_DP4A) { + if (cc < GGML_CUDA_CC_DP4A) { return false; } @@ -144,9 +144,9 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) { return true; #endif //GGML_CUDA_FORCE_MMQ - if (cc < CC_OFFSET_AMD) { - return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; + if (cc < GGML_CUDA_CC_OFFSET_AMD) { + return cc < GGML_CUDA_CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; } - return (cc < CC_RDNA3 && cc != CC_CDNA && cc != CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; + return (cc < GGML_CUDA_CC_RDNA3 && cc != GGML_CUDA_CC_CDNA && cc != GGML_CUDA_CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; } diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 8d8867121f321..3cd508a1d4a1c 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -89,9 +89,9 @@ struct tile_x_sizes { static constexpr int get_mmq_x_max_host(const int cc) { return int8_mma_available(cc) ? 128 : #ifdef GGML_CUDA_FORCE_MMQ - cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? 128 : 64; + cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? 128 : 64; #else - cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_DP4A_MAX_BATCH_SIZE : 64; + cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? MMQ_DP4A_MAX_BATCH_SIZE : 64; #endif // GGML_CUDA_FORCE_MMQ } @@ -104,23 +104,23 @@ static constexpr __device__ int get_mmq_x_max_device() { return 128; #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) -#if __CUDA_ARCH__ >= CC_VOLTA +#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA #ifdef GGML_CUDA_FORCE_MMQ return MMQ_DP4A_MAX_BATCH_SIZE; #else // GGML_CUDA_FORCE_MMQ return 128; #endif // GGML_CUDA_FORCE_MMQ -#else // __CUDA_ARCH__ >= CC_VOLTA +#else // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA return 64; -#endif // __CUDA_ARCH__ >= CC_VOLTA +#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #endif // INT8_MMA_AVAILABLE } static constexpr int get_mmq_y_host(const int cc) { - return cc >= CC_OFFSET_AMD ? (cc == CC_RDNA1 ? 64 : 128) : (cc >= CC_VOLTA ? 128 : 64); + return cc >= GGML_CUDA_CC_OFFSET_AMD ? (cc == GGML_CUDA_CC_RDNA1 ? 64 : 128) : (cc >= GGML_CUDA_CC_VOLTA ? 128 : 64); } static constexpr __device__ int get_mmq_y_device() { @@ -131,11 +131,11 @@ static constexpr __device__ int get_mmq_y_device() { return 128; #endif // defined RDNA1 #else -#if __CUDA_ARCH__ >= CC_VOLTA +#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA return 128; #else return 64; -#endif // __CUDA_ARCH__ >= CC_VOLTA +#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) } @@ -2574,11 +2574,11 @@ template __launch_bounds__(WARP_SIZE*nwarps, 2) #endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN) #else -#if __CUDA_ARCH__ >= CC_VOLTA +#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA __launch_bounds__(WARP_SIZE*nwarps, 1) #else __launch_bounds__(WARP_SIZE*nwarps, 2) -#endif // __CUDA_ARCH__ >= CC_VOLTA +#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) static __global__ void mul_mat_q( const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup, @@ -2594,7 +2594,7 @@ static __global__ void mul_mat_q( constexpr int mmq_y = get_mmq_y_device(); // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead: -#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA +#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA { constexpr bool fixup = false; mul_mat_q_process_tile @@ -2602,7 +2602,7 @@ static __global__ void mul_mat_q( blockIdx.x, blockIdx.y, 0, ne00/qk); return; } -#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA +#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA const int64_t blocks_per_ne00 = ne00 / qk; constexpr int blocks_per_iter = MMQ_ITER_K / qk; @@ -2825,7 +2825,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda const int mmq_x_max = get_mmq_x_max_host(cc); const int mmq_y = get_mmq_y_host(cc); const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y; - const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD; + const bool use_stream_k = cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD; int mmq_x_best = 0; int nparts_best = INT_MAX; diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 02d1509836c40..e3b912d875b5b 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -142,7 +142,7 @@ static void mul_mat_vec_q_cuda( int64_t nwarps = 1; int64_t rows_per_cuda_block = 1; - if (ggml_cuda_info().devices[id].cc < CC_CDNA || ggml_cuda_info().devices[id].cc == CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA + if (ggml_cuda_info().devices[id].cc < GGML_CUDA_CC_CDNA || ggml_cuda_info().devices[id].cc == GGML_CUDA_CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA switch(ncols_y) { case 1: nwarps = 4; diff --git a/ggml/src/ggml-cuda/sum.cu b/ggml/src/ggml-cuda/sum.cu index 31cfe53941fe0..e0dafc1d2049d 100644 --- a/ggml/src/ggml-cuda/sum.cu +++ b/ggml/src/ggml-cuda/sum.cu @@ -3,8 +3,6 @@ #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700 #ifdef USE_CUB -// On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh. -// For this reason CUB must be included BEFORE anything else. #include using namespace cub; #endif // USE_CUB From 8d07314fe09bb9046964edd715436e62655cedf8 Mon Sep 17 00:00:00 2001 From: Bartowski Date: Tue, 10 Dec 2024 12:23:50 -0500 Subject: [PATCH 177/245] imatrix : Add imatrix to --no-context-shift (#10766) This allows for setting the --no-context-shift value in llama-imatrix which is required for models like DeepSeek --- common/arg.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/arg.cpp b/common/arg.cpp index 808ec1c3e65fc..49af31682510d 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -591,7 +591,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.ctx_shift = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); add_opt(common_arg( {"--chunks"}, "N", string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), From f4892ca472450d01f9fa2df6f841035548bd7e5e Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Tue, 10 Dec 2024 19:33:23 +0000 Subject: [PATCH 178/245] vulkan: dynamic subgroup size for the remaining k quants (#10745) * q5_k q4_k q3_k q2_k q6_k multi row example * revert as multi row isnt faster for k quants --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 30 ++++++++----------- .../vulkan-shaders/mul_mat_vec_base.comp | 2 -- .../vulkan-shaders/mul_mat_vec_q2_k.comp | 27 ++++++++++------- .../vulkan-shaders/mul_mat_vec_q3_k.comp | 27 ++++++++++------- .../vulkan-shaders/mul_mat_vec_q4_k.comp | 26 +++++++++------- .../vulkan-shaders/mul_mat_vec_q5_k.comp | 21 ++++++++----- 6 files changed, 72 insertions(+), 61 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index ad5535c11d4a1..bb9572d76c7c7 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -44,12 +44,6 @@ #define MAX_VK_BUFFERS 256 -#ifndef K_QUANTS_PER_ITERATION -#define K_QUANTS_PER_ITERATION 1 -#else -static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2"); -#endif - #define VK_CHECK(err, msg) \ do { \ vk::Result err_ = (err); \ @@ -1792,10 +1786,10 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); @@ -1806,10 +1800,10 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size}, 1, true); @@ -1820,10 +1814,10 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp index 2ec1af5c75542..3894fca8260a5 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp @@ -2,8 +2,6 @@ #extension GL_EXT_shader_16bit_storage : require #extension GL_EXT_shader_8bit_storage : require -#define K_QUANTS_PER_ITERATION 2 - #ifdef MUL_MAT_ID #define EXPERT_COUNT 8 #endif diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp index fcf02210ea6c7..1a5350d99ea14 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp @@ -3,9 +3,11 @@ #include "mul_mat_vec_base.comp" -layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; -shared FLOAT_TYPE tmp[32]; +layout (constant_id = 0) const uint BLOCK_SIZE = 32; + +shared FLOAT_TYPE tmp[BLOCK_SIZE]; void main() { const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; @@ -20,22 +22,25 @@ void main() { const uint num_blocks_per_row = p.ncols / QUANT_K; const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row; - const uint tid = gl_LocalInvocationID.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 - const uint ix = gl_LocalInvocationID.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1 + // 16 threads are used to process each block + const uint it_size = gl_WorkGroupSize.x/16; + const uint tid = gl_LocalInvocationID.x; + const uint itid = tid%16; // 0...16 + const uint ix = tid/16; - const uint step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 + const uint step = 8; - const uint v_im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const uint v_in = tid - step*v_im; // 0...15 or 0...7 + const uint v_im = itid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const uint v_in = itid - step*v_im; // 0...15 or 0...7 - const uint l0 = K_QUANTS_PER_ITERATION*v_in; // 0...15 + const uint l0 = 2*v_in; // 0...15 const uint q_offset = 32*v_im + l0; const uint s_offset = 8*v_im; const uint y_offset = 128*v_im + l0; FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp - [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) { const uint y_idx = i * QUANT_K + y_offset; f16vec2 d = data_a[ib0 + i].d; @@ -71,7 +76,7 @@ void main() { FLOAT_TYPE sum1 = FLOAT_TYPE(0.0); FLOAT_TYPE sum2 = FLOAT_TYPE(0.0); - [[unroll]] for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + [[unroll]] for (int l = 0; l < 2; ++l) { sum1 = fma(FLOAT_TYPE(b0[l]), FLOAT_TYPE(s0_lo4[0]) * FLOAT_TYPE((qs0[l] >> 0) & 3), fma(FLOAT_TYPE(b16[l]), FLOAT_TYPE(s0_lo4[1]) * FLOAT_TYPE((qs16[l] >> 0) & 3), fma(FLOAT_TYPE(b32[l]), FLOAT_TYPE(s0_lo4[2]) * FLOAT_TYPE((qs0[l] >> 2) & 3), @@ -96,7 +101,7 @@ void main() { // sum up partial sums and write back result barrier(); - [[unroll]] for (uint s = 16; s > 0; s >>= 1) { + [[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) { if (tid < s) { tmp[tid] += tmp[tid + s]; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp index 723fadde0d52b..b19c3811136bb 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp @@ -3,9 +3,11 @@ #include "mul_mat_vec_base.comp" -layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; -shared FLOAT_TYPE tmp[32]; +layout (constant_id = 0) const uint BLOCK_SIZE = 32; + +shared FLOAT_TYPE tmp[BLOCK_SIZE]; void main() { const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; @@ -20,17 +22,20 @@ void main() { const uint num_blocks_per_row = p.ncols / QUANT_K; const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row; - const uint tid = gl_LocalInvocationID.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 - const uint ix = gl_LocalInvocationID.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1 + // 16 threads are used to process each block + const uint it_size = gl_WorkGroupSize.x/16; + const uint tid = gl_LocalInvocationID.x; + const uint itid = tid%16; // 0...16 + const uint ix = tid/16; - const uint step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 + const uint step = 8; - const uint v_im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const uint v_in = tid - step*v_im; // 0...15 or 0...7 + const uint v_im = itid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const uint v_in = itid - step*v_im; // 0...15 or 0...7 const uint8_t m = uint8_t(1 << (4 * v_im)); - const uint l0 = K_QUANTS_PER_ITERATION*v_in; // 0...15 + const uint l0 = 2*v_in; // 0...15 const uint q_offset = 32*v_im + l0; const uint y_offset = 128*v_im + l0; @@ -38,7 +43,7 @@ void main() { const uint s_shift = 4 * v_im; - [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) { const uint y_idx = i * QUANT_K + y_offset; const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d); @@ -66,7 +71,7 @@ void main() { u8vec2 s10 = unpack8(s10_16); FLOAT_TYPE sum = FLOAT_TYPE(0.0); - for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + [[unroll]] for (int l = 0; l < 2; ++l) { sum = fma(FLOAT_TYPE(b0[l]) * FLOAT_TYPE(int8_t(((s0[0] >> s_shift) & 0xF) | ((s8[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 0)) != 0) ? 0 : 4)), fma(FLOAT_TYPE(b32[l]) * FLOAT_TYPE(int8_t(((s2[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 1)) != 0) ? 0 : 4)), fma(FLOAT_TYPE(b64[l]) * FLOAT_TYPE(int8_t(((s4[0] >> s_shift) & 0xF) | ((s8[0] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 2)) != 0) ? 0 : 4)), @@ -83,7 +88,7 @@ void main() { // sum up partial sums and write back result barrier(); - [[unroll]] for (uint s = 16; s > 0; s >>= 1) { + [[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) { if (tid < s) { tmp[tid] += tmp[tid + s]; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp index 5846f2e86f3e3..b86d28589c64c 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp @@ -4,11 +4,12 @@ #include "mul_mat_vec_base.comp" -layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; -shared FLOAT_TYPE tmp[32]; +layout (constant_id = 0) const uint BLOCK_SIZE = 32; + +shared FLOAT_TYPE tmp[BLOCK_SIZE]; -// This shader assumes K_QUANTS_PER_ITERATION == 2 for alignment of loads void main() { const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; @@ -22,14 +23,17 @@ void main() { const uint num_blocks_per_row = p.ncols / QUANT_K; const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row; - const uint tid = gl_LocalInvocationID.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 - const uint ix = gl_LocalInvocationID.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1 + // 16 threads are used to process each block + const uint it_size = gl_WorkGroupSize.x/16; + const uint tid = gl_LocalInvocationID.x; + const uint itid = tid%16; // 0...16 + const uint ix = tid/16; - const uint step = 8/K_QUANTS_PER_ITERATION; // 8 or 4 + const uint step = 4; - const uint il = tid/step; // 0...3 - const uint ir = tid - step*il; // 0...7 or 0...3 - const uint n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4 + const uint il = itid/step; // 0...3 + const uint ir = itid - step*il; // 0...7 or 0...3 + const uint n = 4; const uint v_im = il / 2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 const uint v_in = il % 2; @@ -40,7 +44,7 @@ void main() { FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp - [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) { const uint y1_idx = i * QUANT_K + y_offset; const uint y2_idx = y1_idx + 128; @@ -115,7 +119,7 @@ void main() { // sum up partial sums and write back result barrier(); - [[unroll]] for (uint s = 16; s > 0; s >>= 1) { + [[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) { if (tid < s) { tmp[tid] += tmp[tid + s]; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp index b455cbd31ec91..fd243cf9161fc 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp @@ -4,9 +4,11 @@ #include "mul_mat_vec_base.comp" -layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; -shared FLOAT_TYPE tmp[32]; +layout (constant_id = 0) const uint BLOCK_SIZE = 32; + +shared FLOAT_TYPE tmp[BLOCK_SIZE]; void main() { const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; @@ -21,11 +23,14 @@ void main() { const uint num_blocks_per_row = p.ncols / QUANT_K; const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row; - const uint tid = gl_LocalInvocationID.x/2; // 0...31 or 0...16 - const uint ix = gl_LocalInvocationID.x%2; // 0 or 0, 1 + // 16 threads are used to process each block + const uint it_size = gl_WorkGroupSize.x/16; + const uint tid = gl_LocalInvocationID.x; + const uint itid = tid%16; // 0...16 + const uint ix = tid/16; - const uint il = tid/4; // 0...3 - const uint ir = tid - 4*il; // 0...7 or 0...3 + const uint il = itid/4; // 0...3 + const uint ir = itid - 4*il; // 0...7 or 0...3 const uint v_im = il / 2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 const uint v_in = il % 2; @@ -36,7 +41,7 @@ void main() { FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp - [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += 2) { + [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) { const uint y1_idx = i * QUANT_K + y_offset; const uint y2_idx = y1_idx + 128; @@ -143,7 +148,7 @@ void main() { // sum up partial sums and write back result barrier(); - [[unroll]] for (uint s = 16; s > 0; s >>= 1) { + [[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) { if (tid < s) { tmp[tid] += tmp[tid + s]; } From aeab6a70f378c737dca413f62bc1cdf33828f1a1 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Tue, 10 Dec 2024 14:23:17 -0600 Subject: [PATCH 179/245] vulkan: request round-to-even for fp16 in im2col/rope_head (#10767) Vulkan doesn't mandate a specific rounding mode, but the shader_float_controls feature allows rounding mode to be requested if the implementation supports it. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 23 +++++++++++++++---- .../ggml-vulkan/vulkan-shaders/im2col.comp | 5 ++++ .../ggml-vulkan/vulkan-shaders/rope_head.comp | 5 ++++ .../vulkan-shaders/vulkan-shaders-gen.cpp | 3 +++ 4 files changed, 31 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index bb9572d76c7c7..a8ae58ee2ce85 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -162,6 +162,7 @@ struct vk_device_struct { uint32_t subgroup_size; uint32_t shader_core_count; bool uma; + bool float_controls_rte_fp16; bool coopmat2; bool coopmat_support; @@ -1916,17 +1917,26 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + + if (device->float_controls_rte_fp16) { + ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_rte_len, rope_norm_f16_rte_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_rte_len, rope_neox_f16_rte_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + } else { + ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + } ggml_vk_create_pipeline(device, device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1); + if (device->float_controls_rte_fp16) { + ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_rte_len, im2col_f32_f16_rte_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1); + } else { + ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1); + } ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1); @@ -2007,11 +2017,13 @@ static vk_device ggml_vk_get_device(size_t idx) { vk::PhysicalDeviceDriverProperties driver_props; vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props; vk::PhysicalDeviceShaderCoreProperties2AMD amd_shader_core_properties2_props; + vk::PhysicalDeviceVulkan12Properties vk12_props; props2.pNext = &props3; props3.pNext = &subgroup_props; subgroup_props.pNext = &driver_props; + driver_props.pNext = &vk12_props; - VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&driver_props; + VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&vk12_props; if (maintenance4_support) { last_struct->pNext = (VkBaseOutStructure *)&props4; @@ -2057,6 +2069,7 @@ static vk_device ggml_vk_get_device(size_t idx) { } else { device->shader_core_count = 0; } + device->float_controls_rte_fp16 = vk12_props.shaderRoundingModeRTEFloat16; const bool force_disable_f16 = getenv("GGML_VK_DISABLE_F16") != nullptr; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp index 4d48610a3adcb..966fedf8fa7a0 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp @@ -1,6 +1,11 @@ #version 450 #extension GL_EXT_shader_16bit_storage : require +#extension GL_EXT_spirv_intrinsics: enable + +#if RTE16 +spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits +#endif layout (push_constant) uniform parameter { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp index ea89542261cc4..574b51ca55389 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp @@ -1,6 +1,11 @@ #include "types.comp" #extension GL_EXT_shader_16bit_storage : require +#extension GL_EXT_spirv_intrinsics: enable + +#if RTE16 +spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits +#endif layout(local_size_x = 1, local_size_y = 256, local_size_z = 1) in; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index ee35208337463..c48a228aef65d 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -461,9 +461,11 @@ void process_shaders() { string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); + string_to_spv("rope_norm_f16_rte", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}}); string_to_spv("rope_neox_f32", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("rope_neox_f16", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); + string_to_spv("rope_neox_f16_rte", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}}); string_to_spv("argsort_f32", "argsort.comp", {{"A_TYPE", "float"}}); @@ -471,6 +473,7 @@ void process_shaders() { string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}})); + string_to_spv("im2col_f32_f16_rte", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}})); string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); From 6c52736b61a35c7744545e9d2ea6801156ec9628 Mon Sep 17 00:00:00 2001 From: "Gilad S." <7817232+giladgd@users.noreply.github.com> Date: Wed, 11 Dec 2024 02:47:21 +0200 Subject: [PATCH 180/245] ggml: load all backends from a user-provided search path (#10699) * feat: load all backends from a user-provided search path * fix: Windows search path * refactor: rename `ggml_backend_load_all_in_search_path` to `ggml_backend_load_all_from_path` * refactor: rename `search_path` to `dir_path` * fix: change `NULL` to `nullptr` Co-authored-by: Diego Devesa * fix: change `NULL` to `nullptr` --------- Co-authored-by: Diego Devesa --- ggml/include/ggml-backend.h | 1 + ggml/src/ggml-backend-reg.cpp | 40 +++++++++++++++++++++++------------ 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 19881a5059f17..7221a08309274 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -228,6 +228,7 @@ extern "C" { GGML_API void ggml_backend_unload(ggml_backend_reg_t reg); // Load all known backends from dynamic libraries GGML_API void ggml_backend_load_all(void); + GGML_API void ggml_backend_load_all_from_path(const char * dir_path); // // Backend scheduler diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 5cb0fb9d159db..2e7340145eed2 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -449,11 +449,21 @@ static std::string backend_filename_suffix() { #endif } -static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) { +static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) { // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths // TODO: search system paths - std::vector search_paths = { "./", get_executable_path() }; std::string file_prefix = backend_filename_prefix() + name + "-"; + std::vector search_paths; + if (user_search_path == nullptr) { + search_paths.push_back("./"); + search_paths.push_back(get_executable_path()); + } else { +#if defined(_WIN32) + search_paths.push_back(std::string(user_search_path) + "\\"); +#else + search_paths.push_back(std::string(user_search_path) + "/"); +#endif + } int best_score = 0; std::string best_path; @@ -509,21 +519,25 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) } void ggml_backend_load_all() { + ggml_backend_load_all_from_path(nullptr); +} + +void ggml_backend_load_all_from_path(const char * dir_path) { #ifdef NDEBUG bool silent = true; #else bool silent = false; #endif - ggml_backend_load_best("blas", silent); - ggml_backend_load_best("cann", silent); - ggml_backend_load_best("cuda", silent); - ggml_backend_load_best("hip", silent); - ggml_backend_load_best("kompute", silent); - ggml_backend_load_best("metal", silent); - ggml_backend_load_best("rpc", silent); - ggml_backend_load_best("sycl", silent); - ggml_backend_load_best("vulkan", silent); - ggml_backend_load_best("musa", silent); - ggml_backend_load_best("cpu", silent); + ggml_backend_load_best("blas", silent, dir_path); + ggml_backend_load_best("cann", silent, dir_path); + ggml_backend_load_best("cuda", silent, dir_path); + ggml_backend_load_best("hip", silent, dir_path); + ggml_backend_load_best("kompute", silent, dir_path); + ggml_backend_load_best("metal", silent, dir_path); + ggml_backend_load_best("rpc", silent, dir_path); + ggml_backend_load_best("sycl", silent, dir_path); + ggml_backend_load_best("vulkan", silent, dir_path); + ggml_backend_load_best("musa", silent, dir_path); + ggml_backend_load_best("cpu", silent, dir_path); } From d3e0eee3c8a457e3b3db21c1b6741ba76641ed01 Mon Sep 17 00:00:00 2001 From: CentricStorm Date: Wed, 11 Dec 2024 10:47:43 +0000 Subject: [PATCH 181/245] docs: fix server documentation formatting (#10776) --- examples/server/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/README.md b/examples/server/README.md index 6294f541fc7d7..2bd23d1a6de2c 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -442,7 +442,7 @@ These words will not be included in the completion, so make sure to add them to `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values. - `timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false` +`timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false` **Response format** From c8d98b0df8711d82085cfaee6bf2193b508445ce Mon Sep 17 00:00:00 2001 From: kallewoof Date: Wed, 11 Dec 2024 22:48:04 +0900 Subject: [PATCH 182/245] bug-fix: snprintf prints NULL in place of the last character (#10419) * bug-fix: snprintf prints NULL in place of the last character We need to give snprintf enough space to print the last character and the null character, thus we allocate one extra byte and then ignore it when converting to std::string. * add comment about extra null-term byte requirement --- examples/server/utils.hpp | 2 +- include/llama.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 8f545aea52dc4..2fcb895abe7e4 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -333,7 +333,7 @@ static std::string llama_get_chat_template(const struct llama_model * model) { if (res < 2) { return ""; } else { - std::vector model_template(res, 0); + std::vector model_template(res + 1, 0); llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); return std::string(model_template.data(), model_template.size() - 1); } diff --git a/include/llama.h b/include/llama.h index 36945cde335dc..eebbacb803454 100644 --- a/include/llama.h +++ b/include/llama.h @@ -456,6 +456,7 @@ extern "C" { // Functions to access the model's GGUF metadata scalar values // - The functions return the length of the string on success, or -1 on failure // - The output string is always null-terminated and cleared on failure + // - When retrieving a string, an extra byte must be allocated to account for the null terminator // - GGUF array values are not supported by these functions // Get metadata value as a string by key name From 2c0b614af21c1968865e8c19900dbc5f5442399c Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 11 Dec 2024 14:59:41 +0100 Subject: [PATCH 183/245] ci : pin nodejs to 22.11.0 (#10779) --- .github/workflows/server.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 9e66fb68cffe8..671fe595cdfcb 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -79,7 +79,7 @@ jobs: # Setup nodejs (to be used for verifying bundled index.html) - uses: actions/setup-node@v4 with: - node-version: 22 + node-version: '22.11.0' - name: Verify bundled index.html id: verify_server_index_html From 1a47802598a6b42fc734f1d8dc678b3b81528136 Mon Sep 17 00:00:00 2001 From: qingy1337 Date: Wed, 11 Dec 2024 07:16:32 -0800 Subject: [PATCH 184/245] Update README.md (#10772) --- examples/quantize/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/quantize/README.md b/examples/quantize/README.md index 5d1e11c67b13f..f9cce7b213334 100644 --- a/examples/quantize/README.md +++ b/examples/quantize/README.md @@ -81,7 +81,7 @@ Several quantization methods are supported. They differ in the resulting model d - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930) - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957) - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969) - - [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996) + - [#4996 - k-quants tuning](https://github.com/ggerganov/llama.cpp/pull/4996) - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060) - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196) - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361) From 367e86b7ba5fe943b36140339a6da4a64134af98 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 11 Dec 2024 20:52:14 +0100 Subject: [PATCH 185/245] server : (UI) add tok/s, get rid of completion.js (#10786) * get rid of completion.js * extract chat bubble to a component * add tok/s info * sync * fix BASE_URL * only extract timings when it's enabled * fix auto scroll --- examples/server/public/index.html | 198 ++++++++++++--------- examples/server/webui/index.html | 124 ++++++++----- examples/server/webui/package-lock.json | 7 + examples/server/webui/package.json | 1 + examples/server/webui/src/completion.js | 225 ------------------------ examples/server/webui/src/main.js | 128 +++++++++++--- 6 files changed, 307 insertions(+), 376 deletions(-) delete mode 100644 examples/server/webui/src/completion.js diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 250729a441272..9a19c5e83e35b 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -11,84 +11,84 @@ 🦙 llama.cpp - chat - - + @@ -99,7 +99,7 @@
-
+

Conversations

@@ -204,51 +204,25 @@

Conversations

{{ messages.length === 0 ? 'Send a message to start' : '' }}
-
-
- - - - -
-
- - -
- - - - - -
+
-
-
- - -
+
+
@@ -311,6 +285,10 @@

Settings

Advanced config
+
+ + Show tokens per second +
+ + + + +