Skip to content
1 change: 1 addition & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ option(GGML_HIP_EXPORT_METRICS "ggml: enable kernel perf metrics ou
option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
option(GGML_VULKAN "ggml: use Vulkan" OFF)
option(GGML_VULKAN_BUILD_ADRENO_SHADERS "ggml: build Adreno-supported shader variants" ON)
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
Expand Down
5 changes: 5 additions & 0 deletions ggml/src/ggml-vulkan/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,11 @@ if (Vulkan_FOUND)
add_compile_definitions(GGML_VULKAN_DEBUG)
endif()

if (GGML_VULKAN_BUILD_ADRENO_SHADERS)
add_compile_definitions(GGML_VULKAN_BUILD_ADRENO_SHADERS)
list(APPEND VULKAN_SHADER_GEN_CMAKE_ARGS -DGGML_VULKAN_BUILD_ADRENO_SHADERS=ON)
endif()

if (GGML_VULKAN_MEMORY_DEBUG)
add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
endif()
Expand Down
607 changes: 569 additions & 38 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
cmake_minimum_required(VERSION 3.19)
project("vulkan-shaders-gen" C CXX)

option(GGML_VULKAN_BUILD_ADRENO_SHADERS "Build Adreno-specific shader variants" ON)

find_package (Threads REQUIRED)

if (GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
Expand All @@ -23,6 +25,10 @@ if (GGML_VULKAN_SHADER_DEBUG_INFO)
add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO)
message(STATUS "Enabling shader debug info")
endif()
if (GGML_VULKAN_BUILD_ADRENO_SHADERS)
add_compile_definitions(GGML_VULKAN_BUILD_ADRENO_SHADERS)
message(STATUS "Building Adreno-specific shaders")
endif()

set(TARGET vulkan-shaders-gen)
add_executable(${TARGET} vulkan-shaders-gen.cpp)
Expand Down
5 changes: 5 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,13 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1]));
}
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
#if defined(ADRENO)
const vec2 v0 = dequantize(ib, iqs, a_offset);
const vec2 v1 = dequantize(ib, iqs + 2, a_offset);
#else
const i8vec2 v0 = unpack8(int32_t(data_a_packed16[a_offset + ib].qs[iqs/2])).xy; // vec4 used due to #12147
const i8vec2 v1 = unpack8(int32_t(data_a_packed16[a_offset + ib].qs[iqs/2 + 1])).xy;
#endif
return vec4(v0.x, v0.y, v1.x, v1.y);
}
#endif
Expand Down
28 changes: 28 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,25 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im,

const uint32_t scale_0_4_l = (scale4_u32 << 16) | scale0_u32;
const uint32_t scale_0_4_h = (scale_0_4_l & 0xC0C0C0C0) >> 2;

#if defined(ADRENO)
const vec4 scale_0_4_l_f = vec4(
float((scale_0_4_l >> 0) & 0x3Fu),
float((scale_0_4_l >> 8) & 0x3Fu),
float((scale_0_4_l >> 16) & 0x3Fu),
float((scale_0_4_l >> 24) & 0x3Fu)
);

const vec4 scale8_f = vec4(
float(((((scale8_u32 << 12) | scale8_u32) & 0x0F0F0F0Fu) | scale_0_4_h) >> 0 & 0xFFu),
float(((((scale8_u32 << 12) | scale8_u32) & 0x0F0F0F0Fu) | scale_0_4_h) >> 8 & 0xFFu),
float(((((scale8_u32 << 12) | scale8_u32) & 0x0F0F0F0Fu) | scale_0_4_h) >> 16 & 0xFFu),
float(((((scale8_u32 << 12) | scale8_u32) & 0x0F0F0F0Fu) | scale_0_4_h) >> 24 & 0xFFu)
);
#else
const vec4 scale_0_4_l_f = vec4(unpack8(scale_0_4_l & 0x3F3F3F3F));
const vec4 scale8_f = vec4(unpack8((((scale8_u32 << 12) | scale8_u32) & 0x0F0F0F0F) | scale_0_4_h));
#endif

const FLOAT_TYPE sc0 = scale_0_4_l_f.x;
const FLOAT_TYPE sc1 = scale_0_4_l_f.y;
Expand All @@ -44,10 +61,17 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im,
const uint32_t qs64_u32_lo4 = qs64_u32 & 0x0F0F0F0F;
const uint32_t qs64_u32_hi4 = (qs64_u32 >> 4) & 0x0F0F0F0F;

#if defined(ADRENO)
const vec4 qs0_lo4 = vec4(float(qs0_u32_lo4 & 0xFFu), float((qs0_u32_lo4 >> 8) & 0xFFu), float((qs0_u32_lo4 >> 16) & 0xFFu), float((qs0_u32_lo4 >> 24) & 0xFFu));
const vec4 qs64_lo4 = vec4(float(qs64_u32_lo4 & 0xFFu), float((qs64_u32_lo4 >> 8) & 0xFFu), float((qs64_u32_lo4 >> 16) & 0xFFu), float((qs64_u32_lo4 >> 24) & 0xFFu));
const vec4 qs0_hi4 = vec4(float(qs0_u32_hi4 & 0xFFu), float((qs0_u32_hi4 >> 8) & 0xFFu), float((qs0_u32_hi4 >> 16) & 0xFFu), float((qs0_u32_hi4 >> 24) & 0xFFu));
const vec4 qs64_hi4 = vec4(float(qs64_u32_hi4 & 0xFFu), float((qs64_u32_hi4 >> 8) & 0xFFu), float((qs64_u32_hi4 >> 16) & 0xFFu), float((qs64_u32_hi4 >> 24) & 0xFFu));
#else
const vec4 qs0_lo4 = vec4(unpack8(qs0_u32_lo4));
const vec4 qs64_lo4 = vec4(unpack8(qs64_u32_lo4));
const vec4 qs0_hi4 = vec4(unpack8(qs0_u32_hi4));
const vec4 qs64_hi4 = vec4(unpack8(qs64_u32_hi4));
#endif

const FLOAT_TYPE q4_0 = qs0_lo4.x;
const FLOAT_TYPE q4_1 = qs0_lo4.y;
Expand All @@ -66,7 +90,11 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im,
const FLOAT_TYPE q4_14 = qs64_hi4.z;
const FLOAT_TYPE q4_15 = qs64_hi4.w;

#if defined(ADRENO)
for (uint j = 0; j < NUM_COLS; ++j) {
#else
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
#endif
vec4 by10 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4 ]);
vec4 by132 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4 + 8]);
vec4 by20 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y2_idx) / 4 ]);
Expand Down
7 changes: 7 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,17 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
const uint32_t q2_u32 = ql0_u32_hi4 | qh4_u32;
const uint32_t q3_u32 = ql32_u32_hi4 | qh6_u32;

#if defined(ADRENO)
const vec4 q0 = vec4(float(q0_u32 & 0xFF), float((q0_u32 >> 8) & 0xFF), float((q0_u32 >> 16) & 0xFF), float(q0_u32 >> 24)) - 32;
const vec4 q1 = vec4(float(q1_u32 & 0xFF), float((q1_u32 >> 8) & 0xFF), float((q1_u32 >> 16) & 0xFF), float(q1_u32 >> 24)) - 32;
const vec4 q2 = vec4(float(q2_u32 & 0xFF), float((q2_u32 >> 8) & 0xFF), float((q2_u32 >> 16) & 0xFF), float(q2_u32 >> 24)) - 32;
const vec4 q3 = vec4(float(q3_u32 & 0xFF), float((q3_u32 >> 8) & 0xFF), float((q3_u32 >> 16) & 0xFF), float(q3_u32 >> 24)) - 32;
#else
const vec4 q0 = vec4(unpack8(q0_u32)) - 32;
const vec4 q1 = vec4(unpack8(q1_u32)) - 32;
const vec4 q2 = vec4(unpack8(q2_u32)) - 32;
const vec4 q3 = vec4(unpack8(q3_u32)) - 32;
#endif

if (all_threads) {
sccache[csel][ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
Expand Down
43 changes: 43 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
Original file line number Diff line number Diff line change
Expand Up @@ -363,8 +363,25 @@ void main() {

const float d = float(data_a_packed16[ib].d);
const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16);

#if defined(ADRENO)
vec4 v0 = (vec4(
float((vui >> 0) & 0xF),
float((vui >> 8) & 0xF),
float((vui >> 16) & 0xF),
float((vui >> 24) & 0xF)
) - 8.0) * d;

vec4 v1 = (vec4(
float((vui >> 4) & 0xF),
float((vui >> 12) & 0xF),
float((vui >> 20) & 0xF),
float((vui >> 28) & 0xF)
) - 8.0) * d;
#else
const vec4 v0 = (vec4(unpack8(vui & 0x0F0F0F0F)) - 8.0f) * d;
const vec4 v1 = (vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) - 8.0f) * d;
#endif

buf_a[buf_idx ] = FLOAT_TYPE(v0.x);
buf_a[buf_idx + 1 ] = FLOAT_TYPE(v0.y);
Expand All @@ -384,8 +401,24 @@ void main() {
const float d = float(data_a_packed16[ib].d);
const float m = float(data_a_packed16[ib].m);
const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16);

#if defined(ADRENO)
vec4 v0 = vec4(
float((vui >> 0) & 0xF),
float((vui >> 8) & 0xF),
float((vui >> 16) & 0xF),
float((vui >> 24) & 0xF)
) * d + m;
vec4 v1 = vec4(
float((vui >> 4) & 0xF),
float((vui >> 12) & 0xF),
float((vui >> 20) & 0xF),
float((vui >> 28) & 0xF)
) * d + m;
#else
const vec4 v0 = vec4(unpack8(vui & 0x0F0F0F0F)) * d + m;
const vec4 v1 = vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) * d + m;
#endif

buf_a[buf_idx ] = FLOAT_TYPE(v0.x);
buf_a[buf_idx + 1 ] = FLOAT_TYPE(v0.y);
Expand Down Expand Up @@ -441,10 +474,20 @@ void main() {
const uint ib = idx / 8;
const uint iqs = idx & 0x07;

#if defined(ADRENO)
const float d = float(data_a[ib].d);
const vec4 v = vec4(
int(data_a[ib].qs[4*iqs]),
int(data_a[ib].qs[4*iqs + 1]),
int(data_a[ib].qs[4*iqs + 2]),
int(data_a[ib].qs[4*iqs + 3])
) * d;
#else
const float d = float(data_a_packed16[ib].d);
const i8vec2 v0 = unpack8(int32_t(data_a_packed16[ib].qs[2*iqs])).xy; // vec4 used due to #12147
const i8vec2 v1 = unpack8(int32_t(data_a_packed16[ib].qs[2*iqs + 1])).xy;
const vec4 v = vec4(v0.x, v0.y, v1.x, v1.y) * d;
#endif

buf_a[buf_idx ] = FLOAT_TYPE(v.x);
buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
Expand Down
20 changes: 20 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,20 @@ shared ACC_TYPE coopmat_stage[TM * TN * NUM_WARPS];

#include "mul_mmq_funcs.comp"

int dotPacked4x8(uint a, uint b) {
int a0 = int(a << 24) >> 24;
int a1 = int(a << 16) >> 24;
int a2 = int(a << 8) >> 24;
int a3 = int(a) >> 24;

int b0 = int(b << 24) >> 24;
int b1 = int(b << 16) >> 24;
int b2 = int(b << 8) >> 24;
int b3 = int(b) >> 24;

return a0 * b0 + a1 * b1 + a2 * b2 + a3 * b3;
}

void main() {
#ifdef NEEDS_INIT_IQ_SHMEM
init_iq_shmem(gl_WorkGroupSize);
Expand Down Expand Up @@ -352,8 +366,14 @@ void main() {
const uint sums_idx = (wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr;
int32_t q_sum = 0;
[[unroll]] for (uint idx_k = 0; idx_k < BK / 4; idx_k++) {

#if defined(ADRENO)
q_sum += dotPacked4x8(cache_a_qs[cache_a_idx * (BK / 4) + idx_k],
cache_b_qs[cc * (BK / 4) + idx_k]);
#else
q_sum += dotPacked4x8EXT(cache_a_qs[cache_a_idx * (BK / 4) + idx_k],
cache_b_qs[cc * (BK / 4) + idx_k]);
#endif
}

sums[sums_idx] += mul_q8_1(q_sum, cache_a_dm[cache_a_idx], cache_b_ds[cc], 1);
Expand Down
56 changes: 33 additions & 23 deletions ggml/src/ggml-vulkan/vulkan-shaders/out_prod_q8_0.comp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
#include "generic_binary_head.comp"
#include "dequant_funcs.comp"

const uint num_threads = 256;
const uint quant_group_sz = 2;
const uint num_threads = 512 / quant_group_sz;
layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;

void get_dst_indices(uint idx, out uint i20, out uint i21, out uint i22, out uint i23) {
Expand All @@ -17,38 +18,47 @@ void get_dst_indices(uint idx, out uint i20, out uint i21, out uint i22, out uin
}

void main() {
// num_threads * num_iter must equal 512 to match the wg_denoms and get_idx
const uint num_iter = 2;

const uint broadcast2 = uint(p.param2);
const uint broadcast3 = p.param3;

uint idx = get_idx();
uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x * quant_group_sz;

uint aoffset = get_aoffset();
uint boffset = get_boffset();
uint doffset = get_doffset();

[[unroll]] for (uint it = 0; it < num_iter; ++it) {
if (idx < p.ne) {
uint i0, i1, i2, i3;
get_dst_indices(idx, i0, i1, i2, i3);
if (idx < p.ne) {
uint i0, i1, i2, i3;
get_dst_indices(idx, i0, i1, i2, i3);

float acc = 0.0f;
vec2 acc = vec2(0.0);

for (uint k = 0; k < p.ne01; k++) {
if (i0 + 1 >= p.ne20) {
continue;
}

for (uint k = 0; k < p.ne01; k += 1) {
const uint a_block_base = get_aoffset() + (i3 / broadcast3) * p.nb03 + (i2 / broadcast2) * p.nb02 + k * p.nb01;
const uint ib = a_block_base + (i0 / QUANT_K);
const uint iqs = (i0 % QUANT_K) / QUANT_R;
const uint a_block_base = aoffset + (i3 / broadcast3) * p.nb03 + (i2 / broadcast2) * p.nb02 + k * p.nb01;
const uint ib = a_block_base + ((i0) / QUANT_K) * p.nb00;
const uint iqs = ((i0) % QUANT_K) / QUANT_R;

const vec2 v = dequantize(ib, iqs, 0);
const vec2 dm = get_dm(ib, 0);
const float a_val = v.x * dm.x + dm.y;
const vec2 v = dequantize(ib, iqs, 0);
const vec2 dm = get_dm(ib, 0);
const vec2 a_vals = v * dm.x + dm.y;

const uint b_idx = src1_idx(i1, k, i2, i3);
const float b = data_b[boffset + b_idx];

acc += a_vals * b;
}

const uint b_idx = src1_idx(i1, k, i2, i3);
const float b = data_b[get_boffset() + b_idx];
acc += a_val * b;
uint d_idx = dst_idx(i0, i1, i2, i3);
for (uint q = 0; q < quant_group_sz; q++) {
if (d_idx + q >= p.ne) {
continue;
}

uint d_idx = dst_idx(i0, i1, i2, i3);
data_d[get_doffset() + d_idx] = acc;
data_d[doffset + d_idx + q] = acc[q];
}
idx += num_threads;
}
}
18 changes: 18 additions & 0 deletions ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,28 @@ void quantize() {
vals = round(vals * d_inv);

#ifndef QBLOCK_X4
#if defined(ADRENO)
i8vec4 q = i8vec4(round(vals));
data_b[ib].qs[iqs] =
int((uint(q.x) & 0xFFu) |
((uint(q.y) & 0xFFu) << 8) |
((uint(q.z) & 0xFFu) << 16) |
((uint(q.w) & 0xFFu) << 24));
#else
data_b[ib].qs[iqs] = pack32(i8vec4(round(vals)));
#endif
#else
#if defined(ADRENO)
i8vec4 q = i8vec4(round(vals));
data_b[ibx4_outer].qs[ibx4_inner * 8 + iqs] =
int((uint(q.x) & 0xFFu) |
((uint(q.y) & 0xFFu) << 8) |
((uint(q.z) & 0xFFu) << 16) |
((uint(q.w) & 0xFFu) << 24));
#else
data_b[ibx4_outer].qs[ibx4_inner * 8 + iqs] = pack32(i8vec4(round(vals)));
#endif
#endif

#ifndef USE_SUBGROUPS
barrier();
Expand Down
Loading
Loading