CUDA: Remove unneded bias/gate dims in fused mmvq

ORippler · ORippler · commit 44987f773ed5 · 2025-10-30T12:34:59.000+01:00
Pointed out [here](ggml-org#16847 (comment)) that only a single value is needed per target col per thread
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
@@ -190,8 +190,8 @@ static __global__ void mul_mat_vec_q(
 
     const uint32_t channel_bias = ids ? channel_x : channel_dst;
 
-    float x_biases[ncols_dst][rows_per_cuda_block]    = { { 0.0f } };
-    float gate_biases[ncols_dst][rows_per_cuda_block] = { { 0.0f } };
+    float x_biases[ncols_dst]    = { { 0.0f } };
+    float gate_biases[ncols_dst] = { { 0.0f } };
     if constexpr (has_fusion) {
         if (use_bias) {
             x_bias = x_bias + sample_dst*stride_sample_dst + channel_bias*stride_channel_dst + row0;
@@ -200,7 +200,7 @@ static __global__ void mul_mat_vec_q(
             if (threadIdx.x < rows_per_cuda_block && threadIdx.y == 0 &&
                 (rows_per_cuda_block == 1 || uint32_t(row0 + threadIdx.x) < stride_col_dst)) {
                 for (int j = 0; j < ncols_dst; ++j) {
-                    x_biases[j][threadIdx.x] = x_bias[j * stride_col_dst + threadIdx.x];
+                    x_biases[j] = x_bias[j * stride_col_dst + threadIdx.x];
                 }
             }
         }
@@ -209,7 +209,7 @@ static __global__ void mul_mat_vec_q(
             if (threadIdx.x < rows_per_cuda_block && threadIdx.y == 0 &&
                 (rows_per_cuda_block == 1 || uint32_t(row0 + threadIdx.x) < stride_col_dst)) {
                 for (int j = 0; j < ncols_dst; ++j) {
-                    gate_biases[j][threadIdx.x] = gate_bias[j * stride_col_dst + threadIdx.x];
+                    gate_biases[j] = gate_bias[j * stride_col_dst + threadIdx.x];
                 }
             }
         }
@@ -299,12 +299,12 @@ static __global__ void mul_mat_vec_q(
             float result = tmp[j][threadIdx.x];
             if constexpr (has_fusion) {
                 if (use_bias) {
-                    result += x_biases[j][threadIdx.x];
+                    result += x_biases[j];
                 }
                 if (use_gate) {
                     float gate_value = tmp_gate[j][threadIdx.x];
                     if (use_gate_bias) {
-                        gate_value += gate_biases[j][threadIdx.x];
+                        gate_value += gate_biases[j];
                     }
                     switch (active_glu) {
                         case GGML_GLU_OP_SWIGLU:

Original file line number	Diff line number	Diff line change
`@@ -190,8 +190,8 @@ static __global__ void mul_mat_vec_q(`
`190`	`190`
`191`	`191`	`const uint32_t channel_bias = ids ? channel_x : channel_dst;`
`192`	`192`
`193`		`- float x_biases[ncols_dst][rows_per_cuda_block] = { { 0.0f } };`
`194`		`- float gate_biases[ncols_dst][rows_per_cuda_block] = { { 0.0f } };`
	`193`	`+ float x_biases[ncols_dst] = { { 0.0f } };`
	`194`	`+ float gate_biases[ncols_dst] = { { 0.0f } };`
`195`	`195`	`if constexpr (has_fusion) {`
`196`	`196`	`if (use_bias) {`
`197`	`197`	`x_bias = x_bias + sample_dststride_sample_dst + channel_biasstride_channel_dst + row0;`
`@@ -200,7 +200,7 @@ static __global__ void mul_mat_vec_q(`
`200`	`200`	`if (threadIdx.x < rows_per_cuda_block && threadIdx.y == 0 &&`
`201`	`201`	`(rows_per_cuda_block == 1 \|\| uint32_t(row0 + threadIdx.x) < stride_col_dst)) {`
`202`	`202`	`for (int j = 0; j < ncols_dst; ++j) {`
`203`		`- x_biases[j][threadIdx.x] = x_bias[j * stride_col_dst + threadIdx.x];`
	`203`	`+ x_biases[j] = x_bias[j * stride_col_dst + threadIdx.x];`
`204`	`204`	`}`
`205`	`205`	`}`
`206`	`206`	`}`
`@@ -209,7 +209,7 @@ static __global__ void mul_mat_vec_q(`
`209`	`209`	`if (threadIdx.x < rows_per_cuda_block && threadIdx.y == 0 &&`
`210`	`210`	`(rows_per_cuda_block == 1 \|\| uint32_t(row0 + threadIdx.x) < stride_col_dst)) {`
`211`	`211`	`for (int j = 0; j < ncols_dst; ++j) {`
`212`		`- gate_biases[j][threadIdx.x] = gate_bias[j * stride_col_dst + threadIdx.x];`
	`212`	`+ gate_biases[j] = gate_bias[j * stride_col_dst + threadIdx.x];`
`213`	`213`	`}`
`214`	`214`	`}`
`215`	`215`	`}`
`@@ -299,12 +299,12 @@ static __global__ void mul_mat_vec_q(`
`299`	`299`	`float result = tmp[j][threadIdx.x];`
`300`	`300`	`if constexpr (has_fusion) {`
`301`	`301`	`if (use_bias) {`
`302`		`- result += x_biases[j][threadIdx.x];`
	`302`	`+ result += x_biases[j];`
`303`	`303`	`}`
`304`	`304`	`if (use_gate) {`
`305`	`305`	`float gate_value = tmp_gate[j][threadIdx.x];`
`306`	`306`	`if (use_gate_bias) {`
`307`		`- gate_value += gate_biases[j][threadIdx.x];`
	`307`	`+ gate_value += gate_biases[j];`
`308`	`308`	`}`
`309`	`309`	`switch (active_glu) {`
`310`	`310`	`case GGML_GLU_OP_SWIGLU:`