@@ -190,8 +190,8 @@ static __global__ void mul_mat_vec_q(
190190
191191 const uint32_t channel_bias = ids ? channel_x : channel_dst;
192192
193- float x_biases[ncols_dst][rows_per_cuda_block] = { { 0 .0f } };
194- float gate_biases[ncols_dst][rows_per_cuda_block] = { { 0 .0f } };
193+ float x_biases[ncols_dst] = { { 0 .0f } };
194+ float gate_biases[ncols_dst] = { { 0 .0f } };
195195 if constexpr (has_fusion) {
196196 if (use_bias) {
197197 x_bias = x_bias + sample_dst*stride_sample_dst + channel_bias*stride_channel_dst + row0;
@@ -200,7 +200,7 @@ static __global__ void mul_mat_vec_q(
200200 if (threadIdx .x < rows_per_cuda_block && threadIdx .y == 0 &&
201201 (rows_per_cuda_block == 1 || uint32_t (row0 + threadIdx .x ) < stride_col_dst)) {
202202 for (int j = 0 ; j < ncols_dst; ++j) {
203- x_biases[j][ threadIdx . x ] = x_bias[j * stride_col_dst + threadIdx .x ];
203+ x_biases[j] = x_bias[j * stride_col_dst + threadIdx .x ];
204204 }
205205 }
206206 }
@@ -209,7 +209,7 @@ static __global__ void mul_mat_vec_q(
209209 if (threadIdx .x < rows_per_cuda_block && threadIdx .y == 0 &&
210210 (rows_per_cuda_block == 1 || uint32_t (row0 + threadIdx .x ) < stride_col_dst)) {
211211 for (int j = 0 ; j < ncols_dst; ++j) {
212- gate_biases[j][ threadIdx . x ] = gate_bias[j * stride_col_dst + threadIdx .x ];
212+ gate_biases[j] = gate_bias[j * stride_col_dst + threadIdx .x ];
213213 }
214214 }
215215 }
@@ -299,12 +299,12 @@ static __global__ void mul_mat_vec_q(
299299 float result = tmp[j][threadIdx .x ];
300300 if constexpr (has_fusion) {
301301 if (use_bias) {
302- result += x_biases[j][ threadIdx . x ] ;
302+ result += x_biases[j];
303303 }
304304 if (use_gate) {
305305 float gate_value = tmp_gate[j][threadIdx .x ];
306306 if (use_gate_bias) {
307- gate_value += gate_biases[j][ threadIdx . x ] ;
307+ gate_value += gate_biases[j];
308308 }
309309 switch (active_glu) {
310310 case GGML_GLU_OP_SWIGLU:
0 commit comments