@@ -293,3 +293,47 @@ FLOAT_TYPE mmvq_dot_product(const uint ib_a, const uint iqs) {
293293 return FLOAT_TYPE(float (cache_b_ds.x) * float (dm_scale.x) * float (q_sum) - float (dm_scale.y) * float (cache_b_ds.y / 4 ));
294294}
295295#endif
296+
297+ #if defined(DATA_A_Q6_K)
298+ // 2-byte loads for Q6_K blocks (210 bytes)
299+ i32vec2 repack2(uint ib, uint iqs) {
300+ const uint ib_k = ib / 8 ;
301+ const uint iqs_k = (ib % 8 ) * 8 + iqs;
302+
303+ const uint ql_idx = (iqs_k / 32 ) * 16 + iqs_k % 16 ;
304+ const uint ql_shift = ((iqs_k % 32 ) / 16 ) * 4 ;
305+
306+ const uint qh_idx = (iqs_k / 32 ) * 8 + iqs;
307+ const uint qh_shift = ((iqs_k % 32 ) / 8 ) * 2 ;
308+
309+ const i8vec2 vals00 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 ] >> ql_shift) & uint16_t(0x0F0F))) |
310+ unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 ] >> qh_shift) & uint16_t(0x0303)) << 4 ))) - int8_t(32 );
311+ const i8vec2 vals01 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 1 ] >> ql_shift) & uint16_t(0x0F0F))) |
312+ unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 1 ] >> qh_shift) & uint16_t(0x0303)) << 4 ))) - int8_t(32 );
313+ const i8vec2 vals10 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 2 ] >> ql_shift) & uint16_t(0x0F0F))) |
314+ unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 2 ] >> qh_shift) & uint16_t(0x0303)) << 4 ))) - int8_t(32 );
315+ const i8vec2 vals11 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 3 ] >> ql_shift) & uint16_t(0x0F0F))) |
316+ unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 3 ] >> qh_shift) & uint16_t(0x0303)) << 4 ))) - int8_t(32 );
317+
318+ return i32vec2(pack32(i8vec4(vals00.x, vals00.y, vals01.x, vals01.y)),
319+ pack32(i8vec4(vals10.x, vals10.y, vals11.x, vals11.y)));
320+ }
321+
322+ float get_d_scale(uint ib, uint iqs) {
323+ const uint ib_k = ib / 8 ;
324+ const uint iqs_k = (ib % 8 ) * 8 + iqs;
325+ return float (data_a[ib_k].d) * float (data_a[ib_k].scales[iqs_k / 4 ]);
326+ }
327+
328+ FLOAT_TYPE mmvq_dot_product(const uint ib_a, const uint iqs) {
329+ int32_t q_sum = 0 ;
330+
331+ const i32vec2 qs_a = repack2(ib_a, iqs * 2 );
332+ const float d_scale = get_d_scale(ib_a, iqs * 2 );
333+
334+ q_sum += dotPacked4x8EXT(qs_a.x, cache_b_qs[0 ]);
335+ q_sum += dotPacked4x8EXT(qs_a.y, cache_b_qs[1 ]);
336+
337+ return FLOAT_TYPE(float (cache_b_ds.x) * float (d_scale) * float (q_sum));
338+ }
339+ #endif
0 commit comments