@@ -9779,6 +9779,9 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
9779
9779
9780
9780
iq1m_scale_t scale;
9781
9781
9782
+ uint32_t aux32;
9783
+ const uint8_t * aux8 = (const uint8_t *)&aux32;
9784
+
9782
9785
float sumf = 0;
9783
9786
for (int i = 0; i < nb; ++i) {
9784
9787
@@ -9809,13 +9812,11 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
9809
9812
const int32x4_t p2 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(mzero, q1b.val[3], q8b.val[3]));
9810
9813
const int32x4_t p12 = vpaddq_s32(p1, p2);
9811
9814
9812
- delta.val[0] = deltas.val[((qh[0] & 0x08) >> 3) | ((qh[0] & 0x80) >> 6)];
9813
- delta.val[1] = deltas.val[((qh[1] & 0x08) >> 3) | ((qh[1] & 0x80) >> 6)];
9814
- delta.val[2] = deltas.val[((qh[2] & 0x08) >> 3) | ((qh[2] & 0x80) >> 6)];
9815
- delta.val[3] = deltas.val[((qh[3] & 0x08) >> 3) | ((qh[3] & 0x80) >> 6)];
9815
+ const uint32_t * qh32 = (const uint32_t *)qh; // we are 4-byte aligned, so we can do that
9816
+ aux32 = ((qh32[0] >> 3) & 0x01010101) | ((qh32[0] >> 6) & 0x02020202);
9816
9817
9817
- const int32x4_t p3 = vpaddq_s32(ggml_vdotq_s32(mzero, delta .val[0] , q8b.val[0]), ggml_vdotq_s32(mzero, delta .val[1 ], q8b.val[1]));
9818
- const int32x4_t p4 = vpaddq_s32(ggml_vdotq_s32(mzero, delta .val[2] , q8b.val[2]), ggml_vdotq_s32(mzero, delta .val[3 ], q8b.val[3]));
9818
+ const int32x4_t p3 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas .val[aux8[0]] , q8b.val[0]), ggml_vdotq_s32(mzero, deltas .val[aux8[1] ], q8b.val[1]));
9819
+ const int32x4_t p4 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas .val[aux8[2]] , q8b.val[2]), ggml_vdotq_s32(mzero, deltas .val[aux8[3] ], q8b.val[3]));
9819
9820
const int32x4_t p34 = vpaddq_s32(p3, p4);
9820
9821
9821
9822
int32x4_t scales_4 = {sc[ib/2] >> 0, sc[ib/2] >> 3, sc[ib/2] >> 6, sc[ib/2] >> 9};
0 commit comments