@@ -501,6 +501,42 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
501
501
502
502
}
503
503
504
+ typedef union {
505
+ half f16;
506
+ uint16_t u16;
507
+ } iq1m_scale_t ;
508
+
509
+ template <typename dst_t >
510
+ static __global__ void dequantize_block_iq1_m (const void * __restrict__ vx, dst_t * __restrict__ yy) {
511
+
512
+ const int i = blockIdx .x ;
513
+ const block_iq1_m * x = (const block_iq1_m *) vx;
514
+
515
+ const int tid = threadIdx .x ;
516
+ #if QK_K == 256
517
+ const int il = tid/8 ; // 0...3
518
+ const int ib = tid%8 ; // 0...7
519
+ dst_t * y = yy + i*QK_K + 32 *ib + 8 *il;
520
+ const uint16_t * sc = (const uint16_t *)x[i].scales ;
521
+ iq1m_scale_t scale;
522
+ scale.u16 = (sc[0 ] >> 12 ) | ((sc[1 ] >> 8 ) & 0x00f0 ) | ((sc[2 ] >> 4 ) & 0x0f00 ) | (sc[3 ] & 0xf000 );
523
+ const int ib16 = 2 *ib + il/2 ; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
524
+ const float d = (float )scale.f16 * (2 *((sc[ib16/4 ] >> 3 *(ib16%4 )) & 0x7 ) + 1 );
525
+ const float delta = x[i].qh [2 *ib+il/2 ] & (0x08 << 4 *(il%2 )) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
526
+ uint32_t grid32[2 ]; const int8_t * q = (const int8_t *)grid32;
527
+ grid32[0 ] = iq1s_grid_gpu[x[i].qs [4 *ib+il] | (((x[i].qh [2 *ib+il/2 ] >> 4 *(il%2 )) & 7 ) << 8 )];
528
+ grid32[1 ] = (grid32[0 ] >> 4 ) & 0x0f0f0f0f ;
529
+ grid32[0 ] &= 0x0f0f0f0f ;
530
+ for (int j = 0 ; j < 8 ; ++j) {
531
+ y[j] = d * (q[j] + delta);
532
+ }
533
+ #else
534
+ assert (false );
535
+ #endif
536
+
537
+ }
538
+
539
+
504
540
template <typename dst_t >
505
541
static __global__ void dequantize_block_iq4_nl (const void * __restrict__ vx, dst_t * __restrict__ yy) {
506
542
@@ -658,6 +694,12 @@ static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k,
658
694
dequantize_block_iq4_nl<<<nb, 32 , 0 , stream>>> (vx, y);
659
695
}
660
696
697
+ template <typename dst_t >
698
+ static void dequantize_row_iq1_m_cuda (const void * vx, dst_t * y, const int k, cudaStream_t stream) {
699
+ const int nb = k / QK_K;
700
+ dequantize_block_iq1_m<<<nb, 32 , 0 , stream>>> (vx, y);
701
+ }
702
+
661
703
template <typename dst_t >
662
704
static void dequantize_row_iq4_xs_cuda (const void * vx, dst_t * y, const int k, cudaStream_t stream) {
663
705
const int nb = (k + QK_K - 1 ) / QK_K;
@@ -724,6 +766,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
724
766
return dequantize_row_iq3_xxs_cuda;
725
767
case GGML_TYPE_IQ1_S:
726
768
return dequantize_row_iq1_s_cuda;
769
+ case GGML_TYPE_IQ1_M:
770
+ return dequantize_row_iq1_m_cuda;
727
771
case GGML_TYPE_IQ4_NL:
728
772
return dequantize_row_iq4_nl_cuda;
729
773
case GGML_TYPE_IQ4_XS:
@@ -769,6 +813,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
769
813
return dequantize_row_iq3_xxs_cuda;
770
814
case GGML_TYPE_IQ1_S:
771
815
return dequantize_row_iq1_s_cuda;
816
+ case GGML_TYPE_IQ1_M:
817
+ return dequantize_row_iq1_m_cuda;
772
818
case GGML_TYPE_IQ4_NL:
773
819
return dequantize_row_iq4_nl_cuda;
774
820
case GGML_TYPE_IQ4_XS:
0 commit comments