@@ -569,33 +569,30 @@ static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
569569// and a shift:
570570//
571571// n/d = (mulhi(n, mp) + n) >> L;
572- static const uint2 init_fastdiv_values (uint32_t d) {
572+ static const uint3 init_fastdiv_values (uint32_t d) {
573573 // compute L = ceil(log2(d));
574574 uint32_t L = 0 ;
575575 while (L < 32 && (uint32_t { 1 } << L) < d) {
576576 L++;
577577 }
578578
579579 uint32_t mp = (uint32_t ) ((uint64_t { 1 } << 32 ) * ((uint64_t { 1 } << L) - d) / d + 1 );
580- return make_uint2 (mp, L);
580+ // pack divisor as well to reduce error surface
581+ return make_uint3 (mp, L, d);
581582}
582583
583- static __device__ __forceinline__ uint32_t fastdiv (uint32_t n, const uint2 div_consts) {
584+ static __device__ __forceinline__ uint32_t fastdiv (uint32_t n, const uint3 div_consts) {
585+ // expects div_consts to contain <mp, L, divisor> in <x, y, z>
586+ // div_consts.z is unused and optimized away by the compiler.
584587 // Compute high 32 bits of n * mp
585588 const uint32_t hi = __umulhi (n, div_consts.x );
586- // Apply the formula
589+ // add n, apply bit shift
587590 return (hi + n) >> div_consts.y ;
588591}
589592
590- static const uint3 init_fastmodulo_values (uint32_t d) {
591- // uint3 contains <mp, L, divisor> in <x, y, z>
592- const uint2 fastdiv = init_fastdiv_values (d);
593- return make_uint3 (fastdiv.x , fastdiv.y , d);
594- }
595-
596593static __device__ __forceinline__ uint32_t fastmodulo (uint32_t n, const uint3 modulo_consts) {
597- // expects modulo_consts to contain <mp, L, divisor> in <x, y, z> (see init_fastmodulo_values function )
598- return n - fastdiv (n, make_uint2 ( modulo_consts. x , modulo_consts. y ) ) * modulo_consts.z ;
594+ // expects modulo_consts to contain <mp, L, divisor> in <x, y, z> (see init_fastdiv_values )
595+ return n - fastdiv (n, modulo_consts) * modulo_consts.z ;
599596}
600597
601598typedef void (*dequantize_kernel_t )(const void * vx, const int64_t ib, const int iqs, float2 & v);
0 commit comments