Skip to content

Fix more int overflow during quant (PPL/CUDA). #6563

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 84 additions & 84 deletions ggml-cuda/convert.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@

template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
const int64_t i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
const int64_t i = (int64_t)2*(blockDim.x*blockIdx.x + threadIdx.x);

if (i >= k) {
return;
}

const int64_t ib = i/qk; // block index
const int iqs = (i%qk)/qr; // quant index
const int iybs = i - i%qk; // y block start index
const int y_offset = qr == 1 ? 1 : qk/2;
const int64_t iqs = (i%qk)/qr; // quant index
const int64_t iybs = i - i%qk; // y block start index
const int64_t y_offset = qr == 1 ? 1 : qk/2;

// dequantize
dfloat2 v;
Expand All @@ -29,7 +29,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
#if __CUDA_ARCH__ >= CC_PASCAL
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;

const int i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
const int64_t i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
const int * x0 = ((int *) vx) + blockIdx.x * nint;
half2 * y2 = (half2 *) (y + i0);

Expand Down Expand Up @@ -71,9 +71,9 @@ static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t
const int64_t i = blockIdx.x;

// assume 32 threads
const int tid = threadIdx.x;
const int il = tid/8;
const int ir = tid%8;
const int64_t tid = threadIdx.x;
const int64_t il = tid/8;
const int64_t ir = tid%8;
const int64_t ib = 8*i + ir;
if (ib >= nb32) {
return;
Expand All @@ -99,9 +99,9 @@ static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t
const int64_t i = blockIdx.x;

// assume 32 threads
const int tid = threadIdx.x;
const int il = tid/8;
const int ir = tid%8;
const int64_t tid = threadIdx.x;
const int64_t il = tid/8;
const int64_t ir = tid%8;
const int64_t ib = 8*i + ir;
if (ib >= nb32) {
return;
Expand All @@ -125,14 +125,14 @@ static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t
template<typename dst_t>
static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {

const int i = blockIdx.x;
const int64_t i = blockIdx.x;
const block_q2_K * x = (const block_q2_K *) vx;

const int tid = threadIdx.x;
const int64_t tid = threadIdx.x;
#if QK_K == 256
const int n = tid/32;
const int l = tid - 32*n;
const int is = 8*n + l/16;
const int64_t n = tid/32;
const int64_t l = tid - 32*n;
const int64_t is = 8*n + l/16;

const uint8_t q = x[i].qs[32*n + l];
dst_t * y = yy + i*QK_K + 128*n;
Expand All @@ -144,8 +144,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
#else
const int is = tid/16; // 0 or 1
const int il = tid%16; // 0...15
const int64_t is = tid/16; // 0 or 1
const int64_t il = tid%16; // 0...15
const uint8_t q = x[i].qs[il] >> (2*is);
dst_t * y = yy + i*QK_K + 16*is + il;
float dall = __low2half(x[i].dm);
Expand All @@ -159,19 +159,19 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
template<typename dst_t>
static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {

const int i = blockIdx.x;
const int64_t i = blockIdx.x;
const block_q3_K * x = (const block_q3_K *) vx;

#if QK_K == 256
const int r = threadIdx.x/4;
const int tid = r/2;
const int is0 = r%2;
const int l0 = 16*is0 + 4*(threadIdx.x%4);
const int n = tid / 4;
const int j = tid - 4*n;
const int64_t r = threadIdx.x/4;
const int64_t tid = r/2;
const int64_t is0 = r%2;
const int64_t l0 = 16*is0 + 4*(threadIdx.x%4);
const int64_t n = tid / 4;
const int64_t j = tid - 4*n;

uint8_t m = 1 << (4*n + j);
int is = 8*n + 2*j + is0;
int64_t is = 8*n + 2*j + is0;
int shift = 2*j;

int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
Expand All @@ -187,11 +187,11 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t

for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
#else
const int tid = threadIdx.x;
const int is = tid/16; // 0 or 1
const int il = tid%16; // 0...15
const int im = il/8; // 0...1
const int in = il%8; // 0...7
const int64_t tid = threadIdx.x;
const int64_t is = tid/16; // 0 or 1
const int64_t il = tid%16; // 0...15
const int64_t im = il/8; // 0...1
const int64_t in = il%8; // 0...7

dst_t * y = yy + i*QK_K + 16*is + il;

Expand Down Expand Up @@ -225,15 +225,15 @@ template<typename dst_t>
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const block_q4_K * x = (const block_q4_K *) vx;

const int i = blockIdx.x;
const int64_t i = blockIdx.x;

#if QK_K == 256
// assume 32 threads
const int tid = threadIdx.x;
const int il = tid/8;
const int ir = tid%8;
const int is = 2*il;
const int n = 4;
const int64_t tid = threadIdx.x;
const int64_t il = tid/8;
const int64_t ir = tid%8;
const int64_t is = 2*il;
const int64_t n = 4;

dst_t * y = yy + i*QK_K + 64*il + n*ir;

Expand All @@ -252,7 +252,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
y[l +32] = d2 * (q[l] >> 4) - m2;
}
#else
const int tid = threadIdx.x;
const int64_t tid = threadIdx.x;
const uint8_t * q = x[i].qs;
dst_t * y = yy + i*QK_K;
const float d = (float)x[i].dm[0];
Expand All @@ -266,14 +266,14 @@ template<typename dst_t>
static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const block_q5_K * x = (const block_q5_K *) vx;

const int i = blockIdx.x;
const int64_t i = blockIdx.x;

#if QK_K == 256
// assume 64 threads - this is very slightly better than the one below
const int tid = threadIdx.x;
const int il = tid/16; // il is in 0...3
const int ir = tid%16; // ir is in 0...15
const int is = 2*il; // is is in 0...6
const int64_t tid = threadIdx.x;
const int64_t il = tid/16; // il is in 0...3
const int64_t ir = tid%16; // ir is in 0...15
const int64_t is = 2*il; // is is in 0...6

dst_t * y = yy + i*QK_K + 64*il + 2*ir;

Expand All @@ -296,11 +296,11 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
#else
const int tid = threadIdx.x;
const int64_t tid = threadIdx.x;
const uint8_t q = x[i].qs[tid];
const int im = tid/8; // 0...3
const int in = tid%8; // 0...7
const int is = tid/16; // 0 or 1
const int64_t im = tid/8; // 0...3
const int64_t in = tid%8; // 0...7
const int64_t is = tid/16; // 0 or 1
const uint8_t h = x[i].qh[in] >> im;
const float d = x[i].d;
dst_t * y = yy + i*QK_K + tid;
Expand Down Expand Up @@ -357,13 +357,13 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
template<typename dst_t>
static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {

const int i = blockIdx.x;
const int64_t i = blockIdx.x;
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;

const int tid = threadIdx.x;
const int64_t tid = threadIdx.x;
#if QK_K == 256
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
const int64_t il = tid/8; // 0...3
const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const uint16_t * q2 = x[i].qs + 4*ib;
const uint8_t * aux8 = (const uint8_t *)q2;
Expand All @@ -381,13 +381,13 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
template<typename dst_t>
static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {

const int i = blockIdx.x;
const int64_t i = blockIdx.x;
const block_iq2_xs * x = (const block_iq2_xs *) vx;

const int tid = threadIdx.x;
const int64_t tid = threadIdx.x;
#if QK_K == 256
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
const int64_t il = tid/8; // 0...3
const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const uint16_t * q2 = x[i].qs + 4*ib;
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
Expand All @@ -403,13 +403,13 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
template<typename dst_t>
static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {

const int i = blockIdx.x;
const int64_t i = blockIdx.x;
const block_iq2_s * x = (const block_iq2_s *) vx;

const int tid = threadIdx.x;
const int64_t tid = threadIdx.x;
#if QK_K == 256
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
const int64_t il = tid/8; // 0...3
const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
Expand All @@ -424,13 +424,13 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
template<typename dst_t>
static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {

const int i = blockIdx.x;
const int64_t i = blockIdx.x;
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;

const int tid = threadIdx.x;
const int64_t tid = threadIdx.x;
#if QK_K == 256
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
const int64_t il = tid/8; // 0...3
const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const uint8_t * q3 = x[i].qs + 8*ib;
const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
Expand All @@ -452,13 +452,13 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
template<typename dst_t>
static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {

const int i = blockIdx.x;
const int64_t i = blockIdx.x;
const block_iq3_s * x = (const block_iq3_s *) vx;

const int tid = threadIdx.x;
const int64_t tid = threadIdx.x;
#if QK_K == 256
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
const int64_t il = tid/8; // 0...3
const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const uint8_t * qs = x[i].qs + 8*ib;
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
Expand All @@ -478,13 +478,13 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
template<typename dst_t>
static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {

const int i = blockIdx.x;
const int64_t i = blockIdx.x;
const block_iq1_s * x = (const block_iq1_s *) vx;

const int tid = threadIdx.x;
const int64_t tid = threadIdx.x;
#if QK_K == 256
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
const int64_t il = tid/8; // 0...3
const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
Expand All @@ -504,18 +504,18 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
template<typename dst_t>
static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_t * __restrict__ yy) {

const int i = blockIdx.x;
const int64_t i = blockIdx.x;
const block_iq1_m * x = (const block_iq1_m *) vx;

const int tid = threadIdx.x;
const int64_t tid = threadIdx.x;
#if QK_K == 256
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
const int64_t il = tid/8; // 0...3
const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const uint16_t * sc = (const uint16_t *)x[i].scales;
iq1m_scale_t scale;
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
const int64_t ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
Expand All @@ -535,12 +535,12 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
template<typename dst_t>
static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {

const int i = blockIdx.x;
const int64_t i = blockIdx.x;
const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);

const int tid = threadIdx.x;
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
const int64_t tid = threadIdx.x;
const int64_t il = tid/8; // 0...3
const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
const uint8_t * q4 = x[ib].qs + 4*il;
const float d = (float)x[ib].d;
Expand All @@ -554,12 +554,12 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
#if QK_K != 64
template<typename dst_t>
static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int i = blockIdx.x;
const int64_t i = blockIdx.x;
const block_iq4_xs * x = (const block_iq4_xs *)vx;

const int tid = threadIdx.x;
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
const int64_t tid = threadIdx.x;
const int64_t il = tid/8; // 0...3
const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
Expand Down
8 changes: 4 additions & 4 deletions ggml-cuda/softmax.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
extern __shared__ float data_soft_max_f32[];
float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
// shared memory buffer to cache values between iterations:
float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + rowx*ncols;
float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + (int64_t)rowx*ncols;

float max_val = -INFINITY;

Expand All @@ -40,8 +40,8 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
break;
}

const int ix = rowx*ncols + col;
const int iy = rowy*ncols + col;
const int64_t ix = (int64_t)rowx*ncols + col;
const int64_t iy = (int64_t)rowy*ncols + col;

const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);

Expand Down Expand Up @@ -109,7 +109,7 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
return;
}

const int idst = rowx*ncols + col;
const int64_t idst = (int64_t)rowx*ncols + col;
dst[idst] = vals[col] * inv_sum;
}
}
Expand Down
Loading