Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added RISC-V Vector Support for K-Quants and improved the existing intrinsics #3453

Merged
merged 2 commits into from
Oct 3, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
250 changes: 153 additions & 97 deletions ggml.c
Original file line number Diff line number Diff line change
@@ -1272,6 +1272,33 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
#endif
}
#elif defined(__riscv_v_intrinsic)

size_t vl = __riscv_vsetvl_e32m4(QK8_0);

for (int i = 0; i < nb; i++) {
// load elements
vfloat32m4_t v_x = __riscv_vle32_v_f32m4(x+i*QK8_0, vl);

vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl);
vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);

const float d = amax / ((1 << 7) - 1);
const float id = d ? 1.0f/d : 0.0f;

y[i].d = GGML_FP32_TO_FP16(d);

vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);

// convert to integer
vint16m2_t vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
vint8m1_t vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);

// store result
__riscv_vse8_v_i8m1(y[i].qs , vs, vl);
}
#else
// scalar
quantize_row_q8_0_reference(x, y, k);
@@ -1490,6 +1517,41 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
#endif
}
#elif defined(__riscv_v_intrinsic)

size_t vl = __riscv_vsetvl_e32m4(QK8_1);

for (int i = 0; i < nb; i++) {
// load elements
vfloat32m4_t v_x = __riscv_vle32_v_f32m4(x+i*QK8_1, vl);

vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0, vl);
vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);

const float d = amax / ((1 << 7) - 1);
const float id = d ? 1.0f/d : 0.0f;

y[i].d = d;

vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);

// convert to integer
vint16m2_t vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
vint8m1_t vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);

// store result
__riscv_vse8_v_i8m1(y[i].qs , vs, vl);

// compute sum for y[i].s
vint16m1_t tmp2 = __riscv_vmv_v_x_i16m1(0, vl);
vint16m1_t vwrs = __riscv_vwredsum_vs_i8m1_i16m1(vs, tmp2, vl);

// set y[i].s
int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
y[i].s = sum*d;
}
#else
// scalar
quantize_row_q8_1_reference(x, y, k);
@@ -2662,30 +2724,32 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
size_t vl = __riscv_vsetvl_e8m1(qk/2);

for (int i = 0; i < nb; i++) {
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
// load elements
vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);

vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);

vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
// mask and store lower part of x, and then upper part
vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);

vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);

vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl);
vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl);
// subtract offset
vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 8, vl);
vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 8, vl);

vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);

vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);

vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);

int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);

sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
}
@@ -2823,27 +2887,28 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
size_t vl = __riscv_vsetvl_e8m1(qk/2);

for (int i = 0; i < nb; i++) {
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
// load elements
vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);

vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);

vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
// mask and store lower part of x, and then upper part
vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);

vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);

vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);

vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);

vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);

int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);

sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
}
@@ -3088,66 +3153,61 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *

uint32_t qh;

// These temp values are for masking and shift operations
uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
uint32_t temp_2[16] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};

size_t vl = __riscv_vsetvl_e8m1(qk/2);

// These tempory registers are for masking and shift operations
vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl);

vuint32m2_t vt_3 = __riscv_vsll_vx_u32m2(vt_2, 16, vl);
vuint32m2_t vt_4 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);

for (int i = 0; i < nb; i++) {
memcpy(&qh, x[i].qh, sizeof(uint32_t));

// temporary registers
vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_2, vl);
vuint32m4_t vt_2 = __riscv_vle32_v_u32m4(temp_1, vl);
vuint32m4_t vt_3 = __riscv_vsll_vx_u32m4(vt_1, 16, vl);
vuint32m4_t vt_4 = __riscv_vadd_vx_u32m4(vt_2, 12, vl);

// ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(vt_1, qh, vl);
vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(xha_0, vt_2, vl);
vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(vt_2, qh, vl);
vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(xha_0, vt_1, vl);
vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);

// ((qh & (1u << (j + 16))) >> (j + 12));
vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(vt_3, qh, vl);
vuint32m4_t xhl_1 = __riscv_vsrl_vv_u32m4(xha_1, vt_4, vl);
vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(vt_3, qh, vl);
vuint32m2_t xhl_1 = __riscv_vsrl_vv_u32m2(xha_1, vt_4, vl);

// narrowing
vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xhl_0, vl);
vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xhl_0, vl);
vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);

vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xhl_1, vl);
vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xhl_1, vl);
vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);

// load
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);

vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);

vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);

vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);

vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);

vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 16, vl);
vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 16, vl);
vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 16, vl);
vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 16, vl);

vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);

vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);

vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);

int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);

sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
}
@@ -3414,62 +3474,58 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *

uint32_t qh;

// These temp values are for shift operations
uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};

size_t vl = __riscv_vsetvl_e8m1(qk/2);

// temporary registers for shift operations
vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
vuint32m2_t vt_2 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);

for (int i = 0; i < nb; i++) {
memcpy(&qh, x[i].qh, sizeof(uint32_t));

// temporary registers
vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_1, vl);
vuint32m4_t vt_2 = __riscv_vadd_vx_u32m4(vt_1, 12, vl);

// load qh
vuint32m4_t vqh = __riscv_vmv_v_x_u32m4(qh, vl);
vuint32m2_t vqh = __riscv_vmv_v_x_u32m2(qh, vl);

// ((qh >> (j + 0)) << 4) & 0x10;
vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(vqh, vt_1, vl);
vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(xhl_0, 0x10, vl);
vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(vqh, vt_1, vl);
vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(xhl_0, 0x10, vl);

// ((qh >> (j + 12)) ) & 0x10;
vuint32m4_t xhr_1 = __riscv_vsrl_vv_u32m4(vqh, vt_2, vl);
vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(xhr_1, 0x10, vl);
vuint32m2_t xhr_1 = __riscv_vsrl_vv_u32m2(vqh, vt_2, vl);
vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(xhr_1, 0x10, vl);

// narrowing
vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xha_0, vl);
vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xha_0, vl);
vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);

vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xha_1, vl);
vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xha_1, vl);
vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);

// load
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);

vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);

vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);

vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);

vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);

vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);

vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);

vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);

int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);

sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
}
Loading