From 3c69f93d6caf1409bb50f7ad4ad116f510f8d3e0 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Fri, 21 Apr 2023 10:26:49 +0200 Subject: [PATCH 1/3] RMSE-optimized quants for all quantization types By default this new option is ON. One can turn it off by setting LLAMA_NO_RMSE. With this option enabled, the Q4_3 quantization results in a perplexity of 6.0344, so 0.0273 lower than simple Q4_3 quantization. --- CMakeLists.txt | 7 + Makefile | 4 + ggml.c | 357 ++++++++++++++++++++++++++++++++++++++----------- 3 files changed, 287 insertions(+), 81 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 11ebe9eb66fae..1f31cfa20baf6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,6 +68,9 @@ option(LLAMA_ACCELERATE "llama: enable Accelerate framework" option(LLAMA_OPENBLAS "llama: use OpenBLAS" OFF) option(LLAMA_CUBLAS "llama: use cuBLAS" OFF) +# RMSE minimization when quantizing +option(LLAMA_NO_RMSE "llama: disable RMSE minimization" OFF) + option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) @@ -99,6 +102,10 @@ if (NOT MSVC) endif() endif() +if (LLAMA_NO_RMSE) + add_compile_definitions(GGML_NO_RMSE) +endif() + if (APPLE AND LLAMA_ACCELERATE) find_library(ACCELERATE_FRAMEWORK Accelerate) if (ACCELERATE_FRAMEWORK) diff --git a/Makefile b/Makefile index b297959c937da..04ddc4f10b291 100644 --- a/Makefile +++ b/Makefile @@ -134,6 +134,10 @@ ifneq ($(filter armv8%,$(UNAME_M)),) CFLAGS += -mfp16-format=ieee -mno-unaligned-access endif +ifdef LLAMA_NO_RMSE + CFLAGS += -DGGML_NO_RMSE +endif + # # Print build information # diff --git a/ggml.c b/ggml.c index 281b20283c16f..78982eec86ca7 100644 --- a/ggml.c +++ b/ggml.c @@ -670,10 +670,107 @@ typedef struct { } block_q8_0; static_assert(sizeof(block_q8_0) == 3*sizeof(float) + QK8_0, "wrong q8_0 block size/padding"); +#ifndef GGML_NO_RMSE +// Stuff for RMSE-minimizing quantization +static inline int nearest_int(float fval) { + assert(fval <= 4194303.f); + float val = fval + 12582912.f; + int i; memcpy(&i, &val, sizeof(int)); + return (i & 0x007fffff) - 0x00400000; +} + +static float kquantize_q4_with_bounds(int n, int nmin, int nmax, const float * restrict X, int nCandidates, + const float * restrict candidates, int8_t * restrict L) { + assert (nmin >= INT8_MIN); + assert (nmax <= INT8_MAX); + float amax = 0; + for (int i=0; i sumlxM2*suml2P) { + if (sumlxP2 > best*suml2P) { + best = sumlxP2/suml2P; bestScale = iscale; + } + } else { + if (sumlxM2 > best*suml2M) { + best = sumlxM2/suml2M; bestScale = -iscale; + } + } + } + float sumlx = 0; int suml2 = 0; + for (int i=0; i best*suml2) { + best = sumlx2/suml2; bestScale = iscale; + } + } + float sumlx = 0; int suml2 = 0; + for (int i=0; i= INT8_MIN); - assert (nmax <= INT8_MAX); - float amax = 0; - for (int i=0; i max) max = x[j]; + } + if (max == min) { + *result_a = min; + *result_b = 1.f; + for (int j=0; j sumlxM2*suml2P) { - if (sumlxP2 > best*suml2P) { - best = sumlxP2/suml2P; bestScale = iscale; - } - } else { - if (sumlxM2 > best*suml2M) { - best = sumlxM2/suml2M; bestScale = -iscale; - } + float a = min, b = (max - min)/15; + float bi = 15/(max - min); + float simple_err = 0; + for (int j=0; j 0 && fabsf(a - aold) < epsilon*fabsf(aold) && fabsf(b - bold) < epsilon*fabsf(bold)) break; + } + float err = 0; + for (int j=0; j simple_err) { + a = min; b = (max - min)/15; + for (int j=0; j> 4; + + hist[vi0]++; + hist[vi1]++; + } +} + size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % QK4_0 == 0); const int nb = k / QK4_0; @@ -12084,13 +12304,7 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * quantize_row_q4_0_reference(src + j, y, k); for (int i = 0; i < nb; i++) { - for (int l = 0; l < QK4_0; l += 2) { - const uint8_t vi0 = y[i].qs[l/2] & 0xF; - const uint8_t vi1 = y[i].qs[l/2] >> 4; - - hist[vi0]++; - hist[vi1]++; - } + collect_quant_histogram(QK4_0, y[i].qs, hist); } } @@ -12107,13 +12321,7 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * quantize_row_q4_1_reference(src + j, y, k); for (int i = 0; i < nb; i++) { - for (int l = 0; l < QK4_1; l += 2) { - const uint8_t vi0 = y[i].qs[l/2] & 0xF; - const uint8_t vi1 = y[i].qs[l/2] >> 4; - - hist[vi0]++; - hist[vi1]++; - } + collect_quant_histogram(QK4_1, y[i].qs, hist); } } @@ -12127,17 +12335,10 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * for (int j = 0; j < n; j += k) { block_q4_2 * restrict y = (block_q4_2 *)dst + j/QK4_2; - //quantize_row_q4_2_reference(src + j, y, k); - quantize_row_q4_2_rmse(src + j, y, k); + quantize_row_q4_2_reference(src + j, y, k); for (int i = 0; i < nb; i++) { - for (int l = 0; l < QK4_2; l += 2) { - const uint8_t vi0 = y[i].qs[l/2] & 0xF; - const uint8_t vi1 = y[i].qs[l/2] >> 4; - - hist[vi0]++; - hist[vi1]++; - } + collect_quant_histogram(QK4_2, y[i].qs, hist); } } @@ -12154,13 +12355,7 @@ size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * quantize_row_q4_3_reference(src + j, y, k); for (int i = 0; i < nb; i++) { - for (int l = 0; l < QK4_3; l += 2) { - const uint8_t vi0 = y[i].qs[l/2] & 0xF; - const uint8_t vi1 = y[i].qs[l/2] >> 4; - - hist[vi0]++; - hist[vi1]++; - } + collect_quant_histogram(QK4_3, y[i].qs, hist); } } From 4f4f90c92e91ff407ab428d00a8eb708719b503e Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Fri, 21 Apr 2023 17:39:17 +0200 Subject: [PATCH 2/3] Fix test-quantize Test does not work with RMSE-minimization enabled, so have to put the test cases between ifdefs. --- tests/test-quantize.c | 45 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 tests/test-quantize.c diff --git a/tests/test-quantize.c b/tests/test-quantize.c new file mode 100644 index 0000000000000..9dcefc3a01e8f --- /dev/null +++ b/tests/test-quantize.c @@ -0,0 +1,45 @@ +#include "ggml.h" +#undef NDEBUG +#include +#include + +int main(void) { +// Sorry, but I have to disable these for RMSE-optimized quantization +#ifdef GGML_NO_RMSE + #define QK 32 + float src[QK]; + uint8_t dst[24]; + int64_t hist[16]; + + for (int i = 0; i < QK; i++) { + src[i] = (float)(i + 1); + } + + size_t size = ggml_quantize_q4_0(src, dst, QK, QK, hist); + assert(size == 20); + float max_result = ((float *)dst)[0]; + float max_expected = src[31] / ((1 << 3) - 1); + assert(max_result == max_expected); + for (int i = 0; i < QK; i++) { + uint8_t q4_result = (i % 2) ? (dst[sizeof(float) + i/2] >> 4) : (dst[sizeof(float) + i/2] & 0xF); + uint8_t q4_expected = roundf(src[i] / max_expected) + 8; + assert(q4_result == q4_expected); + } + + size = ggml_quantize_q4_1(src, dst, QK, QK, hist); + assert(size == 24); + float delta_result = ((float *)dst)[0]; + float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1); + assert(delta_result == delta_expected); + float min_result = ((float *)dst)[1]; + float min_expected = src[0]; + assert(min_result == min_expected); + for (int i = 0; i < QK; i++) { + uint8_t q4_result = (i % 2) ? (dst[sizeof(float)*2 + i/2] >> 4) : (dst[sizeof(float)*2 + i/2] & 0xF); + uint8_t q4_expected = roundf((src[i] - min_expected) / delta_expected); + assert(q4_result == q4_expected); + } +#endif + + return 0; +} From 6fd49ed050c037579a3fb14d0df2c119c814b2ec Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Fri, 21 Apr 2023 18:09:43 +0200 Subject: [PATCH 3/3] Minor, plus rebase on master --- ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index 78982eec86ca7..392dcc5105535 100644 --- a/ggml.c +++ b/ggml.c @@ -2012,7 +2012,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { [GGML_TYPE_Q4_3] = { .dequantize_row_q = dequantize_row_q4_3, .quantize_row_q = quantize_row_q4_3, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_3_reference, // TODO: RMSE optimization + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_3_reference, .quantize_row_q_dot = quantize_row_q8_0, .vec_dot_q = ggml_vec_dot_q4_3_q8_0, },