Use bf16 kv cache when it's faster

Mozilla-Ocho · Aug 25, 2024 · fa4c4e7 · fa4c4e7 · Djip007 · Aug 29, 2024
1 parent 98eff09
commit fa4c4e7
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 6 deletions.
diff --git a/llama.cpp/common.cpp b/llama.cpp/common.cpp
@@ -2159,6 +2159,9 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
     if (s == "f32") {
         return GGML_TYPE_F32;
     }
+    if (s == "bf16") {
+        return GGML_TYPE_BF16;
+    }
     if (s == "f16") {
         return GGML_TYPE_F16;
     }

diff --git a/llama.cpp/common.h b/llama.cpp/common.h
@@ -22,6 +22,7 @@
 #include <thread>
 #include <unordered_map>
 #include <tuple>
+#include <cosmo.h>
 
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@@ -192,8 +193,8 @@ struct gpt_params {
     bool warmup            = true;  // warmup run
     bool check_tensors     = false; // validate tensor data
 
-    std::string cache_type_k = "f16"; // KV cache data type for the K
-    std::string cache_type_v = "f16"; // KV cache data type for the V
+    std::string cache_type_k = X86_HAVE(AVX512_BF16) ? "bf16" : "f16"; // KV cache data type for the K [jart]
+    std::string cache_type_v = X86_HAVE(AVX512_BF16) ? "bf16" : "f16"; // KV cache data type for the V [jart]
 
     // multimodal models (see examples/llava)
     std::string mmproj = "";        // path to multimodal projector

diff --git a/llama.cpp/llama.cpp b/llama.cpp/llama.cpp
@@ -16766,7 +16766,10 @@ struct llama_context * llama_new_context_with_model(
         params.flash_attn = false;
     }
 
-    if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
+    // [jart] allow bf16
+    if (params.type_v != GGML_TYPE_F16 &&
+        params.type_v != GGML_TYPE_BF16 &&
+        !params.flash_attn) {
         LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
         return nullptr;
     }

diff --git a/llamafile/tinyblas_cpu_sgemm.inc b/llamafile/tinyblas_cpu_sgemm.inc
@@ -73,7 +73,7 @@ bool llamafile_sgemm_impl(long m, long n, long k, const void *A, long lda, const
 
     case GGML_TYPE_BF16: {
 #if defined(__AVX512BF16__)
-        if (Btype == GGML_TYPE_F32 && n < 2) {
+        if (Btype == GGML_TYPE_F32 && n <= 2) {
             tinyBLAS<0, 16, __m512, __m512, ggml_bf16_t, float, TC> tb{
                 k, (const ggml_bf16_t *)A, lda, (const float *)B, ldb, C, ldc, ith, nth};
             tb.matmul(m, n);
@@ -120,7 +120,7 @@ bool llamafile_sgemm_impl(long m, long n, long k, const void *A, long lda, const
 
     case GGML_TYPE_F16: {
 #if defined(__AVX512F__)
-        if (Btype == GGML_TYPE_F32 && n < 2) {
+        if (Btype == GGML_TYPE_F32 && n <= 2) {
             tinyBLAS<0, 16, __m512, __m512, ggml_fp16_t, float, TC> tb{
                 k, (const ggml_fp16_t *)A, lda, (const float *)B, ldb, C, ldc, ith, nth};
             tb.matmul(m, n);
@@ -136,7 +136,7 @@ bool llamafile_sgemm_impl(long m, long n, long k, const void *A, long lda, const
         return true;
 #elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
         if (X86_CHECK(F16C)) {
-            if (Btype == GGML_TYPE_F32 && n < 2) {
+            if (Btype == GGML_TYPE_F32 && n <= 2) {
                 tinyBLAS<0, 8, __m256, __m256, ggml_fp16_t, float, TC> tb{
                     k, (const ggml_fp16_t *)A, lda, (const float *)B, ldb, C, ldc, ith, nth};
                 tb.matmul(m, n);