From fd20638bbcb4b1ba69783312bb78545fa418d3f2 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Sun, 29 Sep 2024 09:03:52 +0300 Subject: [PATCH] Allow bf16 kv-cache (#69) On the CPU I get the exact same PPL with and without FA using bf16 for kv-cache. But on CUDA the bf16 kv-cache result is about the same as the fp16 kv-cache CPU result, so I'm missing some conversion somewhere. Co-authored-by: Iwan Kawrakow --- ggml/src/ggml-cuda.cu | 3 +++ ggml/src/ggml-cuda/cpy.cu | 21 +++++++++++++++++++++ src/llama.cpp | 2 +- 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 3d24cc6f..617dd58f 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -2862,6 +2862,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) { return true; } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_BF16) { + return true; + } if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) { return true; } diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index aad34bfe..6ec3b5f3 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -16,6 +16,13 @@ static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) { *dsti = __float2half(*xi); } +static __device__ void cpy_1_f32_bf16(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + nv_bfloat16 * dsti = (nv_bfloat16 *) cdsti; + + *dsti = __float2bfloat16(*xi); +} + static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) { const half * xi = (const half *) cxi; half * dsti = (half *) cdsti; @@ -318,6 +325,16 @@ static void ggml_cpy_f32_f16_cuda( (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); } +static void ggml_cpy_f32_bf16_cuda( + const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { + + const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; + cpy_f32_f16<<>> + (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); +} + static void ggml_cpy_f32_q8_0_cuda( const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, @@ -432,6 +449,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) { + ggml_cpy_f32_bf16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) { @@ -465,6 +484,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) { return (void*) cpy_f32_f16; } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { return (void*) cpy_f32_f16; + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) { + return (void*) cpy_f32_f16; } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { return (void*) cpy_f32_q; } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) { diff --git a/src/llama.cpp b/src/llama.cpp index 94a939d8..dca03ade 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16816,7 +16816,7 @@ struct llama_context * llama_new_context_with_model( params.flash_attn = false; } - if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) { + if (params.type_v != GGML_TYPE_F16 && params.type_v != GGML_TYPE_BF16 && !params.flash_attn) { LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__); return nullptr; }