From 01f638bd3dbc8eca99fb2b6bbc110910ba4f180a Mon Sep 17 00:00:00 2001 From: juk Date: Mon, 10 Mar 2025 15:26:40 +0000 Subject: [PATCH 01/11] Initial commit of all reformatted changes for deepseek2-mla --- common/arg.cpp | 7 + common/common.h | 1 + convert_hf_to_gguf.py | 72 ++++++ examples/server/README.md | 1 + gguf-py/gguf/constants.py | 21 ++ gguf-py/gguf/tensor_mapping.py | 28 +++ src/llama-arch.cpp | 31 ++- src/llama-arch.h | 7 + src/llama-cparams.h | 1 + src/llama-kv-cache.cpp | 19 +- src/llama-model.cpp | 16 +- src/llama-model.h | 41 ++-- src/llama.cpp | 414 ++++++++++++++++++++------------- 13 files changed, 456 insertions(+), 203 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 8531f0871d44a..84dc6841e3866 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -811,6 +811,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.flash_attn = true; } ).set_env("LLAMA_ARG_FLASH_ATTN")); + add_opt(common_arg( + {"-mla", "--mla-attn"}, + string_format("enable Multi-head Latent Attention (default: %s)", params.mla_attn ? "enabled" : "disabled"), + [](common_params & params) { + params.mla_attn = true; + } + ).set_env("LLAMA_ARG_MLA_ATTN")); add_opt(common_arg( {"-p", "--prompt"}, "PROMPT", "prompt to start generation with; for system message, use -sys", diff --git a/common/common.h b/common/common.h index 1c0f199774976..6d6f98b7b632b 100644 --- a/common/common.h +++ b/common/common.h @@ -325,6 +325,7 @@ struct common_params { bool simple_io = false; // improves compatibility with subprocesses and limited consoles bool cont_batching = true; // insert new sequences for decoding on-the-fly bool flash_attn = false; // flash attention + bool mla_attn = false; // mla attention bool no_perf = false; // disable performance metrics bool ctx_shift = true; // context shift on inifinite text generation diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 6358a94e9b55f..dc566b006374a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4141,6 +4141,78 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter else: return [] + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + n_head_kv = self.hparams["num_key_value_heads"] + qk_nope_head_dim = self.hparams["qk_nope_head_dim"] + qk_rope_head_dim = self.hparams["qk_rope_head_dim"] + v_head_dim = self.hparams["v_head_dim"] + kv_lora_rank = self.hparams["kv_lora_rank"] + + # (v2-lite) split q_proj into: q_proj and q_mqa_proj + if name.endswith("q_proj.weight"): + assert data_torch.shape[0] == n_head_kv * (qk_nope_head_dim + qk_rope_head_dim) + assert data_torch.shape[1] == n_embed + + q_proj_with_mqa = data_torch.view(n_head_kv, qk_nope_head_dim + qk_rope_head_dim, n_embed) + q_proj, q_mqa_proj = torch.split(q_proj_with_mqa, [qk_nope_head_dim, qk_rope_head_dim], dim = 1) + + q_proj = q_proj.reshape(n_head_kv * qk_nope_head_dim, n_embed) + q_mqa_proj = q_mqa_proj.reshape(n_head_kv * qk_rope_head_dim, n_embed) + + return [ + (self.map_tensor_name(name), q_proj), + (self.map_tensor_name(name.replace("q_proj", "q_mqa_proj")), q_mqa_proj) + ] + + # (v2/v3/r1) split q_b_proj into: q_b_proj and q_b_mqa_proj + if name.endswith("q_b_proj.weight"): + q_lora_rank = self.hparams["q_lora_rank"] + + assert data_torch.shape[0] == n_head_kv * (qk_nope_head_dim + qk_rope_head_dim) + assert data_torch.shape[1] == q_lora_rank + + q_b_proj_with_mqa = data_torch.view(n_head_kv, qk_nope_head_dim + qk_rope_head_dim, q_lora_rank) + q_b_proj, q_b_mqa_proj = torch.split(q_b_proj_with_mqa, [qk_nope_head_dim, qk_rope_head_dim], dim = 1) + + q_b_proj = q_b_proj.reshape(n_head_kv * qk_nope_head_dim, q_lora_rank) + q_b_mqa_proj = q_b_mqa_proj.reshape(n_head_kv * qk_rope_head_dim, q_lora_rank) + + return [ + (self.map_tensor_name(name), q_b_proj), + (self.map_tensor_name(name.replace("q_b_proj", "q_b_mqa_proj")), q_b_mqa_proj) + ] + + # split kv_a_proj_with_mqa into: kv_a_proj and k_mqa_proj + if name.endswith("kv_a_proj_with_mqa.weight"): + assert data_torch.shape[0] == kv_lora_rank + qk_rope_head_dim + assert data_torch.shape[1] == n_embed + + kv_a_proj_with_mqa = data_torch.view(kv_lora_rank + qk_rope_head_dim, n_embed) + kv_a_proj, k_mqa_proj = torch.split(kv_a_proj_with_mqa, [kv_lora_rank, qk_rope_head_dim], dim = 0) + + return [ + (self.map_tensor_name(name.replace("kv_a_proj_with_mqa", "kv_a_proj")), kv_a_proj), + (self.map_tensor_name(name.replace("kv_a_proj_with_mqa", "k_mqa_proj")), k_mqa_proj) + ] + + # split kv_b_proj into: k_b_proj, v_b_proj, and k_b_trans_proj (for deepseek-mla) + if name.endswith("kv_b_proj.weight"): + assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim) + assert data_torch.shape[1] == kv_lora_rank + + kv_b_proj = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, kv_lora_rank) + k_b_proj, v_b_proj = torch.split(kv_b_proj, [qk_nope_head_dim, v_head_dim], dim = 1) + + k_b_trans_proj = k_b_proj.transpose(1, 2).reshape(n_head_kv * kv_lora_rank, qk_nope_head_dim) + k_b_proj = k_b_proj.reshape(n_head_kv * qk_nope_head_dim, kv_lora_rank) + v_b_proj = v_b_proj.reshape(n_head_kv * v_head_dim, kv_lora_rank) + + return [ + (self.map_tensor_name(name.replace("kv_b_proj", "k_b_trans_proj")), k_b_trans_proj), + (self.map_tensor_name(name.replace("kv_b_proj", "k_b_proj")), k_b_proj), + (self.map_tensor_name(name.replace("kv_b_proj", "v_b_proj")), v_b_proj) + ] + return [(self.map_tensor_name(name), data_torch)] def prepare_tensors(self): diff --git a/examples/server/README.md b/examples/server/README.md index a2a0903261e31..043c725d8d548 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -46,6 +46,7 @@ The project is under active development, and we are [looking for feedback and co | `-ub, --ubatch-size N` | physical maximum batch size (default: 512)
(env: LLAMA_ARG_UBATCH) | | `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) | | `-fa, --flash-attn` | enable Flash Attention (default: disabled)
(env: LLAMA_ARG_FLASH_ATTN) | +| `-mla, --mla-attn` | enable Multi-head Latent Attention (default: disabled)
(env: LLAMA_ARG_MLA_ATTN) | | `--no-perf` | disable internal libllama performance timings (default: false)
(env: LLAMA_ARG_NO_PERF) | | `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) | | `--no-escape` | do not process escape sequences | diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index ecac5b4bb7f59..758efa2f3ef16 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -356,6 +356,13 @@ class MODEL_TENSOR(IntEnum): ATTN_Q_B = auto() ATTN_KV_A_MQA = auto() ATTN_KV_B = auto() + ATTN_Q_MQA = auto() + ATTN_Q_B_MQA = auto() + ATTN_KV_A = auto() + ATTN_K_MQA = auto() + ATTN_K_B_TRANS = auto() + ATTN_K_B = auto() + ATTN_V_B = auto() ATTN_Q_A_NORM = auto() ATTN_KV_A_NORM = auto() FFN_SUB_NORM = auto() @@ -543,6 +550,13 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b", MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa", MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b", + MODEL_TENSOR.ATTN_Q_MQA: "blk.{bid}.attn_q_mqa", + MODEL_TENSOR.ATTN_Q_B_MQA: "blk.{bid}.attn_q_b_mqa", + MODEL_TENSOR.ATTN_KV_A: "blk.{bid}.attn_kv_a", + MODEL_TENSOR.ATTN_K_MQA: "blk.{bid}.attn_k_mqa", + MODEL_TENSOR.ATTN_K_B_TRANS: "blk.{bid}.attn_k_b_trans", + MODEL_TENSOR.ATTN_K_B: "blk.{bid}.attn_k_b", + MODEL_TENSOR.ATTN_V_B: "blk.{bid}.attn_v_b", MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm", MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm", MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm", @@ -1041,6 +1055,13 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_Q_B, MODEL_TENSOR.ATTN_KV_A_MQA, MODEL_TENSOR.ATTN_KV_B, + MODEL_TENSOR.ATTN_Q_MQA, + MODEL_TENSOR.ATTN_Q_B_MQA, + MODEL_TENSOR.ATTN_KV_A, + MODEL_TENSOR.ATTN_K_MQA, + MODEL_TENSOR.ATTN_K_B_TRANS, + MODEL_TENSOR.ATTN_K_B, + MODEL_TENSOR.ATTN_V_B, MODEL_TENSOR.ATTN_Q_A_NORM, MODEL_TENSOR.ATTN_KV_A_NORM, MODEL_TENSOR.ATTN_OUT, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 617791e240b60..ae17da73af674 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -586,6 +586,34 @@ class TensorNameMap: "model.layers.{bid}.self_attn.kv_b_proj", # deepseek2 ), + MODEL_TENSOR.ATTN_Q_MQA: ( + "model.layers.{bid}.self_attn.q_mqa_proj", # deepseek2 (v2-lite) + ), + + MODEL_TENSOR.ATTN_Q_B_MQA: ( + "model.layers.{bid}.self_attn.q_b_mqa_proj", # deepseek2 (v2/v3/r1) + ), + + MODEL_TENSOR.ATTN_KV_A: ( + "model.layers.{bid}.self_attn.kv_a_proj", # deepseek2 + ), + + MODEL_TENSOR.ATTN_K_MQA: ( + "model.layers.{bid}.self_attn.k_mqa_proj", # deepseek2 + ), + + MODEL_TENSOR.ATTN_K_B_TRANS: ( + "model.layers.{bid}.self_attn.k_b_trans_proj", # deepseek2 (mla only) + ), + + MODEL_TENSOR.ATTN_K_B: ( + "model.layers.{bid}.self_attn.k_b_proj", # deepseek2 + ), + + MODEL_TENSOR.ATTN_V_B: ( + "model.layers.{bid}.self_attn.v_b_proj", # deepseek2 + ), + MODEL_TENSOR.ATTN_Q_A_NORM: ( "model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2 ), diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 97a1e7e5e01ef..cca3cad2c6cb8 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -997,6 +997,13 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, { LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" }, { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" }, + { LLM_TENSOR_ATTN_Q_MQA, "blk.%d.attn_q_mqa" }, + { LLM_TENSOR_ATTN_Q_B_MQA, "blk.%d.attn_q_b_mqa" }, + { LLM_TENSOR_ATTN_KV_A, "blk.%d.attn_kv_a" }, + { LLM_TENSOR_ATTN_K_MQA, "blk.%d.attn_k_mqa" }, + { LLM_TENSOR_ATTN_K_B_TRANS, "blk.%d.attn_k_b_trans" }, + { LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" }, + { LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" }, { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" }, { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" }, { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, @@ -1333,23 +1340,13 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_FFN_DOWN_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_FFN_GATE_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_FFN_UP_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_ATTN_Q_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ATTN_Q_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ATTN_Q_B_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ATTN_KV_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ATTN_K_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ATTN_K_B_TRANS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 122fdcebe0af6..cae591373c2de 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -277,6 +277,13 @@ enum llm_tensor { LLM_TENSOR_ATTN_Q_B, LLM_TENSOR_ATTN_KV_A_MQA, LLM_TENSOR_ATTN_KV_B, + LLM_TENSOR_ATTN_Q_MQA, + LLM_TENSOR_ATTN_Q_B_MQA, + LLM_TENSOR_ATTN_KV_A, + LLM_TENSOR_ATTN_K_MQA, + LLM_TENSOR_ATTN_K_B_TRANS, + LLM_TENSOR_ATTN_K_B, + LLM_TENSOR_ATTN_V_B, LLM_TENSOR_ATTN_Q_A_NORM, LLM_TENSOR_ATTN_KV_A_NORM, LLM_TENSOR_ATTN_SUB_NORM, diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 252012f3d9405..6ebab857e236a 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -28,6 +28,7 @@ struct llama_cparams { bool causal_attn; bool offload_kqv; bool flash_attn; + bool mla_attn; bool no_perf; enum llama_pooling_type pooling_type; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index feffdf0de52cf..384465fb68845 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -32,7 +32,7 @@ bool llama_kv_cache_init( cache.recurrent = llama_model_is_recurrent(&model); cache.v_trans = !cache.recurrent && !cparams.flash_attn; - cache.can_shift = !cache.recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA + cache.can_shift = !cache.recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA (or YaRN?) LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n", __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, cache.can_shift); @@ -91,8 +91,21 @@ bool llama_kv_cache_init( return false; } - ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size); - ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size); + int64_t n_embd_k; + int64_t n_embd_v; + + // note: deepseek-mla stores the compressed versions + if (cparams.mla_attn && model.arch == LLM_ARCH_DEEPSEEK2) { + n_embd_k = hparams.n_lora_kv + hparams.n_rot; + n_embd_v = hparams.n_lora_kv; + } else { + n_embd_k = hparams.n_embd_k_gqa(i); + n_embd_v = hparams.n_embd_v_gqa(i); + } + + ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k*kv_size); + ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v*kv_size); + ggml_format_name(k, "cache_k_l%d", i); ggml_format_name(v, "cache_v_l%d", i); cache.k_l.push_back(k); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 1da4eae7e63e2..dc83718b968c6 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2890,14 +2890,20 @@ bool llama_model::load_tensors(llama_model_loader & ml) { if (!is_lite) { layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0); - layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0); + layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_qk_nope}, 0); + layer.wq_b_mqa = create_tensor(tn(LLM_TENSOR_ATTN_Q_B_MQA, "weight", i), {q_lora_rank, n_head * n_embd_head_qk_rope}, 0); } else { - layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_qk_nope}, 0); + layer.wq_mqa = create_tensor(tn(LLM_TENSOR_ATTN_Q_MQA, "weight", i), {n_embd, n_head * n_embd_head_qk_rope}, 0); } - layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0); - layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0); - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0); + layer.wkv_a = create_tensor(tn(LLM_TENSOR_ATTN_KV_A, "weight", i), {n_embd, kv_lora_rank}, 0); + layer.wk_mqa = create_tensor(tn(LLM_TENSOR_ATTN_K_MQA, "weight", i), {n_embd, n_embd_head_qk_rope}, 0); + layer.wk_b_trans = create_tensor(tn(LLM_TENSOR_ATTN_K_B_TRANS, "weight", i), {n_embd_head_qk_nope, n_head * kv_lora_rank}, 0); + layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {kv_lora_rank, n_head * n_embd_head_qk_nope}, 0); + layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_head * n_embd_head_v}, 0); + + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v, n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); diff --git a/src/llama-model.h b/src/llama-model.h index a7c30444786fd..1b9852402d7b5 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -152,23 +152,30 @@ struct llama_layer { struct ggml_tensor * attn_norm_enc = nullptr; // attention - struct ggml_tensor * wq = nullptr; - struct ggml_tensor * wk = nullptr; - struct ggml_tensor * wv = nullptr; - struct ggml_tensor * wo = nullptr; - struct ggml_tensor * wqkv = nullptr; - struct ggml_tensor * wq_a = nullptr; - struct ggml_tensor * wq_b = nullptr; - struct ggml_tensor * wkv_a_mqa = nullptr; - struct ggml_tensor * wkv_b = nullptr; - struct ggml_tensor * wq_cross = nullptr; - struct ggml_tensor * wk_cross = nullptr; - struct ggml_tensor * wv_cross = nullptr; - struct ggml_tensor * wo_cross = nullptr; - struct ggml_tensor * wq_enc = nullptr; - struct ggml_tensor * wk_enc = nullptr; - struct ggml_tensor * wv_enc = nullptr; - struct ggml_tensor * wo_enc = nullptr; + struct ggml_tensor * wq = nullptr; + struct ggml_tensor * wk = nullptr; + struct ggml_tensor * wv = nullptr; + struct ggml_tensor * wo = nullptr; + struct ggml_tensor * wqkv = nullptr; + struct ggml_tensor * wq_a = nullptr; + struct ggml_tensor * wq_b = nullptr; + struct ggml_tensor * wkv_a_mqa = nullptr; + struct ggml_tensor * wkv_b = nullptr; + struct ggml_tensor * wq_mqa = nullptr; + struct ggml_tensor * wq_b_mqa = nullptr; + struct ggml_tensor * wkv_a = nullptr; + struct ggml_tensor * wk_mqa = nullptr; + struct ggml_tensor * wk_b_trans = nullptr; + struct ggml_tensor * wk_b = nullptr; + struct ggml_tensor * wv_b = nullptr; + struct ggml_tensor * wq_cross = nullptr; + struct ggml_tensor * wk_cross = nullptr; + struct ggml_tensor * wv_cross = nullptr; + struct ggml_tensor * wo_cross = nullptr; + struct ggml_tensor * wq_enc = nullptr; + struct ggml_tensor * wk_enc = nullptr; + struct ggml_tensor * wv_enc = nullptr; + struct ggml_tensor * wo_enc = nullptr; // attention bias struct ggml_tensor * bq = nullptr; diff --git a/src/llama.cpp b/src/llama.cpp index 607f278615969..5be8f11410a47 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -156,8 +156,7 @@ static struct ggml_tensor * llm_build_inp_embd( static void llm_build_kv_store( struct ggml_context * ctx, - const llama_hparams & hparams, - const llama_cparams & cparams, + struct llama_context & lctx, const llama_kv_cache & kv, struct ggml_cgraph * graph, struct ggml_tensor * k_cur, @@ -166,28 +165,41 @@ static void llm_build_kv_store( int32_t kv_head, const llm_build_cb & cb, int64_t il) { - const int64_t n_ctx = cparams.n_ctx; + const llama_model & model = lctx.model; + const llama_hparams & hparams = lctx.model.hparams; + const llama_cparams & cparams = lctx.cparams; - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + const int64_t n_ctx = cparams.n_ctx; GGML_ASSERT(kv.size == n_ctx); - struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa)*kv_head); + int64_t n_embd_k; + int64_t n_embd_v; + + // note: deepseek-mla stores the compressed versions + if (cparams.mla_attn && model.arch == LLM_ARCH_DEEPSEEK2) { + n_embd_k = hparams.n_lora_kv + hparams.n_rot; + n_embd_v = hparams.n_lora_kv; + } else { + n_embd_k = hparams.n_embd_k_gqa(il); + n_embd_v = hparams.n_embd_v_gqa(il); + } + + struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k, ggml_row_size(kv.k_l[il]->type, n_embd_k)*kv_head); cb(k_cache_view, "k_cache_view", il); // note: storing RoPE-ed version of K in the KV cache ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view)); - assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens); + assert(v_cur->ne[0] == n_embd_v && v_cur->ne[1] == n_tokens); struct ggml_tensor * v_cache_view = nullptr; if (cparams.flash_attn) { - v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa)*kv_head); + v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v, ggml_row_size(kv.v_l[il]->type, n_embd_v)*kv_head); } else { // note: the V cache is transposed when not using flash attention - v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa, + v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v, ( n_ctx)*ggml_element_size(kv.v_l[il]), (kv_head)*ggml_element_size(kv.v_l[il])); @@ -542,8 +554,9 @@ static struct ggml_tensor * llm_build_kqv( struct llama_context & lctx, const llama_kv_cache & kv, struct ggml_cgraph * graph, + struct ggml_tensor * wv_b, struct ggml_tensor * wo, - struct ggml_tensor * wo_b, + struct ggml_tensor * bo, struct ggml_tensor * q_cur, struct ggml_tensor * kq_mask, int32_t n_tokens, @@ -555,13 +568,33 @@ static struct ggml_tensor * llm_build_kqv( const llama_hparams & hparams = lctx.model.hparams; const llama_cparams & cparams = lctx.cparams; - const int64_t n_ctx = cparams.n_ctx; - const int64_t n_head = hparams.n_head(il); - const int64_t n_head_kv = hparams.n_head_kv(il); - const int64_t n_embd_head_k = hparams.n_embd_head_k; - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - const int64_t n_embd_head_v = hparams.n_embd_head_v; - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + const int64_t n_ctx = cparams.n_ctx; + const int64_t n_head = hparams.n_head(il); + + int64_t n_head_kv; + int64_t n_embd_k; + int64_t n_embd_head_k; + int64_t n_embd_v; + int64_t n_embd_head_v; + int64_t n_embd_head_v_final; + + // note: MLA caches compressed KV and acts as MQA until the final wv_b expansion + if (cparams.mla_attn && model.arch == LLM_ARCH_DEEPSEEK2) { + GGML_ASSERT(wv_b); + n_head_kv = 1; + n_embd_head_k = hparams.n_lora_kv + hparams.n_rot; + n_embd_k = n_embd_head_k; + n_embd_head_v = hparams.n_lora_kv; + n_embd_v = n_embd_head_v; + n_embd_head_v_final = hparams.n_embd_head_v; // after multiplying by wv_b + } else { + n_head_kv = hparams.n_head_kv(il); + n_embd_head_k = hparams.n_embd_head_k; + n_embd_k = hparams.n_embd_k_gqa(il); + n_embd_head_v = hparams.n_embd_head_v; + n_embd_v = hparams.n_embd_v_gqa(il); + n_embd_head_v_final = n_embd_head_v; + } struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3); cb(q, "q", il); @@ -569,7 +602,7 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * k = ggml_view_3d(ctx, kv.k_l[il], n_embd_head_k, n_kv, n_head_kv, - ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv.k_l[il]->type, n_embd_k), ggml_row_size(kv.k_l[il]->type, n_embd_head_k), 0); cb(k, "k", il); @@ -580,11 +613,14 @@ static struct ggml_tensor * llm_build_kqv( GGML_UNUSED(model); GGML_UNUSED(n_ctx); + // note: MLA creates emebddings too large for FA, see: https://github.com/ggml-org/llama.cpp/pull/12227 + GGML_ASSERT(!wv_b); + // split cached v into n_head heads (not transposed) struct ggml_tensor * v = ggml_view_3d(ctx, kv.v_l[il], n_embd_head_v, n_kv, n_head_kv, - ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa), + ggml_row_size(kv.v_l[il]->type, n_embd_v), ggml_row_size(kv.v_l[il]->type, n_embd_head_v), 0); cb(v, "v", il); @@ -594,7 +630,7 @@ static struct ggml_tensor * llm_build_kqv( ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); - cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); + cur = ggml_reshape_2d(ctx, cur, n_embd_head_v_final*n_head, n_tokens); } else { struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); cb(kq, "kq", il); @@ -637,10 +673,22 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq); cb(kqv, "kqv", il); + // note: MLA needs to expand KQV from MQA into MHA + if (wv_b) { + struct ggml_tensor * wv_b_view = ggml_view_3d(ctx, wv_b, n_embd_head_v, n_embd_head_v_final, n_head, + ggml_row_size(model.layers[il].wv_b->type, n_embd_head_v), + ggml_row_size(model.layers[il].wv_b->type, n_embd_head_v * n_embd_head_v_final), + 0); + cb(wv_b_view, "wv_b_view", il); + + kqv = ggml_mul_mat(ctx, wv_b_view, kqv); + cb(kqv, "kqv_wv_b", il); + } + struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3); cb(kqv_merged, "kqv_merged", il); - cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens); + cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v_final*n_head, n_tokens); cb(cur, "kqv_merged_cont", il); } @@ -650,12 +698,12 @@ static struct ggml_tensor * llm_build_kqv( cur = llm_build_lora_mm(lctx, ctx, wo, cur); } - if (wo_b) { + if (bo) { cb(cur, "kqv_wo", il); } - if (wo_b) { - cur = ggml_add(ctx, cur, wo_b); + if (bo) { + cur = ggml_add(ctx, cur, bo); } return cur; @@ -666,8 +714,9 @@ static struct ggml_tensor * llm_build_kv( struct llama_context & lctx, const llama_kv_cache & kv, struct ggml_cgraph * graph, + struct ggml_tensor * wv_b, struct ggml_tensor * wo, - struct ggml_tensor * wo_b, + struct ggml_tensor * bo, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, struct ggml_tensor * q_cur, @@ -687,11 +736,11 @@ static struct ggml_tensor * llm_build_kv( ggml_build_forward_expand(graph, k_cur); ggml_build_forward_expand(graph, v_cur); - llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il); + llm_build_kv_store(ctx, lctx, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il); struct ggml_tensor * cur; - cur = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b, q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il); + cur = llm_build_kqv(ctx, lctx, kv, graph, wv_b, wo, bo, q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il); cb(cur, "kqv_out", il); return cur; @@ -1546,7 +1595,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, + nullptr, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); } @@ -1723,7 +1772,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, + nullptr, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); } @@ -1861,7 +1910,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, NULL, + nullptr, model.layers[il].wo, nullptr, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -1966,7 +2015,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, NULL, + nullptr, model.layers[il].wo, nullptr, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -2087,7 +2136,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, NULL, + nullptr, model.layers[il].wo, nullptr, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -2211,7 +2260,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, + nullptr, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -2363,7 +2412,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, NULL, + nullptr, model.layers[il].wo, nullptr, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -2475,7 +2524,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, + nullptr, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -2569,7 +2618,7 @@ struct llm_build_context { cb(Qcur, "Qcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, NULL, + nullptr, model.layers[il].wo, nullptr, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -2864,7 +2913,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, + nullptr, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -2996,13 +3045,13 @@ struct llm_build_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, + nullptr, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } else { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, + nullptr, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } } @@ -3147,7 +3196,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, NULL, + nullptr, model.layers[il].wo, nullptr, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -3266,7 +3315,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, NULL, + nullptr, model.layers[il].wo, nullptr, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -3380,7 +3429,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, + nullptr, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -3498,7 +3547,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, + nullptr, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -3613,7 +3662,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, + nullptr, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -3772,7 +3821,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, + nullptr, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -3897,7 +3946,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, + nullptr, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -4022,7 +4071,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, NULL, + nullptr, model.layers[il].wo, nullptr, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } struct ggml_tensor * sa_out = cur; @@ -4124,7 +4173,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, + nullptr, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -4235,7 +4284,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, + nullptr, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -4355,7 +4404,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, NULL, + nullptr, model.layers[il].wo, nullptr, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -4473,7 +4522,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, + nullptr, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -4667,7 +4716,7 @@ struct llm_build_context { cb(k_states, "k_states", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, NULL, + nullptr, model.layers[il].wo, nullptr, k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); } @@ -4789,7 +4838,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, NULL, + nullptr, model.layers[il].wo, nullptr, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -4908,7 +4957,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, NULL, + nullptr, model.layers[il].wo, nullptr, Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -5045,7 +5094,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, + nullptr, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -5244,7 +5293,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, + nullptr, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -5381,8 +5430,9 @@ struct llm_build_context { cb(Kcur, "Kcur", il); } - cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, - KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il); + cur = llm_build_kv(ctx0, lctx, kv_self, gf, + nullptr, model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -5507,7 +5557,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, nullptr, + nullptr, model.layers[il].wo, nullptr, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -5626,7 +5676,7 @@ struct llm_build_context { cb(Kcur, "Kcur_rope", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, NULL, + nullptr, model.layers[il].wo, nullptr, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -5758,7 +5808,7 @@ struct llm_build_context { cb(Kcur, "Kcur_rope", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, NULL, + nullptr, model.layers[il].wo, nullptr, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -5888,7 +5938,7 @@ struct llm_build_context { cb(Qcur, "Vcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, NULL, + nullptr, model.layers[il].wo, nullptr, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -5997,7 +6047,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, + nullptr, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -6140,7 +6190,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, NULL, + nullptr, model.layers[il].wo, nullptr, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -6289,7 +6339,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, + nullptr, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); } @@ -6388,8 +6438,8 @@ struct llm_build_context { const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k)); const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)); - const uint32_t n_embd_head_qk_rope = hparams.n_rot; const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; + const uint32_t n_embd_head_qk_rope = hparams.n_rot; const uint32_t kv_lora_rank = hparams.n_lora_kv; struct ggml_tensor * cur; @@ -6407,7 +6457,6 @@ struct llm_build_context { for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - // norm cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il); @@ -6415,115 +6464,158 @@ struct llm_build_context { // self_attention { - struct ggml_tensor * q = NULL; + struct ggml_tensor * q_nope; + struct ggml_tensor * q_mqa; if (!is_lite) { // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens} - q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); - cb(q, "q", il); + struct ggml_tensor * q_compressed = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); + cb(q_compressed, "q_compressed", il); - q = llm_build_norm(ctx0, q, hparams, + q_compressed = llm_build_norm(ctx0, q_compressed, hparams, model.layers[il].attn_q_a_norm, NULL, LLM_NORM_RMS, cb, il); - cb(q, "q", il); + cb(q_compressed, "q_compressed_norm", il); + + // {q_lora_rank, n_head * n_embd_head_qk_nope} * {q_lora_rank, n_tokens} -> {n_head * n_embd_head_qk_nope, n_tokens} + q_nope = ggml_mul_mat(ctx0, model.layers[il].wq_b, q_compressed); + cb(q_nope, "q_nope", il); - // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens} - q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q); - cb(q, "q", il); + // {q_lora_rank, n_head * n_embd_head_qk_rope} * {q_lora_rank, n_tokens} -> {n_head * n_embd_head_qk_rope, n_tokens} + q_mqa = ggml_mul_mat(ctx0, model.layers[il].wq_b_mqa, q_compressed); + cb(q_mqa, "q_mqa", il); } else { - q = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - cb(q, "q", il); + // {n_embd, n_head * n_embd_head_qk_nope} * {n_embd, n_tokens} -> {n_head * n_embd_head_qk_nope, n_tokens} + q_nope = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(q_nope, "q_nope", il); + + // {n_embd, n_head * n_embd_head_qk_rope} * {n_embd, n_tokens} -> {n_head * n_embd_head_qk_rope, n_tokens} + q_mqa = ggml_mul_mat(ctx0, model.layers[il].wq_mqa, cur); + cb(q_mqa, "q_mqa", il); } - // split into {n_head * n_embd_head_qk_nope, n_tokens} - struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, - ggml_row_size(q->type, hparams.n_embd_head_k), - ggml_row_size(q->type, hparams.n_embd_head_k * n_head), + // {n_embd_head_qk_nope, n_head, n_tokens} + struct ggml_tensor * q_nope_view = ggml_view_3d(ctx0, q_nope, n_embd_head_qk_nope, n_head, n_tokens, + ggml_row_size(q_nope->type, n_embd_head_qk_nope), + ggml_row_size(q_nope->type, n_head * n_embd_head_qk_nope), 0); - cb(q_nope, "q_nope", il); - - // and {n_head * n_embd_head_qk_rope, n_tokens} - struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, - ggml_row_size(q->type, hparams.n_embd_head_k), - ggml_row_size(q->type, hparams.n_embd_head_k * n_head), - ggml_row_size(q->type, n_embd_head_qk_nope)); - cb(q_pe, "q_pe", il); - - // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens} - struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); - cb(kv_pe_compresseed, "kv_pe_compresseed", il); + cb(q_nope_view, "q_nope_view", il); - // split into {kv_lora_rank, n_tokens} - struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens, - kv_pe_compresseed->nb[1], + // {n_embd_head_qk_rope, n_head, n_tokens} + struct ggml_tensor * q_mqa_view = ggml_view_3d(ctx0, q_mqa, n_embd_head_qk_rope, n_head, n_tokens, + ggml_row_size(q_mqa->type, n_embd_head_qk_rope), + ggml_row_size(q_mqa->type, n_head * n_embd_head_qk_rope), 0); - cb(kv_compressed, "kv_compressed", il); + cb(q_mqa_view, "q_mqa_view", il); - // and {n_embd_head_qk_rope, n_tokens} - struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens, - kv_pe_compresseed->nb[1], - kv_pe_compresseed->nb[1], - ggml_row_size(kv_pe_compresseed->type, kv_lora_rank)); - cb(k_pe, "k_pe", il); - - // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont - kv_compressed = ggml_cont(ctx0, kv_compressed); - kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams, - model.layers[il].attn_kv_a_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(kv_compressed, "kv_compressed", il); + q_mqa_view = ggml_rope_ext( + ctx0, q_mqa_view, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor_scaled, beta_fast, beta_slow + ); + cb(q_mqa_view, "q_mqa_view_rope", il); + + // {n_embd, kv_lora_rank} * {n_embd, n_tokens} -> {kv_lora_rank, n_tokens} + struct ggml_tensor * kv_compressed = ggml_mul_mat(ctx0, model.layers[il].wkv_a, cur); + cb(kv_compressed, "kv_compressed", il); + + kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams, + model.layers[il].attn_kv_a_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(kv_compressed, "kv_compressed_norm", il); + + // {n_embd, n_embd_head_qk_rope} * {n_embd, n_tokens} -> {n_embd_head_qk_rope, n_tokens} + struct ggml_tensor * k_mqa = ggml_mul_mat(ctx0, model.layers[il].wk_mqa, cur); + cb(k_mqa, "k_mqa", il); + + // {n_embd_head_qk_rope, 1, n_tokens} + struct ggml_tensor * k_mqa_view = ggml_view_3d(ctx0, k_mqa, n_embd_head_qk_rope, 1, n_tokens, + ggml_row_size(k_mqa->type, n_embd_head_qk_rope), + ggml_row_size(k_mqa->type, n_embd_head_qk_rope), + 0); + cb(k_mqa_view, "k_mqa_view", il); + + k_mqa_view = ggml_rope_ext( + ctx0, k_mqa_view, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor_scaled, beta_fast, beta_slow + ); + cb(k_mqa_view, "k_mqa_view_rope", il); + + // non-MLA + if (!cparams.mla_attn) { + // {kv_lora_rank, n_head * n_embd_head_qk_nope} * {kv_lora_rank, n_tokens} -> {n_head * n_embd_head_qk_nope, n_tokens} + struct ggml_tensor * k_nope = ggml_mul_mat(ctx0, model.layers[il].wk_b, kv_compressed); + cb(k_nope, "k_nope", il); + + // {n_embd_head_qk_nope, n_head, n_tokens} + struct ggml_tensor * k_nope_view = ggml_view_3d(ctx0, k_nope, n_embd_head_qk_nope, n_head, n_tokens, + ggml_row_size(k_nope->type, n_embd_head_qk_nope), + ggml_row_size(k_nope->type, n_head * n_embd_head_qk_nope), + 0); + cb(k_nope_view, "k_nope_view", il); + + // {n_embd_head_qk_nope + n_embd_head_qk_rope, n_head, n_tokens} + struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope_view, q_mqa_view, 0); + cb(q_states, "q_states", il); + + // {n_embd_head_qk_nope + n_embd_head_qk_rope, n_head, n_tokens} + struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope_view, ggml_repeat(ctx0, k_mqa_view, q_mqa_view), 0); + cb(k_states, "k_states", il); + + // {kv_lora_rank, n_head * n_embd_head_v} * {kv_lora_rank, n_tokens} -> {n_head * n_embd_head_v, n_tokens} + struct ggml_tensor * v_states = ggml_mul_mat(ctx0, model.layers[il].wv_b, kv_compressed); + cb(v_states, "v_states", il); + + cur = llm_build_kv(ctx0, lctx, kv_self, gf, + nullptr, model.layers[il].wo, nullptr, + k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); + } + else { + // {n_embd_head_qk_nope, kv_lora_rank, n_head} + struct ggml_tensor * wk_b_trans_view = ggml_view_3d(ctx0, model.layers[il].wk_b_trans, + n_embd_head_qk_nope, kv_lora_rank, n_head, + ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope), + ggml_row_size(model.layers[il].wk_b->type, kv_lora_rank * n_embd_head_qk_nope), + 0); + cb(wk_b_trans_view, "wk_b_trans_view", il); - // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens} - struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed); - cb(kv, "kv", il); + // {n_embd_head_qk_nope, n_tokens, n_head} + q_nope_view = ggml_permute(ctx0, q_nope_view, 0, 2, 1, 3); + cb(q_nope_view, "q_nope_view_perm", il); - // split into {n_head * n_embd_head_qk_nope, n_tokens} - struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, - ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v), - ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)), - 0); - cb(k_nope, "k_nope", il); + // {n_embd_head_qk_nope, kv_lora_rank, n_head} * {n_embd_head_qk_nope, n_tokens, n_head} = {kv_lora_rank, n_tokens, n_head} + struct ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, wk_b_trans_view, q_nope_view); + cb(q_nope_absorbed, "q_nope_absorbed", il); - // and {n_head * n_embd_head_v, n_tokens} - struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens, - ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)), - ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head), - ggml_row_size(kv->type, (n_embd_head_qk_nope))); - cb(v_states, "v_states", il); + // {n_embd_head_qk_rope, n_head, n_tokens} + q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3); + cb(q_nope_absorbed, "q_nope_absorbed_perm", il); - v_states = ggml_cont(ctx0, v_states); - cb(v_states, "v_states", il); + // {kv_lora_rank + n_embd_head_qk_rope, n_head, n_tokens} + struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope_absorbed, q_mqa_view, 0); + cb(q_states, "q_states", il); - v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens, - ggml_row_size(kv->type, hparams.n_embd_head_v * n_head), - 0); - cb(v_states, "v_states", il); - - q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this - q_pe = ggml_rope_ext( - ctx0, q_pe, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor_scaled, beta_fast, beta_slow - ); - cb(q_pe, "q_pe", il); + // {kv_lora_rank, 1, n_tokens} + struct ggml_tensor * kv_compressed_view = ggml_view_3d(ctx0, kv_compressed, + kv_lora_rank, 1, n_tokens, + ggml_row_size(k_mqa->type, kv_lora_rank), + ggml_row_size(k_mqa->type, kv_lora_rank), + 0); + cb(kv_compressed_view, "kv_compressed_view", il); - // shared RoPE key - k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this - k_pe = ggml_rope_ext( - ctx0, k_pe, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor_scaled, beta_fast, beta_slow - ); - cb(k_pe, "k_pe", il); + // {kv_lora_rank + n_embd_head_qk_rope, 1, n_tokens} + struct ggml_tensor * k_states = ggml_concat(ctx0, kv_compressed_view, k_mqa_view, 0); + cb(k_states, "k_states", il); - struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0); - cb(q_states, "q_states", il); + // {kv_lora_rank, 1, n_tokens} + struct ggml_tensor * v_states = kv_compressed; + cb(v_states, "v_states", il); - struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); - cb(k_states, "k_states", il); + cur = llm_build_kv(ctx0, lctx, kv_self, gf, + model.layers[il].wv_b, model.layers[il].wo, nullptr, + k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); + } - cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, NULL, - k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); } if (il == n_layer - 1) { @@ -6680,7 +6772,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - NULL, NULL, + nullptr, nullptr, nullptr, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cur = llm_build_norm(ctx0, cur, hparams, @@ -6932,7 +7024,7 @@ struct llm_build_context { struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); - llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); + llm_build_kv_store(ctx0, lctx, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); struct ggml_tensor * k = ggml_view_3d(ctx0, kv_self.k_l[il], @@ -7134,7 +7226,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, + nullptr, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/float(n_embd_head), cb, il); } @@ -7260,7 +7352,7 @@ struct llm_build_context { cb(Kcur, "Kcur_rope", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, NULL, + nullptr, model.layers[il].wo, nullptr, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -7379,7 +7471,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, + nullptr, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -7505,7 +7597,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, + nullptr, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -7876,7 +7968,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, nullptr, + nullptr, model.layers[il].wo, nullptr, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); if (hparams.swin_norm) { From 9c0eb4fdaad0959945a7a619f24c9ed5e02402d3 Mon Sep 17 00:00:00 2001 From: juk Date: Mon, 10 Mar 2025 15:49:40 +0000 Subject: [PATCH 02/11] Added missing MLA flags --- common/common.cpp | 1 + common/common.h | 2 +- include/llama.h | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index 6448b7b03d6d2..476e1e3764991 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1132,6 +1132,7 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.cb_eval_user_data = params.cb_eval_user_data; cparams.offload_kqv = !params.no_kv_offload; cparams.flash_attn = params.flash_attn; + cparams.mla_attn_attn = params.mla_attn_attn; cparams.no_perf = params.no_perf; if (params.reranking) { diff --git a/common/common.h b/common/common.h index 6d6f98b7b632b..207732a9957a8 100644 --- a/common/common.h +++ b/common/common.h @@ -325,7 +325,7 @@ struct common_params { bool simple_io = false; // improves compatibility with subprocesses and limited consoles bool cont_batching = true; // insert new sequences for decoding on-the-fly bool flash_attn = false; // flash attention - bool mla_attn = false; // mla attention + bool mla_attn = false; // MLA attention for deepseek2 bool no_perf = false; // disable performance metrics bool ctx_shift = true; // context shift on inifinite text generation diff --git a/include/llama.h b/include/llama.h index d62792c0a6760..be6dfc5d87154 100644 --- a/include/llama.h +++ b/include/llama.h @@ -343,6 +343,7 @@ extern "C" { bool embeddings; // if true, extract embeddings (together with logits) bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool flash_attn; // whether to use flash attention [EXPERIMENTAL] + bool mla_attn; // MLA attention for deepseek2 bool no_perf; // whether to measure performance timings // Abort callback From eefa5bb4e7c4abf115d3e3adc056bbc178f64338 Mon Sep 17 00:00:00 2001 From: juk Date: Mon, 10 Mar 2025 15:54:52 +0000 Subject: [PATCH 03/11] Added missing default to llama_context_default_params() --- src/llama.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/llama.cpp b/src/llama.cpp index 5be8f11410a47..368a954f48b86 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9450,6 +9450,7 @@ struct llama_context_params llama_context_default_params() { /*.embeddings =*/ false, /*.offload_kqv =*/ true, /*.flash_attn =*/ false, + /*.mla_attn =*/ false, /*.no_perf =*/ true, /*.abort_callback =*/ nullptr, /*.abort_callback_data =*/ nullptr, @@ -9686,6 +9687,7 @@ struct llama_context * llama_init_from_model( cparams.embeddings = params.embeddings; cparams.offload_kqv = params.offload_kqv; cparams.flash_attn = params.flash_attn; + cparams.mla_attn = params.mla_attn; cparams.no_perf = params.no_perf; cparams.pooling_type = params.pooling_type; From 5636696808d3a8eac66d3ab04118fd1bc06617d2 Mon Sep 17 00:00:00 2001 From: juk Date: Mon, 10 Mar 2025 15:59:36 +0000 Subject: [PATCH 04/11] Fixed typo in mla_attn name --- common/common.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index 476e1e3764991..b3438f2646bdf 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1132,7 +1132,7 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.cb_eval_user_data = params.cb_eval_user_data; cparams.offload_kqv = !params.no_kv_offload; cparams.flash_attn = params.flash_attn; - cparams.mla_attn_attn = params.mla_attn_attn; + cparams.mla_attn = params.mla_attn; cparams.no_perf = params.no_perf; if (params.reranking) { From e551cdcec0cfd27a3954761322dc17fa71914c90 Mon Sep 17 00:00:00 2001 From: juk Date: Mon, 10 Mar 2025 16:10:50 +0000 Subject: [PATCH 05/11] Fixed bad formatting of build_deepseek2() --- src/llama.cpp | 126 ++++++++++++++++++++++++-------------------------- 1 file changed, 61 insertions(+), 65 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 368a954f48b86..78667cbb45b4d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6464,15 +6464,15 @@ struct llm_build_context { // self_attention { - struct ggml_tensor * q_nope; - struct ggml_tensor * q_mqa; + struct ggml_tensor * q_nope = nullptr; + struct ggml_tensor * q_mqa = nullptr; if (!is_lite) { // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens} - struct ggml_tensor * q_compressed = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); + struct ggml_tensor * q_compressed = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); cb(q_compressed, "q_compressed", il); q_compressed = llm_build_norm(ctx0, q_compressed, hparams, - model.layers[il].attn_q_a_norm, NULL, + model.layers[il].attn_q_a_norm, nullptr, LLM_NORM_RMS, cb, il); cb(q_compressed, "q_compressed_norm", il); @@ -6507,70 +6507,66 @@ struct llm_build_context { 0); cb(q_mqa_view, "q_mqa_view", il); - q_mqa_view = ggml_rope_ext( - ctx0, q_mqa_view, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor_scaled, beta_fast, beta_slow + q_mqa_view = ggml_rope_ext(ctx0, q_mqa_view, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor_scaled, beta_fast, beta_slow ); cb(q_mqa_view, "q_mqa_view_rope", il); - // {n_embd, kv_lora_rank} * {n_embd, n_tokens} -> {kv_lora_rank, n_tokens} - struct ggml_tensor * kv_compressed = ggml_mul_mat(ctx0, model.layers[il].wkv_a, cur); - cb(kv_compressed, "kv_compressed", il); - - kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams, - model.layers[il].attn_kv_a_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(kv_compressed, "kv_compressed_norm", il); - - // {n_embd, n_embd_head_qk_rope} * {n_embd, n_tokens} -> {n_embd_head_qk_rope, n_tokens} - struct ggml_tensor * k_mqa = ggml_mul_mat(ctx0, model.layers[il].wk_mqa, cur); - cb(k_mqa, "k_mqa", il); - - // {n_embd_head_qk_rope, 1, n_tokens} - struct ggml_tensor * k_mqa_view = ggml_view_3d(ctx0, k_mqa, n_embd_head_qk_rope, 1, n_tokens, - ggml_row_size(k_mqa->type, n_embd_head_qk_rope), - ggml_row_size(k_mqa->type, n_embd_head_qk_rope), - 0); - cb(k_mqa_view, "k_mqa_view", il); - - k_mqa_view = ggml_rope_ext( - ctx0, k_mqa_view, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor_scaled, beta_fast, beta_slow - ); - cb(k_mqa_view, "k_mqa_view_rope", il); - - // non-MLA + // {n_embd, kv_lora_rank} * {n_embd, n_tokens} -> {kv_lora_rank, n_tokens} + struct ggml_tensor * kv_compressed = ggml_mul_mat(ctx0, model.layers[il].wkv_a, cur); + cb(kv_compressed, "kv_compressed", il); + + kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams, + model.layers[il].attn_kv_a_norm, nullptr, + LLM_NORM_RMS, cb, il); + cb(kv_compressed, "kv_compressed_norm", il); + + // {n_embd, n_embd_head_qk_rope} * {n_embd, n_tokens} -> {n_embd_head_qk_rope, n_tokens} + struct ggml_tensor * k_mqa = ggml_mul_mat(ctx0, model.layers[il].wk_mqa, cur); + cb(k_mqa, "k_mqa", il); + + // {n_embd_head_qk_rope, 1, n_tokens} + struct ggml_tensor * k_mqa_view = ggml_view_3d(ctx0, k_mqa, n_embd_head_qk_rope, 1, n_tokens, + ggml_row_size(k_mqa->type, n_embd_head_qk_rope), + ggml_row_size(k_mqa->type, n_embd_head_qk_rope), + 0); + cb(k_mqa_view, "k_mqa_view", il); + + k_mqa_view = ggml_rope_ext(ctx0, k_mqa_view, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor_scaled, beta_fast, beta_slow + ); + cb(k_mqa_view, "k_mqa_view_rope", il); + if (!cparams.mla_attn) { - // {kv_lora_rank, n_head * n_embd_head_qk_nope} * {kv_lora_rank, n_tokens} -> {n_head * n_embd_head_qk_nope, n_tokens} - struct ggml_tensor * k_nope = ggml_mul_mat(ctx0, model.layers[il].wk_b, kv_compressed); - cb(k_nope, "k_nope", il); - - // {n_embd_head_qk_nope, n_head, n_tokens} - struct ggml_tensor * k_nope_view = ggml_view_3d(ctx0, k_nope, n_embd_head_qk_nope, n_head, n_tokens, - ggml_row_size(k_nope->type, n_embd_head_qk_nope), - ggml_row_size(k_nope->type, n_head * n_embd_head_qk_nope), - 0); - cb(k_nope_view, "k_nope_view", il); - - // {n_embd_head_qk_nope + n_embd_head_qk_rope, n_head, n_tokens} - struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope_view, q_mqa_view, 0); - cb(q_states, "q_states", il); - - // {n_embd_head_qk_nope + n_embd_head_qk_rope, n_head, n_tokens} - struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope_view, ggml_repeat(ctx0, k_mqa_view, q_mqa_view), 0); - cb(k_states, "k_states", il); - - // {kv_lora_rank, n_head * n_embd_head_v} * {kv_lora_rank, n_tokens} -> {n_head * n_embd_head_v, n_tokens} - struct ggml_tensor * v_states = ggml_mul_mat(ctx0, model.layers[il].wv_b, kv_compressed); - cb(v_states, "v_states", il); - - cur = llm_build_kv(ctx0, lctx, kv_self, gf, - nullptr, model.layers[il].wo, nullptr, - k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); - } - else { + // {kv_lora_rank, n_head * n_embd_head_qk_nope} * {kv_lora_rank, n_tokens} -> {n_head * n_embd_head_qk_nope, n_tokens} + struct ggml_tensor * k_nope = ggml_mul_mat(ctx0, model.layers[il].wk_b, kv_compressed); + cb(k_nope, "k_nope", il); + + // {n_embd_head_qk_nope, n_head, n_tokens} + struct ggml_tensor * k_nope_view = ggml_view_3d(ctx0, k_nope, n_embd_head_qk_nope, n_head, n_tokens, + ggml_row_size(k_nope->type, n_embd_head_qk_nope), + ggml_row_size(k_nope->type, n_head * n_embd_head_qk_nope), + 0); + cb(k_nope_view, "k_nope_view", il); + + // {n_embd_head_qk_nope + n_embd_head_qk_rope, n_head, n_tokens} + struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope_view, q_mqa_view, 0); + cb(q_states, "q_states", il); + + // {n_embd_head_qk_nope + n_embd_head_qk_rope, n_head, n_tokens} + struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope_view, ggml_repeat(ctx0, k_mqa_view, q_mqa_view), 0); + cb(k_states, "k_states", il); + + // {kv_lora_rank, n_head * n_embd_head_v} * {kv_lora_rank, n_tokens} -> {n_head * n_embd_head_v, n_tokens} + struct ggml_tensor * v_states = ggml_mul_mat(ctx0, model.layers[il].wv_b, kv_compressed); + cb(v_states, "v_states", il); + + cur = llm_build_kv(ctx0, lctx, kv_self, gf, + nullptr, model.layers[il].wo, nullptr, + k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); + } else { // {n_embd_head_qk_nope, kv_lora_rank, n_head} struct ggml_tensor * wk_b_trans_view = ggml_view_3d(ctx0, model.layers[il].wk_b_trans, n_embd_head_qk_nope, kv_lora_rank, n_head, @@ -6601,7 +6597,7 @@ struct llm_build_context { ggml_row_size(k_mqa->type, kv_lora_rank), ggml_row_size(k_mqa->type, kv_lora_rank), 0); - cb(kv_compressed_view, "kv_compressed_view", il); + cb(kv_compressed_view, "kv_compressed_view", il); // {kv_lora_rank + n_embd_head_qk_rope, 1, n_tokens} struct ggml_tensor * k_states = ggml_concat(ctx0, kv_compressed_view, k_mqa_view, 0); From 75e3d6ac286383f761e258593f36a1a137f67e88 Mon Sep 17 00:00:00 2001 From: juk Date: Mon, 10 Mar 2025 16:41:28 +0000 Subject: [PATCH 06/11] Switched to have first n_rot so can use context shifting later --- src/llama.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 78667cbb45b4d..33159d097f48e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6552,11 +6552,11 @@ struct llm_build_context { cb(k_nope_view, "k_nope_view", il); // {n_embd_head_qk_nope + n_embd_head_qk_rope, n_head, n_tokens} - struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope_view, q_mqa_view, 0); + struct ggml_tensor * q_states = ggml_concat(ctx0, q_mqa_view, q_nope_view, 0); cb(q_states, "q_states", il); // {n_embd_head_qk_nope + n_embd_head_qk_rope, n_head, n_tokens} - struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope_view, ggml_repeat(ctx0, k_mqa_view, q_mqa_view), 0); + struct ggml_tensor * k_states = ggml_concat(ctx0, ggml_repeat(ctx0, k_mqa_view, q_mqa_view), k_nope_view, 0); cb(k_states, "k_states", il); // {kv_lora_rank, n_head * n_embd_head_v} * {kv_lora_rank, n_tokens} -> {n_head * n_embd_head_v, n_tokens} @@ -6588,7 +6588,7 @@ struct llm_build_context { cb(q_nope_absorbed, "q_nope_absorbed_perm", il); // {kv_lora_rank + n_embd_head_qk_rope, n_head, n_tokens} - struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope_absorbed, q_mqa_view, 0); + struct ggml_tensor * q_states = ggml_concat(ctx0, q_mqa_view, q_nope_absorbed, 0); cb(q_states, "q_states", il); // {kv_lora_rank, 1, n_tokens} @@ -6600,7 +6600,7 @@ struct llm_build_context { cb(kv_compressed_view, "kv_compressed_view", il); // {kv_lora_rank + n_embd_head_qk_rope, 1, n_tokens} - struct ggml_tensor * k_states = ggml_concat(ctx0, kv_compressed_view, k_mqa_view, 0); + struct ggml_tensor * k_states = ggml_concat(ctx0, k_mqa_view, kv_compressed_view, 0); cb(k_states, "k_states", il); // {kv_lora_rank, 1, n_tokens} From 559328274ad32825620d6f286de98725dbdf9338 Mon Sep 17 00:00:00 2001 From: juk Date: Mon, 10 Mar 2025 16:44:00 +0000 Subject: [PATCH 07/11] Changed comment as shifting *could* be implemented now --- src/llama-kv-cache.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 384465fb68845..37ab4adfdbd94 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -32,7 +32,7 @@ bool llama_kv_cache_init( cache.recurrent = llama_model_is_recurrent(&model); cache.v_trans = !cache.recurrent && !cparams.flash_attn; - cache.can_shift = !cache.recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA (or YaRN?) + cache.can_shift = !cache.recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported yet LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n", __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, cache.can_shift); From c9faa1bcd9b7504d4f71998d0c00daf603457b38 Mon Sep 17 00:00:00 2001 From: juk Date: Mon, 10 Mar 2025 17:02:28 +0000 Subject: [PATCH 08/11] Final tidy up and added TODOs and extra comments --- src/llama.cpp | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 33159d097f48e..355a78900027e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6551,11 +6551,13 @@ struct llm_build_context { 0); cb(k_nope_view, "k_nope_view", il); - // {n_embd_head_qk_nope + n_embd_head_qk_rope, n_head, n_tokens} + // TODO: build_k_shift() and build_defrag(); the RoPEed part is the first n_rot as they expect + // {n_embd_head_qk_rope + n_embd_head_qk_nope, n_head, n_tokens} struct ggml_tensor * q_states = ggml_concat(ctx0, q_mqa_view, q_nope_view, 0); cb(q_states, "q_states", il); - // {n_embd_head_qk_nope + n_embd_head_qk_rope, n_head, n_tokens} + // TODO: build_k_shift() and build_defrag(); the RoPEed part is the first n_rot as they expect + // {n_embd_head_qk_rope + n_embd_head_qk_nope, n_head, n_tokens} struct ggml_tensor * k_states = ggml_concat(ctx0, ggml_repeat(ctx0, k_mqa_view, q_mqa_view), k_nope_view, 0); cb(k_states, "k_states", il); @@ -6563,6 +6565,7 @@ struct llm_build_context { struct ggml_tensor * v_states = ggml_mul_mat(ctx0, model.layers[il].wv_b, kv_compressed); cb(v_states, "v_states", il); + // note: this has essentially converted MLA into MHA (with very large KV-cache overhead) cur = llm_build_kv(ctx0, lctx, kv_self, gf, nullptr, model.layers[il].wo, nullptr, k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); @@ -6587,7 +6590,8 @@ struct llm_build_context { q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3); cb(q_nope_absorbed, "q_nope_absorbed_perm", il); - // {kv_lora_rank + n_embd_head_qk_rope, n_head, n_tokens} + // TODO: build_k_shift() and build_defrag(); the RoPEed part is the first n_rot as they expect + // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens} struct ggml_tensor * q_states = ggml_concat(ctx0, q_mqa_view, q_nope_absorbed, 0); cb(q_states, "q_states", il); @@ -6599,7 +6603,8 @@ struct llm_build_context { 0); cb(kv_compressed_view, "kv_compressed_view", il); - // {kv_lora_rank + n_embd_head_qk_rope, 1, n_tokens} + // TODO: build_k_shift() and build_defrag(); the RoPEed part is the first n_rot as they expect + // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens} struct ggml_tensor * k_states = ggml_concat(ctx0, k_mqa_view, kv_compressed_view, 0); cb(k_states, "k_states", il); @@ -6607,6 +6612,7 @@ struct llm_build_context { struct ggml_tensor * v_states = kv_compressed; cb(v_states, "v_states", il); + // note: this has essentially converted MLA into MQA (with very low KV-cache overhead) cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wv_b, model.layers[il].wo, nullptr, k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); @@ -9683,7 +9689,7 @@ struct llama_context * llama_init_from_model( cparams.embeddings = params.embeddings; cparams.offload_kqv = params.offload_kqv; cparams.flash_attn = params.flash_attn; - cparams.mla_attn = params.mla_attn; + cparams.mla_attn = params.mla_attn; cparams.no_perf = params.no_perf; cparams.pooling_type = params.pooling_type; From f1297b6a9c7c2c36e21b43984ee8e6d3f034366b Mon Sep 17 00:00:00 2001 From: juk Date: Mon, 10 Mar 2025 17:40:56 +0000 Subject: [PATCH 09/11] Removed unused local variables from llm_build_kv() --- src/llama.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 355a78900027e..2929152cb9885 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -727,9 +727,6 @@ static struct ggml_tensor * llm_build_kv( float kq_scale, const llm_build_cb & cb, int il) { - const llama_hparams & hparams = lctx.model.hparams; - const llama_cparams & cparams = lctx.cparams; - // these nodes are added to the graph together so that they are not reordered // by doing so, the number of splits in the graph is reduced ggml_build_forward_expand(graph, q_cur); From 3649714b37a34420aeb84b5ff52e867b06780962 Mon Sep 17 00:00:00 2001 From: juk Date: Mon, 10 Mar 2025 17:46:50 +0000 Subject: [PATCH 10/11] Removed trailing whitespace in convert_hf_to_gguf.py --- convert_hf_to_gguf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index dc566b006374a..d13196696ce33 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4146,7 +4146,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter qk_nope_head_dim = self.hparams["qk_nope_head_dim"] qk_rope_head_dim = self.hparams["qk_rope_head_dim"] v_head_dim = self.hparams["v_head_dim"] - kv_lora_rank = self.hparams["kv_lora_rank"] + kv_lora_rank = self.hparams["kv_lora_rank"] # (v2-lite) split q_proj into: q_proj and q_mqa_proj if name.endswith("q_proj.weight"): @@ -4200,10 +4200,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim) assert data_torch.shape[1] == kv_lora_rank - kv_b_proj = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, kv_lora_rank) + kv_b_proj = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, kv_lora_rank) k_b_proj, v_b_proj = torch.split(kv_b_proj, [qk_nope_head_dim, v_head_dim], dim = 1) - k_b_trans_proj = k_b_proj.transpose(1, 2).reshape(n_head_kv * kv_lora_rank, qk_nope_head_dim) + k_b_trans_proj = k_b_proj.transpose(1, 2).reshape(n_head_kv * kv_lora_rank, qk_nope_head_dim) k_b_proj = k_b_proj.reshape(n_head_kv * qk_nope_head_dim, kv_lora_rank) v_b_proj = v_b_proj.reshape(n_head_kv * v_head_dim, kv_lora_rank) From c6845359af8f6e727dcb3e34cdf9a4e06d8d0bf7 Mon Sep 17 00:00:00 2001 From: juk Date: Mon, 10 Mar 2025 22:46:39 +0000 Subject: [PATCH 11/11] Added optimised version of MQA for MLA to llm_build_kqv() --- src/llama.cpp | 215 +++++++++++++++++++++++++++++++------------------- 1 file changed, 133 insertions(+), 82 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 2929152cb9885..72e999e0ac0b0 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -176,7 +176,7 @@ static void llm_build_kv_store( int64_t n_embd_k; int64_t n_embd_v; - // note: deepseek-mla stores the compressed versions + // note: deepseek-mla converts MLA to MQA so n_embd_k/n_embd_v change too if (cparams.mla_attn && model.arch == LLM_ARCH_DEEPSEEK2) { n_embd_k = hparams.n_lora_kv + hparams.n_rot; n_embd_v = hparams.n_lora_kv; @@ -568,59 +568,36 @@ static struct ggml_tensor * llm_build_kqv( const llama_hparams & hparams = lctx.model.hparams; const llama_cparams & cparams = lctx.cparams; - const int64_t n_ctx = cparams.n_ctx; - const int64_t n_head = hparams.n_head(il); + const int64_t n_ctx = cparams.n_ctx; + const int64_t n_head = hparams.n_head(il); + const int64_t n_head_kv = hparams.n_head_kv(il); + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + const int64_t n_embd_head_k = hparams.n_embd_head_k; + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + const int64_t n_embd_head_v = hparams.n_embd_head_v; - int64_t n_head_kv; - int64_t n_embd_k; - int64_t n_embd_head_k; - int64_t n_embd_v; - int64_t n_embd_head_v; - int64_t n_embd_head_v_final; - - // note: MLA caches compressed KV and acts as MQA until the final wv_b expansion - if (cparams.mla_attn && model.arch == LLM_ARCH_DEEPSEEK2) { - GGML_ASSERT(wv_b); - n_head_kv = 1; - n_embd_head_k = hparams.n_lora_kv + hparams.n_rot; - n_embd_k = n_embd_head_k; - n_embd_head_v = hparams.n_lora_kv; - n_embd_v = n_embd_head_v; - n_embd_head_v_final = hparams.n_embd_head_v; // after multiplying by wv_b - } else { - n_head_kv = hparams.n_head_kv(il); - n_embd_head_k = hparams.n_embd_head_k; - n_embd_k = hparams.n_embd_k_gqa(il); - n_embd_head_v = hparams.n_embd_head_v; - n_embd_v = hparams.n_embd_v_gqa(il); - n_embd_head_v_final = n_embd_head_v; - } + struct ggml_tensor * cur; struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3); cb(q, "q", il); - struct ggml_tensor * k = - ggml_view_3d(ctx, kv.k_l[il], - n_embd_head_k, n_kv, n_head_kv, - ggml_row_size(kv.k_l[il]->type, n_embd_k), - ggml_row_size(kv.k_l[il]->type, n_embd_head_k), - 0); - cb(k, "k", il); - - struct ggml_tensor * cur; - if (cparams.flash_attn) { GGML_UNUSED(model); GGML_UNUSED(n_ctx); - // note: MLA creates emebddings too large for FA, see: https://github.com/ggml-org/llama.cpp/pull/12227 - GGML_ASSERT(!wv_b); + struct ggml_tensor * k = + ggml_view_3d(ctx, kv.k_l[il], + n_embd_head_k, n_kv, n_head_kv, + ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv.k_l[il]->type, n_embd_head_k), + 0); + cb(k, "k", il); // split cached v into n_head heads (not transposed) struct ggml_tensor * v = ggml_view_3d(ctx, kv.v_l[il], n_embd_head_v, n_kv, n_head_kv, - ggml_row_size(kv.v_l[il]->type, n_embd_v), + ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa), ggml_row_size(kv.v_l[il]->type, n_embd_head_v), 0); cb(v, "v", il); @@ -630,65 +607,139 @@ static struct ggml_tensor * llm_build_kqv( ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); - cur = ggml_reshape_2d(ctx, cur, n_embd_head_v_final*n_head, n_tokens); + cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); } else { - struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); - cb(kq, "kq", il); - // note: this op tends to require high floating point range - // while for some models F16 is enough, for others it is not, so we default to F32 here - ggml_mul_mat_set_prec(kq, GGML_PREC_F32); + // MLA converetd to MQA optimised to use non-batched matrix multiplies + if (cparams.mla_attn && model.arch == LLM_ARCH_DEEPSEEK2) { + const int64_t n_embd_head_k_mqa = hparams.n_lora_kv + hparams.n_rot; + const int64_t n_embd_head_v_mqa = hparams.n_lora_kv; - if (model.arch == LLM_ARCH_GROK) { - // need to do the following: - // multiply by attn_output_multiplyer of 0.08838834764831845 - // and then : - // kq = 30 * tanh(kq / 30) - // before the softmax below + // must cont for the 2D view or else kq with have n_tokens <-> n_head swapped... + q = ggml_cont(ctx, q); + cb(q, "q_cont", il); - kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f)); - kq = ggml_scale(ctx, kq, 30); - } + q = ggml_view_2d(ctx, q, + n_embd_head_k_mqa, n_head * n_tokens, + ggml_row_size(q->type, n_embd_head_k_mqa), + 0); + cb(q, "q_view", il); - if (hparams.attn_soft_cap) { - kq = ggml_scale(ctx, kq, 1.0f / hparams.f_attn_logit_softcapping); - kq = ggml_tanh(ctx, kq); - kq = ggml_scale(ctx, kq, hparams.f_attn_logit_softcapping); - } + struct ggml_tensor * k = + ggml_view_2d(ctx, kv.k_l[il], + n_embd_head_k_mqa, n_kv, + ggml_row_size(kv.k_l[il]->type, n_embd_head_k_mqa), + 0); + cb(k, "k", il); - kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); + struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); + cb(kq, "kq", il); - GGML_ASSERT(kv.size == n_ctx); + // note: this doesn't seem necessary + //ggml_mul_mat_set_prec(kq, GGML_PREC_F32); - // split cached v into n_head heads - struct ggml_tensor * v = - ggml_view_3d(ctx, kv.v_l[il], - n_kv, n_embd_head_v, n_head_kv, - ggml_element_size(kv.v_l[il])*n_ctx, - ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v, + kq = ggml_view_3d(ctx, kq, + n_kv, n_tokens, n_head, + ggml_row_size(kq->type, n_kv), + ggml_row_size(kq->type, n_kv * n_tokens), 0); - cb(v, "v", il); + cb(kq, "kq_view", il); - struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq); - cb(kqv, "kqv", il); + kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias); + cb(kq, "kq_soft_max_ext", il); - // note: MLA needs to expand KQV from MQA into MHA - if (wv_b) { - struct ggml_tensor * wv_b_view = ggml_view_3d(ctx, wv_b, n_embd_head_v, n_embd_head_v_final, n_head, - ggml_row_size(model.layers[il].wv_b->type, n_embd_head_v), - ggml_row_size(model.layers[il].wv_b->type, n_embd_head_v * n_embd_head_v_final), + kq = ggml_view_2d(ctx, kq, + n_kv, n_tokens * n_head, + ggml_row_size(kq->type, n_kv), + 0); + cb(kq, "kq_soft_max_view", il); + + GGML_ASSERT(kv.size == n_ctx); + + struct ggml_tensor * v = + ggml_view_2d(ctx, kv.v_l[il], + n_kv, n_embd_head_v_mqa, + ggml_element_size(kv.v_l[il])*n_ctx, + 0); + cb(v, "v", il); + + struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq); + cb(kqv, "kqv_compressed", il); + + kqv = ggml_view_3d(ctx, kqv, + n_embd_head_v_mqa, n_tokens, n_head, + ggml_row_size(kqv->type, n_embd_head_v_mqa), + ggml_row_size(kqv->type, n_embd_head_v_mqa * n_tokens), + 0); + cb(kqv, "kqv_view", il); + + struct ggml_tensor * wv_b_view = + ggml_view_3d(ctx, wv_b, n_embd_head_v_mqa, n_embd_head_v, n_head, + ggml_row_size(wv_b->type, n_embd_head_v_mqa), + ggml_row_size(wv_b->type, n_embd_head_v * n_embd_head_v_mqa), 0); cb(wv_b_view, "wv_b_view", il); - kqv = ggml_mul_mat(ctx, wv_b_view, kqv); - cb(kqv, "kqv_wv_b", il); + // dsecompress the MQA to MHA + cur = ggml_mul_mat(ctx, wv_b_view, kqv); + cb(cur, "kqv", il); + + // standard MHA/GQA non-flash-attension case + } else { + struct ggml_tensor * k = + ggml_view_3d(ctx, kv.k_l[il], + n_embd_head_k, n_kv, n_head_kv, + ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv.k_l[il]->type, n_embd_head_k), + 0); + cb(k, "k", il); + + struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); + cb(kq, "kq", il); + + // note: this op tends to require high floating point range + // while for some models F16 is enough, for others it is not, so we default to F32 here + ggml_mul_mat_set_prec(kq, GGML_PREC_F32); + + if (model.arch == LLM_ARCH_GROK) { + // need to do the following: + // multiply by attn_output_multiplyer of 0.08838834764831845 + // and then : + // kq = 30 * tanh(kq / 30) + // before the softmax below + + kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f)); + kq = ggml_scale(ctx, kq, 30); + } + + if (hparams.attn_soft_cap) { + kq = ggml_scale(ctx, kq, 1.0f / hparams.f_attn_logit_softcapping); + kq = ggml_tanh(ctx, kq); + kq = ggml_scale(ctx, kq, hparams.f_attn_logit_softcapping); + } + + kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias); + cb(kq, "kq_soft_max_ext", il); + + GGML_ASSERT(kv.size == n_ctx); + + // split cached v into n_head heads + struct ggml_tensor * v = + ggml_view_3d(ctx, kv.v_l[il], + n_kv, n_embd_head_v, n_head_kv, + ggml_element_size(kv.v_l[il])*n_ctx, + ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v, + 0); + cb(v, "v", il); + + cur = ggml_mul_mat(ctx, v, kq); + cb(cur, "kqv", il); } - struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); + cur = ggml_permute(ctx, cur, 0, 2, 1, 3); + cb(cur, "kqv_merged", il); - cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v_final*n_head, n_tokens); + cur = ggml_cont_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); cb(cur, "kqv_merged_cont", il); }