From 01f638bd3dbc8eca99fb2b6bbc110910ba4f180a Mon Sep 17 00:00:00 2001
From: juk <jukofyork@yahoo.com>
Date: Mon, 10 Mar 2025 15:26:40 +0000
Subject: [PATCH 01/11] Initial commit of all reformatted changes for
 deepseek2-mla

---
 common/arg.cpp                 |   7 +
 common/common.h                |   1 +
 convert_hf_to_gguf.py          |  72 ++++++
 examples/server/README.md      |   1 +
 gguf-py/gguf/constants.py      |  21 ++
 gguf-py/gguf/tensor_mapping.py |  28 +++
 src/llama-arch.cpp             |  31 ++-
 src/llama-arch.h               |   7 +
 src/llama-cparams.h            |   1 +
 src/llama-kv-cache.cpp         |  19 +-
 src/llama-model.cpp            |  16 +-
 src/llama-model.h              |  41 ++--
 src/llama.cpp                  | 414 ++++++++++++++++++++-------------
 13 files changed, 456 insertions(+), 203 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 8531f0871d44a..84dc6841e3866 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -811,6 +811,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.flash_attn = true;
         }
     ).set_env("LLAMA_ARG_FLASH_ATTN"));
+    add_opt(common_arg(
+        {"-mla", "--mla-attn"},
+        string_format("enable Multi-head Latent Attention (default: %s)", params.mla_attn ? "enabled" : "disabled"),
+        [](common_params & params) {
+            params.mla_attn = true;
+        }
+    ).set_env("LLAMA_ARG_MLA_ATTN"));
     add_opt(common_arg(
         {"-p", "--prompt"}, "PROMPT",
         "prompt to start generation with; for system message, use -sys",
diff --git a/common/common.h b/common/common.h
index 1c0f199774976..6d6f98b7b632b 100644
--- a/common/common.h
+++ b/common/common.h
@@ -325,6 +325,7 @@ struct common_params {
     bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
     bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
     bool flash_attn        = false; // flash attention
+    bool mla_attn          = false; // mla attention
     bool no_perf           = false; // disable performance metrics
     bool ctx_shift         = true;  // context shift on inifinite text generation
 
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 6358a94e9b55f..dc566b006374a 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -4141,6 +4141,78 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             else:
                 return []
 
+        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
+        n_head_kv = self.hparams["num_key_value_heads"]
+        qk_nope_head_dim = self.hparams["qk_nope_head_dim"]
+        qk_rope_head_dim = self.hparams["qk_rope_head_dim"]
+        v_head_dim = self.hparams["v_head_dim"]
+        kv_lora_rank = self.hparams["kv_lora_rank"]        
+
+        # (v2-lite) split q_proj into: q_proj and q_mqa_proj
+        if name.endswith("q_proj.weight"):
+            assert data_torch.shape[0] == n_head_kv * (qk_nope_head_dim + qk_rope_head_dim)
+            assert data_torch.shape[1] == n_embed
+
+            q_proj_with_mqa = data_torch.view(n_head_kv, qk_nope_head_dim + qk_rope_head_dim, n_embed)
+            q_proj, q_mqa_proj = torch.split(q_proj_with_mqa, [qk_nope_head_dim, qk_rope_head_dim], dim = 1)
+
+            q_proj = q_proj.reshape(n_head_kv * qk_nope_head_dim, n_embed)
+            q_mqa_proj = q_mqa_proj.reshape(n_head_kv * qk_rope_head_dim, n_embed)
+
+            return [
+                (self.map_tensor_name(name), q_proj),
+                (self.map_tensor_name(name.replace("q_proj", "q_mqa_proj")), q_mqa_proj)
+            ]
+
+        # (v2/v3/r1) split q_b_proj into: q_b_proj and q_b_mqa_proj
+        if name.endswith("q_b_proj.weight"):
+            q_lora_rank = self.hparams["q_lora_rank"]
+
+            assert data_torch.shape[0] == n_head_kv * (qk_nope_head_dim + qk_rope_head_dim)
+            assert data_torch.shape[1] == q_lora_rank
+
+            q_b_proj_with_mqa = data_torch.view(n_head_kv, qk_nope_head_dim + qk_rope_head_dim, q_lora_rank)
+            q_b_proj, q_b_mqa_proj = torch.split(q_b_proj_with_mqa, [qk_nope_head_dim, qk_rope_head_dim], dim = 1)
+
+            q_b_proj = q_b_proj.reshape(n_head_kv * qk_nope_head_dim, q_lora_rank)
+            q_b_mqa_proj = q_b_mqa_proj.reshape(n_head_kv * qk_rope_head_dim, q_lora_rank)
+
+            return [
+                (self.map_tensor_name(name), q_b_proj),
+                (self.map_tensor_name(name.replace("q_b_proj", "q_b_mqa_proj")), q_b_mqa_proj)
+            ]
+
+        # split kv_a_proj_with_mqa into: kv_a_proj and k_mqa_proj
+        if name.endswith("kv_a_proj_with_mqa.weight"):
+            assert data_torch.shape[0] == kv_lora_rank + qk_rope_head_dim
+            assert data_torch.shape[1] == n_embed
+
+            kv_a_proj_with_mqa = data_torch.view(kv_lora_rank + qk_rope_head_dim, n_embed)
+            kv_a_proj, k_mqa_proj = torch.split(kv_a_proj_with_mqa, [kv_lora_rank, qk_rope_head_dim], dim = 0)
+
+            return [
+                (self.map_tensor_name(name.replace("kv_a_proj_with_mqa", "kv_a_proj")), kv_a_proj),
+                (self.map_tensor_name(name.replace("kv_a_proj_with_mqa", "k_mqa_proj")), k_mqa_proj)
+            ]
+
+        # split kv_b_proj into: k_b_proj, v_b_proj, and k_b_trans_proj (for deepseek-mla)
+        if name.endswith("kv_b_proj.weight"):
+            assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim)
+            assert data_torch.shape[1] == kv_lora_rank
+
+            kv_b_proj = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, kv_lora_rank)            
+            k_b_proj, v_b_proj = torch.split(kv_b_proj, [qk_nope_head_dim, v_head_dim], dim = 1)
+
+            k_b_trans_proj = k_b_proj.transpose(1, 2).reshape(n_head_kv * kv_lora_rank, qk_nope_head_dim)            
+            k_b_proj = k_b_proj.reshape(n_head_kv * qk_nope_head_dim, kv_lora_rank)
+            v_b_proj = v_b_proj.reshape(n_head_kv * v_head_dim, kv_lora_rank)
+
+            return [
+                (self.map_tensor_name(name.replace("kv_b_proj", "k_b_trans_proj")), k_b_trans_proj),
+                (self.map_tensor_name(name.replace("kv_b_proj", "k_b_proj")), k_b_proj),
+                (self.map_tensor_name(name.replace("kv_b_proj", "v_b_proj")), v_b_proj)
+            ]
+
         return [(self.map_tensor_name(name), data_torch)]
 
     def prepare_tensors(self):
diff --git a/examples/server/README.md b/examples/server/README.md
index a2a0903261e31..043c725d8d548 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -46,6 +46,7 @@ The project is under active development, and we are [looking for feedback and co
 | `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
 | `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
 | `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
+| `-mla, --mla-attn` | enable Multi-head Latent Attention (default: disabled)<br/>(env: LLAMA_ARG_MLA_ATTN) |
 | `--no-perf` | disable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_NO_PERF) |
 | `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
 | `--no-escape` | do not process escape sequences |
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index ecac5b4bb7f59..758efa2f3ef16 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -356,6 +356,13 @@ class MODEL_TENSOR(IntEnum):
     ATTN_Q_B             = auto()
     ATTN_KV_A_MQA        = auto()
     ATTN_KV_B            = auto()
+    ATTN_Q_MQA           = auto()
+    ATTN_Q_B_MQA         = auto()
+    ATTN_KV_A            = auto()
+    ATTN_K_MQA           = auto()
+    ATTN_K_B_TRANS       = auto()
+    ATTN_K_B             = auto()
+    ATTN_V_B             = auto()
     ATTN_Q_A_NORM        = auto()
     ATTN_KV_A_NORM       = auto()
     FFN_SUB_NORM         = auto()
@@ -543,6 +550,13 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.ATTN_Q_B:                  "blk.{bid}.attn_q_b",
     MODEL_TENSOR.ATTN_KV_A_MQA:             "blk.{bid}.attn_kv_a_mqa",
     MODEL_TENSOR.ATTN_KV_B:                 "blk.{bid}.attn_kv_b",
+    MODEL_TENSOR.ATTN_Q_MQA:                "blk.{bid}.attn_q_mqa",
+    MODEL_TENSOR.ATTN_Q_B_MQA:              "blk.{bid}.attn_q_b_mqa",
+    MODEL_TENSOR.ATTN_KV_A:                 "blk.{bid}.attn_kv_a",
+    MODEL_TENSOR.ATTN_K_MQA:                "blk.{bid}.attn_k_mqa",
+    MODEL_TENSOR.ATTN_K_B_TRANS:            "blk.{bid}.attn_k_b_trans",
+    MODEL_TENSOR.ATTN_K_B:                  "blk.{bid}.attn_k_b",
+    MODEL_TENSOR.ATTN_V_B:                  "blk.{bid}.attn_v_b",
     MODEL_TENSOR.ATTN_Q_A_NORM:             "blk.{bid}.attn_q_a_norm",
     MODEL_TENSOR.ATTN_KV_A_NORM:            "blk.{bid}.attn_kv_a_norm",
     MODEL_TENSOR.ATTN_SUB_NORM:             "blk.{bid}.attn_sub_norm",
@@ -1041,6 +1055,13 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.ATTN_Q_B,
         MODEL_TENSOR.ATTN_KV_A_MQA,
         MODEL_TENSOR.ATTN_KV_B,
+        MODEL_TENSOR.ATTN_Q_MQA,
+        MODEL_TENSOR.ATTN_Q_B_MQA,
+        MODEL_TENSOR.ATTN_KV_A,
+        MODEL_TENSOR.ATTN_K_MQA,
+        MODEL_TENSOR.ATTN_K_B_TRANS,
+        MODEL_TENSOR.ATTN_K_B,
+        MODEL_TENSOR.ATTN_V_B,
         MODEL_TENSOR.ATTN_Q_A_NORM,
         MODEL_TENSOR.ATTN_KV_A_NORM,
         MODEL_TENSOR.ATTN_OUT,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 617791e240b60..ae17da73af674 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -586,6 +586,34 @@ class TensorNameMap:
             "model.layers.{bid}.self_attn.kv_b_proj", # deepseek2
         ),
 
+        MODEL_TENSOR.ATTN_Q_MQA: (
+            "model.layers.{bid}.self_attn.q_mqa_proj",  # deepseek2 (v2-lite)
+        ),
+
+        MODEL_TENSOR.ATTN_Q_B_MQA: (
+            "model.layers.{bid}.self_attn.q_b_mqa_proj",  # deepseek2 (v2/v3/r1)
+        ),
+
+        MODEL_TENSOR.ATTN_KV_A: (
+            "model.layers.{bid}.self_attn.kv_a_proj",  # deepseek2
+        ),
+
+        MODEL_TENSOR.ATTN_K_MQA: (
+            "model.layers.{bid}.self_attn.k_mqa_proj",  # deepseek2
+        ),
+
+        MODEL_TENSOR.ATTN_K_B_TRANS: (
+            "model.layers.{bid}.self_attn.k_b_trans_proj",  # deepseek2 (mla only)
+        ),
+
+        MODEL_TENSOR.ATTN_K_B: (
+            "model.layers.{bid}.self_attn.k_b_proj",  # deepseek2
+        ),
+
+        MODEL_TENSOR.ATTN_V_B: (
+            "model.layers.{bid}.self_attn.v_b_proj",  # deepseek2
+        ),
+
         MODEL_TENSOR.ATTN_Q_A_NORM: (
             "model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2
         ),
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 97a1e7e5e01ef..cca3cad2c6cb8 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -997,6 +997,13 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
             { LLM_TENSOR_ATTN_Q_A,           "blk.%d.attn_q_a" },
             { LLM_TENSOR_ATTN_Q_B,           "blk.%d.attn_q_b" },
+            { LLM_TENSOR_ATTN_Q_MQA,         "blk.%d.attn_q_mqa" },
+            { LLM_TENSOR_ATTN_Q_B_MQA,       "blk.%d.attn_q_b_mqa" },
+            { LLM_TENSOR_ATTN_KV_A,          "blk.%d.attn_kv_a" },
+            { LLM_TENSOR_ATTN_K_MQA,         "blk.%d.attn_k_mqa" },
+            { LLM_TENSOR_ATTN_K_B_TRANS,     "blk.%d.attn_k_b_trans" },
+            { LLM_TENSOR_ATTN_K_B,           "blk.%d.attn_k_b" },
+            { LLM_TENSOR_ATTN_V_B,           "blk.%d.attn_v_b" },
             { LLM_TENSOR_ATTN_KV_A_MQA,      "blk.%d.attn_kv_a_mqa" },
             { LLM_TENSOR_ATTN_KV_B,          "blk.%d.attn_kv_b" },
             { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
@@ -1333,23 +1340,13 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_ATTN_Q_B,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_ATTN_KV_A_MQA,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_ATTN_KV_B,                  {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_DEC_ATTN_Q,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_DEC_ATTN_K,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_ATTN_Q,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_ATTN_K,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_ATTN_V,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_ATTN_QKV,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_ATTN_OUT,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_FFN_GATE,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_FFN_DOWN,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_FFN_UP,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_FFN_DOWN_SHEXP,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_FFN_GATE_SHEXP,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_FFN_UP_SHEXP,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_ATTN_Q_A,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_ATTN_Q_B,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_ATTN_KV_A_MQA,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_ATTN_KV_B,                  {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_Q_MQA,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_Q_B_MQA,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_KV_A,                  {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_K_MQA,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_K_B_TRANS,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_K_B,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_V_B,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_DEC_ATTN_Q,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_DEC_ATTN_K,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_DEC_ATTN_V,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 122fdcebe0af6..cae591373c2de 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -277,6 +277,13 @@ enum llm_tensor {
     LLM_TENSOR_ATTN_Q_B,
     LLM_TENSOR_ATTN_KV_A_MQA,
     LLM_TENSOR_ATTN_KV_B,
+    LLM_TENSOR_ATTN_Q_MQA,
+    LLM_TENSOR_ATTN_Q_B_MQA,
+    LLM_TENSOR_ATTN_KV_A,
+    LLM_TENSOR_ATTN_K_MQA,
+    LLM_TENSOR_ATTN_K_B_TRANS,
+    LLM_TENSOR_ATTN_K_B,
+    LLM_TENSOR_ATTN_V_B,
     LLM_TENSOR_ATTN_Q_A_NORM,
     LLM_TENSOR_ATTN_KV_A_NORM,
     LLM_TENSOR_ATTN_SUB_NORM,
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
index 252012f3d9405..6ebab857e236a 100644
--- a/src/llama-cparams.h
+++ b/src/llama-cparams.h
@@ -28,6 +28,7 @@ struct llama_cparams {
     bool causal_attn;
     bool offload_kqv;
     bool flash_attn;
+    bool mla_attn;
     bool no_perf;
 
     enum llama_pooling_type pooling_type;
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index feffdf0de52cf..384465fb68845 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -32,7 +32,7 @@ bool llama_kv_cache_init(
 
     cache.recurrent = llama_model_is_recurrent(&model);
     cache.v_trans   = !cache.recurrent && !cparams.flash_attn;
-    cache.can_shift = !cache.recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
+    cache.can_shift = !cache.recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA (or YaRN?)
 
     LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n",
             __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, cache.can_shift);
@@ -91,8 +91,21 @@ bool llama_kv_cache_init(
             return false;
         }
 
-        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
-        ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
+        int64_t n_embd_k;
+        int64_t n_embd_v;
+
+        // note: deepseek-mla stores the compressed versions
+        if (cparams.mla_attn && model.arch == LLM_ARCH_DEEPSEEK2) {
+            n_embd_k = hparams.n_lora_kv + hparams.n_rot;
+            n_embd_v = hparams.n_lora_kv;
+        } else {
+            n_embd_k = hparams.n_embd_k_gqa(i);
+            n_embd_v = hparams.n_embd_v_gqa(i);
+        }
+
+        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k*kv_size);
+        ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v*kv_size);
+
         ggml_format_name(k, "cache_k_l%d", i);
         ggml_format_name(v, "cache_v_l%d", i);
         cache.k_l.push_back(k);
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 1da4eae7e63e2..dc83718b968c6 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2890,14 +2890,20 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                         if (!is_lite) {
                             layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
-                            layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
+                            layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_qk_nope}, 0);
+                            layer.wq_b_mqa = create_tensor(tn(LLM_TENSOR_ATTN_Q_B_MQA, "weight", i), {q_lora_rank, n_head * n_embd_head_qk_rope}, 0);
                         } else {
-                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_qk_nope}, 0);
+                            layer.wq_mqa = create_tensor(tn(LLM_TENSOR_ATTN_Q_MQA, "weight", i), {n_embd, n_head * n_embd_head_qk_rope}, 0);
                         }
 
-                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
-                        layer.wkv_b     = create_tensor(tn(LLM_TENSOR_ATTN_KV_B,     "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
-                        layer.wo        = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {              n_head * (                      n_embd_head_v), n_embd}, 0);
+                        layer.wkv_a = create_tensor(tn(LLM_TENSOR_ATTN_KV_A, "weight", i), {n_embd, kv_lora_rank}, 0);
+                        layer.wk_mqa = create_tensor(tn(LLM_TENSOR_ATTN_K_MQA, "weight", i), {n_embd, n_embd_head_qk_rope}, 0);
+                        layer.wk_b_trans = create_tensor(tn(LLM_TENSOR_ATTN_K_B_TRANS, "weight", i), {n_embd_head_qk_nope, n_head * kv_lora_rank}, 0);
+                        layer.wk_b      = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {kv_lora_rank, n_head * n_embd_head_qk_nope}, 0);
+                        layer.wv_b      = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_head * n_embd_head_v}, 0);
+
+                        layer.wo        = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v, n_embd}, 0);
 
                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
diff --git a/src/llama-model.h b/src/llama-model.h
index a7c30444786fd..1b9852402d7b5 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -152,23 +152,30 @@ struct llama_layer {
     struct ggml_tensor * attn_norm_enc   = nullptr;
 
     // attention
-    struct ggml_tensor * wq        = nullptr;
-    struct ggml_tensor * wk        = nullptr;
-    struct ggml_tensor * wv        = nullptr;
-    struct ggml_tensor * wo        = nullptr;
-    struct ggml_tensor * wqkv      = nullptr;
-    struct ggml_tensor * wq_a      = nullptr;
-    struct ggml_tensor * wq_b      = nullptr;
-    struct ggml_tensor * wkv_a_mqa = nullptr;
-    struct ggml_tensor * wkv_b     = nullptr;
-    struct ggml_tensor * wq_cross  = nullptr;
-    struct ggml_tensor * wk_cross  = nullptr;
-    struct ggml_tensor * wv_cross  = nullptr;
-    struct ggml_tensor * wo_cross  = nullptr;
-    struct ggml_tensor * wq_enc    = nullptr;
-    struct ggml_tensor * wk_enc    = nullptr;
-    struct ggml_tensor * wv_enc    = nullptr;
-    struct ggml_tensor * wo_enc    = nullptr;
+    struct ggml_tensor * wq         = nullptr;
+    struct ggml_tensor * wk         = nullptr;
+    struct ggml_tensor * wv         = nullptr;
+    struct ggml_tensor * wo         = nullptr;
+    struct ggml_tensor * wqkv       = nullptr;
+    struct ggml_tensor * wq_a       = nullptr;
+    struct ggml_tensor * wq_b       = nullptr;
+    struct ggml_tensor * wkv_a_mqa  = nullptr;
+    struct ggml_tensor * wkv_b      = nullptr;
+    struct ggml_tensor * wq_mqa     = nullptr;
+    struct ggml_tensor * wq_b_mqa   = nullptr;
+    struct ggml_tensor * wkv_a      = nullptr;
+    struct ggml_tensor * wk_mqa     = nullptr;
+    struct ggml_tensor * wk_b_trans = nullptr;
+    struct ggml_tensor * wk_b       = nullptr;
+    struct ggml_tensor * wv_b       = nullptr;
+    struct ggml_tensor * wq_cross   = nullptr;
+    struct ggml_tensor * wk_cross   = nullptr;
+    struct ggml_tensor * wv_cross   = nullptr;
+    struct ggml_tensor * wo_cross   = nullptr;
+    struct ggml_tensor * wq_enc     = nullptr;
+    struct ggml_tensor * wk_enc     = nullptr;
+    struct ggml_tensor * wv_enc     = nullptr;
+    struct ggml_tensor * wo_enc     = nullptr;
 
     // attention bias
     struct ggml_tensor * bq   = nullptr;
diff --git a/src/llama.cpp b/src/llama.cpp
index 607f278615969..5be8f11410a47 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -156,8 +156,7 @@ static struct ggml_tensor * llm_build_inp_embd(
 
 static void llm_build_kv_store(
         struct ggml_context * ctx,
-        const llama_hparams & hparams,
-        const llama_cparams & cparams,
+       struct llama_context & lctx,
        const llama_kv_cache & kv,
          struct ggml_cgraph * graph,
          struct ggml_tensor * k_cur,
@@ -166,28 +165,41 @@ static void llm_build_kv_store(
                     int32_t   kv_head,
          const llm_build_cb & cb,
                     int64_t   il) {
-    const int64_t n_ctx = cparams.n_ctx;
+    const llama_model   & model   = lctx.model;
+    const llama_hparams & hparams = lctx.model.hparams;
+    const llama_cparams & cparams = lctx.cparams;
 
-    const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-    const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+    const int64_t n_ctx = cparams.n_ctx;
 
     GGML_ASSERT(kv.size == n_ctx);
 
-    struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa)*kv_head);
+    int64_t n_embd_k;
+    int64_t n_embd_v;
+
+    // note: deepseek-mla stores the compressed versions
+    if (cparams.mla_attn && model.arch == LLM_ARCH_DEEPSEEK2) {
+        n_embd_k = hparams.n_lora_kv + hparams.n_rot;
+        n_embd_v = hparams.n_lora_kv;
+    } else {
+        n_embd_k = hparams.n_embd_k_gqa(il);
+        n_embd_v = hparams.n_embd_v_gqa(il);
+    }
+
+    struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k, ggml_row_size(kv.k_l[il]->type, n_embd_k)*kv_head);
     cb(k_cache_view, "k_cache_view", il);
 
     // note: storing RoPE-ed version of K in the KV cache
     ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
 
-    assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
+    assert(v_cur->ne[0] == n_embd_v && v_cur->ne[1] == n_tokens);
 
     struct ggml_tensor * v_cache_view = nullptr;
 
     if (cparams.flash_attn) {
-        v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa)*kv_head);
+        v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v, ggml_row_size(kv.v_l[il]->type, n_embd_v)*kv_head);
     } else {
         // note: the V cache is transposed when not using flash attention
-        v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
+        v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v,
                 (  n_ctx)*ggml_element_size(kv.v_l[il]),
                 (kv_head)*ggml_element_size(kv.v_l[il]));
 
@@ -542,8 +554,9 @@ static struct ggml_tensor * llm_build_kqv(
        struct llama_context & lctx,
        const llama_kv_cache & kv,
          struct ggml_cgraph * graph,
+         struct ggml_tensor * wv_b,
          struct ggml_tensor * wo,
-         struct ggml_tensor * wo_b,
+         struct ggml_tensor * bo,
          struct ggml_tensor * q_cur,
          struct ggml_tensor * kq_mask,
                     int32_t   n_tokens,
@@ -555,13 +568,33 @@ static struct ggml_tensor * llm_build_kqv(
     const llama_hparams & hparams = lctx.model.hparams;
     const llama_cparams & cparams = lctx.cparams;
 
-    const int64_t n_ctx         = cparams.n_ctx;
-    const int64_t n_head        = hparams.n_head(il);
-    const int64_t n_head_kv     = hparams.n_head_kv(il);
-    const int64_t n_embd_head_k = hparams.n_embd_head_k;
-    const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa(il);
-    const int64_t n_embd_head_v = hparams.n_embd_head_v;
-    const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa(il);
+    const int64_t n_ctx  = cparams.n_ctx;
+    const int64_t n_head = hparams.n_head(il);
+
+    int64_t n_head_kv;
+    int64_t n_embd_k;
+    int64_t n_embd_head_k;
+    int64_t n_embd_v;
+    int64_t n_embd_head_v;
+    int64_t n_embd_head_v_final;
+
+    // note: MLA caches compressed KV and acts as MQA until the final wv_b expansion
+    if (cparams.mla_attn && model.arch == LLM_ARCH_DEEPSEEK2) {
+        GGML_ASSERT(wv_b);
+        n_head_kv           = 1;
+        n_embd_head_k       = hparams.n_lora_kv + hparams.n_rot;
+        n_embd_k            = n_embd_head_k;
+        n_embd_head_v       = hparams.n_lora_kv;
+        n_embd_v            = n_embd_head_v;
+        n_embd_head_v_final = hparams.n_embd_head_v; // after multiplying by wv_b
+    } else {
+        n_head_kv           = hparams.n_head_kv(il);
+        n_embd_head_k       = hparams.n_embd_head_k;
+        n_embd_k            = hparams.n_embd_k_gqa(il);
+        n_embd_head_v       = hparams.n_embd_head_v;
+        n_embd_v            = hparams.n_embd_v_gqa(il);
+        n_embd_head_v_final = n_embd_head_v;
+    }
 
     struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
     cb(q, "q", il);
@@ -569,7 +602,7 @@ static struct ggml_tensor * llm_build_kqv(
     struct ggml_tensor * k =
         ggml_view_3d(ctx, kv.k_l[il],
                 n_embd_head_k, n_kv, n_head_kv,
-                ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
+                ggml_row_size(kv.k_l[il]->type, n_embd_k),
                 ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
                 0);
     cb(k, "k", il);
@@ -580,11 +613,14 @@ static struct ggml_tensor * llm_build_kqv(
         GGML_UNUSED(model);
         GGML_UNUSED(n_ctx);
 
+        // note: MLA creates emebddings too large for FA, see: https://github.com/ggml-org/llama.cpp/pull/12227
+        GGML_ASSERT(!wv_b);
+
         // split cached v into n_head heads (not transposed)
         struct ggml_tensor * v =
             ggml_view_3d(ctx, kv.v_l[il],
                     n_embd_head_v, n_kv, n_head_kv,
-                    ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
+                    ggml_row_size(kv.v_l[il]->type, n_embd_v),
                     ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
                     0);
         cb(v, "v", il);
@@ -594,7 +630,7 @@ static struct ggml_tensor * llm_build_kqv(
 
         ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
 
-        cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
+        cur = ggml_reshape_2d(ctx, cur, n_embd_head_v_final*n_head, n_tokens);
     } else {
         struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
         cb(kq, "kq", il);
@@ -637,10 +673,22 @@ static struct ggml_tensor * llm_build_kqv(
         struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
         cb(kqv, "kqv", il);
 
+        // note: MLA needs to expand KQV from MQA into MHA
+        if (wv_b) {
+            struct ggml_tensor * wv_b_view = ggml_view_3d(ctx, wv_b, n_embd_head_v, n_embd_head_v_final, n_head,
+                    ggml_row_size(model.layers[il].wv_b->type, n_embd_head_v),
+                    ggml_row_size(model.layers[il].wv_b->type, n_embd_head_v * n_embd_head_v_final),
+                    0);
+            cb(wv_b_view, "wv_b_view", il);
+
+            kqv = ggml_mul_mat(ctx, wv_b_view, kqv);
+            cb(kqv, "kqv_wv_b", il);
+        }
+
         struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
         cb(kqv_merged, "kqv_merged", il);
 
-        cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens);
+        cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v_final*n_head, n_tokens);
         cb(cur, "kqv_merged_cont", il);
     }
 
@@ -650,12 +698,12 @@ static struct ggml_tensor * llm_build_kqv(
         cur = llm_build_lora_mm(lctx, ctx, wo, cur);
     }
 
-    if (wo_b) {
+    if (bo) {
         cb(cur, "kqv_wo", il);
     }
 
-    if (wo_b) {
-        cur = ggml_add(ctx, cur, wo_b);
+    if (bo) {
+        cur = ggml_add(ctx, cur, bo);
     }
 
     return cur;
@@ -666,8 +714,9 @@ static struct ggml_tensor * llm_build_kv(
        struct llama_context & lctx,
        const llama_kv_cache & kv,
          struct ggml_cgraph * graph,
+         struct ggml_tensor * wv_b,
          struct ggml_tensor * wo,
-         struct ggml_tensor * wo_b,
+         struct ggml_tensor * bo,
          struct ggml_tensor * k_cur,
          struct ggml_tensor * v_cur,
          struct ggml_tensor * q_cur,
@@ -687,11 +736,11 @@ static struct ggml_tensor * llm_build_kv(
     ggml_build_forward_expand(graph, k_cur);
     ggml_build_forward_expand(graph, v_cur);
 
-    llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il);
+    llm_build_kv_store(ctx, lctx, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il);
 
     struct ggml_tensor * cur;
 
-    cur  = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b, q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
+    cur  = llm_build_kqv(ctx, lctx, kv, graph, wv_b, wo, bo, q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
     cb(cur, "kqv_out", il);
 
     return cur;
@@ -1546,7 +1595,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
+                        nullptr, model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
             }
 
@@ -1723,7 +1772,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
+                        nullptr, model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
             }
 
@@ -1861,7 +1910,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, NULL,
+                        nullptr, model.layers[il].wo, nullptr,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
@@ -1966,7 +2015,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, NULL,
+                        nullptr, model.layers[il].wo, nullptr,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
@@ -2087,7 +2136,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, NULL,
+                        nullptr, model.layers[il].wo, nullptr,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
@@ -2211,7 +2260,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
+                        nullptr, model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
             }
 
@@ -2363,7 +2412,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, NULL,
+                        nullptr, model.layers[il].wo, nullptr,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
@@ -2475,7 +2524,7 @@ struct llm_build_context {
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
+                        nullptr, model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
@@ -2569,7 +2618,7 @@ struct llm_build_context {
                 cb(Qcur, "Qcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, NULL,
+                        nullptr, model.layers[il].wo, nullptr,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
@@ -2864,7 +2913,7 @@ struct llm_build_context {
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
+                        nullptr, model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
@@ -2996,13 +3045,13 @@ struct llm_build_context {
                     Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
 
                     cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                            model.layers[il].wo, model.layers[il].bo,
+                            nullptr, model.layers[il].wo, model.layers[il].bo,
                             Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                 } else {
                     Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
                     cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                            model.layers[il].wo, model.layers[il].bo,
+                            nullptr, model.layers[il].wo, model.layers[il].bo,
                             Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                 }
             }
@@ -3147,7 +3196,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, NULL,
+                        nullptr, model.layers[il].wo, nullptr,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
@@ -3266,7 +3315,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, NULL,
+                        nullptr, model.layers[il].wo, nullptr,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
@@ -3380,7 +3429,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
+                        nullptr, model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
@@ -3498,7 +3547,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
+                        nullptr, model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
@@ -3613,7 +3662,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
+                        nullptr, model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
@@ -3772,7 +3821,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
+                        nullptr, model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
             }
 
@@ -3897,7 +3946,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
+                        nullptr, model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
             }
 
@@ -4022,7 +4071,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, NULL,
+                        nullptr, model.layers[il].wo, nullptr,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
             struct ggml_tensor * sa_out = cur;
@@ -4124,7 +4173,7 @@ struct llm_build_context {
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
+                        nullptr, model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
@@ -4235,7 +4284,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
+                        nullptr, model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
@@ -4355,7 +4404,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, NULL,
+                        nullptr, model.layers[il].wo, nullptr,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
@@ -4473,7 +4522,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
+                        nullptr, model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
@@ -4667,7 +4716,7 @@ struct llm_build_context {
                 cb(k_states, "k_states", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, NULL,
+                        nullptr, model.layers[il].wo, nullptr,
                         k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
             }
 
@@ -4789,7 +4838,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, NULL,
+                        nullptr, model.layers[il].wo, nullptr,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
             }
 
@@ -4908,7 +4957,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, NULL,
+                        nullptr, model.layers[il].wo, nullptr,
                         Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il);
             }
 
@@ -5045,7 +5094,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
+                        nullptr, model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
@@ -5244,7 +5293,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
+                        nullptr, model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
@@ -5381,8 +5430,9 @@ struct llm_build_context {
                     cb(Kcur, "Kcur", il);
                 }
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur,
-                                   KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        nullptr, model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -5507,7 +5557,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, nullptr,
+                        nullptr, model.layers[il].wo, nullptr,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
@@ -5626,7 +5676,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur_rope", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, NULL,
+                        nullptr, model.layers[il].wo, nullptr,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
@@ -5758,7 +5808,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur_rope", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, NULL,
+                        nullptr, model.layers[il].wo, nullptr,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
@@ -5888,7 +5938,7 @@ struct llm_build_context {
                 cb(Qcur, "Vcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, NULL,
+                        nullptr, model.layers[il].wo, nullptr,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
@@ -5997,7 +6047,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
+                        nullptr, model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
@@ -6140,7 +6190,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, NULL,
+                        nullptr, model.layers[il].wo, nullptr,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
@@ -6289,7 +6339,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
+                        nullptr, model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
             }
 
@@ -6388,8 +6438,8 @@ struct llm_build_context {
         const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
         const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
 
-        const uint32_t n_embd_head_qk_rope = hparams.n_rot;
         const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+        const uint32_t n_embd_head_qk_rope = hparams.n_rot;
         const uint32_t kv_lora_rank = hparams.n_lora_kv;
 
         struct ggml_tensor * cur;
@@ -6407,7 +6457,6 @@ struct llm_build_context {
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
-            // norm
             cur = llm_build_norm(ctx0, inpL, hparams,
                     model.layers[il].attn_norm, NULL,
                     LLM_NORM_RMS, cb, il);
@@ -6415,115 +6464,158 @@ struct llm_build_context {
 
             // self_attention
             {
-                struct ggml_tensor * q = NULL;
+                 struct ggml_tensor * q_nope;
+                 struct ggml_tensor * q_mqa;
                 if (!is_lite) {
                     // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
-                    q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
-                    cb(q, "q", il);
+                     struct ggml_tensor * q_compressed = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
+                    cb(q_compressed, "q_compressed", il);
 
-                    q = llm_build_norm(ctx0, q, hparams,
+                    q_compressed = llm_build_norm(ctx0, q_compressed, hparams,
                             model.layers[il].attn_q_a_norm, NULL,
                             LLM_NORM_RMS, cb, il);
-                    cb(q, "q", il);
+                    cb(q_compressed, "q_compressed_norm", il);
+
+                    // {q_lora_rank, n_head * n_embd_head_qk_nope} * {q_lora_rank, n_tokens} -> {n_head * n_embd_head_qk_nope, n_tokens}
+                    q_nope = ggml_mul_mat(ctx0, model.layers[il].wq_b, q_compressed);
+                    cb(q_nope, "q_nope", il);
 
-                    // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
-                    q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
-                    cb(q, "q", il);
+                    // {q_lora_rank, n_head * n_embd_head_qk_rope} * {q_lora_rank, n_tokens} -> {n_head * n_embd_head_qk_rope, n_tokens}
+                    q_mqa = ggml_mul_mat(ctx0, model.layers[il].wq_b_mqa, q_compressed);
+                    cb(q_mqa, "q_mqa", il);
                 } else {
-                    q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-                    cb(q, "q", il);
+                    // {n_embd, n_head * n_embd_head_qk_nope} * {n_embd, n_tokens} -> {n_head * n_embd_head_qk_nope, n_tokens}
+                    q_nope = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                    cb(q_nope, "q_nope", il);
+
+                    // {n_embd, n_head * n_embd_head_qk_rope} * {n_embd, n_tokens} -> {n_head * n_embd_head_qk_rope, n_tokens}
+                    q_mqa = ggml_mul_mat(ctx0, model.layers[il].wq_mqa, cur);
+                    cb(q_mqa, "q_mqa", il);
                 }
 
-                // split into {n_head * n_embd_head_qk_nope, n_tokens}
-                struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
-                        ggml_row_size(q->type, hparams.n_embd_head_k),
-                        ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+                // {n_embd_head_qk_nope, n_head, n_tokens}
+                struct ggml_tensor * q_nope_view = ggml_view_3d(ctx0, q_nope, n_embd_head_qk_nope, n_head, n_tokens,
+                        ggml_row_size(q_nope->type, n_embd_head_qk_nope),
+                        ggml_row_size(q_nope->type, n_head * n_embd_head_qk_nope),
                         0);
-                cb(q_nope, "q_nope", il);
-
-                // and {n_head * n_embd_head_qk_rope, n_tokens}
-                struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
-                        ggml_row_size(q->type, hparams.n_embd_head_k),
-                        ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
-                        ggml_row_size(q->type, n_embd_head_qk_nope));
-                cb(q_pe, "q_pe", il);
-
-                // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
-                struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
-                cb(kv_pe_compresseed, "kv_pe_compresseed", il);
+                cb(q_nope_view, "q_nope_view", il);
 
-                // split into {kv_lora_rank, n_tokens}
-                struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
-                        kv_pe_compresseed->nb[1],
+                // {n_embd_head_qk_rope, n_head, n_tokens}
+                struct ggml_tensor * q_mqa_view = ggml_view_3d(ctx0, q_mqa, n_embd_head_qk_rope, n_head, n_tokens,
+                        ggml_row_size(q_mqa->type, n_embd_head_qk_rope),
+                        ggml_row_size(q_mqa->type, n_head * n_embd_head_qk_rope),
                         0);
-                cb(kv_compressed, "kv_compressed", il);
+                cb(q_mqa_view, "q_mqa_view", il);
 
-                // and {n_embd_head_qk_rope, n_tokens}
-                struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
-                        kv_pe_compresseed->nb[1],
-                        kv_pe_compresseed->nb[1],
-                        ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
-                cb(k_pe, "k_pe", il);
-
-                // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
-                kv_compressed = ggml_cont(ctx0, kv_compressed);
-                kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
-                        model.layers[il].attn_kv_a_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
-                cb(kv_compressed, "kv_compressed", il);
+                q_mqa_view = ggml_rope_ext(
+                    ctx0, q_mqa_view, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor_scaled, beta_fast, beta_slow
+                );
+                cb(q_mqa_view, "q_mqa_view_rope", il);
+
+                    // {n_embd, kv_lora_rank} * {n_embd, n_tokens} -> {kv_lora_rank, n_tokens}
+                    struct ggml_tensor * kv_compressed = ggml_mul_mat(ctx0, model.layers[il].wkv_a, cur);
+                    cb(kv_compressed, "kv_compressed", il);
+
+                    kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
+                              model.layers[il].attn_kv_a_norm, NULL,
+                              LLM_NORM_RMS, cb, il);
+                    cb(kv_compressed, "kv_compressed_norm", il);
+
+                    // {n_embd, n_embd_head_qk_rope} * {n_embd, n_tokens} -> {n_embd_head_qk_rope, n_tokens}
+                    struct ggml_tensor * k_mqa = ggml_mul_mat(ctx0, model.layers[il].wk_mqa, cur);
+                    cb(k_mqa, "k_mqa", il);
+
+                    // {n_embd_head_qk_rope, 1, n_tokens}
+                    struct ggml_tensor * k_mqa_view = ggml_view_3d(ctx0, k_mqa, n_embd_head_qk_rope, 1, n_tokens,
+                              ggml_row_size(k_mqa->type, n_embd_head_qk_rope),
+                              ggml_row_size(k_mqa->type, n_embd_head_qk_rope),
+                              0);
+                    cb(k_mqa_view, "k_mqa_view", il);
+
+                    k_mqa_view = ggml_rope_ext(
+                         ctx0, k_mqa_view, inp_pos, nullptr,
+                         n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                         ext_factor, attn_factor_scaled, beta_fast, beta_slow
+                    );
+                    cb(k_mqa_view, "k_mqa_view_rope", il);
+
+                // non-MLA
+                if (!cparams.mla_attn) {
+                         // {kv_lora_rank, n_head * n_embd_head_qk_nope} * {kv_lora_rank, n_tokens} -> {n_head * n_embd_head_qk_nope, n_tokens}
+                         struct ggml_tensor * k_nope = ggml_mul_mat(ctx0, model.layers[il].wk_b, kv_compressed);
+                         cb(k_nope, "k_nope", il);
+
+                         // {n_embd_head_qk_nope, n_head, n_tokens}
+                         struct ggml_tensor * k_nope_view = ggml_view_3d(ctx0, k_nope, n_embd_head_qk_nope, n_head, n_tokens,
+                                   ggml_row_size(k_nope->type, n_embd_head_qk_nope),
+                                   ggml_row_size(k_nope->type, n_head * n_embd_head_qk_nope),
+                                   0);
+                         cb(k_nope_view, "k_nope_view", il);
+
+                         // {n_embd_head_qk_nope + n_embd_head_qk_rope, n_head, n_tokens}
+                         struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope_view, q_mqa_view, 0);
+                         cb(q_states, "q_states", il);
+
+                         // {n_embd_head_qk_nope + n_embd_head_qk_rope, n_head, n_tokens}
+                         struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope_view, ggml_repeat(ctx0, k_mqa_view, q_mqa_view), 0);
+                         cb(k_states, "k_states", il);
+
+                         // {kv_lora_rank, n_head * n_embd_head_v} * {kv_lora_rank, n_tokens} -> {n_head * n_embd_head_v, n_tokens}
+                         struct ggml_tensor * v_states = ggml_mul_mat(ctx0, model.layers[il].wv_b, kv_compressed);
+                         cb(v_states, "v_states", il);
+
+                         cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                                   nullptr, model.layers[il].wo, nullptr,
+                                   k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
+                }
+                else {
+                    // {n_embd_head_qk_nope, kv_lora_rank, n_head}
+                    struct ggml_tensor * wk_b_trans_view = ggml_view_3d(ctx0, model.layers[il].wk_b_trans,
+                            n_embd_head_qk_nope, kv_lora_rank, n_head,
+                            ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope),
+                            ggml_row_size(model.layers[il].wk_b->type, kv_lora_rank * n_embd_head_qk_nope),
+                            0);
+                    cb(wk_b_trans_view, "wk_b_trans_view", il);
 
-                // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
-                struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
-                cb(kv, "kv", il);
+                    // {n_embd_head_qk_nope, n_tokens, n_head}
+                    q_nope_view = ggml_permute(ctx0, q_nope_view, 0, 2, 1, 3);
+                    cb(q_nope_view, "q_nope_view_perm", il);
 
-                // split into {n_head * n_embd_head_qk_nope, n_tokens}
-                struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
-                        ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
-                        ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
-                        0);
-                cb(k_nope, "k_nope", il);
+                    // {n_embd_head_qk_nope, kv_lora_rank, n_head} * {n_embd_head_qk_nope, n_tokens, n_head} = {kv_lora_rank, n_tokens, n_head}
+                    struct ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, wk_b_trans_view, q_nope_view);
+                    cb(q_nope_absorbed, "q_nope_absorbed", il);
 
-                // and {n_head * n_embd_head_v, n_tokens}
-                struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
-                        ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
-                        ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
-                        ggml_row_size(kv->type, (n_embd_head_qk_nope)));
-                cb(v_states, "v_states", il);
+                    // {n_embd_head_qk_rope, n_head, n_tokens}
+                    q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
+                    cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
 
-                v_states = ggml_cont(ctx0, v_states);
-                cb(v_states, "v_states", il);
+                    // {kv_lora_rank + n_embd_head_qk_rope, n_head, n_tokens}
+                    struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope_absorbed, q_mqa_view, 0);
+                    cb(q_states, "q_states", il);
 
-                v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
-                    ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
-                    0);
-                cb(v_states, "v_states", il);
-
-                q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
-                q_pe = ggml_rope_ext(
-                    ctx0, q_pe, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor_scaled, beta_fast, beta_slow
-                );
-                cb(q_pe, "q_pe", il);
+                    // {kv_lora_rank, 1, n_tokens}
+                    struct ggml_tensor * kv_compressed_view = ggml_view_3d(ctx0, kv_compressed,
+                            kv_lora_rank, 1, n_tokens,
+                            ggml_row_size(k_mqa->type, kv_lora_rank),
+                            ggml_row_size(k_mqa->type, kv_lora_rank),
+                            0);
+                        cb(kv_compressed_view, "kv_compressed_view", il);
 
-                // shared RoPE key
-                k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
-                k_pe = ggml_rope_ext(
-                    ctx0, k_pe, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor_scaled, beta_fast, beta_slow
-                );
-                cb(k_pe, "k_pe", il);
+                    // {kv_lora_rank + n_embd_head_qk_rope, 1, n_tokens}
+                    struct ggml_tensor * k_states = ggml_concat(ctx0, kv_compressed_view, k_mqa_view, 0);
+                    cb(k_states, "k_states", il);
 
-                struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
-                cb(q_states, "q_states", il);
+                    // {kv_lora_rank, 1, n_tokens}
+                    struct ggml_tensor * v_states = kv_compressed;
+                    cb(v_states, "v_states", il);
 
-                struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
-                cb(k_states, "k_states", il);
+                    cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                            model.layers[il].wv_b, model.layers[il].wo, nullptr,
+                            k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
+                }
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, NULL,
-                        k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -6680,7 +6772,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        NULL, NULL,
+                        nullptr, nullptr, nullptr,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
 
                 cur = llm_build_norm(ctx0, cur, hparams,
@@ -6932,7 +7024,7 @@ struct llm_build_context {
                 struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
-                llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
+                llm_build_kv_store(ctx0, lctx, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
 
                 struct ggml_tensor * k =
                     ggml_view_3d(ctx0, kv_self.k_l[il],
@@ -7134,7 +7226,7 @@ struct llm_build_context {
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
+                        nullptr, model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/float(n_embd_head), cb, il);
             }
 
@@ -7260,7 +7352,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur_rope", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, NULL,
+                        nullptr, model.layers[il].wo, nullptr,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
 
             }
@@ -7379,7 +7471,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
+                        nullptr, model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
@@ -7505,7 +7597,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
+                        nullptr, model.layers[il].wo, model.layers[il].bo,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
@@ -7876,7 +7968,7 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                        model.layers[il].wo, nullptr,
+                        nullptr, model.layers[il].wo, nullptr,
                         Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
 
                 if (hparams.swin_norm) {

From 9c0eb4fdaad0959945a7a619f24c9ed5e02402d3 Mon Sep 17 00:00:00 2001
From: juk <jukofyork@yahoo.com>
Date: Mon, 10 Mar 2025 15:49:40 +0000
Subject: [PATCH 02/11] Added missing MLA flags

---
 common/common.cpp | 1 +
 common/common.h   | 2 +-
 include/llama.h   | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index 6448b7b03d6d2..476e1e3764991 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1132,6 +1132,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
     cparams.cb_eval_user_data = params.cb_eval_user_data;
     cparams.offload_kqv       = !params.no_kv_offload;
     cparams.flash_attn        = params.flash_attn;
+    cparams.mla_attn_attn     = params.mla_attn_attn;
     cparams.no_perf           = params.no_perf;
 
     if (params.reranking) {
diff --git a/common/common.h b/common/common.h
index 6d6f98b7b632b..207732a9957a8 100644
--- a/common/common.h
+++ b/common/common.h
@@ -325,7 +325,7 @@ struct common_params {
     bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
     bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
     bool flash_attn        = false; // flash attention
-    bool mla_attn          = false; // mla attention
+    bool mla_attn          = false; // MLA attention for deepseek2
     bool no_perf           = false; // disable performance metrics
     bool ctx_shift         = true;  // context shift on inifinite text generation
 
diff --git a/include/llama.h b/include/llama.h
index d62792c0a6760..be6dfc5d87154 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -343,6 +343,7 @@ extern "C" {
         bool embeddings;  // if true, extract embeddings (together with logits)
         bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
         bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
+        bool mla_attn;    // MLA attention for deepseek2
         bool no_perf;     // whether to measure performance timings
 
         // Abort callback

From eefa5bb4e7c4abf115d3e3adc056bbc178f64338 Mon Sep 17 00:00:00 2001
From: juk <jukofyork@yahoo.com>
Date: Mon, 10 Mar 2025 15:54:52 +0000
Subject: [PATCH 03/11] Added missing default to llama_context_default_params()

---
 src/llama.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/llama.cpp b/src/llama.cpp
index 5be8f11410a47..368a954f48b86 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -9450,6 +9450,7 @@ struct llama_context_params llama_context_default_params() {
         /*.embeddings                  =*/ false,
         /*.offload_kqv                 =*/ true,
         /*.flash_attn                  =*/ false,
+        /*.mla_attn                    =*/ false,
         /*.no_perf                     =*/ true,
         /*.abort_callback              =*/ nullptr,
         /*.abort_callback_data         =*/ nullptr,
@@ -9686,6 +9687,7 @@ struct llama_context * llama_init_from_model(
     cparams.embeddings       = params.embeddings;
     cparams.offload_kqv      = params.offload_kqv;
     cparams.flash_attn       = params.flash_attn;
+    cparams.mla_attn       = params.mla_attn;
     cparams.no_perf          = params.no_perf;
     cparams.pooling_type     = params.pooling_type;
 

From 5636696808d3a8eac66d3ab04118fd1bc06617d2 Mon Sep 17 00:00:00 2001
From: juk <jukofyork@yahoo.com>
Date: Mon, 10 Mar 2025 15:59:36 +0000
Subject: [PATCH 04/11] Fixed typo in mla_attn name

---
 common/common.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index 476e1e3764991..b3438f2646bdf 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1132,7 +1132,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
     cparams.cb_eval_user_data = params.cb_eval_user_data;
     cparams.offload_kqv       = !params.no_kv_offload;
     cparams.flash_attn        = params.flash_attn;
-    cparams.mla_attn_attn     = params.mla_attn_attn;
+    cparams.mla_attn          = params.mla_attn;
     cparams.no_perf           = params.no_perf;
 
     if (params.reranking) {

From e551cdcec0cfd27a3954761322dc17fa71914c90 Mon Sep 17 00:00:00 2001
From: juk <jukofyork@yahoo.com>
Date: Mon, 10 Mar 2025 16:10:50 +0000
Subject: [PATCH 05/11] Fixed bad formatting of build_deepseek2()

---
 src/llama.cpp | 126 ++++++++++++++++++++++++--------------------------
 1 file changed, 61 insertions(+), 65 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 368a954f48b86..78667cbb45b4d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6464,15 +6464,15 @@ struct llm_build_context {
 
             // self_attention
             {
-                 struct ggml_tensor * q_nope;
-                 struct ggml_tensor * q_mqa;
+                struct ggml_tensor * q_nope = nullptr;
+                struct ggml_tensor * q_mqa = nullptr;
                 if (!is_lite) {
                     // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
-                     struct ggml_tensor * q_compressed = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
+                    struct ggml_tensor * q_compressed = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
                     cb(q_compressed, "q_compressed", il);
 
                     q_compressed = llm_build_norm(ctx0, q_compressed, hparams,
-                            model.layers[il].attn_q_a_norm, NULL,
+                            model.layers[il].attn_q_a_norm, nullptr,
                             LLM_NORM_RMS, cb, il);
                     cb(q_compressed, "q_compressed_norm", il);
 
@@ -6507,70 +6507,66 @@ struct llm_build_context {
                         0);
                 cb(q_mqa_view, "q_mqa_view", il);
 
-                q_mqa_view = ggml_rope_ext(
-                    ctx0, q_mqa_view, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor_scaled, beta_fast, beta_slow
+                q_mqa_view = ggml_rope_ext(ctx0, q_mqa_view, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor_scaled, beta_fast, beta_slow
                 );
                 cb(q_mqa_view, "q_mqa_view_rope", il);
 
-                    // {n_embd, kv_lora_rank} * {n_embd, n_tokens} -> {kv_lora_rank, n_tokens}
-                    struct ggml_tensor * kv_compressed = ggml_mul_mat(ctx0, model.layers[il].wkv_a, cur);
-                    cb(kv_compressed, "kv_compressed", il);
-
-                    kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
-                              model.layers[il].attn_kv_a_norm, NULL,
-                              LLM_NORM_RMS, cb, il);
-                    cb(kv_compressed, "kv_compressed_norm", il);
-
-                    // {n_embd, n_embd_head_qk_rope} * {n_embd, n_tokens} -> {n_embd_head_qk_rope, n_tokens}
-                    struct ggml_tensor * k_mqa = ggml_mul_mat(ctx0, model.layers[il].wk_mqa, cur);
-                    cb(k_mqa, "k_mqa", il);
-
-                    // {n_embd_head_qk_rope, 1, n_tokens}
-                    struct ggml_tensor * k_mqa_view = ggml_view_3d(ctx0, k_mqa, n_embd_head_qk_rope, 1, n_tokens,
-                              ggml_row_size(k_mqa->type, n_embd_head_qk_rope),
-                              ggml_row_size(k_mqa->type, n_embd_head_qk_rope),
-                              0);
-                    cb(k_mqa_view, "k_mqa_view", il);
-
-                    k_mqa_view = ggml_rope_ext(
-                         ctx0, k_mqa_view, inp_pos, nullptr,
-                         n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                         ext_factor, attn_factor_scaled, beta_fast, beta_slow
-                    );
-                    cb(k_mqa_view, "k_mqa_view_rope", il);
-
-                // non-MLA
+                // {n_embd, kv_lora_rank} * {n_embd, n_tokens} -> {kv_lora_rank, n_tokens}
+                struct ggml_tensor * kv_compressed = ggml_mul_mat(ctx0, model.layers[il].wkv_a, cur);
+                cb(kv_compressed, "kv_compressed", il);
+
+                kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
+                        model.layers[il].attn_kv_a_norm, nullptr,
+                        LLM_NORM_RMS, cb, il);
+                cb(kv_compressed, "kv_compressed_norm", il);
+
+                // {n_embd, n_embd_head_qk_rope} * {n_embd, n_tokens} -> {n_embd_head_qk_rope, n_tokens}
+                struct ggml_tensor * k_mqa = ggml_mul_mat(ctx0, model.layers[il].wk_mqa, cur);
+                cb(k_mqa, "k_mqa", il);
+
+                // {n_embd_head_qk_rope, 1, n_tokens}
+                struct ggml_tensor * k_mqa_view = ggml_view_3d(ctx0, k_mqa, n_embd_head_qk_rope, 1, n_tokens,
+                        ggml_row_size(k_mqa->type, n_embd_head_qk_rope),
+                        ggml_row_size(k_mqa->type, n_embd_head_qk_rope),
+                        0);
+                cb(k_mqa_view, "k_mqa_view", il);
+
+                k_mqa_view = ggml_rope_ext(ctx0, k_mqa_view, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor_scaled, beta_fast, beta_slow
+                );
+                cb(k_mqa_view, "k_mqa_view_rope", il);
+
                 if (!cparams.mla_attn) {
-                         // {kv_lora_rank, n_head * n_embd_head_qk_nope} * {kv_lora_rank, n_tokens} -> {n_head * n_embd_head_qk_nope, n_tokens}
-                         struct ggml_tensor * k_nope = ggml_mul_mat(ctx0, model.layers[il].wk_b, kv_compressed);
-                         cb(k_nope, "k_nope", il);
-
-                         // {n_embd_head_qk_nope, n_head, n_tokens}
-                         struct ggml_tensor * k_nope_view = ggml_view_3d(ctx0, k_nope, n_embd_head_qk_nope, n_head, n_tokens,
-                                   ggml_row_size(k_nope->type, n_embd_head_qk_nope),
-                                   ggml_row_size(k_nope->type, n_head * n_embd_head_qk_nope),
-                                   0);
-                         cb(k_nope_view, "k_nope_view", il);
-
-                         // {n_embd_head_qk_nope + n_embd_head_qk_rope, n_head, n_tokens}
-                         struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope_view, q_mqa_view, 0);
-                         cb(q_states, "q_states", il);
-
-                         // {n_embd_head_qk_nope + n_embd_head_qk_rope, n_head, n_tokens}
-                         struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope_view, ggml_repeat(ctx0, k_mqa_view, q_mqa_view), 0);
-                         cb(k_states, "k_states", il);
-
-                         // {kv_lora_rank, n_head * n_embd_head_v} * {kv_lora_rank, n_tokens} -> {n_head * n_embd_head_v, n_tokens}
-                         struct ggml_tensor * v_states = ggml_mul_mat(ctx0, model.layers[il].wv_b, kv_compressed);
-                         cb(v_states, "v_states", il);
-
-                         cur = llm_build_kv(ctx0, lctx, kv_self, gf,
-                                   nullptr, model.layers[il].wo, nullptr,
-                                   k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
-                }
-                else {
+                     // {kv_lora_rank, n_head * n_embd_head_qk_nope} * {kv_lora_rank, n_tokens} -> {n_head * n_embd_head_qk_nope, n_tokens}
+                     struct ggml_tensor * k_nope = ggml_mul_mat(ctx0, model.layers[il].wk_b, kv_compressed);
+                     cb(k_nope, "k_nope", il);
+
+                     // {n_embd_head_qk_nope, n_head, n_tokens}
+                     struct ggml_tensor * k_nope_view = ggml_view_3d(ctx0, k_nope, n_embd_head_qk_nope, n_head, n_tokens,
+                             ggml_row_size(k_nope->type, n_embd_head_qk_nope),
+                             ggml_row_size(k_nope->type, n_head * n_embd_head_qk_nope),
+                             0);
+                     cb(k_nope_view, "k_nope_view", il);
+
+                     // {n_embd_head_qk_nope + n_embd_head_qk_rope, n_head, n_tokens}
+                     struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope_view, q_mqa_view, 0);
+                     cb(q_states, "q_states", il);
+
+                     // {n_embd_head_qk_nope + n_embd_head_qk_rope, n_head, n_tokens}
+                     struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope_view, ggml_repeat(ctx0, k_mqa_view, q_mqa_view), 0);
+                     cb(k_states, "k_states", il);
+
+                     // {kv_lora_rank, n_head * n_embd_head_v} * {kv_lora_rank, n_tokens} -> {n_head * n_embd_head_v, n_tokens}
+                     struct ggml_tensor * v_states = ggml_mul_mat(ctx0, model.layers[il].wv_b, kv_compressed);
+                     cb(v_states, "v_states", il);
+
+                     cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                             nullptr, model.layers[il].wo, nullptr,
+                             k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
+                } else {
                     // {n_embd_head_qk_nope, kv_lora_rank, n_head}
                     struct ggml_tensor * wk_b_trans_view = ggml_view_3d(ctx0, model.layers[il].wk_b_trans,
                             n_embd_head_qk_nope, kv_lora_rank, n_head,
@@ -6601,7 +6597,7 @@ struct llm_build_context {
                             ggml_row_size(k_mqa->type, kv_lora_rank),
                             ggml_row_size(k_mqa->type, kv_lora_rank),
                             0);
-                        cb(kv_compressed_view, "kv_compressed_view", il);
+                    cb(kv_compressed_view, "kv_compressed_view", il);
 
                     // {kv_lora_rank + n_embd_head_qk_rope, 1, n_tokens}
                     struct ggml_tensor * k_states = ggml_concat(ctx0, kv_compressed_view, k_mqa_view, 0);

From 75e3d6ac286383f761e258593f36a1a137f67e88 Mon Sep 17 00:00:00 2001
From: juk <jukofyork@yahoo.com>
Date: Mon, 10 Mar 2025 16:41:28 +0000
Subject: [PATCH 06/11] Switched to have first n_rot so can use context
 shifting later

---
 src/llama.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 78667cbb45b4d..33159d097f48e 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6552,11 +6552,11 @@ struct llm_build_context {
                      cb(k_nope_view, "k_nope_view", il);
 
                      // {n_embd_head_qk_nope + n_embd_head_qk_rope, n_head, n_tokens}
-                     struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope_view, q_mqa_view, 0);
+                     struct ggml_tensor * q_states = ggml_concat(ctx0, q_mqa_view, q_nope_view, 0);
                      cb(q_states, "q_states", il);
 
                      // {n_embd_head_qk_nope + n_embd_head_qk_rope, n_head, n_tokens}
-                     struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope_view, ggml_repeat(ctx0, k_mqa_view, q_mqa_view), 0);
+                     struct ggml_tensor * k_states = ggml_concat(ctx0, ggml_repeat(ctx0, k_mqa_view, q_mqa_view), k_nope_view, 0);
                      cb(k_states, "k_states", il);
 
                      // {kv_lora_rank, n_head * n_embd_head_v} * {kv_lora_rank, n_tokens} -> {n_head * n_embd_head_v, n_tokens}
@@ -6588,7 +6588,7 @@ struct llm_build_context {
                     cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
 
                     // {kv_lora_rank + n_embd_head_qk_rope, n_head, n_tokens}
-                    struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope_absorbed, q_mqa_view, 0);
+                    struct ggml_tensor * q_states = ggml_concat(ctx0, q_mqa_view, q_nope_absorbed, 0);
                     cb(q_states, "q_states", il);
 
                     // {kv_lora_rank, 1, n_tokens}
@@ -6600,7 +6600,7 @@ struct llm_build_context {
                     cb(kv_compressed_view, "kv_compressed_view", il);
 
                     // {kv_lora_rank + n_embd_head_qk_rope, 1, n_tokens}
-                    struct ggml_tensor * k_states = ggml_concat(ctx0, kv_compressed_view, k_mqa_view, 0);
+                    struct ggml_tensor * k_states = ggml_concat(ctx0, k_mqa_view, kv_compressed_view, 0);
                     cb(k_states, "k_states", il);
 
                     // {kv_lora_rank, 1, n_tokens}

From 559328274ad32825620d6f286de98725dbdf9338 Mon Sep 17 00:00:00 2001
From: juk <jukofyork@yahoo.com>
Date: Mon, 10 Mar 2025 16:44:00 +0000
Subject: [PATCH 07/11] Changed comment as shifting *could* be implemented now

---
 src/llama-kv-cache.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 384465fb68845..37ab4adfdbd94 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -32,7 +32,7 @@ bool llama_kv_cache_init(
 
     cache.recurrent = llama_model_is_recurrent(&model);
     cache.v_trans   = !cache.recurrent && !cparams.flash_attn;
-    cache.can_shift = !cache.recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA (or YaRN?)
+    cache.can_shift = !cache.recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported yet
 
     LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n",
             __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, cache.can_shift);

From c9faa1bcd9b7504d4f71998d0c00daf603457b38 Mon Sep 17 00:00:00 2001
From: juk <jukofyork@yahoo.com>
Date: Mon, 10 Mar 2025 17:02:28 +0000
Subject: [PATCH 08/11] Final tidy up and added TODOs and extra comments

---
 src/llama.cpp | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 33159d097f48e..355a78900027e 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6551,11 +6551,13 @@ struct llm_build_context {
                              0);
                      cb(k_nope_view, "k_nope_view", il);
 
-                     // {n_embd_head_qk_nope + n_embd_head_qk_rope, n_head, n_tokens}
+                     // TODO: build_k_shift() and build_defrag(); the RoPEed part is the first n_rot as they expect
+                     // {n_embd_head_qk_rope + n_embd_head_qk_nope, n_head, n_tokens}
                      struct ggml_tensor * q_states = ggml_concat(ctx0, q_mqa_view, q_nope_view, 0);
                      cb(q_states, "q_states", il);
 
-                     // {n_embd_head_qk_nope + n_embd_head_qk_rope, n_head, n_tokens}
+                     // TODO: build_k_shift() and build_defrag(); the RoPEed part is the first n_rot as they expect
+                     // {n_embd_head_qk_rope + n_embd_head_qk_nope, n_head, n_tokens}
                      struct ggml_tensor * k_states = ggml_concat(ctx0, ggml_repeat(ctx0, k_mqa_view, q_mqa_view), k_nope_view, 0);
                      cb(k_states, "k_states", il);
 
@@ -6563,6 +6565,7 @@ struct llm_build_context {
                      struct ggml_tensor * v_states = ggml_mul_mat(ctx0, model.layers[il].wv_b, kv_compressed);
                      cb(v_states, "v_states", il);
 
+                     // note: this has essentially converted MLA into MHA (with very large KV-cache overhead)
                      cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                              nullptr, model.layers[il].wo, nullptr,
                              k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
@@ -6587,7 +6590,8 @@ struct llm_build_context {
                     q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
                     cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
 
-                    // {kv_lora_rank + n_embd_head_qk_rope, n_head, n_tokens}
+                    // TODO: build_k_shift() and build_defrag(); the RoPEed part is the first n_rot as they expect
+                    // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
                     struct ggml_tensor * q_states = ggml_concat(ctx0, q_mqa_view, q_nope_absorbed, 0);
                     cb(q_states, "q_states", il);
 
@@ -6599,7 +6603,8 @@ struct llm_build_context {
                             0);
                     cb(kv_compressed_view, "kv_compressed_view", il);
 
-                    // {kv_lora_rank + n_embd_head_qk_rope, 1, n_tokens}
+                    // TODO: build_k_shift() and build_defrag(); the RoPEed part is the first n_rot as they expect
+                    // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
                     struct ggml_tensor * k_states = ggml_concat(ctx0, k_mqa_view, kv_compressed_view, 0);
                     cb(k_states, "k_states", il);
 
@@ -6607,6 +6612,7 @@ struct llm_build_context {
                     struct ggml_tensor * v_states = kv_compressed;
                     cb(v_states, "v_states", il);
 
+                    // note: this has essentially converted MLA into MQA (with very low KV-cache overhead)
                     cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                             model.layers[il].wv_b, model.layers[il].wo, nullptr,
                             k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
@@ -9683,7 +9689,7 @@ struct llama_context * llama_init_from_model(
     cparams.embeddings       = params.embeddings;
     cparams.offload_kqv      = params.offload_kqv;
     cparams.flash_attn       = params.flash_attn;
-    cparams.mla_attn       = params.mla_attn;
+    cparams.mla_attn         = params.mla_attn;
     cparams.no_perf          = params.no_perf;
     cparams.pooling_type     = params.pooling_type;
 

From f1297b6a9c7c2c36e21b43984ee8e6d3f034366b Mon Sep 17 00:00:00 2001
From: juk <jukofyork@yahoo.com>
Date: Mon, 10 Mar 2025 17:40:56 +0000
Subject: [PATCH 09/11] Removed unused local variables from llm_build_kv()

---
 src/llama.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 355a78900027e..2929152cb9885 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -727,9 +727,6 @@ static struct ggml_tensor * llm_build_kv(
                     float     kq_scale,
          const llm_build_cb & cb,
                     int       il) {
-    const llama_hparams & hparams = lctx.model.hparams;
-    const llama_cparams & cparams = lctx.cparams;
-
     // these nodes are added to the graph together so that they are not reordered
     // by doing so, the number of splits in the graph is reduced
     ggml_build_forward_expand(graph, q_cur);

From 3649714b37a34420aeb84b5ff52e867b06780962 Mon Sep 17 00:00:00 2001
From: juk <jukofyork@yahoo.com>
Date: Mon, 10 Mar 2025 17:46:50 +0000
Subject: [PATCH 10/11] Removed trailing whitespace in convert_hf_to_gguf.py

---
 convert_hf_to_gguf.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index dc566b006374a..d13196696ce33 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -4146,7 +4146,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         qk_nope_head_dim = self.hparams["qk_nope_head_dim"]
         qk_rope_head_dim = self.hparams["qk_rope_head_dim"]
         v_head_dim = self.hparams["v_head_dim"]
-        kv_lora_rank = self.hparams["kv_lora_rank"]        
+        kv_lora_rank = self.hparams["kv_lora_rank"]
 
         # (v2-lite) split q_proj into: q_proj and q_mqa_proj
         if name.endswith("q_proj.weight"):
@@ -4200,10 +4200,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim)
             assert data_torch.shape[1] == kv_lora_rank
 
-            kv_b_proj = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, kv_lora_rank)            
+            kv_b_proj = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, kv_lora_rank)
             k_b_proj, v_b_proj = torch.split(kv_b_proj, [qk_nope_head_dim, v_head_dim], dim = 1)
 
-            k_b_trans_proj = k_b_proj.transpose(1, 2).reshape(n_head_kv * kv_lora_rank, qk_nope_head_dim)            
+            k_b_trans_proj = k_b_proj.transpose(1, 2).reshape(n_head_kv * kv_lora_rank, qk_nope_head_dim)
             k_b_proj = k_b_proj.reshape(n_head_kv * qk_nope_head_dim, kv_lora_rank)
             v_b_proj = v_b_proj.reshape(n_head_kv * v_head_dim, kv_lora_rank)
 

From c6845359af8f6e727dcb3e34cdf9a4e06d8d0bf7 Mon Sep 17 00:00:00 2001
From: juk <jukofyork@yahoo.com>
Date: Mon, 10 Mar 2025 22:46:39 +0000
Subject: [PATCH 11/11] Added optimised version of MQA for MLA to
 llm_build_kqv()

---
 src/llama.cpp | 215 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 133 insertions(+), 82 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 2929152cb9885..72e999e0ac0b0 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -176,7 +176,7 @@ static void llm_build_kv_store(
     int64_t n_embd_k;
     int64_t n_embd_v;
 
-    // note: deepseek-mla stores the compressed versions
+    // note: deepseek-mla converts MLA to MQA so n_embd_k/n_embd_v change too
     if (cparams.mla_attn && model.arch == LLM_ARCH_DEEPSEEK2) {
         n_embd_k = hparams.n_lora_kv + hparams.n_rot;
         n_embd_v = hparams.n_lora_kv;
@@ -568,59 +568,36 @@ static struct ggml_tensor * llm_build_kqv(
     const llama_hparams & hparams = lctx.model.hparams;
     const llama_cparams & cparams = lctx.cparams;
 
-    const int64_t n_ctx  = cparams.n_ctx;
-    const int64_t n_head = hparams.n_head(il);
+    const int64_t n_ctx         = cparams.n_ctx;
+    const int64_t n_head        = hparams.n_head(il);
+    const int64_t n_head_kv     = hparams.n_head_kv(il);
+    const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa(il);
+    const int64_t n_embd_head_k = hparams.n_embd_head_k;
+    const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa(il);
+    const int64_t n_embd_head_v = hparams.n_embd_head_v;
 
-    int64_t n_head_kv;
-    int64_t n_embd_k;
-    int64_t n_embd_head_k;
-    int64_t n_embd_v;
-    int64_t n_embd_head_v;
-    int64_t n_embd_head_v_final;
-
-    // note: MLA caches compressed KV and acts as MQA until the final wv_b expansion
-    if (cparams.mla_attn && model.arch == LLM_ARCH_DEEPSEEK2) {
-        GGML_ASSERT(wv_b);
-        n_head_kv           = 1;
-        n_embd_head_k       = hparams.n_lora_kv + hparams.n_rot;
-        n_embd_k            = n_embd_head_k;
-        n_embd_head_v       = hparams.n_lora_kv;
-        n_embd_v            = n_embd_head_v;
-        n_embd_head_v_final = hparams.n_embd_head_v; // after multiplying by wv_b
-    } else {
-        n_head_kv           = hparams.n_head_kv(il);
-        n_embd_head_k       = hparams.n_embd_head_k;
-        n_embd_k            = hparams.n_embd_k_gqa(il);
-        n_embd_head_v       = hparams.n_embd_head_v;
-        n_embd_v            = hparams.n_embd_v_gqa(il);
-        n_embd_head_v_final = n_embd_head_v;
-    }
+    struct ggml_tensor * cur;
 
     struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
     cb(q, "q", il);
 
-    struct ggml_tensor * k =
-        ggml_view_3d(ctx, kv.k_l[il],
-                n_embd_head_k, n_kv, n_head_kv,
-                ggml_row_size(kv.k_l[il]->type, n_embd_k),
-                ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
-                0);
-    cb(k, "k", il);
-
-    struct ggml_tensor * cur;
-
     if (cparams.flash_attn) {
         GGML_UNUSED(model);
         GGML_UNUSED(n_ctx);
 
-        // note: MLA creates emebddings too large for FA, see: https://github.com/ggml-org/llama.cpp/pull/12227
-        GGML_ASSERT(!wv_b);
+        struct ggml_tensor * k =
+            ggml_view_3d(ctx, kv.k_l[il],
+                    n_embd_head_k, n_kv, n_head_kv,
+                    ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
+                    ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
+                    0);
+        cb(k, "k", il);
 
         // split cached v into n_head heads (not transposed)
         struct ggml_tensor * v =
             ggml_view_3d(ctx, kv.v_l[il],
                     n_embd_head_v, n_kv, n_head_kv,
-                    ggml_row_size(kv.v_l[il]->type, n_embd_v),
+                    ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
                     ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
                     0);
         cb(v, "v", il);
@@ -630,65 +607,139 @@ static struct ggml_tensor * llm_build_kqv(
 
         ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
 
-        cur = ggml_reshape_2d(ctx, cur, n_embd_head_v_final*n_head, n_tokens);
+        cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
     } else {
-        struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
-        cb(kq, "kq", il);
 
-        // note: this op tends to require high floating point range
-        //       while for some models F16 is enough, for others it is not, so we default to F32 here
-        ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
+        // MLA converetd to MQA optimised to use non-batched matrix multiplies
+        if (cparams.mla_attn && model.arch == LLM_ARCH_DEEPSEEK2) {
+            const int64_t n_embd_head_k_mqa = hparams.n_lora_kv + hparams.n_rot;
+            const int64_t n_embd_head_v_mqa = hparams.n_lora_kv;
 
-        if (model.arch == LLM_ARCH_GROK) {
-            // need to do the following:
-            // multiply by attn_output_multiplyer of 0.08838834764831845
-            // and then :
-            // kq = 30 * tanh(kq / 30)
-            // before the softmax below
+            // must cont for the 2D view or else kq with have n_tokens <-> n_head swapped...
+            q = ggml_cont(ctx, q);
+            cb(q, "q_cont", il);
 
-            kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
-            kq = ggml_scale(ctx, kq, 30);
-        }
+            q = ggml_view_2d(ctx, q,
+                    n_embd_head_k_mqa, n_head * n_tokens,
+                    ggml_row_size(q->type, n_embd_head_k_mqa),
+                    0);
+            cb(q, "q_view", il);
 
-        if (hparams.attn_soft_cap) {
-            kq = ggml_scale(ctx, kq, 1.0f / hparams.f_attn_logit_softcapping);
-            kq = ggml_tanh(ctx, kq);
-            kq = ggml_scale(ctx, kq, hparams.f_attn_logit_softcapping);
-        }
+            struct ggml_tensor * k =
+                ggml_view_2d(ctx, kv.k_l[il],
+                        n_embd_head_k_mqa, n_kv,
+                        ggml_row_size(kv.k_l[il]->type, n_embd_head_k_mqa),
+                        0);
+            cb(k, "k", il);
 
-        kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
-        cb(kq, "kq_soft_max_ext", il);
+            struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
+            cb(kq, "kq", il);
 
-        GGML_ASSERT(kv.size == n_ctx);
+            // note: this doesn't seem necessary
+            //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
 
-        // split cached v into n_head heads
-        struct ggml_tensor * v =
-            ggml_view_3d(ctx, kv.v_l[il],
-                    n_kv, n_embd_head_v, n_head_kv,
-                    ggml_element_size(kv.v_l[il])*n_ctx,
-                    ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
+            kq = ggml_view_3d(ctx, kq,
+                    n_kv, n_tokens, n_head,
+                    ggml_row_size(kq->type, n_kv),
+                    ggml_row_size(kq->type, n_kv * n_tokens),
                     0);
-        cb(v, "v", il);
+            cb(kq, "kq_view", il);
 
-        struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
-        cb(kqv, "kqv", il);
+            kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
+            cb(kq, "kq_soft_max_ext", il);
 
-        // note: MLA needs to expand KQV from MQA into MHA
-        if (wv_b) {
-            struct ggml_tensor * wv_b_view = ggml_view_3d(ctx, wv_b, n_embd_head_v, n_embd_head_v_final, n_head,
-                    ggml_row_size(model.layers[il].wv_b->type, n_embd_head_v),
-                    ggml_row_size(model.layers[il].wv_b->type, n_embd_head_v * n_embd_head_v_final),
+            kq = ggml_view_2d(ctx, kq,
+                    n_kv, n_tokens * n_head,
+                    ggml_row_size(kq->type, n_kv),
+                    0);
+            cb(kq, "kq_soft_max_view", il);
+
+            GGML_ASSERT(kv.size == n_ctx);
+
+            struct ggml_tensor * v =
+                ggml_view_2d(ctx, kv.v_l[il],
+                        n_kv, n_embd_head_v_mqa,
+                        ggml_element_size(kv.v_l[il])*n_ctx,
+                        0);
+            cb(v, "v", il);
+
+            struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
+            cb(kqv, "kqv_compressed", il);
+
+            kqv = ggml_view_3d(ctx, kqv,
+                    n_embd_head_v_mqa, n_tokens, n_head,
+                    ggml_row_size(kqv->type, n_embd_head_v_mqa),
+                    ggml_row_size(kqv->type, n_embd_head_v_mqa * n_tokens),
+                    0);
+            cb(kqv, "kqv_view", il);
+
+            struct ggml_tensor * wv_b_view =
+                    ggml_view_3d(ctx, wv_b, n_embd_head_v_mqa, n_embd_head_v, n_head,
+                    ggml_row_size(wv_b->type, n_embd_head_v_mqa),
+                    ggml_row_size(wv_b->type, n_embd_head_v * n_embd_head_v_mqa),
                     0);
             cb(wv_b_view, "wv_b_view", il);
 
-            kqv = ggml_mul_mat(ctx, wv_b_view, kqv);
-            cb(kqv, "kqv_wv_b", il);
+            // dsecompress the MQA to MHA
+            cur = ggml_mul_mat(ctx, wv_b_view, kqv);
+            cb(cur, "kqv", il);
+
+        // standard MHA/GQA non-flash-attension case
+        } else {
+            struct ggml_tensor * k =
+                ggml_view_3d(ctx, kv.k_l[il],
+                        n_embd_head_k, n_kv, n_head_kv,
+                        ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
+                        ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
+                        0);
+            cb(k, "k", il);
+
+            struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
+            cb(kq, "kq", il);
+
+            // note: this op tends to require high floating point range
+            //       while for some models F16 is enough, for others it is not, so we default to F32 here
+            ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
+
+            if (model.arch == LLM_ARCH_GROK) {
+                // need to do the following:
+                // multiply by attn_output_multiplyer of 0.08838834764831845
+                // and then :
+                // kq = 30 * tanh(kq / 30)
+                // before the softmax below
+
+                kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
+                kq = ggml_scale(ctx, kq, 30);
+            }
+
+            if (hparams.attn_soft_cap) {
+                kq = ggml_scale(ctx, kq, 1.0f / hparams.f_attn_logit_softcapping);
+                kq = ggml_tanh(ctx, kq);
+                kq = ggml_scale(ctx, kq, hparams.f_attn_logit_softcapping);
+            }
+
+            kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
+            cb(kq, "kq_soft_max_ext", il);
+
+            GGML_ASSERT(kv.size == n_ctx);
+
+            // split cached v into n_head heads
+            struct ggml_tensor * v =
+                ggml_view_3d(ctx, kv.v_l[il],
+                        n_kv, n_embd_head_v, n_head_kv,
+                        ggml_element_size(kv.v_l[il])*n_ctx,
+                        ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
+                        0);
+            cb(v, "v", il);
+
+            cur = ggml_mul_mat(ctx, v, kq);
+            cb(cur, "kqv", il);
         }
 
-        struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
-        cb(kqv_merged, "kqv_merged", il);
+        cur = ggml_permute(ctx, cur, 0, 2, 1, 3);
+        cb(cur, "kqv_merged", il);
 
-        cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v_final*n_head, n_tokens);
+        cur = ggml_cont_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
         cb(cur, "kqv_merged_cont", il);
     }