@@ -1248,6 +1248,9 @@ struct llama_layer {
12481248 struct ggml_tensor * wqkv;
12491249
12501250 // attention bias
1251+ struct ggml_tensor * bq;
1252+ struct ggml_tensor * bk;
1253+ struct ggml_tensor * bv;
12511254 struct ggml_tensor * bo;
12521255 struct ggml_tensor * bqkv;
12531256
@@ -2781,6 +2784,11 @@ static void llm_load_tensors(
27812784 layer.wk = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_K, " weight" , i), {n_embd, n_embd_gqa}, backend_split);
27822785 layer.wv = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_V, " weight" , i), {n_embd, n_embd_gqa}, backend_split);
27832786 layer.wo = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_OUT, " weight" , i), {n_embd, n_embd}, backend_split);
2787+
2788+ layer.bq = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_Q, " bias" , i), {n_embd}, backend);
2789+ layer.bk = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_K, " bias" , i), {n_embd_gqa}, backend);
2790+ layer.bv = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_V, " bias" , i), {n_embd_gqa}, backend);
2791+ layer.bo = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_OUT, " bias" , i), {n_embd}, backend);
27842792
27852793 layer.ffn_norm = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_NORM, " weight" , i), {n_embd}, backend);
27862794
@@ -2791,8 +2799,9 @@ static void llm_load_tensors(
27912799 if (backend == GGML_BACKEND_GPU) {
27922800 vram_weights +=
27932801 ggml_nbytes (layer.attn_norm ) + ggml_nbytes (layer.wq ) + ggml_nbytes (layer.wk ) +
2794- ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) + ggml_nbytes (layer.ffn_norm ) +
2795- ggml_nbytes (layer.ffn_gate ) + ggml_nbytes (layer.ffn_down ) + ggml_nbytes (layer.ffn_up );
2802+ ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) + ggml_nbytes (layer.bq ) +
2803+ ggml_nbytes (layer.bk ) + ggml_nbytes (layer.bv ) + ggml_nbytes (layer.bo ) +
2804+ ggml_nbytes (layer.ffn_norm ) + ggml_nbytes (layer.ffn_gate ) + ggml_nbytes (layer.ffn_down ) + ggml_nbytes (layer.ffn_up );
27962805 }
27972806 }
27982807 } break ;
@@ -3891,13 +3900,25 @@ struct llm_build_context {
38913900 // compute Q and K and RoPE them
38923901 struct ggml_tensor * Qcur = ggml_mul_mat (ctx0, model.layers [il].wq , cur);
38933902 cb (Qcur, " Qcur" , il);
3903+ if (model.layers [il].bq ) {
3904+ Qcur = ggml_add (ctx0, Qcur, model.layers [il].bq );
3905+ cb (Qcur, " Qcur" , il);
3906+ }
38943907
38953908 struct ggml_tensor * Kcur = ggml_mul_mat (ctx0, model.layers [il].wk , cur);
38963909 cb (Kcur, " Kcur" , il);
3910+ if (model.layers [il].bk ) {
3911+ Kcur = ggml_add (ctx0, Kcur, model.layers [il].bk );
3912+ cb (Kcur, " Kcur" , il);
3913+ }
38973914
38983915 struct ggml_tensor * Vcur = ggml_mul_mat (ctx0, model.layers [il].wv , cur);
38993916 cb (Vcur, " Vcur" , il);
3900-
3917+ if (model.layers [il].bv ) {
3918+ Vcur = ggml_add (ctx0, Vcur, model.layers [il].bv );
3919+ cb (Vcur, " Vcur" , il);
3920+ }
3921+
39013922 Qcur = ggml_rope_custom (
39023923 ctx0, ggml_reshape_3d (ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
39033924 n_embd_head, 0 , 0 , n_orig_ctx, freq_base, freq_scale,
@@ -3915,7 +3936,7 @@ struct llm_build_context {
39153936 llm_build_kv_store (ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
39163937
39173938 cur = llm_build_kqv (ctx0, hparams, kv_self,
3918- model.layers [il].wo , NULL ,
3939+ model.layers [il].wo , model. layers [il]. bo ,
39193940 Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1 .0f , cb, il);
39203941 cb (cur, " kqv_out" , il);
39213942 }
0 commit comments