|
1 | 1 | #include "models.h" |
2 | 2 |
|
3 | 3 | llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { |
4 | | - const int64_t n_embd_head = hparams.n_embd_head_k; |
5 | | - |
6 | | - ggml_tensor * cur; |
7 | | - ggml_tensor * inpL; |
8 | | - |
9 | | - inpL = build_inp_embd(model.tok_embd); |
10 | | - |
11 | | - inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); |
12 | | - cb(inpL, "inp_scaled", -1); |
13 | | - |
14 | | - // inp_pos - contains the positions |
15 | | - ggml_tensor * inp_pos = build_inp_pos(); |
16 | | - |
17 | | - auto * inp_attn = build_attn_inp_kv_iswa(); |
18 | | - |
19 | | - ggml_tensor * inp_out_ids = build_inp_out_ids(); |
20 | | - |
21 | | - for (int il = 0; il < n_layer; ++il) { |
22 | | - // norm |
23 | | - cur = build_norm(inpL, |
24 | | - model.layers[il].attn_norm, NULL, |
25 | | - LLM_NORM_RMS, il); |
26 | | - cb(cur, "attn_norm", il); |
27 | | - |
28 | | - // self-attention |
29 | | - { |
30 | | - // compute Q and K and RoPE them |
31 | | - ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); |
32 | | - cb(Qcur, "Qcur", il); |
33 | | - |
34 | | - ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); |
35 | | - cb(Kcur, "Kcur", il); |
36 | | - |
37 | | - ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); |
38 | | - cb(Vcur, "Vcur", il); |
39 | | - |
40 | | - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); |
41 | | - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); |
42 | | - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); |
43 | | - |
44 | | - Qcur = ggml_rope_ext( |
45 | | - ctx0, Qcur, inp_pos, nullptr, |
46 | | - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
47 | | - ext_factor, attn_factor, beta_fast, beta_slow); |
48 | | - |
49 | | - Kcur = ggml_rope_ext( |
50 | | - ctx0, Kcur, inp_pos, nullptr, |
51 | | - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
52 | | - ext_factor, attn_factor, beta_fast, beta_slow); |
53 | | - |
54 | | - cb(Qcur, "Qcur", il); |
55 | | - cb(Kcur, "Kcur", il); |
56 | | - cb(Vcur, "Vcur", il); |
57 | | - |
58 | | - Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale); |
59 | | - |
60 | | - cur = build_attn(inp_attn, |
61 | | - model.layers[il].wo, NULL, |
62 | | - Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il); |
63 | | - } |
64 | | - if (il == n_layer - 1 && inp_out_ids) { |
65 | | - cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
66 | | - inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); |
67 | | - } |
68 | | - cur = build_norm(cur, |
69 | | - model.layers[il].attn_post_norm, NULL, |
70 | | - LLM_NORM_RMS, il); |
71 | | - cb(cur, "attn_post_norm", il); |
72 | | - |
73 | | - ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); |
74 | | - cb(sa_out, "sa_out", il); |
75 | | - |
76 | | - cur = build_norm(sa_out, |
77 | | - model.layers[il].ffn_norm, NULL, |
78 | | - LLM_NORM_RMS, il); |
79 | | - cb(cur, "ffn_norm", il); |
80 | | - |
81 | | - // feed-forward network |
82 | | - { |
83 | | - cur = build_ffn(cur, |
84 | | - model.layers[il].ffn_up, NULL, NULL, |
85 | | - model.layers[il].ffn_gate, NULL, NULL, |
86 | | - model.layers[il].ffn_down, NULL, NULL, |
87 | | - NULL, |
88 | | - LLM_FFN_GELU, LLM_FFN_PAR, il); |
89 | | - cb(cur, "ffn_out", il); |
90 | | - } |
91 | | - cur = build_norm(cur, |
92 | | - model.layers[il].ffn_post_norm, NULL, |
93 | | - LLM_NORM_RMS, -1); |
94 | | - cb(cur, "ffn_post_norm", -1); |
95 | | - |
96 | | - cur = ggml_add(ctx0, cur, sa_out); |
97 | | - |
98 | | - cur = build_cvec(cur, il); |
99 | | - cb(cur, "l_out", il); |
100 | | - |
101 | | - // input for next layer |
102 | | - inpL = cur; |
103 | | - } |
104 | | - cur = inpL; |
| 4 | + const int64_t n_embd_head = hparams.n_embd_head_k; |
| 5 | + |
| 6 | + ggml_tensor * cur; |
| 7 | + ggml_tensor * inpL; |
| 8 | + |
| 9 | + inpL = build_inp_embd(model.tok_embd); |
| 10 | + |
| 11 | + inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); |
| 12 | + cb(inpL, "inp_scaled", -1); |
| 13 | + |
| 14 | + // inp_pos - contains the positions |
| 15 | + ggml_tensor * inp_pos = build_inp_pos(); |
| 16 | + |
| 17 | + auto * inp_attn = build_attn_inp_kv_iswa(); |
| 18 | + |
| 19 | + ggml_tensor * inp_out_ids = build_inp_out_ids(); |
| 20 | + |
| 21 | + for (int il = 0; il < n_layer; ++il) { |
| 22 | + // norm |
| 23 | + cur = build_norm(inpL, |
| 24 | + model.layers[il].attn_norm, NULL, |
| 25 | + LLM_NORM_RMS, il); |
| 26 | + cb(cur, "attn_norm", il); |
| 27 | + |
| 28 | + // self-attention |
| 29 | + { |
| 30 | + // compute Q and K and RoPE them |
| 31 | + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); |
| 32 | + cb(Qcur, "Qcur", il); |
| 33 | + |
| 34 | + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); |
| 35 | + cb(Kcur, "Kcur", il); |
| 36 | + |
| 37 | + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); |
| 38 | + cb(Vcur, "Vcur", il); |
| 39 | + |
| 40 | + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); |
| 41 | + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); |
| 42 | + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); |
105 | 43 |
|
| 44 | + Qcur = ggml_rope_ext( |
| 45 | + ctx0, Qcur, inp_pos, nullptr, |
| 46 | + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| 47 | + ext_factor, attn_factor, beta_fast, beta_slow); |
| 48 | + |
| 49 | + Kcur = ggml_rope_ext( |
| 50 | + ctx0, Kcur, inp_pos, nullptr, |
| 51 | + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, |
| 52 | + ext_factor, attn_factor, beta_fast, beta_slow); |
| 53 | + |
| 54 | + cb(Qcur, "Qcur", il); |
| 55 | + cb(Kcur, "Kcur", il); |
| 56 | + cb(Vcur, "Vcur", il); |
| 57 | + |
| 58 | + Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale); |
| 59 | + |
| 60 | + cur = build_attn(inp_attn, |
| 61 | + model.layers[il].wo, NULL, |
| 62 | + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il); |
| 63 | + } |
| 64 | + if (il == n_layer - 1 && inp_out_ids) { |
| 65 | + cur = ggml_get_rows(ctx0, cur, inp_out_ids); |
| 66 | + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); |
| 67 | + } |
106 | 68 | cur = build_norm(cur, |
107 | | - model.output_norm, NULL, |
| 69 | + model.layers[il].attn_post_norm, NULL, |
| 70 | + LLM_NORM_RMS, il); |
| 71 | + cb(cur, "attn_post_norm", il); |
| 72 | + |
| 73 | + ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); |
| 74 | + cb(sa_out, "sa_out", il); |
| 75 | + |
| 76 | + cur = build_norm(sa_out, |
| 77 | + model.layers[il].ffn_norm, NULL, |
| 78 | + LLM_NORM_RMS, il); |
| 79 | + cb(cur, "ffn_norm", il); |
| 80 | + |
| 81 | + // feed-forward network |
| 82 | + { |
| 83 | + cur = build_ffn(cur, |
| 84 | + model.layers[il].ffn_up, NULL, NULL, |
| 85 | + model.layers[il].ffn_gate, NULL, NULL, |
| 86 | + model.layers[il].ffn_down, NULL, NULL, |
| 87 | + NULL, |
| 88 | + LLM_FFN_GELU, LLM_FFN_PAR, il); |
| 89 | + cb(cur, "ffn_out", il); |
| 90 | + } |
| 91 | + cur = build_norm(cur, |
| 92 | + model.layers[il].ffn_post_norm, NULL, |
108 | 93 | LLM_NORM_RMS, -1); |
| 94 | + cb(cur, "ffn_post_norm", -1); |
| 95 | + |
| 96 | + cur = ggml_add(ctx0, cur, sa_out); |
| 97 | + |
| 98 | + cur = build_cvec(cur, il); |
| 99 | + cb(cur, "l_out", il); |
| 100 | + |
| 101 | + // input for next layer |
| 102 | + inpL = cur; |
| 103 | + } |
| 104 | + cur = inpL; |
| 105 | + |
| 106 | + cur = build_norm(cur, |
| 107 | + model.output_norm, NULL, |
| 108 | + LLM_NORM_RMS, -1); |
109 | 109 |
|
110 | | - cb(cur, "result_norm", -1); |
111 | | - res->t_embd = cur; |
| 110 | + cb(cur, "result_norm", -1); |
| 111 | + res->t_embd = cur; |
112 | 112 |
|
113 | | - // lm_head |
114 | | - cur = build_lora_mm(model.output, cur); |
| 113 | + // lm_head |
| 114 | + cur = build_lora_mm(model.output, cur); |
115 | 115 |
|
116 | | - // final logit soft-capping |
117 | | - cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); |
118 | | - cur = ggml_tanh(ctx0, cur); |
119 | | - cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping); |
| 116 | + // final logit soft-capping |
| 117 | + cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); |
| 118 | + cur = ggml_tanh(ctx0, cur); |
| 119 | + cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping); |
120 | 120 |
|
121 | | - cb(cur, "result_output", -1); |
122 | | - res->t_logits = cur; |
| 121 | + cb(cur, "result_output", -1); |
| 122 | + res->t_logits = cur; |
123 | 123 |
|
124 | | - ggml_build_forward_expand(gf, cur); |
| 124 | + ggml_build_forward_expand(gf, cur); |
125 | 125 | } |
0 commit comments