@@ -52,13 +52,16 @@ void QWEN::init(const char* path_model, model_context* ctx, int n_gpu_layer_, bo
52
52
model.hparams = ml->file_loaders .at (0 )->hparams ;
53
53
model_file_version file_version = ml->file_loaders .at (0 )->file_version ;
54
54
auto & hparams = model.hparams ;
55
- n_ff = hparams.ffn_hidden_size / 2 ;
55
+ n_ff = hparams.ffn_hidden_size ;
56
+ if (hparams.max_seq_len == 8192 ) {
57
+ n_ff = n_ff / 2 ;
58
+ }
56
59
fprintf (stderr, " %s: n_vocab = %u\n " , __func__, hparams.n_vocab );
57
60
fprintf (stderr, " %s: n_embd = %u\n " , __func__, hparams.n_embd );
58
61
fprintf (stderr, " %s: n_head = %u\n " , __func__, hparams.n_head );
59
62
fprintf (stderr, " %s: n_layer = %u\n " , __func__, hparams.n_layer );
60
63
fprintf (stderr, " %s: n_rot = %u\n " , __func__, hparams.n_rot );
61
- fprintf (stderr, " %s: n_ff = %u\n " , __func__, hparams.ffn_hidden_size / 2 );
64
+ fprintf (stderr, " %s: n_ff = %u\n " , __func__, hparams.ffn_hidden_size );
62
65
fprintf (stderr, " %s: n_parts = %zu\n " , __func__, ml->file_loaders .size ());
63
66
n_embd = hparams.n_embd ;
64
67
n_vocab = hparams.n_vocab ;
@@ -102,7 +105,7 @@ void QWEN::load(model_context* ctx, model_progress_callback progress_callback, v
102
105
model.layers .resize (n_layer);
103
106
size_t vram_total = 0 ;
104
107
105
- if (ml->verify_tensor (" token_embd.weight" )) {
108
+ if (ml->verify_tensor (" token_embd.weight" )) { // gguf
106
109
model.others [0 ] = ml->get_tensor (" token_embd.weight" , {n_embd, n_vocab}, NE_BACKEND_CPU);
107
110
model.others [1 ] = ml->get_tensor (" output_norm.weight" , {n_embd}, NE_BACKEND_CPU);
108
111
model.others [2 ] = ml->get_tensor (" output.weight" , {n_embd, n_vocab}, NE_BACKEND_CPU);
@@ -117,16 +120,26 @@ void QWEN::load(model_context* ctx, model_progress_callback progress_callback, v
117
120
layer.norm [1 ] = ml->get_tensor (layers_i + " .ffn_norm.weight" , {n_embd}, backend);
118
121
119
122
// qkv GEMM
120
- layer.attn [0 ] = ml->get_tensor (layers_i + " .attn_qkv.weight" , {n_embd, 3 * n_embd}, backend);
121
- layer.attn [1 ] = ml->get_tensor (layers_i + " .attn_qkv.bias" , {3 * n_embd}, backend);
122
- layer.attn [2 ] = ml->get_tensor (layers_i + " .attn_output.weight" , {n_embd, n_embd}, backend);
123
+ if (ml->verify_tensor (layers_i + " .attn_qkv.weight" )) {
124
+ layer.attn [0 ] = ml->get_tensor (layers_i + " .attn_qkv.weight" , {n_embd, 3 * n_embd}, backend);
125
+ layer.attn [1 ] = ml->get_tensor (layers_i + " .attn_qkv.bias" , {3 * n_embd}, backend);
126
+ layer.attn [2 ] = ml->get_tensor (layers_i + " .attn_output.weight" , {n_embd, n_embd}, backend);
127
+ } else { // qwen2 gguf
128
+ layer.attn [0 ] = ml->get_tensor (layers_i + " .attn_q.weight" , {n_embd, n_embd}, backend);
129
+ layer.attn [1 ] = ml->get_tensor (layers_i + " .attn_q.bias" , {n_embd}, backend);
130
+ layer.attn [2 ] = ml->get_tensor (layers_i + " .attn_k.weight" , {n_embd, n_embd}, backend);
131
+ layer.attn [3 ] = ml->get_tensor (layers_i + " .attn_k.bias" , {n_embd}, backend);
132
+ layer.attn [4 ] = ml->get_tensor (layers_i + " .attn_v.weight" , {n_embd, n_embd}, backend);
133
+ layer.attn [5 ] = ml->get_tensor (layers_i + " .attn_v.bias" , {n_embd}, backend);
134
+ layer.attn [6 ] = ml->get_tensor (layers_i + " .attn_output.weight" , {n_embd, n_embd}, backend);
135
+ }
123
136
124
137
// ffn GEMM
125
138
layer.ffn [0 ] = ml->get_tensor (layers_i + " .ffn_up.weight" , {n_embd, n_ff}, backend);
126
139
layer.ffn [1 ] = ml->get_tensor (layers_i + " .ffn_gate.weight" , {n_embd, n_ff}, backend);
127
140
layer.ffn [2 ] = ml->get_tensor (layers_i + " .ffn_down.weight" , {n_ff, n_embd}, backend);
128
141
}
129
- } else {
142
+ } else if (ml-> verify_tensor ( " transformer.wte.weight " )) { // qwen1 bin
130
143
model.others [0 ] = ml->get_tensor (" transformer.wte.weight" , {n_embd, n_vocab}, NE_BACKEND_CPU);
131
144
model.others [1 ] = ml->get_tensor (" transformer.ln_f.weight" , {n_embd}, NE_BACKEND_CPU);
132
145
model.others [2 ] = ml->get_tensor (" lm_head.weight" , {n_embd, n_vocab}, NE_BACKEND_CPU);
@@ -150,6 +163,34 @@ void QWEN::load(model_context* ctx, model_progress_callback progress_callback, v
150
163
layer.ffn [1 ] = ml->get_tensor (layers_i + " .mlp.w2.weight" , {n_embd, n_ff}, backend);
151
164
layer.ffn [2 ] = ml->get_tensor (layers_i + " .mlp.c_proj.weight" , {n_ff, n_embd}, backend);
152
165
}
166
+ } else { // qwen2 bin
167
+ model.others [0 ] = ml->get_tensor (" model.embed_tokens.weight" , {n_embd, n_vocab}, NE_BACKEND_CPU);
168
+ model.others [1 ] = ml->get_tensor (" model.norm.weight" , {n_embd}, NE_BACKEND_CPU);
169
+ model.others [2 ] = ml->get_tensor (" lm_head.weight" , {n_embd, n_vocab}, NE_BACKEND_CPU);
170
+
171
+ for (uint32_t i = 0 ; i < n_layer; ++i) {
172
+ const ne_backend backend = static_cast <int >(i) < i_gpu_start ? NE_BACKEND_CPU : MODEL_BACKEND_OFFLOAD;
173
+ auto & layer = model.layers [i];
174
+ std::string layers_i = " model.layers." + std::to_string (i);
175
+
176
+ // norm: cur = ln_1_g*cur + ln_1_b
177
+ layer.norm [0 ] = ml->get_tensor (layers_i + " .input_layernorm.weight" , {n_embd}, backend);
178
+ layer.norm [1 ] = ml->get_tensor (layers_i + " .post_attention_layernorm.weight" , {n_embd}, backend);
179
+
180
+ // qkv GEMM + out proj GEMM
181
+ layer.attn [0 ] = ml->get_tensor (layers_i + " .self_attn.q_proj.weight" , {n_embd, n_embd}, backend);
182
+ layer.attn [1 ] = ml->get_tensor (layers_i + " .self_attn.q_proj.bias" , {n_embd}, backend);
183
+ layer.attn [2 ] = ml->get_tensor (layers_i + " .self_attn.k_proj.weight" , {n_embd, n_embd}, backend);
184
+ layer.attn [3 ] = ml->get_tensor (layers_i + " .self_attn.k_proj.bias" , {n_embd}, backend);
185
+ layer.attn [4 ] = ml->get_tensor (layers_i + " .self_attn.v_proj.weight" , {n_embd, n_embd}, backend);
186
+ layer.attn [5 ] = ml->get_tensor (layers_i + " .self_attn.v_proj.bias" , {n_embd}, backend);
187
+ layer.attn [6 ] = ml->get_tensor (layers_i + " .self_attn.o_proj.weight" , {n_embd, n_embd}, backend);
188
+
189
+ // ffn GEMM
190
+ layer.ffn [0 ] = ml->get_tensor (layers_i + " .mlp.up_proj.weight" , {n_embd, n_ff}, backend);
191
+ layer.ffn [1 ] = ml->get_tensor (layers_i + " .mlp.gate_proj.weight" , {n_embd, n_ff}, backend);
192
+ layer.ffn [2 ] = ml->get_tensor (layers_i + " .mlp.down_proj.weight" , {n_ff, n_embd}, backend);
193
+ }
153
194
}
154
195
155
196
// print memory requirements
@@ -180,7 +221,7 @@ class qwen_quant_layer : public quant_layer_base {
180
221
public:
181
222
quant_params_internal get_layer_config (std::string layername, std::vector<int64_t > ne, ne_type type) override {
182
223
bool quantize = layername.rfind (" weight" ) == layername.size () - 6 ; // ends with 'weight'?
183
- if (layername == " transformer.wte.weight" ) {
224
+ if (layername == " transformer.wte.weight" || layername == " model.embed_tokens.weight " ) {
184
225
// special layer process, can be loaded by config file
185
226
return quant_params_internal (); // return q4_0 to cover the usage of getrow
186
227
}
0 commit comments