@@ -9775,18 +9775,6 @@ struct llm_build_qwen3vl : public llm_graph_context {
97759775 } else {
97769776 // Text input: main_embed = inpL, deepstack = zero
97779777 main_embed = inpL;
9778- ds0 = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_main, n_tokens);
9779- ds1 = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_main, n_tokens);
9780- ds2 = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_main, n_tokens);
9781-
9782-
9783- ds0 = ggml_scale(ctx0, ds0, 0.0f);
9784- ds1 = ggml_scale(ctx0, ds1, 0.0f);
9785- ds2 = ggml_scale(ctx0, ds2, 0.0f);
9786-
9787- ggml_build_forward_expand(gf, ds0);
9788- ggml_build_forward_expand(gf, ds1);
9789- ggml_build_forward_expand(gf, ds2);
97909778 }
97919779
97929780 inpL = main_embed;
@@ -9896,6 +9884,16 @@ struct llm_build_qwen3vl : public llm_graph_context {
98969884 cur = build_cvec(cur, il);
98979885 cb(cur, "l_out", il);
98989886
9887+ if (ubatch.embd) {
9888+ switch (il) {
9889+ case 0: cur = ggml_add(ctx0, cur, ds0); break;
9890+ case 1: cur = ggml_add(ctx0, cur, ds1); break;
9891+ case 2: cur = ggml_add(ctx0, cur, ds2); break;
9892+ default: break;
9893+ }
9894+ cb(cur, "l_out_with_deepstack", il);
9895+ }
9896+
98999897 // input for next layer
99009898 inpL = cur;
99019899 }
@@ -9940,7 +9938,7 @@ struct llm_build_qwen3vlmoe : public llm_graph_context {
99409938 if (ubatch.embd) {
99419939 // Image input: split 4*n_embd
99429940 main_embed = ggml_view_2d(ctx0, inpL, n_embd_main, n_tokens, inpL->nb[1], 0);
9943- ds0 = ggml_view_2d(ctx0, inpL, n_embd_main, n_tokens, inpL->nb[1], n_embd_main * sizeof(float));
9941+ ds0 = ggml_view_2d(ctx0, inpL, n_embd_main, n_tokens, inpL->nb[1], n_embd_main * sizeof(float));
99449942 ds1 = ggml_view_2d(ctx0, inpL, n_embd_main, n_tokens, inpL->nb[1], 2 * n_embd_main * sizeof(float));
99459943 ds2 = ggml_view_2d(ctx0, inpL, n_embd_main, n_tokens, inpL->nb[1], 3 * n_embd_main * sizeof(float));
99469944
@@ -9952,18 +9950,6 @@ struct llm_build_qwen3vlmoe : public llm_graph_context {
99529950 } else {
99539951 // Text input: main_embed = inpL, deepstack = zero
99549952 main_embed = inpL;
9955- ds0 = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_main, n_tokens);
9956- ds1 = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_main, n_tokens);
9957- ds2 = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_main, n_tokens);
9958-
9959-
9960- ds0 = ggml_scale(ctx0, ds0, 0.0f);
9961- ds1 = ggml_scale(ctx0, ds1, 0.0f);
9962- ds2 = ggml_scale(ctx0, ds2, 0.0f);
9963-
9964- ggml_build_forward_expand(gf, ds0);
9965- ggml_build_forward_expand(gf, ds1);
9966- ggml_build_forward_expand(gf, ds2);
99679953 }
99689954
99699955
@@ -10066,6 +10052,16 @@ struct llm_build_qwen3vlmoe : public llm_graph_context {
1006610052
1006710053 cur = build_cvec(cur, il);
1006810054 cb(cur, "l_out", il);
10055+
10056+ if (ubatch.embd) {
10057+ switch (il) {
10058+ case 0: cur = ggml_add(ctx0, cur, ds0); break;
10059+ case 1: cur = ggml_add(ctx0, cur, ds1); break;
10060+ case 2: cur = ggml_add(ctx0, cur, ds2); break;
10061+ default: break;
10062+ }
10063+ cb(cur, "l_out_with_deepstack", il);
10064+ }
1006910065
1007010066 // input for next layer
1007110067 inpL = cur;
0 commit comments