fix vit pos embed, deepstack and mrope-interleaved!

JJJYmmm · JJJYmmm · commit ea677dce7e40 · 2025-10-22T20:50:34.000+08:00
love from qwen team!
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
@@ -5487,6 +5487,12 @@ static void ggml_mrope_cache_init(
     int sec_e = sections[2] + sec_w;
     GGML_ASSERT(sect_dims <= ne0);
 
+    // Qwen3VL: interleaved mrope, currently judged by the number of sections
+    bool is_interleaved_mrope = false;
+    if (sections[0] == 24 && sections[1] == 20 && sections[2] == 20) {
+        is_interleaved_mrope = true;
+    }
+
     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
         const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
 
@@ -5510,14 +5516,25 @@ static void ggml_mrope_cache_init(
 
         float theta = theta_t;
 
-        if (sector >= sections[0] && sector < sec_w) {
-            theta = theta_h;
-        }
-        else if (sector >= sec_w && sector < sec_w + sections[2]) {
-            theta = theta_w;
-        }
-        else if (sector >= sec_w + sections[2]) {
-            theta = theta_e;
+        if (is_interleaved_mrope) {
+            // thwthwthw...ttt
+            if (sector % 3 == 1 && sector < 3 * sections[1]) {
+                theta = theta_h;
+            } else if (sector % 3 == 2 && sector < 3 * sections[2]) {
+                theta = theta_w;
+            } else {
+                theta = theta_e;
+            }
+        } else {
+            if (sector >= sections[0] && sector < sec_w) {
+                theta = theta_h;
+            }
+            else if (sector >= sec_w && sector < sec_w + sections[2]) {
+                theta = theta_w;
+            }
+            else if (sector >= sec_w + sections[2]) {
+                theta = theta_e;
+            }
         }
 
         rope_yarn(
diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu
@@ -151,18 +151,30 @@ static __global__ void rope_multi(
     const int sec_w = sections.v[1] + sections.v[0];
     const int sector = (i0 / 2) % sect_dims;
 
+    bool is_interleaved_mrope = (sections.v[0] == 24 && sections.v[1] == 20 && sections.v[2] == 20);
+
     float theta_base = 0.0;
-    if (sector < sections.v[0]) {
-        theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
-    }
-    else if (sector >= sections.v[0] && sector < sec_w) {
-        theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
-    }
-    else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
-        theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
-    }
-    else if (sector >= sec_w + sections.v[2]) {
-        theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
+    if (is_interleaved_mrope) {
+        if (sector % 3 == 1 && sector < 3 * sections.v[1]) { // h
+            theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
+        } else if (sector % 3 == 2 && sector < 3 * sections.v[2]) { // w
+            theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
+        } else { // t
+            theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
+        }
+    } else {
+        if (sector < sections.v[0]) {
+            theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
+        }
+        else if (sector >= sections.v[0] && sector < sec_w) {
+            theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
+        }
+        else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
+            theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
+        }
+        else if (sector >= sec_w + sections.v[2]) {
+            theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
+        }
     }
 
     const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -1141,6 +1141,55 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
     return cur;
 }
 
+// input embeddings with optional lora for qwen3vl series model
+ggml_tensor * llm_graph_context::build_qwen3vl_inp_embd(ggml_tensor * tok_embd) const {
+    const int64_t n_embd_full = hparams.n_embd; // main + 3 deepstack layers
+    const int64_t n_embd_main = n_embd_full / 4;
+
+    auto inp = std::make_unique<llm_graph_input_embd>();
+    ggml_tensor * cur = nullptr;
+
+    if (ubatch.token) {
+        // Pure text input: expand to 4*n_embd with zero deepstack
+        inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
+        ggml_set_input(inp->tokens);
+        res->t_tokens = inp->tokens;
+
+        // Get main embedding from token IDs
+        cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);
+
+        // Apply LoRA if needed
+        for (const auto & lora : *loras) {
+            llama_adapter_lora_weight * lw = lora.first->get_weight(tok_embd);
+            if (lw == nullptr) continue;
+
+            const float adapter_scale = lora.second;
+            const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
+            ggml_tensor * inpL_delta = ggml_scale(ctx0, ggml_mul_mat(
+                ctx0, lw->b,
+                ggml_get_rows(ctx0, lw->a, inp->tokens)
+            ), scale);
+            cur = ggml_add(ctx0, cur, inpL_delta);
+        }
+    } else {
+        // Custom embedding input (e.g., from image): assume already 4*n_embd
+        inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_full, ubatch.n_tokens);
+        ggml_set_input(inp->embd);
+        cur = inp->embd;
+    }
+
+    // Apply embedding scale if needed (e.g., Granite)
+    if (hparams.f_embedding_scale != 0.0f) {
+        cur = ggml_scale(ctx0, cur, hparams.f_embedding_scale);
+    }
+
+    // Register to graph and input system
+    cb(cur, "inp_embd_qwen3vl", -1);
+    res->add_input(std::move(inp));
+
+    return cur;
+}
+
 ggml_tensor * llm_graph_context::build_inp_pos() const {
     auto inp = std::make_unique<llm_graph_input_pos>(hparams.n_pos_per_embd());
 
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -684,6 +684,8 @@ struct llm_graph_context {
     ggml_tensor * build_inp_pos_bucket_dec() const;
     ggml_tensor * build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const;
 
+    ggml_tensor * build_qwen3vl_inp_embd(ggml_tensor * tok_embd) const;
+
     //
     // attention
     //
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp