fix confilcts

LETS-BEE · LETS-BEE · commit 85107b657d18 · 2025-10-23T05:24:00.000Z
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -3538,25 +3538,16 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return super().modify_tensors(data_torch, name, bid)
 
 
-<<<<<<< HEAD
 @ModelBase.register("Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration")
 class Qwen3VLVisionModel(MmprojModel):
-=======
-@ModelBase.register("Qwen3VLMoeForConditionalGeneration")
-class Qwen3VLMoeVisionModel(MmprojModel):
->>>>>>> remote-JJJYmmm/qwen3vl-1022
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         assert self.has_vision_encoder
         assert self.hparams_vision is not None
 
         # Compute image_size if not present
         if "image_size" not in self.hparams_vision:
-<<<<<<< HEAD
             # For Qwen3VL/Qwen3VLMoe, compute from num_position_embeddings
-=======
-            # For Qwen3VLMoe, compute from num_position_embeddings
->>>>>>> remote-JJJYmmm/qwen3vl-1022
             num_pos = self.hparams_vision.get("num_position_embeddings", 2304)
             patch_size = self.hparams_vision.get("patch_size", 16)
             # num_position_embeddings = (image_size / patch_size) ** 2
@@ -3601,15 +3592,10 @@ def set_gguf_parameters(self):
             self.gguf_writer.add_vision_deepstack_layers(self.deepstack_layers)
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-<<<<<<< HEAD
         # Skip text model tensors - they go in the text model file
         if name.startswith("model.language_model.") or name.startswith("lm_head."):
             return []
         
-=======
-        del bid  # unused
-
->>>>>>> remote-JJJYmmm/qwen3vl-1022
         if name.startswith("model.visual."):
             name = name.replace("model.visual.", "visual.", 1)
 
@@ -3666,13 +3652,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             ]
 
         if name == "visual.patch_embed.proj.bias":
-<<<<<<< HEAD
             # Include the bias - it's used by the C++ code
             return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".bias", data_torch)]
-=======
-            # Skip bias for Qwen3VL - the C++ code expects it to be null
-            return []
->>>>>>> remote-JJJYmmm/qwen3vl-1022
 
         if name.startswith("visual."):
             if ".qkv." in name:
@@ -3695,12 +3676,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
             return [(self.map_tensor_name(name), data_torch)]
 
-<<<<<<< HEAD
         # Fall back to parent class for other tensors
         return super().modify_tensors(data_torch, name, bid)
-=======
-        return []
->>>>>>> remote-JJJYmmm/qwen3vl-1022
 
 @ModelBase.register("InternVisionModel")
 class InternVisionModel(MmprojModel):
@@ -4026,7 +4003,6 @@ def set_vocab(self):
         super().set_vocab()
 
 
-<<<<<<< HEAD
 @ModelBase.register("Qwen3VLForConditionalGeneration")
 class Qwen3VLTextModel(Qwen3Model):
     model_arch = gguf.MODEL_ARCH.QWEN3VL
@@ -4056,8 +4032,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return super().modify_tensors(data_torch, name, bid)
 
 
-=======
->>>>>>> remote-JJJYmmm/qwen3vl-1022
 @ModelBase.register("Qwen3VLMoeForConditionalGeneration")
 class Qwen3VLMoeTextModel(Qwen3MoeModel):
     model_arch = gguf.MODEL_ARCH.QWEN3VLMOE
@@ -4079,16 +4053,13 @@ def set_gguf_parameters(self):
 
             logger.info(f"MRoPE sections: {mrope_section[:4]}")
 
-<<<<<<< HEAD
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         # Skip vision tensors - they go in the mmproj file
         if name.startswith("model.visual."):
             return []
         
         return super().modify_tensors(data_torch, name, bid)
 
-=======
->>>>>>> remote-JJJYmmm/qwen3vl-1022
 
 @ModelBase.register("GPT2LMHeadModel")
 class GPT2Model(TextModel):
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
@@ -5516,16 +5516,6 @@ static void ggml_mrope_cache_init(
 
         float theta = theta_t;
 
-<<<<<<< HEAD
-        if (sector >= sections[0] && sector < sec_w) {
-            theta = theta_h;
-        }
-        else if (sector >= sec_w && sector < sec_w + sections[2]) {
-            theta = theta_w;
-        }
-        else if (sector >= sec_w + sections[2]) {
-            theta = theta_e;
-=======
         if (is_interleaved_mrope) {
             // thwthwthw...ttt
             if (sector % 3 == 1 && sector < 3 * sections[1]) {
@@ -5545,7 +5535,6 @@ static void ggml_mrope_cache_init(
             else if (sector >= sec_w + sections[2]) {
                 theta = theta_e;
             }
->>>>>>> remote-JJJYmmm/qwen3vl-1022
         }
 
         rope_yarn(
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -352,10 +352,7 @@ class MODEL_ARCH(IntEnum):
     QWEN2VL          = auto()
     QWEN3            = auto()
     QWEN3MOE         = auto()
-<<<<<<< HEAD
     QWEN3VL          = auto()
-=======
->>>>>>> remote-JJJYmmm/qwen3vl-1022
     QWEN3VLMOE       = auto()
     PHI2             = auto()
     PHI3             = auto()
@@ -706,10 +703,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.QWEN2VL:          "qwen2vl",
     MODEL_ARCH.QWEN3:            "qwen3",
     MODEL_ARCH.QWEN3MOE:         "qwen3moe",
-<<<<<<< HEAD
     MODEL_ARCH.QWEN3VL:          "qwen3vl",
-=======
->>>>>>> remote-JJJYmmm/qwen3vl-1022
     MODEL_ARCH.QWEN3VLMOE:       "qwen3vlmoe",
     MODEL_ARCH.PHI2:             "phi2",
     MODEL_ARCH.PHI3:             "phi3",
@@ -1518,7 +1512,6 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN_EXP,
         MODEL_TENSOR.FFN_UP_EXP,
     ],
-<<<<<<< HEAD
     MODEL_ARCH.QWEN3VL: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
@@ -1536,8 +1529,6 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
-=======
->>>>>>> remote-JJJYmmm/qwen3vl-1022
     MODEL_ARCH.QWEN3VLMOE: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
diff --git a/include/llama.h b/include/llama.h
@@ -232,7 +232,7 @@ extern "C" {
 
         llama_token  *  token;
         float        *  embd;
-        llama_pos    *  pos;      // first `n_tokens` elements are always linearly increasing position for traditional llm
+        llama_pos    *  pos;
         int32_t      *  n_seq_id;
         llama_seq_id ** seq_id;
         int8_t       *  logits;   // TODO: rename this to "output"
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
@@ -32,10 +32,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_QWEN2VL,          "qwen2vl"          },
     { LLM_ARCH_QWEN3,            "qwen3"            },
     { LLM_ARCH_QWEN3MOE,         "qwen3moe"         },
-<<<<<<< HEAD
     { LLM_ARCH_QWEN3_VL,         "qwen3vl"          },
-=======
->>>>>>> remote-JJJYmmm/qwen3vl-1022
     { LLM_ARCH_QWEN3_VL_MOE,     "qwen3vlmoe"       },
     { LLM_ARCH_PHI2,             "phi2"             },
     { LLM_ARCH_PHI3,             "phi3"             },
@@ -785,7 +782,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
         },
     },
     {
-<<<<<<< HEAD
         LLM_ARCH_QWEN3_VL,
         {
             { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
@@ -805,8 +801,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
         },
     },
     {
-=======
->>>>>>> remote-JJJYmmm/qwen3vl-1022
         LLM_ARCH_QWEN3_VL_MOE,
         {
             { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
diff --git a/src/llama-arch.h b/src/llama-arch.h
@@ -36,10 +36,7 @@ enum llm_arch {
     LLM_ARCH_QWEN2VL,
     LLM_ARCH_QWEN3,
     LLM_ARCH_QWEN3MOE,
-<<<<<<< HEAD
     LLM_ARCH_QWEN3_VL,
-=======
->>>>>>> remote-JJJYmmm/qwen3vl-1022
     LLM_ARCH_QWEN3_VL_MOE,
     LLM_ARCH_PHI2,
     LLM_ARCH_PHI3,
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
@@ -259,7 +259,23 @@ bool llama_batch_allocr::init(
         const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
 
         if (p0 >= 0) {
-            if (seq_pos_min(s) != p0 + 1) {
+            bool ok = true;
+
+            if (batch.token) {
+                if (seq_pos_min(s) != p0 + 1) {
+                    ok = false;
+                }
+            } else {
+                assert(batch.embd);
+
+                // for embeddings (typically used as vision input), we allow them to have repeating positions
+                // ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
+                if (seq_pos_min(s) != p0 && seq_pos_min(s) != p0 + 1) {
+                    ok = false;
+                }
+            }
+
+            if (!ok) {
                 LLAMA_LOG_ERROR(
                         "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
                         " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
@@ -639,7 +655,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
 
     auto udata = std::make_shared<llama_ubatch::data_t>();
 
-    const int32_t n_pos_cur = batch.embd ? (n_pos_per_embd + 1) : 1;
+    const int32_t n_pos_cur = batch.embd ? n_pos_per_embd : 1;
 
     const int64_t n_embd_all = batch.embd ? (int64_t) n_tokens*n_embd : 0;
     const int64_t n_pos_all  =              (int64_t) n_tokens*n_pos_cur;
@@ -665,7 +681,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
         }
 
         for (int j = 0; j < n_pos_cur; ++j) {
-            udata->pos[j * n_tokens + i] = batch.pos[j * batch.n_tokens + idxs[i]];
+            udata->pos[j*n_tokens + i] = batch.pos[j*batch.n_tokens + idxs[i]];
         }
 
         udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -54,13 +54,7 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
             }
             ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));
         } else {
-            llama_pos * pos_ptr = ubatch->pos;
-            // Normally, ubatch->pos stores linearly increasing position
-            // However, some multi-modal models requires special position embedding (e.g. M-Rope in qwen2vl and qwen2.5vl)
-            // But linearly increasing position is still needed for proper causal attention masking
-            // So we store both of them: the first n_tokens elements are not changed, while model-specific positions are appended after that.
-            if (ubatch->embd && n_pos_per_embd > 1) pos_ptr += n_tokens; // use mrope positions
-            ggml_backend_tensor_set(pos, pos_ptr, 0, n_tokens * n_pos_per_embd * ggml_element_size(pos));
+            ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos));
         }
     }
 }
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -687,7 +687,6 @@ struct llm_graph_context {
     ggml_tensor * build_inp_pos_bucket_enc() const;
     ggml_tensor * build_inp_pos_bucket_dec() const;
     ggml_tensor * build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const;
-
     ggml_tensor * build_qwen3vl_inp_embd(ggml_tensor * tok_embd) const;
 
     //
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp

Original file line number	Diff line number	Diff line change
`@@ -54,13 +54,7 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {`
`54`	`54`	`}`
`55`	`55`	`ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));`
`56`	`56`	`} else {`
`57`		`- llama_pos * pos_ptr = ubatch->pos;`
`58`		`- // Normally, ubatch->pos stores linearly increasing position`
`59`		`- // However, some multi-modal models requires special position embedding (e.g. M-Rope in qwen2vl and qwen2.5vl)`
`60`		`- // But linearly increasing position is still needed for proper causal attention masking`
`61`		`- // So we store both of them: the first n_tokens elements are not changed, while model-specific positions are appended after that.`
`62`		`- if (ubatch->embd && n_pos_per_embd > 1) pos_ptr += n_tokens; // use mrope positions`
`63`		`- ggml_backend_tensor_set(pos, pos_ptr, 0, n_tokens * n_pos_per_embd * ggml_element_size(pos));`
	`57`	`+ ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokensn_pos_per_embdggml_element_size(pos));`
`64`	`58`	`}`
`65`	`59`	`}`
`66`	`60`	`}`