modelscope · Jintao-Huang · Feb 18, 2025 · Feb 18, 2025 · Feb 18, 2025 · Feb 18, 2025
diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -551,8 +551,8 @@ App参数继承于[部署参数](#部署参数), [Web-UI参数](#Web-UI参数)
 - INIT_TTS: 默认为False
 - INIT_AUDIO: 默认为False
 
-### ovis1_6
-- MAX_PARTITION: 参考[这里](https://github.com/AIDC-AI/Ovis/blob/d248e34d755a95d24315c40e2489750a869c5dbc/ovis/model/modeling_ovis.py#L312)
+### ovis1_6, ovis2
+- MAX_PARTITION: 默认为9，参考[这里](https://github.com/AIDC-AI/Ovis/blob/d248e34d755a95d24315c40e2489750a869c5dbc/ovis/model/modeling_ovis.py#L312)
 
 ### mplug_owl3, mplug_owl3_241101
 - MAX_NUM_FRAMES: 默认为16，参考[这里](https://modelscope.cn/models/iic/mPLUG-Owl3-7B-240728)

diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -560,8 +560,8 @@ For the meaning of the arguments, please refer to [here](https://modelscope.cn/m
 - INIT_TTS: Default is False
 - INIT_AUDIO: Default is False
 
-### ovis1_6
-- MAX_PARTITION: Refer to [here](https://github.com/AIDC-AI/Ovis/blob/d248e34d755a95d24315c40e2489750a869c5dbc/ovis/model/modeling_ovis.py#L312)
+### ovis1_6, ovis2
+- MAX_PARTITION: Default is 9, refer to [here](https://github.com/AIDC-AI/Ovis/blob/d248e34d755a95d24315c40e2489750a869c5dbc/ovis/model/modeling_ovis.py#L312)
 
 ### mplug_owl3, mplug_owl3_241101
 - MAX_NUM_FRAMES: Default is 16, refer to [here](https://modelscope.cn/models/iic/mPLUG-Owl3-7B-240728)

diff --git a/swift/llm/model/model/qwen.py b/swift/llm/model/model/qwen.py
@@ -651,6 +651,8 @@ def get_model_tokenizer_qwen2_audio(*args, **kwargs):
 
 def get_model_tokenizer_ovis(*args, **kwargs):
     model, tokenizer = get_model_tokenizer_with_flash_attn(*args, **kwargs)
+    model.visual_tokenizer.to(model.dtype)
+    model.vte.to(model.dtype)
     if model is not None:
         model.generation_config.cache_implementation = None
         func_list = ['generate', 'forward', 'get_input_embeddings']

diff --git a/swift/llm/template/template/qwen.py b/swift/llm/template/template/qwen.py
@@ -372,6 +372,7 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
 
     def _post_encode(self, model, inputs: Dict[str, Any]) -> Dict[str, Any]:
         padding_side = self.padding_side if self.is_training else 'left'
+        self.model.config.multimodal_max_length = self.max_length
         _, inputs_embeds, labels, attention_mask = self.model.merge_multimodal(
             text_input_ids=inputs['input_ids'],
             text_attention_masks=torch.ones_like(inputs['input_ids']),  # not use, only compat
@@ -407,7 +408,7 @@ def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[in
     ))
 
 register_template(
-    Qwen2_5TemplateMeta(
+    QwenTemplateMeta(
         MLLMTemplateType.ovis2,
         template_cls=Ovis1_6Template,
         placeholder_tokens=['<|image_pad|>', '<|video_pad|>'],

diff --git a/tests/test_align/test_template/test_vision.py b/tests/test_align/test_template/test_vision.py
@@ -8,7 +8,7 @@
 
 def _infer_model(pt_engine, system=None, messages=None, images=None):
     seed_everything(42)
-    request_config = RequestConfig(max_tokens=128, temperature=0)
+    request_config = RequestConfig(max_tokens=128, temperature=0, repetition_penalty=1)
     if messages is None:
         messages = []
         if system is not None:
@@ -269,8 +269,10 @@ def test_ovis1_6_llama3():
 
 def test_ovis2():
     pt_engine = PtEngine('AIDC-AI/Ovis2-2B')
-    response = _infer_model(pt_engine)
-    assert response == '这是一张插画风格的图像，展示了一只可爱的猫咪。这只猫有灰白相间的毛发，大大的蓝色眼睛和竖立的耳朵，显得非常可爱和迷人。背景模糊，突出了猫咪的细节和表情。'
+    response = _infer_model(pt_engine, messages=[{'role': 'user', 'content': 'Describe the image.'}])
+    assert response[:200] == (
+        'The image showcases a charming digital illustration of a young kitten. The kitten has striking blue '
+        'eyes and a mix of gray, white, and black fur, with distinctive black stripes on its head. Its ears a')
 
 
 def test_paligemma():
@@ -452,10 +454,10 @@ def test_ui_tars():
     # test_llava()
     # test_ovis1_6()
     # test_ovis1_6_llama3()
-    # test_ovis2()
+    test_ovis2()
     # test_yi_vl()
     # test_deepseek_vl()
-    test_deepseek_janus()
+    # test_deepseek_janus()
     # test_deepseek_vl2()
     # test_qwen_vl()
     # test_glm4v()