ModelTC
diff --git a/‎lightllm/models/qwen2_5_vl/layer_weights/pre_and_post_layer_weight.py‎
Lines changed: 27 additions & 0 deletions b/‎lightllm/models/qwen2_5_vl/layer_weights/pre_and_post_layer_weight.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎lightllm/models/qwen2_5_vl/qwen2_5_visual.py‎
Lines changed: 29 additions & 8 deletions b/‎lightllm/models/qwen2_5_vl/qwen2_5_visual.py‎
Lines changed: 29 additions & 8 deletions
diff --git a/‎lightllm/models/qwen2_vl/layer_weights/pre_and_post_layer_weight.py‎
Lines changed: 27 additions & 0 deletions b/‎lightllm/models/qwen2_vl/layer_weights/pre_and_post_layer_weight.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎lightllm/models/qwen2_vl/model.py‎
Lines changed: 35 additions & 1 deletion b/‎lightllm/models/qwen2_vl/model.py‎
Lines changed: 35 additions & 1 deletion
@@ -0,0 +1,27 @@
+import torch
+import numpy as np
+from lightllm.utils.envs_utils import get_env_start_args
+from transformers.configuration_utils import PretrainedConfig
+from lightllm.models.qwen2.layer_weights.pre_and_post_layer_weight import Qwen2PreAndPostLayerWeight
+from lightllm.models.qwen2_5_vl.qwen2_5_visual import Qwen2_5VLTransformer
+
+
+def build_visual_model(args, data_type: torch.dtype):
+    if args.disable_extra_process_for_multimodal:
+        kvargs = {
+            "weight_dir": args.model_dir,
+            "data_type": args.data_type,
+            "quant_type": args.vit_quant_type,
+            "quant_cfg": args.vit_quant_cfg,
+            "max_batch_size": args.visual_infer_batch_size,
+        }
+        model_cfg, _ = PretrainedConfig.get_config_dict(kvargs["weight_dir"])
+        return Qwen2_5VLTransformer(kvargs=kvargs, **model_cfg["vision_config"]).eval().to(dtype=data_type)
+    return None
+
+
+class Qwen2_5VLPreAndPostLayerWeight(Qwen2PreAndPostLayerWeight):
+    def __init__(self, data_type, network_config, mode):
+        super().__init__(data_type, network_config, mode)
+        self.visual_model = build_visual_model(get_env_start_args(), data_type)
+        return
@@ -16,7 +16,7 @@
 from torch.nn import LayerNorm
 from transformers.activations import ACT2FN
 import math
-from lightllm.models.qwen2_vl.vision_process import get_image, Qwen2VLImageProcessor
+from lightllm.models.qwen2_vl.vision_process import resize_image, Qwen2VLImageProcessor
 from transformers import AutoProcessor
 from safetensors import safe_open
 from transformers.utils import TensorType
@@ -212,9 +212,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
-class Qwen2_5_VisionTransformerPretrainedModel(nn.Module):
+class Qwen2_5VLTransformer(nn.Module):
     def __init__(
         self,
+        weight_dir,
         depth=32,
         hidden_size=3584,
         hidden_act="silu",
@@ -278,6 +279,11 @@ def __init__(
 
         self.gradient_checkpointing = False
 
+        processor_config_path = os.path.join(weight_dir, "preprocessor_config.json")
+        with open(processor_config_path, "r") as f:
+            processor_config_dict = json.load(f)
+        self.processor = Qwen2VLImageProcessor(**processor_config_dict)
+
         self.device = self.get_device()
         self.dtype = self.get_dtype()
 
@@ -416,12 +422,27 @@ def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.
 
         return hidden_states
 
-    def load_model(self, weight_dir):
+    def load_image(self, img: List[ImageItem]):
+        pixel_values = None
+        if isinstance(img, ImageItem):
+            image_data = read_shm(get_shm_name_data(img.uuid))
+            image_data = Image.open(BytesIO(image_data))
+            image_data = resize_image(image_data)
+            image_inputs = self.processor.preprocess(images=image_data, return_tensors="pt")
+            pixel_values = image_inputs["pixel_values"].to(dtype=torch.bfloat16)
+            image_grid_thw = image_inputs["image_grid_thw"]
+        elif isinstance(img, dict):
+            image_data = read_shm(get_shm_name_data(img["uuid"]))
+            image_data = Image.open(BytesIO(image_data))
+            image_data = resize_image(image_data)
+            image_inputs = self.processor.preprocess(images=image_data, return_tensors="pt")
+            pixel_values = image_inputs["pixel_values"].to(dtype=torch.bfloat16)
+            image_grid_thw = image_inputs["image_grid_thw"]
+        else:
+            raise Exception("Unsupport input types: {} for {}".format(type(img), img))
+        return pixel_values.to(dtype=self.get_dtype()), image_grid_thw
 
-        processor_config_path = os.path.join(weight_dir, "preprocessor_config.json")
-        with open(processor_config_path, "r") as f:
-            processor_config_dict = json.load(f)
-        self.processor = Qwen2VLImageProcessor(**processor_config_dict)
+    def load_model(self, weight_dir):
 
         bin_weight_files = [file_ for file_ in os.listdir(weight_dir) if file_.endswith(".bin")]
         if bin_weight_files:
@@ -455,7 +476,7 @@ def encode(self, images: List[ImageItem]):
                 uuids.append(img.uuid)
                 image_data = read_shm(get_shm_name_data(img.uuid))
                 image_data = Image.open(BytesIO(image_data))
-                image_data = get_image(image_data)
+                image_data = resize_image(image_data)
                 image_inputs = self.processor.preprocess(images=image_data, return_tensors="pt")
                 pixel_values = image_inputs["pixel_values"].to(dtype=torch.bfloat16)
                 image_grid_thw = image_inputs["image_grid_thw"]
 
@@ -0,0 +1,27 @@
+import torch
+import numpy as np
+from lightllm.utils.envs_utils import get_env_start_args
+from transformers.configuration_utils import PretrainedConfig
+from lightllm.models.qwen2.layer_weights.pre_and_post_layer_weight import Qwen2PreAndPostLayerWeight
+from lightllm.models.qwen2_vl.qwen2_visual import Qwen2VLTransformer
+
+
+def build_visual_model(args, data_type: torch.dtype):
+    if args.disable_extra_process_for_multimodal:
+        kvargs = {
+            "weight_dir": args.model_dir,
+            "data_type": args.data_type,
+            "quant_type": args.vit_quant_type,
+            "quant_cfg": args.vit_quant_cfg,
+            "max_batch_size": args.visual_infer_batch_size,
+        }
+        model_cfg, _ = PretrainedConfig.get_config_dict(kvargs["weight_dir"])
+        return Qwen2VLTransformer(kvargs=kvargs, **model_cfg["vision_config"]).eval().to(dtype=data_type)
+    return None
+
+
+class Qwen2VLPreAndPostLayerWeight(Qwen2PreAndPostLayerWeight):
+    def __init__(self, data_type, network_config, mode):
+        super().__init__(data_type, network_config, mode)
+        self.visual_model = build_visual_model(get_env_start_args(), data_type)
+        return
@@ -16,6 +16,8 @@
 from lightllm.common.build_utils import repair_config
 from lightllm.models.registry import ModelRegistry
 from lightllm.models.qwen2_vl.infer_struct import Qwen2VLInferStateInfo
+from lightllm.models.qwen2_vl.layer_weights.pre_and_post_layer_weight import Qwen2VLPreAndPostLayerWeight
+from lightllm.models.qwen2_5_vl.layer_weights.pre_and_post_layer_weight import Qwen2_5VLPreAndPostLayerWeight
 from lightllm.models.qwen2_vl.layer_infer.transformer_layer_infer import Qwen2VLTransformerLayerInfer
 
 import torch
@@ -93,12 +95,44 @@ def encode(self, prompt, multimodal_params: MultimodalParams = None, **kwargs):
         return input_ids
 
 
-@ModelRegistry(["qwen2_vl", "qwen2_5_vl"], is_multimodal=True)
+@ModelRegistry(["qwen2_vl"], is_multimodal=True)
 class Qwen2VLTpPartModel(Qwen2TpPartModel):
 
     pre_layer_infer_class = LlamaMultimodalPreLayerInfer
     transformer_layer_infer_class = Qwen2VLTransformerLayerInfer
 
+    pre_and_post_weight_class = Qwen2VLPreAndPostLayerWeight
+
+    infer_state_class = Qwen2VLInferStateInfo
+
+    def __init__(self, kvargs):
+        super().__init__(kvargs)
+        return
+
+    def _init_inferstate_cls(self):
+        if get_env_start_args().enable_fa3:
+            self.infer_state_class = Qwen2VLFlashAttentionStateInfo
+
+    def _init_config(self):
+        with open(os.path.join(self.weight_dir_, "config.json"), "r") as json_file:
+            self.config = json.load(json_file)
+        # rename keys
+        repair_config(self.config, same_names=["num_attention_heads", "n_head"])
+        repair_config(self.config, same_names=["hidden_size", "n_embd", "n_embed"])
+        repair_config(self.config, same_names=["num_hidden_layers", "n_layer"])
+        if self.finetune_config:
+            self.config["vocab_size"] = self.finetune_config.vocab_size
+        return
+
+
+@ModelRegistry(["qwen2_5_vl"], is_multimodal=True)
+class Qwen2_5VLTpPartModel(Qwen2TpPartModel):
+
+    pre_layer_infer_class = LlamaMultimodalPreLayerInfer
+    transformer_layer_infer_class = Qwen2VLTransformerLayerInfer
+
+    pre_and_post_weight_class = Qwen2_5VLPreAndPostLayerWeight
+
     infer_state_class = Qwen2VLInferStateInfo
 
     def __init__(self, kvargs):