vllm-project · DarkLight1337 · Aug 6, 2024 · Aug 5, 2024 · Aug 5, 2024 · Aug 5, 2024
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -20,6 +20,9 @@ sentence-transformers # required for embedding
 compressed-tensors==0.4.0 # required for compressed-tensors
 timm # required for internvl test
 
+# TODO: Add this after fully implementing llava(mantis)
+# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
+
 # Benchmarking
 aiohttp
 

@@ -1,10 +1,11 @@
 from typing import List, Optional, Tuple, Type
 
 import pytest
-from transformers import AutoTokenizer
+from transformers import AutoConfig, AutoTokenizer
 
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 
 from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
 from .utils import check_logprobs_close
@@ -18,9 +19,11 @@
     "USER: <image>\nWhat is the season?\nASSISTANT:",
 })
 
-IMAGE_TOKEN_ID = 32000
-
-models = ["llava-hf/llava-1.5-7b-hf"]
+models = [
+    "llava-hf/llava-1.5-7b-hf",
+    # TODO: Get this model to produce meaningful output in vLLM
+    # "TIGER-Lab/Mantis-8B-siglip-llama3",
+]
 
 
 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
@@ -29,12 +32,15 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
     """Sanitize vllm output to be comparable with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
 
+    config = AutoConfig.from_pretrained(model)
+    image_token_id = config.image_token_index
+
     tokenizer = AutoTokenizer.from_pretrained(model)
     eos_token_id = tokenizer.eos_token_id
 
     hf_output_ids = [
         token_id for idx, token_id in enumerate(output_ids)
-        if token_id != IMAGE_TOKEN_ID or output_ids[idx - 1] != IMAGE_TOKEN_ID
+        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
     ]
 
     assert output_str[0] == " "
@@ -67,6 +73,17 @@ def run_test(
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
+    # NOTE: For local use; this isn't tested in CI yet (see TODO above)
+    if model.startswith("TIGER-Lab/Mantis"):
+        from mantis.models.mllava import MLlavaProcessor
+
+        torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
+        mantis_processor = MLlavaProcessor.from_pretrained(
+            model, torch_dtype=torch_dtype)
+        assert isinstance(mantis_processor, MLlavaProcessor)
+    else:
+        mantis_processor = None
+
     images = [asset.pil_image for asset in image_assets]
 
     inputs_per_image = [(
@@ -94,6 +111,15 @@ def run_test(
         ]
 
     with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model:
+        if mantis_processor is not None:
+
+            def process(*args, **kwargs):
+                output = mantis_processor(*args, **kwargs)
+                output["pixel_values"] = output["pixel_values"].to(torch_dtype)
+                return output
+
+            hf_model.processor = process
+
         hf_outputs_per_image = [
             hf_model.generate_greedy_logprobs_limit(prompts,
                                                     max_tokens,

@@ -1,7 +1,7 @@
 from typing import List, Optional, Tuple, Type, overload
 
 import pytest
-from transformers import AutoTokenizer
+from transformers import AutoConfig, AutoTokenizer
 
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
@@ -23,8 +23,6 @@
     f"{_PREFACE} USER: <image>\nWhat is the season? ASSISTANT:",
 })
 
-IMAGE_TOKEN_ID = 32000
-
 models = ["llava-hf/llava-v1.6-vicuna-7b-hf"]
 
 
@@ -34,12 +32,15 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
     """Sanitize vllm output to be comparable with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
 
+    config = AutoConfig.from_pretrained(model)
+    image_token_id = config.image_token_index
+
     tokenizer = AutoTokenizer.from_pretrained(model)
     eos_token_id = tokenizer.eos_token_id
 
     hf_output_ids = [
         token_id for idx, token_id in enumerate(output_ids)
-        if token_id != IMAGE_TOKEN_ID or output_ids[idx - 1] != IMAGE_TOKEN_ID
+        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
     ]
 
     assert output_str[0] == " "

@@ -2,7 +2,7 @@
 from typing import List, Optional, Tuple, Type
 
 import pytest
-from transformers import AutoTokenizer
+from transformers import AutoConfig, AutoTokenizer
 
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
@@ -20,8 +20,6 @@
     "What is in the picture?",
 })
 
-IMAGE_TOKEN_ID = 257152
-
 models = ["google/paligemma-3b-mix-224"]
 
 # ROCm Triton FA can run into compilation issues with these models due to,
@@ -37,12 +35,15 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
     """Sanitize vllm output to be comparable with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
 
+    config = AutoConfig.from_pretrained(model)
+    image_token_id = config.image_token_index
+
     tokenizer = AutoTokenizer.from_pretrained(model)
     eos_token_id = tokenizer.eos_token_id
 
     hf_output_ids = [
         token_id for idx, token_id in enumerate(output_ids)
-        if token_id != IMAGE_TOKEN_ID or output_ids[idx - 1] != IMAGE_TOKEN_ID
+        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
     ]
 
     hf_output_str = output_str

@@ -6,4 +6,4 @@
 @pytest.mark.parametrize("model_cls", _MODELS)
 def test_registry_imports(model_cls):
     # Ensure all model classes can be imported successfully
-    ModelRegistry.load_model_cls(model_cls)
+    ModelRegistry.resolve_model_cls([model_cls])
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
@@ -16,7 +16,7 @@
 import torch
 from huggingface_hub import HfApi, hf_hub_download
 from torch import nn
-from transformers import AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, PretrainedConfig
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoadFormat,
                          LoRAConfig, ModelConfig, MultiModalConfig,
@@ -143,6 +143,22 @@ def _get_model_initialization_kwargs(
     return extra_kwargs
 
 
+def build_model(model_class: Type[nn.Module], hf_config: PretrainedConfig,
+                cache_config: Optional[CacheConfig],
+                quant_config: Optional[QuantizationConfig], *,
+                lora_config: Optional[LoRAConfig],
+                multimodal_config: Optional[MultiModalConfig],
+                scheduler_config: Optional[SchedulerConfig]) -> nn.Module:
+    extra_kwargs = _get_model_initialization_kwargs(model_class, lora_config,
+                                                    multimodal_config,
+                                                    scheduler_config)
+
+    return model_class(config=hf_config,
+                       cache_config=cache_config,
+                       quant_config=quant_config,
+                       **extra_kwargs)
+
+
 def _initialize_model(
         model_config: ModelConfig,
         load_config: LoadConfig,
@@ -151,15 +167,17 @@ def _initialize_model(
         cache_config: CacheConfig,
         scheduler_config: Optional[SchedulerConfig] = None) -> nn.Module:
     """Initialize a model with the given configurations."""
-    model_class = get_model_architecture(model_config)[0]
-    quant_config = _get_quantization_config(model_config, load_config)
-
-    return model_class(config=model_config.hf_config,
-                       cache_config=cache_config,
-                       quant_config=quant_config,
-                       **_get_model_initialization_kwargs(
-                           model_class, lora_config, multimodal_config,
-                           scheduler_config))
+    model_class, _ = get_model_architecture(model_config)
+
+    return build_model(
+        model_class,
+        model_config.hf_config,
+        quant_config=_get_quantization_config(model_config, load_config),
+        lora_config=lora_config,
+        multimodal_config=multimodal_config,
+        cache_config=cache_config,
+        scheduler_config=scheduler_config,
+    )
 
 
 class BaseModelLoader(ABC):

diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
@@ -28,13 +28,7 @@ def get_model_architecture(
             and "MixtralForCausalLM" in architectures):
         architectures = ["QuantMixtralForCausalLM"]
 
-    for arch in architectures:
-        model_cls = ModelRegistry.load_model_cls(arch)
-        if model_cls is not None:
-            return (model_cls, arch)
-    raise ValueError(
-        f"Model architectures {architectures} are not supported for now. "
-        f"Supported architectures: {ModelRegistry.get_supported_archs()}")
+    return ModelRegistry.resolve_model_cls(architectures)
 
 
 def get_architecture_class_name(model_config: ModelConfig) -> str:

diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
@@ -1,6 +1,6 @@
 import functools
 import importlib
-from typing import Dict, List, Optional, Type
+from typing import Dict, List, Optional, Tuple, Type
 
 import torch.nn as nn
 
@@ -126,7 +126,7 @@ def _get_model(model_arch: str):
         return getattr(module, model_cls_name, None)
 
     @staticmethod
-    def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]:
+    def _try_load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]:
         if model_arch in _OOT_MODELS:
             return _OOT_MODELS[model_arch]
         if model_arch not in _MODELS:
@@ -143,6 +143,18 @@ def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]:
 
         return ModelRegistry._get_model(model_arch)
 
+    @staticmethod
+    def resolve_model_cls(
+            architectures: List[str]) -> Tuple[Type[nn.Module], str]:
+        for arch in architectures:
+            model_cls = ModelRegistry._try_load_model_cls(arch)
+            if model_cls is not None:
+                return (model_cls, arch)
+
+        raise ValueError(
+            f"Model architectures {architectures} are not supported for now. "
+            f"Supported architectures: {ModelRegistry.get_supported_archs()}")
+
     @staticmethod
     def get_supported_archs() -> List[str]:
         return list(_MODELS.keys())

diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
@@ -1,6 +1,6 @@
 """Minimal implementation of CLIPVisionModel intended to be only used 
 within a vision language model."""
-from typing import Optional
+from typing import Iterable, Optional, Tuple
 
 import torch
 import torch.nn as nn
@@ -14,6 +14,7 @@
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal.image import (cached_get_tokenizer,
                                    repeat_and_pad_image_tokens)
 from vllm.sequence import SequenceData
@@ -32,7 +33,7 @@ def get_clip_num_patches(*, image_size: int, patch_size: int) -> int:
 
 def get_clip_image_feature_size(hf_config: CLIPVisionConfig) -> int:
     return get_clip_num_patches(image_size=hf_config.image_size,
-                                patch_size=hf_config.patch_size)
+                                patch_size=hf_config.patch_size) + 1
 
 
 def get_max_clip_image_tokens(hf_config: CLIPVisionConfig) -> int:
@@ -291,3 +292,22 @@ def forward(self, pixel_values: Optional[torch.Tensor] = None):
     @property
     def device(self):
         return next(self.parameters()).device
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+        layer_count = len(self.vision_model.encoder.layers)
+
+        for name, loaded_weight in weights:
+            # post_layernorm is not needed in CLIPVisionModel
+            if "vision_model.post_layernorm" in name:
+                continue
+            # omit layers when num_hidden_layers_override is set
+            if "vision_model.encoder.layers." in name:
+                layer_idx = int(name.split(".")[3])
+                if layer_idx >= layer_count:
+                    continue
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
@@ -18,7 +18,6 @@
 from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models import ModelRegistry
 from vllm.model_executor.models.intern_vit import InternVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -29,7 +28,8 @@
 from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip,
                    get_clip_num_patches)
 from .interfaces import SupportsVision
-from .utils import merge_vision_embeddings
+from .utils import (filter_weights, init_vllm_registered_model,
+                    merge_vision_embeddings)
 
 IMG_START = '<img>'
 IMG_END = '</img>'
@@ -283,10 +283,8 @@ def __init__(self,
         self.vision_model = InternVisionModel(
             config.vision_config, num_hidden_layers_override=num_hidden_layers)
 
-        llm_class = ModelRegistry.load_model_cls(
-            config.text_config.architectures[0])
-        self.language_model = llm_class(config.text_config, cache_config,
-                                        quant_config)
+        self.language_model = init_vllm_registered_model(
+            config.text_config, cache_config, quant_config)
 
         vit_hidden_size = config.vision_config.hidden_size
         llm_hidden_size = config.text_config.hidden_size
@@ -415,24 +413,16 @@ def sample(
     ) -> Optional[SamplerOutput]:
         return self.language_model.sample(logits, sampling_metadata)
 
-    def _filter_weights(self, weights: Iterable[Tuple[str, torch.Tensor]],
-                        prefix: str):
-        for name, loaded_weight in weights:
-            name = name.split(".")
-            if prefix == name.pop(0):
-                name = ".".join(name)
-                yield name, loaded_weight
-
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         # prepare weight iterators for components
         vit_weights, mlp_weights, llm_weights = itertools.tee(weights, 3)
 
         # load vision encoder
-        vit_weights = self._filter_weights(vit_weights, "vision_model")
+        vit_weights = filter_weights(vit_weights, "vision_model")
         self.vision_model.load_weights(vit_weights)
 
         # load mlp projector
-        mlp_weights = self._filter_weights(mlp_weights, "mlp1")
+        mlp_weights = filter_weights(mlp_weights, "mlp1")
         mlp_params_dict = dict(self.mlp1.named_parameters())
         for name, loaded_weight in mlp_weights:
             param = mlp_params_dict[name]
@@ -441,5 +431,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             weight_loader(param, loaded_weight)
 
         # load llm backbone
-        llm_weights = self._filter_weights(llm_weights, "language_model")
+        llm_weights = filter_weights(llm_weights, "language_model")
         self.language_model.load_weights(llm_weights)