|
| 1 | +# SPDX-License-Identifier: Apache-2.0 |
| 2 | +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project |
| 3 | + |
| 4 | +from collections.abc import Mapping |
| 5 | + |
| 6 | +import torch |
| 7 | +import torch.nn as nn |
| 8 | +from transformers.activations import GELUActivation |
| 9 | + |
| 10 | +from vllm.config import VllmConfig |
| 11 | +from vllm.config.multimodal import BaseDummyOptions |
| 12 | +from vllm.multimodal import MULTIMODAL_REGISTRY |
| 13 | +from vllm.multimodal.inputs import MultiModalDataDict |
| 14 | + |
| 15 | +from .llava_next import ( |
| 16 | + LlavaDummyInputsBuilder, |
| 17 | + LlavaNextMultiModalProcessor, |
| 18 | + LlavaNextProcessingInfo, |
| 19 | +) |
| 20 | +from .llava_onevision import LlavaOnevisionForConditionalGeneration |
| 21 | +from .utils import WeightsMapper |
| 22 | + |
| 23 | + |
| 24 | +class BeeProcessingInfo(LlavaNextProcessingInfo): |
| 25 | + def get_hf_config(self): |
| 26 | + return self.ctx.get_hf_config() |
| 27 | + |
| 28 | + def get_hf_processor(self, **kwargs: object): |
| 29 | + return self.ctx.get_hf_processor(**kwargs) |
| 30 | + |
| 31 | + def _get_num_unpadded_features( |
| 32 | + self, |
| 33 | + *, |
| 34 | + original_height: int, |
| 35 | + original_width: int, |
| 36 | + npatches: int, |
| 37 | + num_patch_height: int, |
| 38 | + num_patch_width: int, |
| 39 | + ) -> tuple[int, int]: |
| 40 | + """Override to use correct max_num_patches from vision_aspect_ratio.""" |
| 41 | + import math |
| 42 | + |
| 43 | + current_height = npatches * num_patch_height |
| 44 | + current_width = npatches * num_patch_width |
| 45 | + |
| 46 | + aspect_ratio = original_width / original_height |
| 47 | + current_aspect_ratio = current_width / current_height |
| 48 | + |
| 49 | + if aspect_ratio > current_aspect_ratio: |
| 50 | + new_height = int( |
| 51 | + round(original_height * (current_width / original_width), 7) |
| 52 | + ) |
| 53 | + padding = (current_height - new_height) // 2 |
| 54 | + current_height = current_height - (2 * padding) |
| 55 | + else: |
| 56 | + new_width = int( |
| 57 | + round(original_width * (current_height / original_height), 7) |
| 58 | + ) |
| 59 | + padding = (current_width - new_width) // 2 |
| 60 | + current_width = current_width - (2 * padding) |
| 61 | + |
| 62 | + unpadded_features = current_height * current_width |
| 63 | + newline_features = current_height |
| 64 | + |
| 65 | + # Get max_num_patches from vision_aspect_ratio config |
| 66 | + hf_config = self.get_hf_config() |
| 67 | + vision_aspect_ratio = getattr(hf_config, "vision_aspect_ratio", "anyres_max_9") |
| 68 | + max_num_patches = int(vision_aspect_ratio.replace("anyres_max_", "")) |
| 69 | + |
| 70 | + ratio = math.sqrt( |
| 71 | + current_height * current_width / (max_num_patches * npatches**2) |
| 72 | + ) |
| 73 | + if ratio > 1.1: |
| 74 | + height_factor = int(current_height // ratio) |
| 75 | + width_factor = int(current_width // ratio) |
| 76 | + unpadded_features = height_factor * width_factor |
| 77 | + newline_features = height_factor |
| 78 | + |
| 79 | + return (unpadded_features, newline_features) |
| 80 | + |
| 81 | + |
| 82 | +class BeeDummyInputsBuilder(LlavaDummyInputsBuilder[BeeProcessingInfo]): |
| 83 | + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: |
| 84 | + num_images = mm_counts.get("image", 0) |
| 85 | + image_token = "<image>" |
| 86 | + |
| 87 | + return image_token * num_images |
| 88 | + |
| 89 | + def get_dummy_mm_data( |
| 90 | + self, |
| 91 | + seq_len: int, |
| 92 | + mm_counts: Mapping[str, int], |
| 93 | + mm_options: Mapping[str, BaseDummyOptions] | None = None, |
| 94 | + ) -> MultiModalDataDict: |
| 95 | + num_images = mm_counts.get("image", 0) |
| 96 | + |
| 97 | + target_width, target_height = self.info.get_image_size_with_most_features() |
| 98 | + |
| 99 | + image_overrides = mm_options.get("image") if mm_options else None |
| 100 | + |
| 101 | + return { |
| 102 | + "image": self._get_dummy_images( |
| 103 | + width=target_width, |
| 104 | + height=target_height, |
| 105 | + num_images=num_images, |
| 106 | + overrides=image_overrides, |
| 107 | + ), |
| 108 | + } |
| 109 | + |
| 110 | + |
| 111 | +class BeeMultiModalProjector(nn.Module): |
| 112 | + def __init__(self, config): |
| 113 | + super().__init__() |
| 114 | + self.pre_norm = nn.LayerNorm(config.vision_config.hidden_size, eps=1e-06) |
| 115 | + self.linear_1 = nn.Linear( |
| 116 | + config.vision_config.hidden_size, |
| 117 | + config.text_config.hidden_size * 4, |
| 118 | + bias=True, |
| 119 | + ) |
| 120 | + self.act = GELUActivation() |
| 121 | + self.linear_2 = nn.Linear( |
| 122 | + config.text_config.hidden_size * 4, |
| 123 | + config.text_config.hidden_size, |
| 124 | + bias=True, |
| 125 | + ) |
| 126 | + |
| 127 | + def forward(self, image_feature: torch.Tensor) -> torch.Tensor: |
| 128 | + image_feature = self.pre_norm(image_feature) |
| 129 | + hidden_states = self.linear_1(image_feature) |
| 130 | + hidden_states = self.act(hidden_states) |
| 131 | + hidden_states = self.linear_2(hidden_states) |
| 132 | + |
| 133 | + return hidden_states |
| 134 | + |
| 135 | + |
| 136 | +@MULTIMODAL_REGISTRY.register_processor( |
| 137 | + LlavaNextMultiModalProcessor, |
| 138 | + info=BeeProcessingInfo, |
| 139 | + dummy_inputs=BeeDummyInputsBuilder, |
| 140 | +) |
| 141 | +class BeeForConditionalGeneration(LlavaOnevisionForConditionalGeneration): |
| 142 | + hf_to_vllm_mapper = WeightsMapper( |
| 143 | + orig_to_new_prefix={ |
| 144 | + # mapping for new names in checkpoint saved after transformers |
| 145 | + # v4.55 |
| 146 | + "model.language_model.": "language_model.model.", |
| 147 | + "model.vision_tower.": "vision_tower.", |
| 148 | + "model.multi_modal_projector.": "multi_modal_projector.", |
| 149 | + "model.image_newline": "image_newline", |
| 150 | + "lm_head.": "language_model.lm_head.", |
| 151 | + } |
| 152 | + ) |
| 153 | + |
| 154 | + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: |
| 155 | + super().__init__(vllm_config=vllm_config, prefix=prefix) |
| 156 | + config = vllm_config.model_config.hf_config |
| 157 | + self.multi_modal_projector = BeeMultiModalProjector(config) |
0 commit comments