|
19 | 19 | from vllm.model_executor.model_loader.weight_utils import default_weight_loader |
20 | 20 | from vllm.model_executor.models.interfaces import SupportsQuant |
21 | 21 |
|
22 | | -from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs |
| 22 | +from .vision import (VisionEncoderInfo, VisionFeatureSelectStrategy, |
| 23 | + resolve_visual_encoder_outputs) |
23 | 24 |
|
24 | 25 |
|
25 | 26 | class CLIPEncoderInfo(VisionEncoderInfo[CLIPVisionConfig]): |
@@ -308,24 +309,29 @@ def __init__( |
308 | 309 | def forward( |
309 | 310 | self, |
310 | 311 | pixel_values: torch.Tensor, |
311 | | - feature_sample_layers: Optional[list[int]] = None, |
| 312 | + *, |
| 313 | + select_layers: Optional[list[int]] = None, |
| 314 | + feature_select_strategy: Optional[VisionFeatureSelectStrategy] = None, |
312 | 315 | ) -> torch.Tensor: |
313 | 316 |
|
314 | 317 | hidden_states = self.embeddings(pixel_values) |
315 | 318 | hidden_states = self.pre_layrnorm(hidden_states) |
316 | 319 |
|
317 | | - return_all_hidden_states = feature_sample_layers is not None |
318 | | - |
319 | 320 | # Produces either the last layer output or all of the hidden states, |
320 | | - # depending on if we have feature_sample_layers or not |
| 321 | + # depending on if we have select_layers or not |
321 | 322 | encoder_outputs = self.encoder( |
322 | 323 | inputs_embeds=hidden_states, |
323 | | - return_all_hidden_states=return_all_hidden_states) |
| 324 | + return_all_hidden_states=select_layers is not None, |
| 325 | + ) |
324 | 326 |
|
325 | 327 | # Handle post-norm (if applicable) and stacks feature layers if needed |
326 | 328 | encoder_outputs = resolve_visual_encoder_outputs( |
327 | | - encoder_outputs, feature_sample_layers, self.post_layernorm, |
328 | | - self.config.num_hidden_layers) |
| 329 | + encoder_outputs, |
| 330 | + self.post_layernorm, |
| 331 | + select_layers=select_layers, |
| 332 | + max_possible_layers=self.config.num_hidden_layers, |
| 333 | + feature_select_strategy=feature_select_strategy, |
| 334 | + ) |
329 | 335 |
|
330 | 336 | return encoder_outputs |
331 | 337 |
|
@@ -355,9 +361,14 @@ def __init__( |
355 | 361 | def forward( |
356 | 362 | self, |
357 | 363 | pixel_values: torch.Tensor, |
358 | | - feature_sample_layers: Optional[list[int]] = None, |
| 364 | + select_layers: Optional[list[int]] = None, |
| 365 | + feature_select_strategy: Optional[VisionFeatureSelectStrategy] = None, |
359 | 366 | ) -> torch.Tensor: |
360 | | - return self.vision_model(pixel_values, feature_sample_layers) |
| 367 | + return self.vision_model( |
| 368 | + pixel_values, |
| 369 | + select_layers=select_layers, |
| 370 | + feature_select_strategy=feature_select_strategy, |
| 371 | + ) |
361 | 372 |
|
362 | 373 | @property |
363 | 374 | def device(self): |
|
0 commit comments