|  | 
| 19 | 19 | from vllm.model_executor.model_loader.weight_utils import default_weight_loader | 
| 20 | 20 | from vllm.model_executor.models.interfaces import SupportsQuant | 
| 21 | 21 | 
 | 
| 22 |  | -from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs | 
|  | 22 | +from .vision import (VisionEncoderInfo, VisionFeatureSelectStrategy, | 
|  | 23 | +                     resolve_visual_encoder_outputs) | 
| 23 | 24 | 
 | 
| 24 | 25 | 
 | 
| 25 | 26 | class CLIPEncoderInfo(VisionEncoderInfo[CLIPVisionConfig]): | 
| @@ -308,24 +309,29 @@ def __init__( | 
| 308 | 309 |     def forward( | 
| 309 | 310 |         self, | 
| 310 | 311 |         pixel_values: torch.Tensor, | 
| 311 |  | -        feature_sample_layers: Optional[list[int]] = None, | 
|  | 312 | +        *, | 
|  | 313 | +        select_layers: Optional[list[int]] = None, | 
|  | 314 | +        feature_select_strategy: Optional[VisionFeatureSelectStrategy] = None, | 
| 312 | 315 |     ) -> torch.Tensor: | 
| 313 | 316 | 
 | 
| 314 | 317 |         hidden_states = self.embeddings(pixel_values) | 
| 315 | 318 |         hidden_states = self.pre_layrnorm(hidden_states) | 
| 316 | 319 | 
 | 
| 317 |  | -        return_all_hidden_states = feature_sample_layers is not None | 
| 318 |  | - | 
| 319 | 320 |         # Produces either the last layer output or all of the hidden states, | 
| 320 |  | -        # depending on if we have feature_sample_layers or not | 
|  | 321 | +        # depending on if we have select_layers or not | 
| 321 | 322 |         encoder_outputs = self.encoder( | 
| 322 | 323 |             inputs_embeds=hidden_states, | 
| 323 |  | -            return_all_hidden_states=return_all_hidden_states) | 
|  | 324 | +            return_all_hidden_states=select_layers is not None, | 
|  | 325 | +        ) | 
| 324 | 326 | 
 | 
| 325 | 327 |         # Handle post-norm (if applicable) and stacks feature layers if needed | 
| 326 | 328 |         encoder_outputs = resolve_visual_encoder_outputs( | 
| 327 |  | -            encoder_outputs, feature_sample_layers, self.post_layernorm, | 
| 328 |  | -            self.config.num_hidden_layers) | 
|  | 329 | +            encoder_outputs, | 
|  | 330 | +            self.post_layernorm, | 
|  | 331 | +            select_layers=select_layers, | 
|  | 332 | +            max_possible_layers=self.config.num_hidden_layers, | 
|  | 333 | +            feature_select_strategy=feature_select_strategy, | 
|  | 334 | +        ) | 
| 329 | 335 | 
 | 
| 330 | 336 |         return encoder_outputs | 
| 331 | 337 | 
 | 
| @@ -355,9 +361,14 @@ def __init__( | 
| 355 | 361 |     def forward( | 
| 356 | 362 |         self, | 
| 357 | 363 |         pixel_values: torch.Tensor, | 
| 358 |  | -        feature_sample_layers: Optional[list[int]] = None, | 
|  | 364 | +        select_layers: Optional[list[int]] = None, | 
|  | 365 | +        feature_select_strategy: Optional[VisionFeatureSelectStrategy] = None, | 
| 359 | 366 |     ) -> torch.Tensor: | 
| 360 |  | -        return self.vision_model(pixel_values, feature_sample_layers) | 
|  | 367 | +        return self.vision_model( | 
|  | 368 | +            pixel_values, | 
|  | 369 | +            select_layers=select_layers, | 
|  | 370 | +            feature_select_strategy=feature_select_strategy, | 
|  | 371 | +        ) | 
| 361 | 372 | 
 | 
| 362 | 373 |     @property | 
| 363 | 374 |     def device(self): | 
|  | 
0 commit comments