|
40 | 40 | ConfigFormat, get_config, get_hf_image_processor_config, |
41 | 41 | get_hf_text_config, get_pooling_config, |
42 | 42 | get_sentence_transformer_tokenizer_config, is_encoder_decoder, |
43 | | - maybe_override_with_speculators_target_model, try_get_generation_config, |
44 | | - try_get_safetensors_metadata, try_get_tokenizer_config, uses_mrope) |
| 43 | + is_interleaved, maybe_override_with_speculators_target_model, |
| 44 | + try_get_generation_config, try_get_safetensors_metadata, |
| 45 | + try_get_tokenizer_config, uses_mrope) |
45 | 46 | from vllm.transformers_utils.s3_utils import S3Model |
46 | 47 | from vllm.transformers_utils.utils import is_s3, maybe_model_redirect |
47 | 48 | # yapf conflicts with isort for this block |
@@ -714,53 +715,31 @@ def _task_to_convert(task: TaskOption) -> ConvertType: |
714 | 715 | revision=self.revision, |
715 | 716 | ) |
716 | 717 |
|
717 | | - # Workaround for Gemma 2 which uses interleaved sliding window |
718 | | - # attention, but it's not specified in its config. |
719 | | - # TODO: remove this when Gemma 2 config updated in HuggingFace. |
720 | | - if self.hf_text_config.model_type == "gemma2": |
721 | | - self.hf_text_config.sliding_window_pattern = 2 |
722 | | - |
723 | | - # TODO: remove this when Gemma 3n config updated in HuggingFace. |
724 | | - if self.hf_text_config.model_type == "gemma3n_text": |
725 | | - # 4 sliding window attention followed by 1 full attention |
726 | | - self.hf_text_config.sliding_window_pattern = "LLLLG" |
727 | | - |
728 | | - sliding_window = getattr(self.hf_text_config, "sliding_window", None) |
729 | | - sliding_window_pattern = getattr(self.hf_text_config, |
730 | | - "sliding_window_pattern", None) |
731 | | - has_interleaved_attention = sliding_window_pattern is not None or ( |
732 | | - isinstance(sliding_window, list)) |
733 | | - |
734 | | - if not self.disable_sliding_window and has_interleaved_attention: |
735 | | - if not envs.VLLM_USE_V1 and (backend := envs.VLLM_ATTENTION_BACKEND |
736 | | - ) in ("XFORMERS", "FLASHINFER"): |
737 | | - sliding_window_len_min = get_min_sliding_window( |
738 | | - self.hf_text_config.sliding_window) |
739 | | - |
740 | | - logger.warning_once( |
741 | | - "%s has interleaved attention, which is currently not supported by the %s backend. Disabling sliding window and capping the max length to the sliding window size (%d).", # noqa: E501 |
742 | | - self.hf_text_config.model_type, |
743 | | - backend, |
744 | | - sliding_window_len_min, |
745 | | - ) |
746 | | - self.disable_sliding_window = True |
747 | | - else: |
748 | | - # for a model with interleaved attention, |
749 | | - # the scheduler and the model treat it as full attention |
750 | | - # (i.e., not dropping any tokens outside the window). |
751 | | - # only the attention layer itself is aware of the sliding |
752 | | - # window, and use the window size to compute the attention. |
753 | | - self.hf_text_config.interleaved_sliding_window = sliding_window |
754 | | - |
755 | | - if hasattr(self.hf_text_config, "sliding_window"): |
756 | | - delattr(self.hf_text_config, "sliding_window") |
757 | | - |
758 | | - sliding_window = None |
| 718 | + # Interleaved attention is not supported by some backends in V0 |
| 719 | + if (not self.disable_sliding_window |
| 720 | + and is_interleaved(self.hf_text_config) |
| 721 | + and not envs.VLLM_USE_V1 |
| 722 | + and (backend := envs.VLLM_ATTENTION_BACKEND) |
| 723 | + in ("XFORMERS", "FLASHINFER")): |
| 724 | + logger.warning_once( |
| 725 | + "%s has interleaved attention, which is currently not " |
| 726 | + "supported by the %s backend. Disabling sliding window and " |
| 727 | + "capping the max length to the sliding window size (%d).", |
| 728 | + self.hf_text_config.model_type, |
| 729 | + backend, |
| 730 | + self.hf_text_config.sliding_window, |
| 731 | + ) |
| 732 | + self.disable_sliding_window = True |
759 | 733 |
|
760 | 734 | self.original_max_model_len = self.max_model_len |
761 | 735 | self.max_model_len = self.get_and_verify_max_len(self.max_model_len) |
762 | 736 | self.multimodal_config = self._init_multimodal_config() |
763 | 737 |
|
| 738 | + if self.disable_sliding_window: |
| 739 | + # Set after get_and_verify_max_len to ensure that max_model_len |
| 740 | + # can be correctly capped to sliding window size |
| 741 | + self.hf_text_config.sliding_window = None |
| 742 | + |
764 | 743 | if not self.skip_tokenizer_init: |
765 | 744 | self._verify_tokenizer_mode() |
766 | 745 |
|
@@ -1322,27 +1301,10 @@ def verify_with_parallel_config( |
1322 | 1301 | if self.use_async_output_proc: |
1323 | 1302 | self.use_async_output_proc = False |
1324 | 1303 |
|
1325 | | - def get_hf_config_sliding_window( |
1326 | | - self) -> Union[Optional[int], list[Optional[int]]]: |
1327 | | - """Get the sliding window size, or None if disabled.""" |
1328 | | - |
1329 | | - # Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in |
1330 | | - # addition to sliding window size. We check if that field is present |
1331 | | - # and if it's False, return None. |
1332 | | - if (hasattr(self.hf_text_config, "use_sliding_window") |
1333 | | - and not self.hf_text_config.use_sliding_window): |
1334 | | - return None |
| 1304 | + def get_sliding_window(self) -> Optional[int]: |
| 1305 | + """Get the sliding window size from the HF text config if present.""" |
1335 | 1306 | return getattr(self.hf_text_config, "sliding_window", None) |
1336 | 1307 |
|
1337 | | - def get_sliding_window(self) -> Optional[Union[int, list[Optional[int]]]]: |
1338 | | - """Get the sliding window size, or None if disabled. |
1339 | | - """ |
1340 | | - # If user disables sliding window, return None. |
1341 | | - if self.disable_sliding_window: |
1342 | | - return None |
1343 | | - # Otherwise get the value from the hf config. |
1344 | | - return self.get_hf_config_sliding_window() |
1345 | | - |
1346 | 1308 | def get_vocab_size(self) -> int: |
1347 | 1309 | return getattr(self.hf_text_config, "vocab_size", 0) |
1348 | 1310 |
|
@@ -1762,7 +1724,7 @@ def get_and_verify_max_len(self, max_model_len: int): |
1762 | 1724 | tokenizer_config=tokenizer_config, |
1763 | 1725 | max_model_len=max_model_len, |
1764 | 1726 | disable_sliding_window=self.disable_sliding_window, |
1765 | | - sliding_window_len=self.get_hf_config_sliding_window(), |
| 1727 | + sliding_window=self.get_sliding_window(), |
1766 | 1728 | spec_target_max_model_len=self.spec_target_max_model_len, |
1767 | 1729 | encoder_config=self.encoder_config) |
1768 | 1730 | logger.info("Using max model len %s", max_model_len) |
@@ -3305,7 +3267,7 @@ def _get_and_verify_max_len( |
3305 | 3267 | tokenizer_config: Optional[dict], |
3306 | 3268 | max_model_len: Optional[int], |
3307 | 3269 | disable_sliding_window: bool, |
3308 | | - sliding_window_len: Optional[Union[int, list[Optional[int]]]], |
| 3270 | + sliding_window: Optional[int], |
3309 | 3271 | spec_target_max_model_len: Optional[int] = None, |
3310 | 3272 | encoder_config: Optional[Any] = None, |
3311 | 3273 | ) -> int: |
@@ -3344,13 +3306,10 @@ def _get_and_verify_max_len( |
3344 | 3306 |
|
3345 | 3307 | # If sliding window is manually disabled, max_length should be less |
3346 | 3308 | # than the sliding window length in the model config. |
3347 | | - if disable_sliding_window and sliding_window_len is not None: |
3348 | | - |
3349 | | - sliding_window_len_min = get_min_sliding_window(sliding_window_len) |
3350 | | - max_len_key = "sliding_window" \ |
3351 | | - if sliding_window_len_min < derived_max_model_len else max_len_key |
3352 | | - derived_max_model_len = min(derived_max_model_len, |
3353 | | - sliding_window_len_min) |
| 3309 | + if (disable_sliding_window and sliding_window is not None |
| 3310 | + and sliding_window < derived_max_model_len): |
| 3311 | + max_len_key = "sliding_window" |
| 3312 | + derived_max_model_len = sliding_window |
3354 | 3313 |
|
3355 | 3314 | # Consider model_max_length in tokenizer_config |
3356 | 3315 | if tokenizer_config: |
@@ -3451,14 +3410,6 @@ def _get_and_verify_max_len( |
3451 | 3410 | return int(max_model_len) |
3452 | 3411 |
|
3453 | 3412 |
|
3454 | | -def get_min_sliding_window( |
3455 | | - sliding_window: Union[int, list[Optional[int]]]) -> int: |
3456 | | - if isinstance(sliding_window, list): |
3457 | | - return min(s for s in sliding_window if s is not None) |
3458 | | - |
3459 | | - return sliding_window |
3460 | | - |
3461 | | - |
3462 | 3413 | def get_served_model_name(model: str, |
3463 | 3414 | served_model_name: Optional[Union[str, list[str]]]): |
3464 | 3415 | """ |
|
0 commit comments