|
40 | 40 | ConfigFormat, get_config, get_hf_image_processor_config, |
41 | 41 | get_hf_text_config, get_pooling_config, |
42 | 42 | get_sentence_transformer_tokenizer_config, is_encoder_decoder, |
43 | | - maybe_override_with_speculators_target_model, try_get_generation_config, |
44 | | - try_get_safetensors_metadata, try_get_tokenizer_config, uses_mrope) |
| 43 | + is_interleaved, maybe_override_with_speculators_target_model, |
| 44 | + try_get_generation_config, try_get_safetensors_metadata, |
| 45 | + try_get_tokenizer_config, uses_mrope) |
45 | 46 | from vllm.transformers_utils.s3_utils import S3Model |
46 | 47 | from vllm.transformers_utils.utils import is_s3, maybe_model_redirect |
47 | 48 | # yapf conflicts with isort for this block |
@@ -714,53 +715,31 @@ def _task_to_convert(task: TaskOption) -> ConvertType: |
714 | 715 | revision=self.revision, |
715 | 716 | ) |
716 | 717 |
|
717 | | - # Workaround for Gemma 2 which uses interleaved sliding window |
718 | | - # attention, but it's not specified in its config. |
719 | | - # TODO: remove this when Gemma 2 config updated in HuggingFace. |
720 | | - if self.hf_text_config.model_type == "gemma2": |
721 | | - self.hf_text_config.sliding_window_pattern = 2 |
722 | | - |
723 | | - # TODO: remove this when Gemma 3n config updated in HuggingFace. |
724 | | - if self.hf_text_config.model_type == "gemma3n_text": |
725 | | - # 4 sliding window attention followed by 1 full attention |
726 | | - self.hf_text_config.sliding_window_pattern = "LLLLG" |
727 | | - |
728 | | - sliding_window = getattr(self.hf_text_config, "sliding_window", None) |
729 | | - sliding_window_pattern = getattr(self.hf_text_config, |
730 | | - "sliding_window_pattern", None) |
731 | | - has_interleaved_attention = sliding_window_pattern is not None or ( |
732 | | - isinstance(sliding_window, list)) |
733 | | - |
734 | | - if not self.disable_sliding_window and has_interleaved_attention: |
735 | | - if not envs.VLLM_USE_V1 and (backend := envs.VLLM_ATTENTION_BACKEND |
736 | | - ) in ("XFORMERS", "FLASHINFER"): |
737 | | - sliding_window_len_min = get_min_sliding_window( |
738 | | - self.hf_text_config.sliding_window) |
739 | | - |
740 | | - logger.warning_once( |
741 | | - "%s has interleaved attention, which is currently not supported by the %s backend. Disabling sliding window and capping the max length to the sliding window size (%d).", # noqa: E501 |
742 | | - self.hf_text_config.model_type, |
743 | | - backend, |
744 | | - sliding_window_len_min, |
745 | | - ) |
746 | | - self.disable_sliding_window = True |
747 | | - else: |
748 | | - # for a model with interleaved attention, |
749 | | - # the scheduler and the model treat it as full attention |
750 | | - # (i.e., not dropping any tokens outside the window). |
751 | | - # only the attention layer itself is aware of the sliding |
752 | | - # window, and use the window size to compute the attention. |
753 | | - self.hf_text_config.interleaved_sliding_window = sliding_window |
754 | | - |
755 | | - if hasattr(self.hf_text_config, "sliding_window"): |
756 | | - delattr(self.hf_text_config, "sliding_window") |
757 | | - |
758 | | - sliding_window = None |
| 718 | + # Interleaved attention is not supported by some backends in V0 |
| 719 | + if (not self.disable_sliding_window |
| 720 | + and is_interleaved(self.hf_text_config) |
| 721 | + and not envs.VLLM_USE_V1 |
| 722 | + and (backend := envs.VLLM_ATTENTION_BACKEND) |
| 723 | + in ("XFORMERS", "FLASHINFER")): |
| 724 | + logger.warning_once( |
| 725 | + "%s has interleaved attention, which is currently not " |
| 726 | + "supported by the %s backend. Disabling sliding window and " |
| 727 | + "capping the max length to the sliding window size (%d).", |
| 728 | + self.hf_text_config.model_type, |
| 729 | + backend, |
| 730 | + self.hf_text_config.sliding_window, |
| 731 | + ) |
| 732 | + self.disable_sliding_window = True |
759 | 733 |
|
760 | 734 | self.original_max_model_len = self.max_model_len |
761 | 735 | self.max_model_len = self.get_and_verify_max_len(self.max_model_len) |
762 | 736 | self.multimodal_config = self._init_multimodal_config() |
763 | 737 |
|
| 738 | + if self.disable_sliding_window: |
| 739 | + # Set after get_and_verify_max_len to ensure that max_model_len |
| 740 | + # can be correctly capped to sliding window size |
| 741 | + self.hf_text_config.sliding_window = None |
| 742 | + |
764 | 743 | if not self.skip_tokenizer_init: |
765 | 744 | self._verify_tokenizer_mode() |
766 | 745 |
|
@@ -1322,27 +1301,10 @@ def verify_with_parallel_config( |
1322 | 1301 | if self.use_async_output_proc: |
1323 | 1302 | self.use_async_output_proc = False |
1324 | 1303 |
|
1325 | | - def get_hf_config_sliding_window( |
1326 | | - self) -> Union[Optional[int], list[Optional[int]]]: |
1327 | | - """Get the sliding window size, or None if disabled.""" |
1328 | | - |
1329 | | - # Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in |
1330 | | - # addition to sliding window size. We check if that field is present |
1331 | | - # and if it's False, return None. |
1332 | | - if (hasattr(self.hf_text_config, "use_sliding_window") |
1333 | | - and not self.hf_text_config.use_sliding_window): |
1334 | | - return None |
| 1304 | + def get_sliding_window(self) -> Optional[int]: |
| 1305 | + """Get the sliding window size from the HF text config if present.""" |
1335 | 1306 | return getattr(self.hf_text_config, "sliding_window", None) |
1336 | 1307 |
|
1337 | | - def get_sliding_window(self) -> Optional[Union[int, list[Optional[int]]]]: |
1338 | | - """Get the sliding window size, or None if disabled. |
1339 | | - """ |
1340 | | - # If user disables sliding window, return None. |
1341 | | - if self.disable_sliding_window: |
1342 | | - return None |
1343 | | - # Otherwise get the value from the hf config. |
1344 | | - return self.get_hf_config_sliding_window() |
1345 | | - |
1346 | 1308 | def get_vocab_size(self) -> int: |
1347 | 1309 | return getattr(self.hf_text_config, "vocab_size", 0) |
1348 | 1310 |
|
@@ -1775,7 +1737,7 @@ def get_and_verify_max_len(self, max_model_len: int): |
1775 | 1737 | tokenizer_config=tokenizer_config, |
1776 | 1738 | max_model_len=max_model_len, |
1777 | 1739 | disable_sliding_window=self.disable_sliding_window, |
1778 | | - sliding_window_len=self.get_hf_config_sliding_window(), |
| 1740 | + sliding_window=self.get_sliding_window(), |
1779 | 1741 | spec_target_max_model_len=self.spec_target_max_model_len, |
1780 | 1742 | encoder_config=self.encoder_config) |
1781 | 1743 | logger.info("Using max model len %s", max_model_len) |
@@ -3318,7 +3280,7 @@ def _get_and_verify_max_len( |
3318 | 3280 | tokenizer_config: Optional[dict], |
3319 | 3281 | max_model_len: Optional[int], |
3320 | 3282 | disable_sliding_window: bool, |
3321 | | - sliding_window_len: Optional[Union[int, list[Optional[int]]]], |
| 3283 | + sliding_window: Optional[int], |
3322 | 3284 | spec_target_max_model_len: Optional[int] = None, |
3323 | 3285 | encoder_config: Optional[Any] = None, |
3324 | 3286 | ) -> int: |
@@ -3357,13 +3319,10 @@ def _get_and_verify_max_len( |
3357 | 3319 |
|
3358 | 3320 | # If sliding window is manually disabled, max_length should be less |
3359 | 3321 | # than the sliding window length in the model config. |
3360 | | - if disable_sliding_window and sliding_window_len is not None: |
3361 | | - |
3362 | | - sliding_window_len_min = get_min_sliding_window(sliding_window_len) |
3363 | | - max_len_key = "sliding_window" \ |
3364 | | - if sliding_window_len_min < derived_max_model_len else max_len_key |
3365 | | - derived_max_model_len = min(derived_max_model_len, |
3366 | | - sliding_window_len_min) |
| 3322 | + if (disable_sliding_window and sliding_window is not None |
| 3323 | + and sliding_window < derived_max_model_len): |
| 3324 | + max_len_key = "sliding_window" |
| 3325 | + derived_max_model_len = sliding_window |
3367 | 3326 |
|
3368 | 3327 | # Consider model_max_length in tokenizer_config |
3369 | 3328 | if tokenizer_config: |
@@ -3464,14 +3423,6 @@ def _get_and_verify_max_len( |
3464 | 3423 | return int(max_model_len) |
3465 | 3424 |
|
3466 | 3425 |
|
3467 | | -def get_min_sliding_window( |
3468 | | - sliding_window: Union[int, list[Optional[int]]]) -> int: |
3469 | | - if isinstance(sliding_window, list): |
3470 | | - return min(s for s in sliding_window if s is not None) |
3471 | | - |
3472 | | - return sliding_window |
3473 | | - |
3474 | | - |
3475 | 3426 | def get_served_model_name(model: str, |
3476 | 3427 | served_model_name: Optional[Union[str, list[str]]]): |
3477 | 3428 | """ |
|
0 commit comments