Skip to content

Commit daf472f

Browse files
authored
Merge branch 'main' into pali-mm-update
2 parents 175eb2a + 73e0225 commit daf472f

File tree

31 files changed

+357
-76
lines changed

31 files changed

+357
-76
lines changed

tests/models/encoder_decoder/vision_language/test_mllama.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -479,8 +479,9 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
479479

480480
# Regression tests for https://github.com/vllm-project/vllm/issues/10648
481481

482-
# Number of image tags is greater than the number of images provided
483-
prompt = "<|begin_of_text|><|image|><|image|> Compare the two images" # noqa: E501
482+
# Number of groups of image tokens is greater than the number of images
483+
# provided (the whitespace between the tags is necessary)
484+
prompt = "<|begin_of_text|><|image|> <|image|> Compare the two images" # noqa: E501
484485
image = stop_sign
485486
with pytest.raises(ValueError):
486487
vllm_model.generate_greedy_logprobs([prompt],

vllm/config.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1039,6 +1039,11 @@ def supported_runner_types(self) -> Set[RunnerType]:
10391039
def runner_type(self) -> RunnerType:
10401040
return _TASK_RUNNER[self.task]
10411041

1042+
@property
1043+
def is_v1_compatible(self) -> bool:
1044+
architectures = getattr(self.hf_config, "architectures", [])
1045+
return ModelRegistry.is_v1_compatible(architectures)
1046+
10421047

10431048
class CacheConfig:
10441049
"""Configuration for the KV cache.
@@ -1978,13 +1983,12 @@ def maybe_create_spec_config(
19781983
if num_speculative_tokens is None:
19791984
# Default to max value defined in draft model config.
19801985
num_speculative_tokens = n_predict
1981-
elif num_speculative_tokens > n_predict:
1982-
# Verify provided value doesn't exceed the maximum
1983-
# supported by the draft model.
1986+
elif num_speculative_tokens > n_predict and \
1987+
num_speculative_tokens % n_predict != 0:
1988+
# Ensure divisibility for MTP module reuse.
19841989
raise ValueError(
1985-
"This speculative model supports a maximum of "
1986-
f"num_speculative_tokens={n_predict}, but "
1987-
f"{num_speculative_tokens=} was provided.")
1990+
f"{num_speculative_tokens=} must be divisible by "
1991+
f"{n_predict=}")
19881992

19891993
speculative_draft_tensor_parallel_size = \
19901994
SpeculativeConfig._verify_and_get_draft_model_tensor_parallel_size(

vllm/model_executor/layers/fused_moe/layer.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -737,3 +737,23 @@ def _load_fp8_scale(self, param: torch.nn.Parameter,
737737
# If we are in the row parallel case (down_proj)
738738
else:
739739
param_data[expert_id] = loaded_weight
740+
741+
def extra_repr(self) -> str:
742+
743+
s = (
744+
f"global_num_experts={self.global_num_experts}, "
745+
f"local_num_experts={self.local_num_experts}, "
746+
f"top_k={self.top_k}, "
747+
f"intermediate_size_per_partition={self.intermediate_size_per_partition}, " # noqa: E501
748+
f"tp_size={self.tp_size},\n"
749+
f"ep_size={self.ep_size}, "
750+
f"reduce_results={self.reduce_results}, "
751+
f"renormalize={self.renormalize}, "
752+
f"use_grouped_topk={self.use_grouped_topk}")
753+
754+
if self.use_grouped_topk:
755+
s += f", num_expert_group={self.num_expert_group}, topk_group={self.topk_group}" # noqa: E501
756+
757+
s += f", scoring_func='{self.scoring_func}', activation='{self.activation}'" # noqa: E501
758+
759+
return s

vllm/model_executor/models/__init__.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
# SPDX-License-Identifier: Apache-2.0
22

33
from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal,
4-
SupportsPP, has_inner_state, supports_lora,
5-
supports_multimodal, supports_pp)
4+
SupportsPP, SupportsV0Only, has_inner_state,
5+
supports_lora, supports_multimodal, supports_pp,
6+
supports_v0_only)
67
from .interfaces_base import (VllmModelForPooling, VllmModelForTextGeneration,
78
is_pooling_model, is_text_generation_model)
89
from .registry import ModelRegistry
@@ -21,4 +22,6 @@
2122
"supports_multimodal",
2223
"SupportsPP",
2324
"supports_pp",
25+
"SupportsV0Only",
26+
"supports_v0_only",
2427
]

vllm/model_executor/models/bamba.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@
3232
from vllm.sequence import IntermediateTensors
3333
from vllm.utils import LayerBlockType
3434

35-
from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP
35+
from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
36+
SupportsV0Only)
3637
from .utils import (is_pp_missing_parameter,
3738
make_empty_intermediate_tensors_factory, make_layers,
3839
maybe_prefix)
@@ -366,7 +367,7 @@ def forward(
366367

367368

368369
class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
369-
IsHybrid):
370+
IsHybrid, SupportsV0Only):
370371
packed_modules_mapping = {
371372
"qkv_proj": [
372373
"q_proj",

vllm/model_executor/models/bart.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
from vllm.model_executor.sampling_metadata import SamplingMetadata
4444
from vllm.sequence import IntermediateTensors
4545

46+
from .interfaces import SupportsV0Only
4647
from .utils import maybe_prefix
4748

4849
logger = logging.get_logger(__name__)
@@ -776,7 +777,7 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
776777
return decoder_outputs
777778

778779

779-
class BartForConditionalGeneration(nn.Module):
780+
class BartForConditionalGeneration(nn.Module, SupportsV0Only):
780781
base_model_prefix = "model"
781782

782783
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):

vllm/model_executor/models/bert.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from vllm.transformers_utils.config import (
2727
get_cross_encoder_activation_function)
2828

29-
from .interfaces import SupportsCrossEncoding
29+
from .interfaces import SupportsCrossEncoding, SupportsV0Only
3030
from .utils import WeightsMapper, maybe_prefix
3131

3232

@@ -385,7 +385,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
385385
return loaded_params
386386

387387

388-
class BertEmbeddingModel(nn.Module):
388+
class BertEmbeddingModel(nn.Module, SupportsV0Only):
389389
"""A model that uses Bert to provide embedding functionalities.
390390
391391
This class encapsulates the BertModel and provides an interface for

vllm/model_executor/models/deepseek_mtp.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ def forward(
8787
hidden_states=hidden_states,
8888
residual=None)
8989
hidden_states = residual + hidden_states
90-
return self.shared_head(hidden_states)
90+
return hidden_states
9191

9292

9393
class DeepSeekMultiTokenPredictor(nn.Module):
@@ -121,12 +121,13 @@ def forward(
121121
inputs_embeds: Optional[torch.Tensor] = None,
122122
spec_step_idx: int = 0,
123123
) -> torch.Tensor:
124-
return self.layers[str(self.mtp_start_layer_idx + spec_step_idx)](
124+
current_step_idx = (spec_step_idx % self.num_mtp_layers)
125+
return self.layers[str(self.mtp_start_layer_idx + current_step_idx)](
125126
input_ids,
126127
positions,
127128
previous_hidden_states,
128129
inputs_embeds,
129-
spec_step_idx,
130+
current_step_idx,
130131
)
131132

132133
def compute_logits(
@@ -135,9 +136,12 @@ def compute_logits(
135136
sampling_metadata: SamplingMetadata,
136137
spec_step_idx: int = 0,
137138
) -> torch.Tensor:
138-
mtp_layer = self.layers[str(self.mtp_start_layer_idx + spec_step_idx)]
139+
current_step_idx = (spec_step_idx % self.num_mtp_layers)
140+
mtp_layer = self.layers[str(self.mtp_start_layer_idx +
141+
current_step_idx)]
139142
logits = self.logits_processor(mtp_layer.shared_head.head,
140-
hidden_states, sampling_metadata)
143+
mtp_layer.shared_head(hidden_states),
144+
sampling_metadata)
141145
return logits
142146

143147

vllm/model_executor/models/florence2.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
3030
from vllm.sequence import IntermediateTensors
3131

32-
from .interfaces import SupportsMultiModal
32+
from .interfaces import SupportsMultiModal, SupportsV0Only
3333
from .utils import AutoWeightsLoader, flatten_bn, merge_multimodal_embeddings
3434

3535

@@ -651,7 +651,7 @@ def forward(
651651
return decoder_outputs
652652

653653

654-
class Florence2LanguageForConditionalGeneration(nn.Module):
654+
class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only):
655655

656656
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
657657
super().__init__()

vllm/model_executor/models/gritlm.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
PoolingSequenceGroupOutput)
2020
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
2121

22+
from .interfaces import SupportsV0Only
23+
2224
logger = init_logger(__name__)
2325

2426

@@ -177,7 +179,7 @@ def forward(
177179
return PoolerOutput(outputs=pooled_outputs)
178180

179181

180-
class GritLM(LlamaForCausalLM):
182+
class GritLM(LlamaForCausalLM, SupportsV0Only):
181183
"""This class implements the embedding model for parasail-ai/GritLM-7B-vllm.
182184
183185
The class inherits from LlamaForCausalLM and provides a custom pooling

0 commit comments

Comments
 (0)