Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions vllm/model_executor/models/bart.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors

from .interfaces import SupportsV0Only
from .interfaces import SupportsQuant, SupportsV0Only
from .utils import maybe_prefix

logger = logging.get_logger(__name__)
Expand Down Expand Up @@ -697,7 +697,7 @@ def forward(
return hidden_states


class BartModel(nn.Module):
class BartModel(nn.Module, SupportsQuant):
_tied_weights_keys = [
"encoder.embed_tokens.weight", "decoder.embed_tokens.weight"
]
Expand Down Expand Up @@ -763,7 +763,8 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
return decoder_outputs


class BartForConditionalGeneration(nn.Module, SupportsV0Only):
class BartForConditionalGeneration(nn.Module, SupportsV0Only, SupportsQuant):
packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
base_model_prefix = "model"

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
Expand Down
7 changes: 5 additions & 2 deletions vllm/model_executor/models/bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors

from .interfaces import SupportsPP
from .interfaces import SupportsPP, SupportsQuant
from .utils import (is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix)
Expand Down Expand Up @@ -279,7 +279,10 @@ def forward(
return hidden_states


class BloomForCausalLM(nn.Module, SupportsPP):
class BloomForCausalLM(nn.Module, SupportsPP, SupportsQuant):
packed_modules_mapping = {
"query_key_value": ["q_proj", "k_proj", "v_proj"]
}

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
Expand Down
5 changes: 3 additions & 2 deletions vllm/model_executor/models/chatglm.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs import ChatGLMConfig

from .interfaces import SupportsLoRA, SupportsPP
from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix)
Expand Down Expand Up @@ -449,7 +449,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)


class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP):
class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
SupportsQuant):
packed_modules_mapping = {
"query_key_value": ["query_key_value"],
"dense_h_to_4h": ["dense_h_to_4h"]
Expand Down
8 changes: 6 additions & 2 deletions vllm/model_executor/models/deepseek.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors

from .interfaces import SupportsPP
from .interfaces import SupportsPP, SupportsQuant
from .utils import (extract_layer_index, is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix)
Expand Down Expand Up @@ -386,7 +386,11 @@ def forward(
return hidden_states


class DeepseekForCausalLM(nn.Module, SupportsPP):
class DeepseekForCausalLM(nn.Module, SupportsPP, SupportsQuant):
packed_modules_mapping = {
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
"gate_up_proj": ["gate_proj", "up_proj"]
}

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/falcon.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs import RWConfig

from .interfaces import SupportsPP
from .interfaces import SupportsPP, SupportsQuant
from .utils import (is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix)
Expand Down Expand Up @@ -396,7 +396,7 @@ def forward(
return hidden_states


class FalconForCausalLM(nn.Module, SupportsPP):
class FalconForCausalLM(nn.Module, SupportsPP, SupportsQuant):
packed_modules_mapping = {
"query_key_value": ["query_key_value"],
}
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/gemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors

from .interfaces import SupportsLoRA, SupportsPP
from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
from .utils import (is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix)
Expand Down Expand Up @@ -320,7 +320,7 @@ def forward(
return hidden_states


class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
packed_modules_mapping = {
"qkv_proj": [
"q_proj",
Expand Down
5 changes: 3 additions & 2 deletions vllm/model_executor/models/gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors

from .interfaces import SupportsPP
from .interfaces import SupportsPP, SupportsQuant
from .utils import (is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix)
Expand Down Expand Up @@ -236,7 +236,8 @@ def forward(
return hidden_states


class GPT2LMHeadModel(nn.Module, SupportsPP):
class GPT2LMHeadModel(nn.Module, SupportsPP, SupportsQuant):
packed_modules_mapping = {"c_attn": ["q_proj", "k_proj", "v_proj"]}

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
Expand Down
5 changes: 3 additions & 2 deletions vllm/model_executor/models/gpt_bigcode.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors

from .interfaces import SupportsLoRA, SupportsPP
from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
from .utils import (is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers)

Expand Down Expand Up @@ -245,7 +245,8 @@ def forward(
return hidden_states


class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP,
SupportsQuant):
packed_modules_mapping = {"c_attn": ["c_attn"]}

# LoRA specific attributes
Expand Down
8 changes: 6 additions & 2 deletions vllm/model_executor/models/gpt_j.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors

from .interfaces import SupportsPP
from .interfaces import SupportsPP, SupportsQuant
from .utils import (is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix)
Expand Down Expand Up @@ -229,7 +229,11 @@ def forward(
return hidden_states


class GPTJForCausalLM(nn.Module, SupportsPP):
class GPTJForCausalLM(nn.Module, SupportsPP, SupportsQuant):
packed_modules_mapping = {
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
"gate_up_proj": ["gate_proj", "up_proj"]
}

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
Expand Down
7 changes: 5 additions & 2 deletions vllm/model_executor/models/gpt_neox.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors

from .interfaces import SupportsPP
from .interfaces import SupportsPP, SupportsQuant
from .utils import (is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix)
Expand Down Expand Up @@ -242,7 +242,10 @@ def forward(
return hidden_states


class GPTNeoXForCausalLM(nn.Module, SupportsPP):
class GPTNeoXForCausalLM(nn.Module, SupportsPP, SupportsQuant):
packed_modules_mapping = {
"query_key_value": ["q_proj", "k_proj", "v_proj"]
}

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/grok1.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors

from .interfaces import SupportsLoRA, SupportsPP
from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
from .utils import (is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix)
Expand Down Expand Up @@ -371,7 +371,7 @@ def forward(
return hidden_states


class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
fall_back_to_pt_during_load = False

packed_modules_mapping = {
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/internlm2.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors, PoolerOutput

from .interfaces import SupportsLoRA, SupportsPP
from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
from .utils import (is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix)
Expand Down Expand Up @@ -307,7 +307,7 @@ def forward(
return hidden_states


class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA, SupportsQuant):
packed_modules_mapping = {
"wqkv": ["wqkv"],
"gate_up_proj": ["w1", "w3"],
Expand Down
5 changes: 3 additions & 2 deletions vllm/model_executor/models/jais.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs import JAISConfig

from .interfaces import SupportsPP
from .interfaces import SupportsPP, SupportsQuant
from .utils import (is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix)
Expand Down Expand Up @@ -285,7 +285,8 @@ def forward(
return hidden_states


class JAISLMHeadModel(nn.Module, SupportsPP):
class JAISLMHeadModel(nn.Module, SupportsPP, SupportsQuant):
packed_modules_mapping = {"c_attn": ["c_attn"]}

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
Expand Down
10 changes: 7 additions & 3 deletions vllm/model_executor/models/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors

from .interfaces import SupportsLoRA, SupportsPP
from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers,
Expand Down Expand Up @@ -286,7 +286,11 @@ def forward(


@support_torch_compile
class LlamaModel(nn.Module):
class LlamaModel(nn.Module, SupportsQuant):
packed_modules_mapping = {
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
"gate_up_proj": ["gate_proj", "up_proj"]
}

def __init__(self,
*,
Expand Down Expand Up @@ -433,7 +437,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
return loaded_params


class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
packed_modules_mapping = {
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
"gate_up_proj": ["gate_proj", "up_proj"]
Expand Down
5 changes: 4 additions & 1 deletion vllm/model_executor/models/mamba.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.interfaces import (HasInnerState,
IsAttentionFree, SupportsPP,
SupportsQuant,
SupportsV0Only)
from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
MambaCacheParams)
Expand Down Expand Up @@ -156,7 +157,9 @@ def forward(


class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP,
SupportsV0Only):
SupportsQuant, SupportsV0Only):
# Mamba doesn't use the packed_modules_mapping pattern as it doesn't use
# stacked parameters in the same way as attention-based models

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
config = vllm_config.model_config.hf_config
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/minicpm.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors

from .interfaces import SupportsLoRA, SupportsPP
from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix)
Expand Down Expand Up @@ -497,7 +497,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
return loaded_params


class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
packed_modules_mapping = {
"qkv_proj": [
"q_proj",
Expand Down
7 changes: 4 additions & 3 deletions vllm/model_executor/models/mixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors

from .interfaces import SupportsLoRA, SupportsPP
from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
from .utils import (is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix)
Expand Down Expand Up @@ -246,7 +246,8 @@ def forward(


@support_torch_compile
class MixtralModel(nn.Module):
class MixtralModel(nn.Module, SupportsQuant):
packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
Expand Down Expand Up @@ -310,7 +311,7 @@ def forward(
return hidden_states


class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
fall_back_to_pt_during_load = False

packed_modules_mapping = {
Expand Down
5 changes: 3 additions & 2 deletions vllm/model_executor/models/mpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs.mpt import MPTConfig

from .interfaces import SupportsPP
from .interfaces import SupportsPP, SupportsQuant
from .utils import (is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix)
Expand Down Expand Up @@ -267,7 +267,8 @@ def forward(
return hidden_states


class MPTForCausalLM(nn.Module, SupportsPP):
class MPTForCausalLM(nn.Module, SupportsPP, SupportsQuant):
packed_modules_mapping = {"Wqkv": ["q_proj", "k_proj", "v_proj"]}

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
Expand Down
8 changes: 6 additions & 2 deletions vllm/model_executor/models/olmo.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors

from .interfaces import SupportsPP
from .interfaces import SupportsPP, SupportsQuant
from .utils import (is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix)
Expand Down Expand Up @@ -286,7 +286,11 @@ def forward(
return hidden_states


class OlmoForCausalLM(nn.Module, SupportsPP):
class OlmoForCausalLM(nn.Module, SupportsPP, SupportsQuant):
packed_modules_mapping = {
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
"gate_up_proj": ["gate_proj", "up_proj"]
}
"""
Extremely barebones HF model wrapper.
"""
Expand Down
Loading