Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tools/pre_commit/mypy.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
"vllm/entrypoints",
"vllm/inputs",
"vllm/logging_utils",
"vllm/lora",
"vllm/multimodal",
"vllm/platforms",
"vllm/transformers_utils",
Expand All @@ -47,7 +48,6 @@
"vllm/engine",
"vllm/executor",
"vllm/inputs",
"vllm/lora",
"vllm/model_executor",
"vllm/plugins",
"vllm/worker",
Expand Down
1 change: 1 addition & 0 deletions vllm/lora/layers/base_linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ def set_lora(
def apply(
self, x: torch.Tensor, bias: Optional[torch.Tensor] = None
) -> torch.Tensor:
assert self.base_layer.quant_method is not None
output = self.base_layer.quant_method.apply(self.base_layer, x, bias)

# In transformers backend, x and output have extra batch dimension like
Expand Down
1 change: 1 addition & 0 deletions vllm/lora/layers/column_parallel_linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def _mcp_apply(x, bias, layer: "ColumnParallelLinearWithLoRA"):
if layer.lora_bias_stacked is not None:
assert layer.n_slices == len(layer.lora_bias_stacked)

assert layer.base_layer.quant_method is not None
output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias)

x = x.view(-1, x.shape[-1])
Expand Down
1 change: 1 addition & 0 deletions vllm/lora/layers/row_parallel_linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
def apply(
self, x: torch.Tensor, bias: Optional[torch.Tensor] = None
) -> torch.Tensor:
assert self.base_layer.quant_method is not None
output = self.base_layer.quant_method.apply(self.base_layer, x)

x = x.view(-1, x.shape[-1])
Expand Down
17 changes: 10 additions & 7 deletions vllm/lora/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ def check_unexpected_modules(modules: dict):
from tensorizer import TensorDeserializer

tensorizer_config = TensorizerConfig(**tensorizer_config_dict)
assert tensorizer_config.tensorizer_dir is not None
lora_tensor_path = os.path.join(
tensorizer_config.tensorizer_dir, "adapter_model.tensors"
)
Expand Down Expand Up @@ -356,6 +357,7 @@ def __init__(
vocab_size: the vocab size of the model.
lora_config: the LoRA configuration.
"""
assert isinstance(model, nn.Module)
self.model: SupportsLoRA = model
self._registered_adapters: dict[int, LoRAModel] = {}
# Dict instead of a set for compatibility with LRUCache.
Expand Down Expand Up @@ -394,13 +396,14 @@ def __init__(
# Dict instead of a set for compatibility with LRUCache.
self._last_mapping: Optional[LoRAMapping] = None
self._create_lora_modules()
self.model.lora_manager = self
setattr(self.model, "lora_manager", self)

def __len__(self) -> int:
return len(self._registered_adapters)

@property
def capacity(self) -> int:
assert self.lora_config.max_cpu_loras is not None
return self.lora_config.max_cpu_loras

@property
Expand Down Expand Up @@ -503,7 +506,7 @@ def _parent_module(module_name: str) -> str:
# - given an input 'x' return ''
return module_name.rpartition(".")[0]

for module_name, module in self.model.named_modules(remove_duplicate=False):
for module_name, module in self.model.named_modules(remove_duplicate=False): # type: ignore[attr-defined]
if isinstance(module, PPMissingLayer):
continue
if not self._match_target_modules(module_name):
Expand All @@ -527,7 +530,7 @@ def _parent_module(module_name: str) -> str:
self.lora_slots,
self.lora_config,
packed_moduled_lst,
self.model.config,
self.model.config, # type: ignore[attr-defined]
),
)

Expand All @@ -540,7 +543,7 @@ def _parent_module(module_name: str) -> str:
f"{parent_module}.{logits_processor_module_name}"
)

logits_processor_module = self.model.get_submodule(
logits_processor_module = self.model.get_submodule( # type: ignore[attr-defined]
logits_processor_module_name
)

Expand All @@ -552,7 +555,7 @@ def _parent_module(module_name: str) -> str:
module,
self.lora_slots,
self.lora_config,
self.model.config,
self.model.config, # type: ignore[attr-defined]
),
)

Expand Down Expand Up @@ -580,7 +583,7 @@ def create_dummy_lora(
) -> LoRAModel:
"""Create zero-initialized LoRAModel for warmup."""
model = LoRAModel(lora_id, rank, {})
for module_name, module in self.model.named_modules():
for module_name, module in self.model.named_modules(): # type: ignore[attr-defined]
bias_enabled = self.lora_config.bias_enabled
if (
not self._match_target_modules(module_name)
Expand Down Expand Up @@ -663,7 +666,7 @@ def _filter_unsupported_mm_module(self, module_name: str) -> bool:
be filtered out.
"""
if self.supports_mm:
module_mapping: MultiModelKeys = self.model.get_mm_mapping()
module_mapping: MultiModelKeys = self.model.get_mm_mapping() # type: ignore[attr-defined]
prefix_lst = module_mapping.connector + module_mapping.tower_model
return any([module_name.startswith(prefix) for prefix in prefix_lst])
return False
Expand Down
1 change: 1 addition & 0 deletions vllm/lora/peft_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def from_local_dir(
tensorizer_args = tensorizer_config._construct_tensorizer_args()
from tensorizer.stream_io import open_stream

assert tensorizer_config.tensorizer_dir is not None
lora_config_path = os.path.join(
tensorizer_config.tensorizer_dir, "adapter_config.json"
)
Expand Down
6 changes: 4 additions & 2 deletions vllm/lora/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,11 +132,13 @@ def parse_fine_tuned_lora_name(
# mapping correctly.
if name.startswith("base_model.model."):
name = name.replace("base_model.model.", "")
name = weights_mapper._map_name(name) if weights_mapper else name
mapped_name = weights_mapper._map_name(name) if weights_mapper else name
name = mapped_name if mapped_name is not None else name
# recover the prefix `base_model.model.`
name = "base_model.model." + name
else:
name = weights_mapper._map_name(name) if weights_mapper else name
mapped_name = weights_mapper._map_name(name) if weights_mapper else name
name = mapped_name if mapped_name is not None else name

# In some situations, we may not start with `base_model.model.`.
# If we don't (e.g., ibm-granite/granite-speech-3.3-8b),
Expand Down
5 changes: 4 additions & 1 deletion vllm/lora/worker_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import torch

from vllm.config import VllmConfig
from vllm.config.lora import LoRAConfig
from vllm.logger import init_logger
from vllm.lora.models import (
LoRAModel,
Expand Down Expand Up @@ -46,7 +47,9 @@ def __init__(
vllm_config.scheduler_config.max_num_batched_tokens
)
self.vocab_size = vllm_config.model_config.get_vocab_size()
self.lora_config = vllm_config.lora_config
lora_config = vllm_config.lora_config
assert lora_config is not None
self.lora_config: LoRAConfig = lora_config

# Use get_text_config() in case of multimodal models
text_config = vllm_config.model_config.hf_config.get_text_config()
Expand Down