diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py index 22ee08535bdd..b28e2b5bf9c7 100755 --- a/tools/pre_commit/mypy.py +++ b/tools/pre_commit/mypy.py @@ -30,6 +30,7 @@ "vllm/entrypoints", "vllm/inputs", "vllm/logging_utils", + "vllm/lora", "vllm/multimodal", "vllm/platforms", "vllm/transformers_utils", @@ -47,7 +48,6 @@ "vllm/engine", "vllm/executor", "vllm/inputs", - "vllm/lora", "vllm/model_executor", "vllm/plugins", "vllm/worker", diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py index d2f017c19ccd..eba1e17bc403 100644 --- a/vllm/lora/layers/base_linear.py +++ b/vllm/lora/layers/base_linear.py @@ -152,6 +152,7 @@ def set_lora( def apply( self, x: torch.Tensor, bias: Optional[torch.Tensor] = None ) -> torch.Tensor: + assert self.base_layer.quant_method is not None output = self.base_layer.quant_method.apply(self.base_layer, x, bias) # In transformers backend, x and output have extra batch dimension like diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py index 011d38157456..43081449b901 100644 --- a/vllm/lora/layers/column_parallel_linear.py +++ b/vllm/lora/layers/column_parallel_linear.py @@ -35,6 +35,7 @@ def _mcp_apply(x, bias, layer: "ColumnParallelLinearWithLoRA"): if layer.lora_bias_stacked is not None: assert layer.n_slices == len(layer.lora_bias_stacked) + assert layer.base_layer.quant_method is not None output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias) x = x.view(-1, x.shape[-1]) diff --git a/vllm/lora/layers/row_parallel_linear.py b/vllm/lora/layers/row_parallel_linear.py index 738371f22a36..074ee7d07d42 100644 --- a/vllm/lora/layers/row_parallel_linear.py +++ b/vllm/lora/layers/row_parallel_linear.py @@ -136,6 +136,7 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: def apply( self, x: torch.Tensor, bias: Optional[torch.Tensor] = None ) -> torch.Tensor: + assert self.base_layer.quant_method is not None output = self.base_layer.quant_method.apply(self.base_layer, x) x = x.view(-1, x.shape[-1]) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 771c8608f4a8..e5e8e9c1037e 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -252,6 +252,7 @@ def check_unexpected_modules(modules: dict): from tensorizer import TensorDeserializer tensorizer_config = TensorizerConfig(**tensorizer_config_dict) + assert tensorizer_config.tensorizer_dir is not None lora_tensor_path = os.path.join( tensorizer_config.tensorizer_dir, "adapter_model.tensors" ) @@ -356,6 +357,7 @@ def __init__( vocab_size: the vocab size of the model. lora_config: the LoRA configuration. """ + assert isinstance(model, nn.Module) self.model: SupportsLoRA = model self._registered_adapters: dict[int, LoRAModel] = {} # Dict instead of a set for compatibility with LRUCache. @@ -394,13 +396,14 @@ def __init__( # Dict instead of a set for compatibility with LRUCache. self._last_mapping: Optional[LoRAMapping] = None self._create_lora_modules() - self.model.lora_manager = self + setattr(self.model, "lora_manager", self) def __len__(self) -> int: return len(self._registered_adapters) @property def capacity(self) -> int: + assert self.lora_config.max_cpu_loras is not None return self.lora_config.max_cpu_loras @property @@ -503,7 +506,7 @@ def _parent_module(module_name: str) -> str: # - given an input 'x' return '' return module_name.rpartition(".")[0] - for module_name, module in self.model.named_modules(remove_duplicate=False): + for module_name, module in self.model.named_modules(remove_duplicate=False): # type: ignore[attr-defined] if isinstance(module, PPMissingLayer): continue if not self._match_target_modules(module_name): @@ -527,7 +530,7 @@ def _parent_module(module_name: str) -> str: self.lora_slots, self.lora_config, packed_moduled_lst, - self.model.config, + self.model.config, # type: ignore[attr-defined] ), ) @@ -540,7 +543,7 @@ def _parent_module(module_name: str) -> str: f"{parent_module}.{logits_processor_module_name}" ) - logits_processor_module = self.model.get_submodule( + logits_processor_module = self.model.get_submodule( # type: ignore[attr-defined] logits_processor_module_name ) @@ -552,7 +555,7 @@ def _parent_module(module_name: str) -> str: module, self.lora_slots, self.lora_config, - self.model.config, + self.model.config, # type: ignore[attr-defined] ), ) @@ -580,7 +583,7 @@ def create_dummy_lora( ) -> LoRAModel: """Create zero-initialized LoRAModel for warmup.""" model = LoRAModel(lora_id, rank, {}) - for module_name, module in self.model.named_modules(): + for module_name, module in self.model.named_modules(): # type: ignore[attr-defined] bias_enabled = self.lora_config.bias_enabled if ( not self._match_target_modules(module_name) @@ -663,7 +666,7 @@ def _filter_unsupported_mm_module(self, module_name: str) -> bool: be filtered out. """ if self.supports_mm: - module_mapping: MultiModelKeys = self.model.get_mm_mapping() + module_mapping: MultiModelKeys = self.model.get_mm_mapping() # type: ignore[attr-defined] prefix_lst = module_mapping.connector + module_mapping.tower_model return any([module_name.startswith(prefix) for prefix in prefix_lst]) return False diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py index 48412eab92d8..a9cf77770147 100644 --- a/vllm/lora/peft_helper.py +++ b/vllm/lora/peft_helper.py @@ -91,6 +91,7 @@ def from_local_dir( tensorizer_args = tensorizer_config._construct_tensorizer_args() from tensorizer.stream_io import open_stream + assert tensorizer_config.tensorizer_dir is not None lora_config_path = os.path.join( tensorizer_config.tensorizer_dir, "adapter_config.json" ) diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 5e55d44ce8d9..df86333f7de3 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -132,11 +132,13 @@ def parse_fine_tuned_lora_name( # mapping correctly. if name.startswith("base_model.model."): name = name.replace("base_model.model.", "") - name = weights_mapper._map_name(name) if weights_mapper else name + mapped_name = weights_mapper._map_name(name) if weights_mapper else name + name = mapped_name if mapped_name is not None else name # recover the prefix `base_model.model.` name = "base_model.model." + name else: - name = weights_mapper._map_name(name) if weights_mapper else name + mapped_name = weights_mapper._map_name(name) if weights_mapper else name + name = mapped_name if mapped_name is not None else name # In some situations, we may not start with `base_model.model.`. # If we don't (e.g., ibm-granite/granite-speech-3.3-8b), diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 3ca819fb732c..771c33acf27d 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -7,6 +7,7 @@ import torch from vllm.config import VllmConfig +from vllm.config.lora import LoRAConfig from vllm.logger import init_logger from vllm.lora.models import ( LoRAModel, @@ -46,7 +47,9 @@ def __init__( vllm_config.scheduler_config.max_num_batched_tokens ) self.vocab_size = vllm_config.model_config.get_vocab_size() - self.lora_config = vllm_config.lora_config + lora_config = vllm_config.lora_config + assert lora_config is not None + self.lora_config: LoRAConfig = lora_config # Use get_text_config() in case of multimodal models text_config = vllm_config.model_config.hf_config.get_text_config()