diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index ecde8fbaa15b..1deb0026a6e5 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -89,7 +89,7 @@ def make_prompt_lora_mapping(num_prompts: int, num_active_loras: int, sort_by_lora_id: bool, device: str) -> torch.Tensor: """ - All prompts are mapped to a Lora ID in range [0, num_active_loras). + All prompts are mapped to a LoRA ID in range [0, num_active_loras). where 0 refers to first lora, 1 refers to second lora and so on. """ assert num_active_loras > 0 diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md index fb5a7a0d519c..dff7e916fb46 100644 --- a/docs/source/features/lora.md +++ b/docs/source/features/lora.md @@ -170,7 +170,7 @@ Now, you can specify a base_model_name alongside the name and path using JSON fo To provide the backward compatibility support, you can still use the old key-value format (name=path), but the `base_model_name` will remain unspecified in that case. -## Lora model lineage in model card +## LoRA model lineage in model card The new format of `--lora-modules` is mainly to support the display of parent model information in the model card. Here's an explanation of how your current response supports this: diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index dcc97ebaa7c5..66bc5257f081 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -491,7 +491,7 @@ def test_prefill_schedule_max_lora(): lora_path="abc")) scheduler.add_seq_group(seq_group) # Add two more requests to verify lora is prioritized. - # 0: Lora, 1: Lora, 2: regular, 3: regular + # 0: LoRA, 1: LoRA, 2: regular, 3: regular # In the first iteration, index 0, 2 is scheduled. # If a request is not scheduled because it hits max lora, it is # prioritized. Verify that. diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py index 2f065ec1070e..e0285b5e5566 100644 --- a/tests/entrypoints/openai/test_cli_args.py +++ b/tests/entrypoints/openai/test_cli_args.py @@ -26,7 +26,7 @@ def serve_parser(): return make_arg_parser(parser) -### Tests for Lora module parsing +### Tests for LoRA module parsing def test_valid_key_value_format(serve_parser): # Test old format: name=path args = serve_parser.parse_args([ diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py index 55900163eef5..e8f3c2f8b39e 100644 --- a/tests/entrypoints/openai/test_serving_models.py +++ b/tests/entrypoints/openai/test_serving_models.py @@ -8,8 +8,8 @@ from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient from vllm.entrypoints.openai.protocol import (ErrorResponse, - LoadLoraAdapterRequest, - UnloadLoraAdapterRequest) + LoadLoRAAdapterRequest, + UnloadLoRAAdapterRequest) from vllm.entrypoints.openai.serving_models import (BaseModelPath, OpenAIServingModels) from vllm.lora.request import LoRARequest @@ -51,7 +51,7 @@ async def test_serving_model_name(): @pytest.mark.asyncio async def test_load_lora_adapter_success(): serving_models = await _async_serving_models_init() - request = LoadLoraAdapterRequest(lora_name="adapter", + request = LoadLoRAAdapterRequest(lora_name="adapter", lora_path="/path/to/adapter2") response = await serving_models.load_lora_adapter(request) assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter') @@ -62,7 +62,7 @@ async def test_load_lora_adapter_success(): @pytest.mark.asyncio async def test_load_lora_adapter_missing_fields(): serving_models = await _async_serving_models_init() - request = LoadLoraAdapterRequest(lora_name="", lora_path="") + request = LoadLoRAAdapterRequest(lora_name="", lora_path="") response = await serving_models.load_lora_adapter(request) assert isinstance(response, ErrorResponse) assert response.type == "InvalidUserInput" @@ -72,14 +72,14 @@ async def test_load_lora_adapter_missing_fields(): @pytest.mark.asyncio async def test_load_lora_adapter_duplicate(): serving_models = await _async_serving_models_init() - request = LoadLoraAdapterRequest(lora_name="adapter1", + request = LoadLoRAAdapterRequest(lora_name="adapter1", lora_path="/path/to/adapter1") response = await serving_models.load_lora_adapter(request) assert response == LORA_LOADING_SUCCESS_MESSAGE.format( lora_name='adapter1') assert len(serving_models.lora_requests) == 1 - request = LoadLoraAdapterRequest(lora_name="adapter1", + request = LoadLoRAAdapterRequest(lora_name="adapter1", lora_path="/path/to/adapter1") response = await serving_models.load_lora_adapter(request) assert isinstance(response, ErrorResponse) @@ -91,12 +91,12 @@ async def test_load_lora_adapter_duplicate(): @pytest.mark.asyncio async def test_unload_lora_adapter_success(): serving_models = await _async_serving_models_init() - request = LoadLoraAdapterRequest(lora_name="adapter1", + request = LoadLoRAAdapterRequest(lora_name="adapter1", lora_path="/path/to/adapter1") response = await serving_models.load_lora_adapter(request) assert len(serving_models.lora_requests) == 1 - request = UnloadLoraAdapterRequest(lora_name="adapter1") + request = UnloadLoRAAdapterRequest(lora_name="adapter1") response = await serving_models.unload_lora_adapter(request) assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format( lora_name='adapter1') @@ -106,7 +106,7 @@ async def test_unload_lora_adapter_success(): @pytest.mark.asyncio async def test_unload_lora_adapter_missing_fields(): serving_models = await _async_serving_models_init() - request = UnloadLoraAdapterRequest(lora_name="", lora_int_id=None) + request = UnloadLoRAAdapterRequest(lora_name="", lora_int_id=None) response = await serving_models.unload_lora_adapter(request) assert isinstance(response, ErrorResponse) assert response.type == "InvalidUserInput" @@ -116,7 +116,7 @@ async def test_unload_lora_adapter_missing_fields(): @pytest.mark.asyncio async def test_unload_lora_adapter_not_found(): serving_models = await _async_serving_models_init() - request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter") + request = UnloadLoRAAdapterRequest(lora_name="nonexistent_adapter") response = await serving_models.unload_lora_adapter(request) assert isinstance(response, ErrorResponse) assert response.type == "NotFoundError" diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 0838ca02c9b7..61699e7052c9 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -14,16 +14,16 @@ from vllm.lora.fully_sharded_layers import ( ColumnParallelLinearWithShardedLoRA, MergedColumnParallelLinearWithShardedLoRA, - MergedQKVParallelLinearWithShardedLora, QKVParallelLinearWithShardedLora, + MergedQKVParallelLinearWithShardedLoRA, QKVParallelLinearWithShardedLoRA, RowParallelLinearWithShardedLoRA) # yapf conflicts with isort for this block # yapf: disable from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, - LinearScalingRotaryEmbeddingWithLora, + LinearScalingRotaryEmbeddingWithLoRA, LogitsProcessorWithLoRA, LoRAMapping, MergedColumnParallelLinearWithLoRA, - MergedQKVParallelLinearWithLora, - QKVParallelLinearWithLora, + MergedQKVParallelLinearWithLoRA, + QKVParallelLinearWithLoRA, ReplicatedLinearWithLoRA, RowParallelLinearWithLoRA, VocabParallelEmbeddingWithLoRA) @@ -866,9 +866,9 @@ def create_column_parallel_packed_layer(): bias=False, params_dtype=torch.float16) linear.weight.data = torch.rand_like(linear.weight.data) - lora_linear = (MergedQKVParallelLinearWithLora(linear) + lora_linear = (MergedQKVParallelLinearWithLoRA(linear) if not fully_shard else - MergedQKVParallelLinearWithShardedLora(linear)) + MergedQKVParallelLinearWithShardedLoRA(linear)) else: linear = QKVParallelLinear(4096, 64, @@ -876,9 +876,9 @@ def create_column_parallel_packed_layer(): bias=False, params_dtype=torch.float16) linear.weight.data = torch.rand_like(linear.weight.data) - lora_linear = QKVParallelLinearWithLora( + lora_linear = QKVParallelLinearWithLoRA( linear - ) if not fully_shard else QKVParallelLinearWithShardedLora(linear) + ) if not fully_shard else QKVParallelLinearWithShardedLoRA(linear) @dataclass class FakeConfig: @@ -1024,7 +1024,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device, base, is_neox_style, ) - lora_rope = LinearScalingRotaryEmbeddingWithLora(rope) + lora_rope = LinearScalingRotaryEmbeddingWithLoRA(rope) lora_rope.set_mapping(punica_wrapper) lora_rope.create_lora_weights(max_loras, lora_config) linear_rope = get_rope(head_size, rotary_dim, max_position, base, diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py index 62005de73ddb..0a94298c9f77 100644 --- a/tests/lora/test_long_context.py +++ b/tests/lora/test_long_context.py @@ -8,7 +8,7 @@ import vllm from vllm import SamplingParams -from vllm.lora.layers import LinearScalingRotaryEmbeddingWithLora +from vllm.lora.layers import LinearScalingRotaryEmbeddingWithLoRA from vllm.lora.request import LoRARequest from vllm.model_executor.layers.rotary_embedding import ( LinearScalingRotaryEmbedding) @@ -151,7 +151,7 @@ def test_rotary_emb_replaced(dist_init): if "rotary_emb" in module_name: if "base_layer" not in module_name: rotary_emb_count += 1 - assert isinstance(module, LinearScalingRotaryEmbeddingWithLora) + assert isinstance(module, LinearScalingRotaryEmbeddingWithLoRA) else: assert isinstance(module, LinearScalingRotaryEmbedding) # Llama 2 has 32 layers. diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 3ce9a0461368..1690017f924c 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1629,7 +1629,7 @@ def _get_stats(self, max_tokens_requests: List[int] = [] finished_reason_requests: List[str] = [] - # Lora requests + # LoRA requests running_lora_adapters = dict( collectionsCounter([ running_request.lora_request.lora_name diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 9995951b3f3d..1b65484c446a 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -53,7 +53,7 @@ EmbeddingResponse, EmbeddingResponseData, ErrorResponse, - LoadLoraAdapterRequest, + LoadLoRAAdapterRequest, PoolingChatRequest, PoolingCompletionRequest, PoolingRequest, PoolingResponse, @@ -63,7 +63,7 @@ TokenizeResponse, TranscriptionRequest, TranscriptionResponse, - UnloadLoraAdapterRequest) + UnloadLoRAAdapterRequest) from vllm.entrypoints.openai.reasoning_parsers import ReasoningParserManager # yapf: enable from vllm.entrypoints.openai.serving_chat import OpenAIServingChat @@ -690,12 +690,12 @@ async def stop_profile(raw_request: Request): if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING: logger.warning( - "Lora dynamic loading & unloading is enabled in the API server. " + "LoRA dynamic loading & unloading is enabled in the API server. " "This should ONLY be used for local development!") @router.post("/v1/load_lora_adapter", dependencies=[Depends(validate_json_request)]) - async def load_lora_adapter(request: LoadLoraAdapterRequest, + async def load_lora_adapter(request: LoadLoRAAdapterRequest, raw_request: Request): handler = models(raw_request) response = await handler.load_lora_adapter(request) @@ -707,7 +707,7 @@ async def load_lora_adapter(request: LoadLoraAdapterRequest, @router.post("/v1/unload_lora_adapter", dependencies=[Depends(validate_json_request)]) - async def unload_lora_adapter(request: UnloadLoraAdapterRequest, + async def unload_lora_adapter(request: UnloadLoRAAdapterRequest, raw_request: Request): handler = models(raw_request) response = await handler.unload_lora_adapter(request) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index cd2902f934bf..31214211cfc4 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1431,12 +1431,12 @@ class DetokenizeResponse(OpenAIBaseModel): prompt: str -class LoadLoraAdapterRequest(BaseModel): +class LoadLoRAAdapterRequest(BaseModel): lora_name: str lora_path: str -class UnloadLoraAdapterRequest(BaseModel): +class UnloadLoRAAdapterRequest(BaseModel): lora_name: str lora_int_id: Optional[int] = Field(default=None) diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index 6ade4ece6d03..0f4a174a8c15 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -9,10 +9,10 @@ from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient from vllm.entrypoints.openai.protocol import (ErrorResponse, - LoadLoraAdapterRequest, + LoadLoRAAdapterRequest, ModelCard, ModelList, ModelPermission, - UnloadLoraAdapterRequest) + UnloadLoRAAdapterRequest) from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.prompt_adapter.request import PromptAdapterRequest @@ -88,7 +88,7 @@ async def init_static_loras(self): if self.static_lora_modules is None: return for lora in self.static_lora_modules: - load_request = LoadLoraAdapterRequest(lora_path=lora.path, + load_request = LoadLoRAAdapterRequest(lora_path=lora.path, lora_name=lora.name) load_result = await self.load_lora_adapter( request=load_request, base_model_name=lora.base_model_name) @@ -140,7 +140,7 @@ async def show_available_models(self) -> ModelList: async def load_lora_adapter( self, - request: LoadLoraAdapterRequest, + request: LoadLoRAAdapterRequest, base_model_name: Optional[str] = None ) -> Union[ErrorResponse, str]: error_check_ret = await self._check_load_lora_adapter_request(request) @@ -177,7 +177,7 @@ async def load_lora_adapter( async def unload_lora_adapter( self, - request: UnloadLoraAdapterRequest) -> Union[ErrorResponse, str]: + request: UnloadLoRAAdapterRequest) -> Union[ErrorResponse, str]: error_check_ret = await self._check_unload_lora_adapter_request(request ) if error_check_ret is not None: @@ -192,7 +192,7 @@ async def unload_lora_adapter( return f"Success: LoRA adapter '{lora_name}' removed successfully." async def _check_load_lora_adapter_request( - self, request: LoadLoraAdapterRequest) -> Optional[ErrorResponse]: + self, request: LoadLoRAAdapterRequest) -> Optional[ErrorResponse]: # Check if both 'lora_name' and 'lora_path' are provided if not request.lora_name or not request.lora_path: return create_error_response( @@ -214,7 +214,7 @@ async def _check_load_lora_adapter_request( async def _check_unload_lora_adapter_request( self, - request: UnloadLoraAdapterRequest) -> Optional[ErrorResponse]: + request: UnloadLoRAAdapterRequest) -> Optional[ErrorResponse]: # Check if either 'lora_name' or 'lora_int_id' is provided if not request.lora_name and not request.lora_int_id: return create_error_response( diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index 3d6620817b4b..41e1ec94145d 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -13,8 +13,8 @@ from vllm.distributed.parallel_state import get_tensor_model_parallel_rank from vllm.lora.layers import (ColumnParallelLinearWithLoRA, MergedColumnParallelLinearWithLoRA, - MergedQKVParallelLinearWithLora, - QKVParallelLinearWithLora, + MergedQKVParallelLinearWithLoRA, + QKVParallelLinearWithLoRA, RowParallelLinearWithLoRA) if TYPE_CHECKING: @@ -167,9 +167,9 @@ def can_replace_layer( ) -class QKVParallelLinearWithShardedLora(QKVParallelLinearWithLora): +class QKVParallelLinearWithShardedLoRA(QKVParallelLinearWithLoRA): """ - Differs from QKVParallelLinearWithLora by slicing the + Differs from QKVParallelLinearWithLoRA by slicing the LoRA A's also. Based on S-LoRA, slicing happens along the rank dim. @@ -202,9 +202,9 @@ def can_replace_layer(cls, source_layer: nn.Module, ) -class MergedQKVParallelLinearWithShardedLora(MergedQKVParallelLinearWithLora): +class MergedQKVParallelLinearWithShardedLoRA(MergedQKVParallelLinearWithLoRA): """ - Differs from MergedQKVParallelLinearWithLora by slicing the + Differs from MergedQKVParallelLinearWithLoRA by slicing the LoRA A's also. Based on S-LoRA, slicing happens along the rank dim. diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 7f68dae9717c..6c48173c201b 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -363,7 +363,7 @@ def set_lora( embeddings_tensor: Optional[torch.Tensor], lora_bias: Optional[torch.Tensor] = None, ): - # Except for QKVParallelLinearWithLora and + # Except for QKVParallelLinearWithLoRA and # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers # store weights in a tuple of size 1. These two layers will # override this function. @@ -686,7 +686,7 @@ def can_replace_layer( and len(packed_modules_list) == 2) -class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA): +class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): """ ColumnParallelLinear layer that is specifically designed for qkv_proj. Certain models, such as chatglm3 and baichuan-7b, @@ -754,7 +754,7 @@ def can_replace_layer(cls, source_layer: nn.Module, packed_modules_list) == 1 -class MergedQKVParallelLinearWithLora(MergedColumnParallelLinearWithLoRA): +class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA): """MergedColumnParallelLinear layer that is composed of 3 sublayers (slices) packed together in qkv proj fashion (q_proj + k_proj + v_proj -> qkv_proj). @@ -1120,7 +1120,7 @@ def can_replace_layer( return False -class LinearScalingRotaryEmbeddingWithLora(BaseLayerWithLoRA): +class LinearScalingRotaryEmbeddingWithLoRA(BaseLayerWithLoRA): """Implements RoPE-scaled embeddings with linear scaling for multiple LoRA adapters with a specialized kernel. diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 774c3876e774..e1294884ac2a 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -20,7 +20,7 @@ from vllm.config import LoRAConfig from vllm.logger import init_logger from vllm.lora.layers import (BaseLayerWithLoRA, - LinearScalingRotaryEmbeddingWithLora, + LinearScalingRotaryEmbeddingWithLoRA, LoRAMapping) from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.peft_helper import PEFTHelper @@ -201,7 +201,7 @@ def from_local_checkpoint( expected_lora_modules: Name of modules that are expected to be replaced by lora. peft_helper: Loaded lora configuration information. - lora_model_id: Lora model id. If not given, automatically set by + lora_model_id: LoRA model id. If not given, automatically set by a global counter. device: Device where the lora model is loaded. dtype: dtype of the lora model weights. @@ -480,9 +480,9 @@ def _create_lora_modules(self): from_layer(module, self.lora_slots, self.lora_config, packed_moduled_lst, self.model.config)) - # LinearScalingRotaryEmbeddingWithLora is used to handle + # LinearScalingRotaryEmbeddingWithLoRA is used to handle # long context lora. Register relevant metadata. - if isinstance(new_module, LinearScalingRotaryEmbeddingWithLora): + if isinstance(new_module, LinearScalingRotaryEmbeddingWithLoRA): self.long_lora_context = LongContextLoRAContext( new_module.scaling_factors, new_module.rotary_dim) self.scaling_factor_to_offset = \ @@ -527,7 +527,7 @@ def create_dummy_lora( bias_enabled = self.lora_config.bias_enabled if (not self._match_target_modules(module_name) or not isinstance(module, BaseLayerWithLoRA) - or isinstance(module, LinearScalingRotaryEmbeddingWithLora) + or isinstance(module, LinearScalingRotaryEmbeddingWithLoRA) or self._filter_unsupported_mm_module(module_name)): continue parts = module_name.split(".") diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py index 9496ab5a75c0..f6944368b36e 100644 --- a/vllm/lora/peft_helper.py +++ b/vllm/lora/peft_helper.py @@ -42,7 +42,7 @@ class PEFTHelper: def _validate_features(self) -> List[str]: """ - Check if there are any unsupported Lora features. + Check if there are any unsupported LoRA features. """ error_msg = [] if self.modules_to_save: diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py index dad98f8e2122..94fa3f27ab60 100644 --- a/vllm/lora/punica_wrapper/punica_base.py +++ b/vllm/lora/punica_wrapper/punica_base.py @@ -314,7 +314,7 @@ def embeddings_indices(self) -> torch.Tensor: def long_lora_indices(self) -> torch.Tensor: """ This property provides access to the indices used for long context - lora, specifically for LinearScalingRotaryEmbeddingWithLora. + lora, specifically for LinearScalingRotaryEmbeddingWithLoRA. """ long_lora_len = self.indices_len[4] return self._long_lora_indices[:long_lora_len] diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 361dac5b3313..63b465fdf743 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -15,17 +15,17 @@ from vllm.lora.fully_sharded_layers import ( ColumnParallelLinearWithShardedLoRA, MergedColumnParallelLinearWithShardedLoRA, - MergedQKVParallelLinearWithShardedLora, QKVParallelLinearWithShardedLora, + MergedQKVParallelLinearWithShardedLoRA, QKVParallelLinearWithShardedLoRA, RowParallelLinearWithShardedLoRA) # being imported for _all_lora_classes below # yapf conflicts with isort for this block # yapf: disable from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, - LinearScalingRotaryEmbeddingWithLora, + LinearScalingRotaryEmbeddingWithLoRA, LogitsProcessorWithLoRA, MergedColumnParallelLinearWithLoRA, - MergedQKVParallelLinearWithLora, - QKVParallelLinearWithLora, + MergedQKVParallelLinearWithLoRA, + QKVParallelLinearWithLoRA, ReplicatedLinearWithLoRA, RowParallelLinearWithLoRA, VocabParallelEmbeddingWithLoRA) @@ -41,17 +41,17 @@ VocabParallelEmbeddingWithLoRA, ColumnParallelLinearWithLoRA, MergedColumnParallelLinearWithLoRA, - QKVParallelLinearWithLora, - MergedQKVParallelLinearWithLora, + QKVParallelLinearWithLoRA, + MergedQKVParallelLinearWithLoRA, RowParallelLinearWithLoRA, ReplicatedLinearWithLoRA, LogitsProcessorWithLoRA, ColumnParallelLinearWithShardedLoRA, - QKVParallelLinearWithShardedLora, + QKVParallelLinearWithShardedLoRA, MergedColumnParallelLinearWithShardedLoRA, - MergedQKVParallelLinearWithShardedLora, + MergedQKVParallelLinearWithShardedLoRA, RowParallelLinearWithShardedLoRA, - LinearScalingRotaryEmbeddingWithLora, + LinearScalingRotaryEmbeddingWithLoRA, } diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py index 2bebf80fadae..2829d631b49e 100644 --- a/vllm/spec_decode/proposer_worker_base.py +++ b/vllm/spec_decode/proposer_worker_base.py @@ -6,10 +6,10 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest from vllm.spec_decode.interfaces import SpeculativeProposer -from vllm.worker.worker_base import LoraNotSupportedWorkerBase +from vllm.worker.worker_base import LoRANotSupportedWorkerBase -class ProposerWorkerBase(LoraNotSupportedWorkerBase, SpeculativeProposer): +class ProposerWorkerBase(LoRANotSupportedWorkerBase, SpeculativeProposer): """Interface for proposer workers""" @abstractmethod diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 8af71842224b..871a3aee6306 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -47,7 +47,7 @@ get_sampled_token_logprobs, nvtx_range, split_batch_by_proposal_len) from vllm.utils import resolve_obj_by_qualname -from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase +from vllm.worker.worker_base import LoRANotSupportedWorkerBase, WorkerBase logger = init_logger(__name__) @@ -118,7 +118,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": # Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid -class SpecDecodeWorker(LoraNotSupportedWorkerBase): +class SpecDecodeWorker(LoRANotSupportedWorkerBase): """Worker which implements speculative decoding. Speculative decoding reduces decoding per-token latency by using a proposal diff --git a/vllm/transformers_utils/configs/arctic.py b/vllm/transformers_utils/configs/arctic.py index 6625ccf0f2a8..5ab70c0e4136 100644 --- a/vllm/transformers_utils/configs/arctic.py +++ b/vllm/transformers_utils/configs/arctic.py @@ -21,7 +21,7 @@ @dataclass -class ArcticLoraConfig: +class ArcticLoRAConfig: lora_r: int = 64 lora_alpha: float = 16 shard_base_weights: bool = False diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index 95e7acd025f0..df651e05a7bb 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -13,11 +13,11 @@ from vllm.sequence import ExecuteModelRequest from vllm.worker.neuron_model_runner import NeuronModelRunner from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, - LoraNotSupportedWorkerBase, WorkerBase, + LoRANotSupportedWorkerBase, WorkerBase, WorkerInput) -class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): +class NeuronWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase): """A worker class that executes the model on a group of neuron cores. """ diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py index 1ad66e6f3be7..fad91270ea2a 100644 --- a/vllm/worker/openvino_worker.py +++ b/vllm/worker/openvino_worker.py @@ -24,7 +24,7 @@ from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata from vllm.utils import bind_kv_cache from vllm.worker.openvino_model_runner import OpenVINOModelRunner -from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase +from vllm.worker.worker_base import LoRANotSupportedWorkerBase, WorkerBase logger = init_logger(__name__) @@ -203,7 +203,7 @@ def get_cache_block_size( return dtype_size * total -class OpenVINOWorker(LoraNotSupportedWorkerBase): +class OpenVINOWorker(LoRANotSupportedWorkerBase): """A worker class that executes the model on OpenVINO backend. Each worker is associated with a single OpenVINO device. The worker is diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index 12f10169f2db..7903e81943c2 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -17,13 +17,13 @@ from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, bind_kv_cache, get_dtype_size from vllm.worker.tpu_model_runner import ExecutionMode, TPUModelRunner from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, - LoraNotSupportedWorkerBase, WorkerBase, + LoRANotSupportedWorkerBase, WorkerBase, WorkerInput) logger = init_logger(__name__) -class TPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): +class TPUWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase): def __init__( self, diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 445c0d3285bf..7cc1562a5bce 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -189,7 +189,7 @@ def __getattr__(self, attr): return getattr(self.worker, attr) -class LoraNotSupportedWorkerBase(WorkerBase): +class LoRANotSupportedWorkerBase(WorkerBase): """Partial implementation of WorkerBase that raises exceptions when LoRA methods are invoked. """ diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index 047c0bbbc355..3aea0d7419d0 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -18,13 +18,13 @@ from vllm.platforms import current_platform from vllm.worker.cache_engine import CacheEngine from vllm.worker.worker import Worker -from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase +from vllm.worker.worker_base import LoRANotSupportedWorkerBase, WorkerBase from vllm.worker.xpu_model_runner import XPUModelRunner logger = init_logger(__name__) -class XPUWorker(LoraNotSupportedWorkerBase, Worker): +class XPUWorker(LoRANotSupportedWorkerBase, Worker): """A worker class that executes (a partition of) the model on a GPU. Each worker is associated with a single XPU device. The worker is