Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion benchmarks/kernels/benchmark_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def make_prompt_lora_mapping(num_prompts: int, num_active_loras: int,
sort_by_lora_id: bool,
device: str) -> torch.Tensor:
"""
All prompts are mapped to a Lora ID in range [0, num_active_loras).
All prompts are mapped to a LoRA ID in range [0, num_active_loras).
where 0 refers to first lora, 1 refers to second lora and so on.
"""
assert num_active_loras > 0
Expand Down
2 changes: 1 addition & 1 deletion docs/source/features/lora.md
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ Now, you can specify a base_model_name alongside the name and path using JSON fo

To provide the backward compatibility support, you can still use the old key-value format (name=path), but the `base_model_name` will remain unspecified in that case.

## Lora model lineage in model card
## LoRA model lineage in model card

The new format of `--lora-modules` is mainly to support the display of parent model information in the model card. Here's an explanation of how your current response supports this:

Expand Down
2 changes: 1 addition & 1 deletion tests/core/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ def test_prefill_schedule_max_lora():
lora_path="abc"))
scheduler.add_seq_group(seq_group)
# Add two more requests to verify lora is prioritized.
# 0: Lora, 1: Lora, 2: regular, 3: regular
# 0: LoRA, 1: LoRA, 2: regular, 3: regular
# In the first iteration, index 0, 2 is scheduled.
# If a request is not scheduled because it hits max lora, it is
# prioritized. Verify that.
Expand Down
2 changes: 1 addition & 1 deletion tests/entrypoints/openai/test_cli_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def serve_parser():
return make_arg_parser(parser)


### Tests for Lora module parsing
### Tests for LoRA module parsing
def test_valid_key_value_format(serve_parser):
# Test old format: name=path
args = serve_parser.parse_args([
Expand Down
20 changes: 10 additions & 10 deletions tests/entrypoints/openai/test_serving_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.openai.protocol import (ErrorResponse,
LoadLoraAdapterRequest,
UnloadLoraAdapterRequest)
LoadLoRAAdapterRequest,
UnloadLoRAAdapterRequest)
from vllm.entrypoints.openai.serving_models import (BaseModelPath,
OpenAIServingModels)
from vllm.lora.request import LoRARequest
Expand Down Expand Up @@ -51,7 +51,7 @@ async def test_serving_model_name():
@pytest.mark.asyncio
async def test_load_lora_adapter_success():
serving_models = await _async_serving_models_init()
request = LoadLoraAdapterRequest(lora_name="adapter",
request = LoadLoRAAdapterRequest(lora_name="adapter",
lora_path="/path/to/adapter2")
response = await serving_models.load_lora_adapter(request)
assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter')
Expand All @@ -62,7 +62,7 @@ async def test_load_lora_adapter_success():
@pytest.mark.asyncio
async def test_load_lora_adapter_missing_fields():
serving_models = await _async_serving_models_init()
request = LoadLoraAdapterRequest(lora_name="", lora_path="")
request = LoadLoRAAdapterRequest(lora_name="", lora_path="")
response = await serving_models.load_lora_adapter(request)
assert isinstance(response, ErrorResponse)
assert response.type == "InvalidUserInput"
Expand All @@ -72,14 +72,14 @@ async def test_load_lora_adapter_missing_fields():
@pytest.mark.asyncio
async def test_load_lora_adapter_duplicate():
serving_models = await _async_serving_models_init()
request = LoadLoraAdapterRequest(lora_name="adapter1",
request = LoadLoRAAdapterRequest(lora_name="adapter1",
lora_path="/path/to/adapter1")
response = await serving_models.load_lora_adapter(request)
assert response == LORA_LOADING_SUCCESS_MESSAGE.format(
lora_name='adapter1')
assert len(serving_models.lora_requests) == 1

request = LoadLoraAdapterRequest(lora_name="adapter1",
request = LoadLoRAAdapterRequest(lora_name="adapter1",
lora_path="/path/to/adapter1")
response = await serving_models.load_lora_adapter(request)
assert isinstance(response, ErrorResponse)
Expand All @@ -91,12 +91,12 @@ async def test_load_lora_adapter_duplicate():
@pytest.mark.asyncio
async def test_unload_lora_adapter_success():
serving_models = await _async_serving_models_init()
request = LoadLoraAdapterRequest(lora_name="adapter1",
request = LoadLoRAAdapterRequest(lora_name="adapter1",
lora_path="/path/to/adapter1")
response = await serving_models.load_lora_adapter(request)
assert len(serving_models.lora_requests) == 1

request = UnloadLoraAdapterRequest(lora_name="adapter1")
request = UnloadLoRAAdapterRequest(lora_name="adapter1")
response = await serving_models.unload_lora_adapter(request)
assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format(
lora_name='adapter1')
Expand All @@ -106,7 +106,7 @@ async def test_unload_lora_adapter_success():
@pytest.mark.asyncio
async def test_unload_lora_adapter_missing_fields():
serving_models = await _async_serving_models_init()
request = UnloadLoraAdapterRequest(lora_name="", lora_int_id=None)
request = UnloadLoRAAdapterRequest(lora_name="", lora_int_id=None)
response = await serving_models.unload_lora_adapter(request)
assert isinstance(response, ErrorResponse)
assert response.type == "InvalidUserInput"
Expand All @@ -116,7 +116,7 @@ async def test_unload_lora_adapter_missing_fields():
@pytest.mark.asyncio
async def test_unload_lora_adapter_not_found():
serving_models = await _async_serving_models_init()
request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter")
request = UnloadLoRAAdapterRequest(lora_name="nonexistent_adapter")
response = await serving_models.unload_lora_adapter(request)
assert isinstance(response, ErrorResponse)
assert response.type == "NotFoundError"
Expand Down
18 changes: 9 additions & 9 deletions tests/lora/test_layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,16 @@
from vllm.lora.fully_sharded_layers import (
ColumnParallelLinearWithShardedLoRA,
MergedColumnParallelLinearWithShardedLoRA,
MergedQKVParallelLinearWithShardedLora, QKVParallelLinearWithShardedLora,
MergedQKVParallelLinearWithShardedLoRA, QKVParallelLinearWithShardedLoRA,
RowParallelLinearWithShardedLoRA)
# yapf conflicts with isort for this block
# yapf: disable
from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
LinearScalingRotaryEmbeddingWithLora,
LinearScalingRotaryEmbeddingWithLoRA,
LogitsProcessorWithLoRA, LoRAMapping,
MergedColumnParallelLinearWithLoRA,
MergedQKVParallelLinearWithLora,
QKVParallelLinearWithLora,
MergedQKVParallelLinearWithLoRA,
QKVParallelLinearWithLoRA,
ReplicatedLinearWithLoRA,
RowParallelLinearWithLoRA,
VocabParallelEmbeddingWithLoRA)
Expand Down Expand Up @@ -866,19 +866,19 @@ def create_column_parallel_packed_layer():
bias=False,
params_dtype=torch.float16)
linear.weight.data = torch.rand_like(linear.weight.data)
lora_linear = (MergedQKVParallelLinearWithLora(linear)
lora_linear = (MergedQKVParallelLinearWithLoRA(linear)
if not fully_shard else
MergedQKVParallelLinearWithShardedLora(linear))
MergedQKVParallelLinearWithShardedLoRA(linear))
else:
linear = QKVParallelLinear(4096,
64,
32,
bias=False,
params_dtype=torch.float16)
linear.weight.data = torch.rand_like(linear.weight.data)
lora_linear = QKVParallelLinearWithLora(
lora_linear = QKVParallelLinearWithLoRA(
linear
) if not fully_shard else QKVParallelLinearWithShardedLora(linear)
) if not fully_shard else QKVParallelLinearWithShardedLoRA(linear)

@dataclass
class FakeConfig:
Expand Down Expand Up @@ -1024,7 +1024,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
base,
is_neox_style,
)
lora_rope = LinearScalingRotaryEmbeddingWithLora(rope)
lora_rope = LinearScalingRotaryEmbeddingWithLoRA(rope)
lora_rope.set_mapping(punica_wrapper)
lora_rope.create_lora_weights(max_loras, lora_config)
linear_rope = get_rope(head_size, rotary_dim, max_position, base,
Expand Down
4 changes: 2 additions & 2 deletions tests/lora/test_long_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import vllm
from vllm import SamplingParams
from vllm.lora.layers import LinearScalingRotaryEmbeddingWithLora
from vllm.lora.layers import LinearScalingRotaryEmbeddingWithLoRA
from vllm.lora.request import LoRARequest
from vllm.model_executor.layers.rotary_embedding import (
LinearScalingRotaryEmbedding)
Expand Down Expand Up @@ -151,7 +151,7 @@ def test_rotary_emb_replaced(dist_init):
if "rotary_emb" in module_name:
if "base_layer" not in module_name:
rotary_emb_count += 1
assert isinstance(module, LinearScalingRotaryEmbeddingWithLora)
assert isinstance(module, LinearScalingRotaryEmbeddingWithLoRA)
else:
assert isinstance(module, LinearScalingRotaryEmbedding)
# Llama 2 has 32 layers.
Expand Down
2 changes: 1 addition & 1 deletion vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1629,7 +1629,7 @@ def _get_stats(self,
max_tokens_requests: List[int] = []
finished_reason_requests: List[str] = []

# Lora requests
# LoRA requests
running_lora_adapters = dict(
collectionsCounter([
running_request.lora_request.lora_name
Expand Down
10 changes: 5 additions & 5 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
EmbeddingResponse,
EmbeddingResponseData,
ErrorResponse,
LoadLoraAdapterRequest,
LoadLoRAAdapterRequest,
PoolingChatRequest,
PoolingCompletionRequest,
PoolingRequest, PoolingResponse,
Expand All @@ -63,7 +63,7 @@
TokenizeResponse,
TranscriptionRequest,
TranscriptionResponse,
UnloadLoraAdapterRequest)
UnloadLoRAAdapterRequest)
from vllm.entrypoints.openai.reasoning_parsers import ReasoningParserManager
# yapf: enable
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
Expand Down Expand Up @@ -690,12 +690,12 @@ async def stop_profile(raw_request: Request):

if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
logger.warning(
"Lora dynamic loading & unloading is enabled in the API server. "
"LoRA dynamic loading & unloading is enabled in the API server. "
"This should ONLY be used for local development!")

@router.post("/v1/load_lora_adapter",
dependencies=[Depends(validate_json_request)])
async def load_lora_adapter(request: LoadLoraAdapterRequest,
async def load_lora_adapter(request: LoadLoRAAdapterRequest,
raw_request: Request):
handler = models(raw_request)
response = await handler.load_lora_adapter(request)
Expand All @@ -707,7 +707,7 @@ async def load_lora_adapter(request: LoadLoraAdapterRequest,

@router.post("/v1/unload_lora_adapter",
dependencies=[Depends(validate_json_request)])
async def unload_lora_adapter(request: UnloadLoraAdapterRequest,
async def unload_lora_adapter(request: UnloadLoRAAdapterRequest,
raw_request: Request):
handler = models(raw_request)
response = await handler.unload_lora_adapter(request)
Expand Down
4 changes: 2 additions & 2 deletions vllm/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -1431,12 +1431,12 @@ class DetokenizeResponse(OpenAIBaseModel):
prompt: str


class LoadLoraAdapterRequest(BaseModel):
class LoadLoRAAdapterRequest(BaseModel):
lora_name: str
lora_path: str


class UnloadLoraAdapterRequest(BaseModel):
class UnloadLoRAAdapterRequest(BaseModel):
lora_name: str
lora_int_id: Optional[int] = Field(default=None)

Expand Down
14 changes: 7 additions & 7 deletions vllm/entrypoints/openai/serving_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.openai.protocol import (ErrorResponse,
LoadLoraAdapterRequest,
LoadLoRAAdapterRequest,
ModelCard, ModelList,
ModelPermission,
UnloadLoraAdapterRequest)
UnloadLoRAAdapterRequest)
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.prompt_adapter.request import PromptAdapterRequest
Expand Down Expand Up @@ -88,7 +88,7 @@ async def init_static_loras(self):
if self.static_lora_modules is None:
return
for lora in self.static_lora_modules:
load_request = LoadLoraAdapterRequest(lora_path=lora.path,
load_request = LoadLoRAAdapterRequest(lora_path=lora.path,
lora_name=lora.name)
load_result = await self.load_lora_adapter(
request=load_request, base_model_name=lora.base_model_name)
Expand Down Expand Up @@ -140,7 +140,7 @@ async def show_available_models(self) -> ModelList:

async def load_lora_adapter(
self,
request: LoadLoraAdapterRequest,
request: LoadLoRAAdapterRequest,
base_model_name: Optional[str] = None
) -> Union[ErrorResponse, str]:
error_check_ret = await self._check_load_lora_adapter_request(request)
Expand Down Expand Up @@ -177,7 +177,7 @@ async def load_lora_adapter(

async def unload_lora_adapter(
self,
request: UnloadLoraAdapterRequest) -> Union[ErrorResponse, str]:
request: UnloadLoRAAdapterRequest) -> Union[ErrorResponse, str]:
error_check_ret = await self._check_unload_lora_adapter_request(request
)
if error_check_ret is not None:
Expand All @@ -192,7 +192,7 @@ async def unload_lora_adapter(
return f"Success: LoRA adapter '{lora_name}' removed successfully."

async def _check_load_lora_adapter_request(
self, request: LoadLoraAdapterRequest) -> Optional[ErrorResponse]:
self, request: LoadLoRAAdapterRequest) -> Optional[ErrorResponse]:
# Check if both 'lora_name' and 'lora_path' are provided
if not request.lora_name or not request.lora_path:
return create_error_response(
Expand All @@ -214,7 +214,7 @@ async def _check_load_lora_adapter_request(

async def _check_unload_lora_adapter_request(
self,
request: UnloadLoraAdapterRequest) -> Optional[ErrorResponse]:
request: UnloadLoRAAdapterRequest) -> Optional[ErrorResponse]:
# Check if either 'lora_name' or 'lora_int_id' is provided
if not request.lora_name and not request.lora_int_id:
return create_error_response(
Expand Down
12 changes: 6 additions & 6 deletions vllm/lora/fully_sharded_layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
from vllm.distributed.parallel_state import get_tensor_model_parallel_rank
from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
MergedColumnParallelLinearWithLoRA,
MergedQKVParallelLinearWithLora,
QKVParallelLinearWithLora,
MergedQKVParallelLinearWithLoRA,
QKVParallelLinearWithLoRA,
RowParallelLinearWithLoRA)

if TYPE_CHECKING:
Expand Down Expand Up @@ -167,9 +167,9 @@ def can_replace_layer(
)


class QKVParallelLinearWithShardedLora(QKVParallelLinearWithLora):
class QKVParallelLinearWithShardedLoRA(QKVParallelLinearWithLoRA):
"""
Differs from QKVParallelLinearWithLora by slicing the
Differs from QKVParallelLinearWithLoRA by slicing the
LoRA A's also.

Based on S-LoRA, slicing happens along the rank dim.
Expand Down Expand Up @@ -202,9 +202,9 @@ def can_replace_layer(cls, source_layer: nn.Module,
)


class MergedQKVParallelLinearWithShardedLora(MergedQKVParallelLinearWithLora):
class MergedQKVParallelLinearWithShardedLoRA(MergedQKVParallelLinearWithLoRA):
"""
Differs from MergedQKVParallelLinearWithLora by slicing the
Differs from MergedQKVParallelLinearWithLoRA by slicing the
LoRA A's also.

Based on S-LoRA, slicing happens along the rank dim.
Expand Down
8 changes: 4 additions & 4 deletions vllm/lora/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,7 @@ def set_lora(
embeddings_tensor: Optional[torch.Tensor],
lora_bias: Optional[torch.Tensor] = None,
):
# Except for QKVParallelLinearWithLora and
# Except for QKVParallelLinearWithLoRA and
# MergedColumnParallelLinearWithLoRA, all other linear LoRA layers
# store weights in a tuple of size 1. These two layers will
# override this function.
Expand Down Expand Up @@ -686,7 +686,7 @@ def can_replace_layer(
and len(packed_modules_list) == 2)


class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
"""
ColumnParallelLinear layer that is specifically designed for
qkv_proj. Certain models, such as chatglm3 and baichuan-7b,
Expand Down Expand Up @@ -754,7 +754,7 @@ def can_replace_layer(cls, source_layer: nn.Module,
packed_modules_list) == 1


class MergedQKVParallelLinearWithLora(MergedColumnParallelLinearWithLoRA):
class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA):
"""MergedColumnParallelLinear layer that is composed of 3 sublayers (slices)
packed together in qkv proj fashion
(q_proj + k_proj + v_proj -> qkv_proj).
Expand Down Expand Up @@ -1120,7 +1120,7 @@ def can_replace_layer(
return False


class LinearScalingRotaryEmbeddingWithLora(BaseLayerWithLoRA):
class LinearScalingRotaryEmbeddingWithLoRA(BaseLayerWithLoRA):
"""Implements RoPE-scaled embeddings with linear scaling for
multiple LoRA adapters with a specialized kernel.

Expand Down
Loading