RichardoMrMu
diff --git a/‎.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh‎
Lines changed: 3 additions & 0 deletions b/‎.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tests/lora/conftest.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/lora/conftest.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/tpu/lora/__init__.py‎ b/‎tests/tpu/lora/__init__.py‎
diff --git a/‎tests/tpu/lora/test_lora.py‎
Lines changed: 124 additions & 0 deletions b/‎tests/tpu/lora/test_lora.py‎
Lines changed: 124 additions & 0 deletions
diff --git a/‎tests/tpu/lora/test_pallas_kernels.py‎
Lines changed: 73 additions & 0 deletions b/‎tests/tpu/lora/test_pallas_kernels.py‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎vllm/config.py‎
Lines changed: 3 additions & 2 deletions b/‎vllm/config.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎vllm/lora/fully_sharded_layers.py‎
Lines changed: 29 additions & 10 deletions b/‎vllm/lora/fully_sharded_layers.py‎
Lines changed: 29 additions & 10 deletions
diff --git a/‎vllm/lora/layers.py‎
Lines changed: 37 additions & 16 deletions b/‎vllm/lora/layers.py‎
Lines changed: 37 additions & 16 deletions
@@ -50,6 +50,9 @@ docker run --privileged --net host --shm-size=16G -it \
     && pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py \
     && echo TEST_12 \
     && pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" \
+    # Disable the TPU LoRA tests until the feature is activated
+    # && echo TEST_13 \
+    # && pytest -s -v /workspace/vllm/tests/tpu/lora/" \
 
 
 # TODO: This test fails because it uses RANDOM_SEED sampling
 
@@ -47,7 +47,7 @@ def dist_init():
     temp_file = tempfile.mkstemp()[1]
 
     backend = "nccl"
-    if current_platform.is_cpu():
+    if current_platform.is_cpu() or current_platform.is_tpu():
         backend = "gloo"
 
     init_distributed_environment(world_size=1,
 
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+import vllm
+from vllm.lora.request import LoRARequest
+
+# This file contains tests to ensure that LoRA works correctly on the TPU
+# backend. We use a series of custom trained adapters for Qwen2.5-3B-Instruct
+# for this. The adapters are:
+# Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter, where x ranges
+# from 1 to 4.
+
+# These adapters are trained using a standard huggingface peft training script,
+# where all the inputs are "What is 1+1? \n" and all the outputs are "x". We run
+# 100 training iterations with a training batch size of 100.
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v1_only(monkeypatch: pytest.MonkeyPatch):
+    """
+    Since Multi-LoRA is only supported on the v1 TPU backend, set VLLM_USE_V1=1
+    for all tests in this file
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        yield
+
+
+def setup_vllm(num_loras: int) -> vllm.LLM:
+    return vllm.LLM(model="Qwen/Qwen2.5-3B-Instruct",
+                    num_scheduler_steps=1,
+                    max_model_len=256,
+                    max_seq_len_to_capture=256,
+                    max_num_seqs=8,
+                    enable_lora=True,
+                    max_loras=num_loras,
+                    max_lora_rank=8)
+
+
+def test_single_lora():
+    """
+    This test ensures we can run a single LoRA adapter on the TPU backend.
+    We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter" which
+    will force Qwen2.5-3B-Instruct to claim 1+1=1.
+    """
+
+    llm = setup_vllm(1)
+
+    prompt = "What is 1+1? \n"
+
+    lora_request = LoRARequest(
+        "lora_adapter_1", 1,
+        "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter")
+    output = llm.generate(prompt,
+                          sampling_params=vllm.SamplingParams(max_tokens=256,
+                                                              temperature=0),
+                          lora_request=lora_request)[0].outputs[0].text
+
+    answer = output.strip()[0]
+
+    assert answer.isdigit()
+    assert int(answer) == 1
+
+
+def test_lora_hotswapping():
+    """
+    This test ensures we can run multiple LoRA adapters on the TPU backend, even
+    if we only have space to store 1.
+    
+    We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter" which
+    will force Qwen2.5-3B-Instruct to claim 1+1=x, for a range of x.
+    """
+
+    lora_name_template = \
+        "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_{}_adapter"
+    lora_requests = [
+        LoRARequest(f"lora_adapter_{i}", i, lora_name_template.format(i))
+        for i in range(1, 5)
+    ]
+
+    llm = setup_vllm(1)
+
+    prompt = "What is 1+1? \n"
+
+    for i, req in enumerate(lora_requests):
+        output = llm.generate(prompt,
+                              sampling_params=vllm.SamplingParams(
+                                  max_tokens=256, temperature=0),
+                              lora_request=req)[0].outputs[0].text
+        answer = output.strip()[0]
+
+        assert answer.isdigit()
+        assert int(answer) == i + 1
+
+
+def test_multi_lora():
+    """
+    This test ensures we can run multiple LoRA adapters on the TPU backend, when
+    we have enough space to store all of them.
+    
+    We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter" which
+    will force Qwen2.5-3B-Instruct to claim 1+1=x, for a range of x.
+    """
+    lora_name_template = \
+        "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_{}_adapter"
+    lora_requests = [
+        LoRARequest(f"lora_adapter_{i}", i, lora_name_template.format(i))
+        for i in range(1, 5)
+    ]
+
+    llm = setup_vllm(4)
+
+    prompt = "What is 1+1? \n"
+
+    for i, req in enumerate(lora_requests):
+        output = llm.generate(prompt,
+                              sampling_params=vllm.SamplingParams(
+                                  max_tokens=256, temperature=0),
+                              lora_request=req)[0].outputs[0].text
+
+        answer = output.strip()[0]
+
+        assert answer.isdigit()
+        assert int(output.strip()[0]) == i + 1
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+
+# Required to register the custom ops
+import vllm.lora.ops.xla_ops.pallas  # noqa # pylint: disable=unused-import
+
+N_TOKENS = [16, 1024, 4096]
+HIDDEN_SIZES = [1024, 2048, 4096]
+
+DTYPES = [torch.bfloat16]
+NUM_LORA = [1, 4, 16]
+RANKS = [32, 256, 512]
+
+
+def generate_test_data(T, D, L, N, seed, dtype=torch.float32):
+    """
+    Inputs: (All integers)
+        T: Total number of tokens
+        D: Input dim
+        L: LoRA Dim
+        N: N LoRAs
+    
+    Outputs:
+        inputs:     torch.Tensor - shape (T, D)
+        loras:      torch.Tensor - shape (N, 1, L, D)
+        idxs:       torch.Tensor - shape (T, ) - all values must be in [0, N)
+        
+        ref_output: torch.Tensor - shape (T, L) - inputs @ loras[idxs].T
+    """
+    torch.manual_seed(seed)
+
+    inputs = torch.randn((T, D), device="xla", dtype=dtype)
+    loras = torch.randn((N, 1, L, D), device="xla", dtype=dtype)
+    idxs = torch.randint(0, N, (T, ), dtype=torch.int32, device="xla")
+
+    ref_output = ref_bgmv(inputs, loras, idxs)
+    return inputs, loras, idxs, ref_output
+
+
+def ref_bgmv(inputs: torch.Tensor, loras: torch.Tensor, idxs: torch.Tensor):
+    selected_loras = loras[idxs]
+    if len(selected_loras.shape) == 4:
+        selected_loras = selected_loras.squeeze(axis=1)
+
+    batch_size, output_size, input_size = selected_loras.shape
+    return (selected_loras @ inputs.reshape(
+        (batch_size, input_size, 1))).reshape((batch_size, output_size))
+
+
+# Parameterize tests with various shapes and dtypes
+@pytest.mark.parametrize("T", N_TOKENS)
+@pytest.mark.parametrize("D", HIDDEN_SIZES)
+@pytest.mark.parametrize("L", RANKS)
+@pytest.mark.parametrize("N", NUM_LORA)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+@pytest.mark.parametrize("seed", [0])
+def test_bgmv_correctness(T, D, L, N, dtype, op_type, seed):
+    if op_type == "expand":
+        D, L = L, D
+
+    inputs, loras, idxs, ref_output = generate_test_data(
+        T, D, L, N, seed, dtype)
+
+    # Run bgmv
+    output = torch.ops.xla.bgmv(inputs, loras, idxs)
+
+    # Make sure we have no NaNs
+    assert not torch.any(torch.isnan(output))
+
+    # Compare with reference output
+    assert torch.allclose(output, ref_output, rtol=1e-2, atol=1e-2)
@@ -2694,8 +2694,8 @@ class LoRAConfig:
     lora_extra_vocab_size: int = 256
     """Maximum size of extra vocabulary that can be present in a LoRA adapter
     (added to the base model vocabulary)."""
-    # This is a constant.
-    lora_vocab_padding_size: ClassVar[int] = 256
+    lora_vocab_padding_size: ClassVar[int] = current_platform\
+        .get_lora_vocab_padding_size()
     long_lora_scaling_factors: Optional[tuple[float, ...]] = None
     """Specify multiple scaling factors (which can be different from base model
     scaling factor - see eg. Long LoRA) to allow for multiple LoRA adapters
@@ -2723,6 +2723,7 @@ def compute_hash(self) -> str:
         factors.append(self.fully_sharded_loras)
         factors.append(self.lora_dtype)
         factors.append(self.lora_extra_vocab_size)
+        factors.append(self.lora_vocab_padding_size)
         factors.append(self.long_lora_scaling_factors)
         factors.append(self.bias_enabled)
         hash_str = hashlib.md5(str(factors).encode(),
 
@@ -16,6 +16,7 @@
                               MergedQKVParallelLinearWithLoRA,
                               QKVParallelLinearWithLoRA,
                               RowParallelLinearWithLoRA)
+from vllm.platforms import current_platform
 
 if TYPE_CHECKING:
     pass
@@ -57,15 +58,25 @@ def _mcp_apply(x, bias, layer: ColumnParallelLinearWithLoRA):
         device=x.device,
     )
 
-    layer.punica_wrapper.add_shrink(buffers, x, layer.lora_a_stacked, 1.0)
+    shrunk_buffers: Optional[torch.Tensor] = layer.punica_wrapper.add_shrink(
+        buffers, x, layer.lora_a_stacked, 1.0)
+
+    if not current_platform.can_update_inplace():
+        buffers = shrunk_buffers
+
     buffers = tensor_model_parallel_all_gather(buffers)
-    layer.punica_wrapper.add_expand(output,
-                                    buffers,
-                                    layer.lora_b_stacked,
-                                    layer.lora_bias_stacked,
-                                    layer.output_slices,
-                                    offset_start=0,
-                                    add_input=True)
+
+    lora_output: Optional[torch.Tensor] = layer.punica_wrapper.add_expand(
+        output,
+        buffers,
+        layer.lora_b_stacked,
+        layer.lora_bias_stacked,
+        layer.output_slices,
+        offset_start=0,
+        add_input=True)
+
+    if not current_platform.can_update_inplace():
+        output = lora_output
 
     output = output.view(*out_orig_shape)
     # now have column partitioned and packed output
@@ -292,7 +303,11 @@ def apply(self,
             device=x.device,
         )
 
-        self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0)
+        shrunk_buffer: Optional[torch.Tensor] = self.punica_wrapper.add_shrink(
+            buffer, x, self.lora_a_stacked, 1.0)
+        if not current_platform.can_update_inplace():
+            buffer = shrunk_buffer
+
         buffer = tensor_model_parallel_all_reduce(buffer)
 
         # following S-LoRA, allows the fusing of all_gather and all_reduce
@@ -304,7 +319,7 @@ def apply(self,
         # NOTE offset are based on the rank.
         shard_size = self.lora_b_stacked[0].shape[2]
         offset_start = self.tp_rank * shard_size
-        self.punica_wrapper.add_expand(
+        lora_output: Optional[torch.Tensor] = self.punica_wrapper.add_expand(
             output,
             buffer,
             self.lora_b_stacked,
@@ -313,6 +328,10 @@ def apply(self,
             offset_start=offset_start,
             add_input=True,
         )
+
+        if not current_platform.can_update_inplace():
+            output = lora_output
+
         output = output.view(*out_orig_shape)
         return output
 
 
@@ -261,10 +261,17 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 full_lora_a_embeddings.shape[1],
                 -1,
             )
-        self.punica_wrapper.add_lora_embedding(full_output,
-                                               full_lora_a_embeddings,
-                                               self.lora_b_stacked,
-                                               add_input=True)
+
+        lora_output: Optional[
+            torch.Tensor] = self.punica_wrapper.add_lora_embedding(
+                full_output,
+                full_lora_a_embeddings,
+                self.lora_b_stacked,
+                add_input=True)
+
+        if not current_platform.can_update_inplace():
+            full_output = lora_output
+
         return full_output.view_as(full_output_org)
 
     @classmethod
@@ -410,10 +417,13 @@ def apply(self,
             output = output.flatten(0, 1)
             x = x.flatten(0, 1)
 
-        self.punica_wrapper.add_lora_linear(output, x, self.lora_a_stacked,
-                                            self.lora_b_stacked,
-                                            self.lora_bias_stacked, 1.0,
-                                            self.output_slices)
+        lora_output: Optional[
+            torch.Tensor] = self.punica_wrapper.add_lora_linear(
+                output, x, self.lora_a_stacked, self.lora_b_stacked,
+                self.lora_bias_stacked, 1.0, self.output_slices)
+        if not current_platform.can_update_inplace():
+            output = lora_output
+
         return output
 
     @property
@@ -1133,15 +1143,23 @@ def _get_logits(
         torch.matmul(self.embeddings_tensors,
                      hidden_states.T,
                      out=lora_logits[:-1])
-        lora_logits[-1] = float("-inf")
+
+        neg_inf, pos_inf = current_platform.get_infinity_values(
+            lora_logits.dtype)
+
+        lora_logits[-1] = neg_inf
         lora_logits = lora_logits.mT
         indices_padded = self.punica_wrapper.sampler_indices_padded
+
+        if current_platform.is_tpu():
+            indices_padded = indices_padded[:logits.size(0)]
+
         lora_logits = (lora_logits.reshape(
             lora_logits.shape[0] * lora_logits.shape[1],
             lora_logits.shape[2],
-        ).index_select(0, indices_padded).nan_to_num_(nan=float("-inf"),
-                                                      posinf=float("inf"),
-                                                      neginf=float("-inf")))
+        ).index_select(0, indices_padded).nan_to_num_(nan=neg_inf,
+                                                      posinf=pos_inf,
+                                                      neginf=neg_inf))
 
         # HPU needs special handling to prune out dummy samples.
         if current_platform.is_hpu():
@@ -1151,10 +1169,13 @@ def _get_logits(
                self.base_layer.org_vocab_size:self.base_layer.org_vocab_size +
                lora_logits.shape[1]] = lora_logits
 
-        # LogitsProcessorWithLoRA always using bgmv
-        self.punica_wrapper.add_lora_logits(logits, hidden_states,
-                                            self.lora_a_stacked,
-                                            self.lora_b_stacked, 1.0)
+        lora_output: Optional[
+            torch.Tensor] = self.punica_wrapper.add_lora_logits(
+                logits, hidden_states, self.lora_a_stacked,
+                self.lora_b_stacked, 1.0)
+
+        if not current_platform.can_update_inplace():
+            logits = lora_output
 
         # Remove paddings in vocab (if any).
         logits = logits[:, :self.base_layer.vocab_size]