vllm-project · vllm-bot · Apr 2, 2025 · Apr 2, 2025 · Apr 2, 2025 · Apr 2, 2025
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -289,7 +289,7 @@ steps:
   source_file_dependencies:
   - vllm/lora
   - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py  --ignore=lora/test_transfomers_model.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
   parallelism: 4
 
 - label: PyTorch Fullgraph Smoke Test # 9min
@@ -602,8 +602,6 @@ steps:
     # requires multi-GPU testing for validation.
     - pytest -v -s -x lora/test_chatglm3_tp.py
     - pytest -v -s -x lora/test_llama_tp.py
-    - pytest -v -s -x lora/test_minicpmv_tp.py
-    - pytest -v -s -x lora/test_transfomers_model.py
 
 
 - label: Weight Loading Multiple GPU Test  # 33min

@@ -2,7 +2,6 @@
 
 import tempfile
 from collections import OrderedDict
-from typing import TypedDict
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -26,28 +25,6 @@
 from vllm.platforms import current_platform
 
 
-class ContextIDInfo(TypedDict):
-    lora_id: int
-    context_length: str
-
-
-class ContextInfo(TypedDict):
-    lora: str
-    context_length: str
-
-
-LONG_LORA_INFOS: list[ContextIDInfo] = [{
-    "lora_id": 1,
-    "context_length": "16k",
-}, {
-    "lora_id": 2,
-    "context_length": "16k",
-}, {
-    "lora_id": 3,
-    "context_length": "32k",
-}]
-
-
 @pytest.fixture()
 def should_do_global_cleanup_after_test(request) -> bool:
     """Allow subdirectories to skip global cleanup by overriding this fixture.

@@ -59,7 +59,7 @@
 # prefill stage(True) or decode stage(False)
 STAGES = [True, False]
 
-NUM_RANDOM_SEEDS = 10
+NUM_RANDOM_SEEDS = 6
 
 VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
 

@@ -153,20 +153,3 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
         enable_chunked_prefill=True,
     )
     generate_and_test(llm, sql_lora_files)
-
-
-@multi_gpu_test(num_gpus=4)
-@create_new_process_for_each_test()
-def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files):
-
-    llm = vllm.LLM(
-        MODEL_PATH,
-        enable_lora=True,
-        max_num_seqs=16,
-        max_loras=4,
-        tensor_parallel_size=4,
-        fully_sharded_loras=True,
-        enable_lora_bias=True,
-        enable_chunked_prefill=True,
-    )
-    generate_and_test(llm, sql_lora_files)
@@ -58,7 +58,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
 @pytest.mark.xfail(
     current_platform.is_rocm(),
     reason="MiniCPM-V dependency xformers incompatible with ROCm")
-@create_new_process_for_each_test()
 def test_minicpmv_lora(minicpmv_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,

@@ -1,7 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import pytest
+
 import vllm
 from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
 
 from ..utils import create_new_process_for_each_test, multi_gpu_test
 
@@ -44,7 +47,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     return generated_texts
 
 
-@create_new_process_for_each_test()
 def test_ilama_lora(ilama_lora_files):
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
@@ -63,6 +65,8 @@ def test_ilama_lora(ilama_lora_files):
         assert output2[i] == EXPECTED_LORA_OUTPUT[i]
 
 
+@pytest.mark.skipif(current_platform.is_cuda_alike(),
+                    reason="Skipping to avoid redundant model tests")
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_ilama_lora_tp4(ilama_lora_files):
@@ -84,6 +88,8 @@ def test_ilama_lora_tp4(ilama_lora_files):
         assert output2[i] == EXPECTED_LORA_OUTPUT[i]
 
 
+@pytest.mark.skipif(current_platform.is_cuda_alike(),
+                    reason="Skipping to avoid redundant model tests")
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):