diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 99358d557991..e2b452d8a05f 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -289,7 +289,7 @@ steps: source_file_dependencies: - vllm/lora - tests/lora - command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py + command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py parallelism: 4 - label: PyTorch Fullgraph Smoke Test # 9min @@ -602,8 +602,6 @@ steps: # requires multi-GPU testing for validation. - pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_llama_tp.py - - pytest -v -s -x lora/test_minicpmv_tp.py - - pytest -v -s -x lora/test_transfomers_model.py - label: Weight Loading Multiple GPU Test # 33min diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 523bebe06ee5..91733fde1307 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -2,7 +2,6 @@ import tempfile from collections import OrderedDict -from typing import TypedDict from unittest.mock import MagicMock, patch import pytest @@ -26,28 +25,6 @@ from vllm.platforms import current_platform -class ContextIDInfo(TypedDict): - lora_id: int - context_length: str - - -class ContextInfo(TypedDict): - lora: str - context_length: str - - -LONG_LORA_INFOS: list[ContextIDInfo] = [{ - "lora_id": 1, - "context_length": "16k", -}, { - "lora_id": 2, - "context_length": "16k", -}, { - "lora_id": 3, - "context_length": "32k", -}] - - @pytest.fixture() def should_do_global_cleanup_after_test(request) -> bool: """Allow subdirectories to skip global cleanup by overriding this fixture. diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 99d60b332e65..f85725fe4230 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -59,7 +59,7 @@ # prefill stage(True) or decode stage(False) STAGES = [True, False] -NUM_RANDOM_SEEDS = 10 +NUM_RANDOM_SEEDS = 6 VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128 diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index 9f20e47c2f94..31abac87d19d 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -153,20 +153,3 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files): enable_chunked_prefill=True, ) generate_and_test(llm, sql_lora_files) - - -@multi_gpu_test(num_gpus=4) -@create_new_process_for_each_test() -def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files): - - llm = vllm.LLM( - MODEL_PATH, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - tensor_parallel_size=4, - fully_sharded_loras=True, - enable_lora_bias=True, - enable_chunked_prefill=True, - ) - generate_and_test(llm, sql_lora_files) diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py index 00e6fe7c61de..0b223e5011ff 100644 --- a/tests/lora/test_minicpmv_tp.py +++ b/tests/lora/test_minicpmv_tp.py @@ -58,7 +58,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: @pytest.mark.xfail( current_platform.is_rocm(), reason="MiniCPM-V dependency xformers incompatible with ROCm") -@create_new_process_for_each_test() def test_minicpmv_lora(minicpmv_lora_files): llm = vllm.LLM( MODEL_PATH, diff --git a/tests/lora/test_transfomers_model.py b/tests/lora/test_transfomers_model.py index 87db0b4bbde0..b50e210ed082 100644 --- a/tests/lora/test_transfomers_model.py +++ b/tests/lora/test_transfomers_model.py @@ -1,7 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 +import pytest + import vllm from vllm.lora.request import LoRARequest +from vllm.platforms import current_platform from ..utils import create_new_process_for_each_test, multi_gpu_test @@ -44,7 +47,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: return generated_texts -@create_new_process_for_each_test() def test_ilama_lora(ilama_lora_files): llm = vllm.LLM(MODEL_PATH, max_model_len=1024, @@ -63,6 +65,8 @@ def test_ilama_lora(ilama_lora_files): assert output2[i] == EXPECTED_LORA_OUTPUT[i] +@pytest.mark.skipif(current_platform.is_cuda_alike(), + reason="Skipping to avoid redundant model tests") @multi_gpu_test(num_gpus=4) @create_new_process_for_each_test() def test_ilama_lora_tp4(ilama_lora_files): @@ -84,6 +88,8 @@ def test_ilama_lora_tp4(ilama_lora_files): assert output2[i] == EXPECTED_LORA_OUTPUT[i] +@pytest.mark.skipif(current_platform.is_cuda_alike(), + reason="Skipping to avoid redundant model tests") @multi_gpu_test(num_gpus=4) @create_new_process_for_each_test() def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):