ai-dynamo
diff --git a/‎.github/workflows/container-validation-backends.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/container-validation-backends.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎tests/conftest.py‎
Lines changed: 47 additions & 46 deletions b/‎tests/conftest.py‎
Lines changed: 47 additions & 46 deletions
diff --git a/‎tests/fault_tolerance/test_request_cancellation.py‎
Lines changed: 10 additions & 48 deletions b/‎tests/fault_tolerance/test_request_cancellation.py‎
Lines changed: 10 additions & 48 deletions
diff --git a/‎tests/fault_tolerance/test_request_migration.py‎
Lines changed: 5 additions & 42 deletions b/‎tests/fault_tolerance/test_request_migration.py‎
Lines changed: 5 additions & 42 deletions
@@ -22,7 +22,7 @@ jobs:
             pytest_marks: "e2e and vllm and gpu_1 and not slow"
           - framework: sglang
             target: runtime
-            pytest_marks: "e2e and sglang and gpu_1"
+            pytest_marks: "e2e and sglang and gpu_1 and not slow"
 
     # Do not cancel main branch runs
     concurrency:
 
@@ -178,7 +178,8 @@ markers = [
     "sglang: marks tests as requiring sglang",
     "slow: marks tests as known to be slow",
     "h100: marks tests to run on H100",
-    "kvbm: marks tests for KV behavior and model determinism"
+    "kvbm: marks tests for KV behavior and model determinism",
+    "model: model id used by a test or parameter"
 ]
 
 # Linting/formatting
 
@@ -20,26 +20,25 @@
 
 import pytest
 
+from tests.utils.constants import TEST_MODELS
 from tests.utils.managed_process import ManagedProcess
 
-# Custom format inspired by your example
+
+def pytest_configure(config):
+    # Defining model morker to avoid `'model' not found in `markers` configuration option`
+    # error when pyproject.toml is not available in the container
+    config.addinivalue_line("markers", "model: model id used by a test or parameter")
+
+
 LOG_FORMAT = "[TEST] %(asctime)s %(levelname)s %(name)s: %(message)s"
 DATE_FORMAT = "%Y-%m-%dT%H:%M:%S"
 
-# Configure logging
 logging.basicConfig(
     level=logging.INFO,
     format=LOG_FORMAT,
     datefmt=DATE_FORMAT,  # ISO 8601 UTC format
 )
 
-# List of models used in tests
-TEST_MODELS = [
-    "Qwen/Qwen3-0.6B",
-    "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
-    "llava-hf/llava-1.5-7b-hf",
-]
-
 
 def download_models(model_list=None, ignore_weights=False):
     """Download models - can be called directly or via fixture
@@ -107,16 +106,34 @@ def download_models(model_list=None, ignore_weights=False):
 
 
 @pytest.fixture(scope="session")
-def predownload_models():
-    """Fixture wrapper around download_models for all TEST_MODELS"""
-    download_models()
+def predownload_models(pytestconfig):
+    """Fixture wrapper around download_models for models used in collected tests"""
+    # Get models from pytest config if available, otherwise fall back to TEST_MODELS
+    models = getattr(pytestconfig, "models_to_download", None)
+    if models:
+        logging.info(
+            f"Downloading {len(models)} models needed for collected tests\nModels: {models}"
+        )
+        download_models(model_list=list(models))
+    else:
+        # Fallback to original behavior if extraction failed
+        download_models()
     yield
 
 
 @pytest.fixture(scope="session")
-def predownload_tokenizers():
-    """Fixture wrapper around download_models for all TEST_MODELS"""
-    download_models(ignore_weights=True)
+def predownload_tokenizers(pytestconfig):
+    """Fixture wrapper around download_models for tokenizers used in collected tests"""
+    # Get models from pytest config if available, otherwise fall back to TEST_MODELS
+    models = getattr(pytestconfig, "models_to_download", None)
+    if models:
+        logging.info(
+            f"Downloading tokenizers for {len(models)} models needed for collected tests\nModels: {models}"
+        )
+        download_models(model_list=list(models), ignore_weights=True)
+    else:
+        # Fallback to original behavior if extraction failed
+        download_models(ignore_weights=True)
     yield
 
 
@@ -135,42 +152,26 @@ def logger(request):
     logger.removeHandler(handler)
 
 
+@pytest.hookimpl(trylast=True)
 def pytest_collection_modifyitems(config, items):
     """
     This function is called to modify the list of tests to run.
-    It is used to skip tests that are not supported on all environments.
     """
-
-    # Tests marked with trtllm requires specific environment with tensorrtllm
-    # installed. Hence, we skip them if the user did not explicitly ask for them.
-    if config.getoption("-m") and "trtllm_marker" in config.getoption("-m"):
-        return
-    skip_trtllm = pytest.mark.skip(reason="need -m trtllm_marker to run")
+    # Collect models via explicit pytest mark from final filtered items only
+    models_to_download = set()
     for item in items:
-        if "trtllm_marker" in item.keywords:
-            item.add_marker(skip_trtllm)
-
-        # Auto-inject predownload_models fixture for serve tests only (not router tests)
-        # Skip items that don't have fixturenames (like MypyFileItem)
-        if hasattr(item, "fixturenames"):
-            # Guard clause: skip if already has the fixtures
-            if (
-                "predownload_models" in item.fixturenames
-                or "predownload_tokenizers" in item.fixturenames
-            ):
-                continue
-
-            # Guard clause: skip if marked with skip_model_download
-            if item.get_closest_marker("skip_model_download"):
-                continue
-
-            # Add appropriate fixture based on test path
-            if "serve" in str(item.path):
-                item.fixturenames = list(item.fixturenames)
-                item.fixturenames.append("predownload_models")
-            elif "router" in str(item.path):
-                item.fixturenames = list(item.fixturenames)
-                item.fixturenames.append("predownload_tokenizers")
+        # Only collect from items that are not skipped
+        if any(
+            getattr(m, "name", "") == "skip" for m in getattr(item, "own_markers", [])
+        ):
+            continue
+        model_mark = item.get_closest_marker("model")
+        if model_mark and model_mark.args:
+            models_to_download.add(model_mark.args[0])
+
+    # Store models to download in pytest config for fixtures to access
+    if models_to_download:
+        config.models_to_download = models_to_download
 
 
 class EtcdServer(ManagedProcess):
 
@@ -9,8 +9,8 @@
 
 import pytest
 import requests
-from huggingface_hub import snapshot_download
 
+from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME
 from tests.utils.engine_process import FRONTEND_PORT
 from tests.utils.managed_process import ManagedProcess
 from tests.utils.payloads import check_health_generate, check_models_api
@@ -56,7 +56,7 @@ def __init__(self, request, is_prefill: bool = False):
             "-m",
             "dynamo.vllm",
             "--model",
-            "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+            FAULT_TOLERANCE_MODEL_NAME,
             "--enforce-eager",
             "--gpu-memory-utilization",
             "0.45",
@@ -137,47 +137,12 @@ def is_ready(self, response) -> bool:
         return False
 
 
-def download_model() -> None:
-    """
-    Download the DeepSeek-R1-Distill-Llama-8B model from HuggingFace Hub if not already cached.
-    """
-    model_id = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
-    logger.info(f"Caching model {model_id}...")
-
-    max_retries = 5
-    retry_delay = 30  # seconds
-
-    for attempt in range(max_retries):
-        try:
-            # Download the model to the default cache directory
-            # This will skip download if the model is already cached
-            snapshot_download(
-                repo_id="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
-                repo_type="model",
-                local_files_only=False,
-            )
-            logger.info(f"Model {model_id} is ready for use")
-            return  # Success, exit the function
-        except Exception as e:
-            if attempt < max_retries - 1:  # Not the last attempt
-                logger.warning(
-                    f"Failed to download model {model_id} (attempt {attempt + 1}/{max_retries}): {e}"
-                )
-                logger.info(f"Retrying in {retry_delay} seconds...")
-                time.sleep(retry_delay)
-            else:  # Last attempt failed
-                logger.error(
-                    f"Failed to download model {model_id} after {max_retries} attempts: {e}"
-                )
-                raise
-
-
 def send_completion_request(
     prompt: str, max_tokens: int, timeout: int = 120
 ) -> requests.Response:
     """Send a completion request to the frontend"""
     payload = {
-        "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+        "model": FAULT_TOLERANCE_MODEL_NAME,
         "prompt": prompt,
         "max_tokens": max_tokens,
     }
@@ -211,7 +176,7 @@ def send_chat_completion_request(
 ) -> requests.Response:
     """Send a chat completion request to the frontend"""
     payload = {
-        "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+        "model": FAULT_TOLERANCE_MODEL_NAME,
         "messages": [{"role": "user", "content": prompt}],
         "max_tokens": max_tokens,
         "stream": stream,
@@ -383,8 +348,8 @@ def verify_request_cancelled(
 @pytest.mark.vllm
 @pytest.mark.gpu_1
 @pytest.mark.e2e
-@pytest.mark.slow
-def test_request_cancellation_vllm(request, runtime_services):
+@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+def test_request_cancellation_vllm(request, runtime_services, predownload_models):
     """
     End-to-end test for request cancellation functionality.
 
@@ -395,8 +360,6 @@ def test_request_cancellation_vllm(request, runtime_services):
     2. Chat completion request (non-streaming)
     3. Chat completion request (streaming)
     """
-    # Step 0: Download the model from HuggingFace if not already cached
-    download_model()
 
     # Step 1: Start the frontend
     with DynamoFrontendProcess(request) as frontend:
@@ -446,17 +409,17 @@ def test_request_cancellation_vllm(request, runtime_services):
 @pytest.mark.vllm
 @pytest.mark.gpu_1
 @pytest.mark.e2e
-@pytest.mark.slow
-def test_request_cancellation_vllm_decode(request, runtime_services):
+@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+def test_request_cancellation_vllm_decode(
+    request, runtime_services, predownload_models
+):
     """
     End-to-end test for request cancellation functionality with remote prefill.
 
     This test verifies that when a request is cancelled by the client,
     the system properly handles the cancellation and cleans up resources
     on the decode worker side in a disaggregated setup.
     """
-    # Step 0: Download the model from HuggingFace if not already cached
-    download_model()
 
     # Step 1: Start the frontend
     with DynamoFrontendProcess(request) as frontend:
@@ -501,7 +464,6 @@ def test_request_cancellation_vllm_decode(request, runtime_services):
 @pytest.mark.vllm
 @pytest.mark.gpu_1
 @pytest.mark.e2e
-@pytest.mark.slow
 def test_request_cancellation_vllm_prefill(request, runtime_services):
     """
     End-to-end test for request cancellation on remote prefill.
 
@@ -10,8 +10,8 @@
 
 import pytest
 import requests
-from huggingface_hub import snapshot_download
 
+from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME
 from tests.utils.engine_process import FRONTEND_PORT
 from tests.utils.managed_process import ManagedProcess, terminate_process_tree
 from tests.utils.payloads import check_models_api
@@ -54,7 +54,7 @@ def __init__(self, request, worker_id: str):
             "-m",
             "dynamo.vllm",
             "--model",
-            "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+            FAULT_TOLERANCE_MODEL_NAME,
             "--enforce-eager",
             "--gpu-memory-utilization",
             "0.45",
@@ -117,47 +117,12 @@ def is_ready(self, response) -> bool:
         return False
 
 
-def download_model() -> None:
-    """
-    Download the DeepSeek-R1-Distill-Llama-8B model from HuggingFace Hub if not already cached.
-    """
-    model_id = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
-    logger.info(f"Caching model {model_id}...")
-
-    max_retries = 5
-    retry_delay = 30  # seconds
-
-    for attempt in range(max_retries):
-        try:
-            # Download the model to the default cache directory
-            # This will skip download if the model is already cached
-            snapshot_download(
-                repo_id="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
-                repo_type="model",
-                local_files_only=False,
-            )
-            logger.info(f"Model {model_id} is ready for use")
-            return  # Success, exit the function
-        except Exception as e:
-            if attempt < max_retries - 1:  # Not the last attempt
-                logger.warning(
-                    f"Failed to download model {model_id} (attempt {attempt + 1}/{max_retries}): {e}"
-                )
-                logger.info(f"Retrying in {retry_delay} seconds...")
-                time.sleep(retry_delay)
-            else:  # Last attempt failed
-                logger.error(
-                    f"Failed to download model {model_id} after {max_retries} attempts: {e}"
-                )
-                raise
-
-
 def send_completion_request(
     prompt: str, max_tokens: int, timeout: int = 120
 ) -> requests.Response:
     """Send a completion request to the frontend"""
     payload = {
-        "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+        "model": FAULT_TOLERANCE_MODEL_NAME,
         "prompt": prompt,
         "max_tokens": max_tokens,
     }
@@ -324,17 +289,15 @@ def verify_migration_occurred(frontend_process: DynamoFrontendProcess) -> None:
 @pytest.mark.vllm
 @pytest.mark.gpu_1
 @pytest.mark.e2e
-@pytest.mark.slow
-def test_request_migration_vllm(request, runtime_services):
+@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+def test_request_migration_vllm(request, runtime_services, predownload_models):
     """
     End-to-end test for worker fault tolerance with migration support.
 
     This test verifies that when a worker is killed during request processing,
     the system can handle the failure gracefully and migrate the request to
     another worker.
     """
-    # Step 0: Download the model from HuggingFace if not already cached
-    download_model()
 
     # Step 1: Start the frontend
     with DynamoFrontendProcess(request) as frontend:
Original file line number	Diff line number	Diff line change
`@@ -178,7 +178,8 @@ markers = [`
`178`	`178`	`"sglang: marks tests as requiring sglang",`
`179`	`179`	`"slow: marks tests as known to be slow",`
`180`	`180`	`"h100: marks tests to run on H100",`
`181`		`- "kvbm: marks tests for KV behavior and model determinism"`
	`181`	`+ "kvbm: marks tests for KV behavior and model determinism",`
	`182`	`+ "model: model id used by a test or parameter"`
`182`	`183`	`]`
`183`	`184`
`184`	`185`	`# Linting/formatting`