From ece12652ae2b84765a834af11e79182f4c77d1ec Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Wed, 18 Jun 2025 15:48:18 -0700 Subject: [PATCH 1/7] wip Signed-off-by: Kourosh Hakhamaneshi --- .../serve/deployments/llm/llm_server.py | 38 ++++++++----------- .../serve/deployments/llm/vllm/vllm_engine.py | 4 +- .../prefill_decode_disagg.py | 18 +-------- .../serve/deployments/routers/router.py | 6 --- 4 files changed, 19 insertions(+), 47 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py index 9e594563b456..e6c34df89b96 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py +++ b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py @@ -5,7 +5,6 @@ # Third-party imports from ray import serve -from ray._common.utils import import_attr # Local imports from ray.llm._internal.serve.configs.constants import ( @@ -13,7 +12,6 @@ DEFAULT_HEALTH_CHECK_TIMEOUT_S, ENGINE_START_TIMEOUT_S, MODEL_RESPONSE_BATCH_TIMEOUT_MS, - RAYLLM_VLLM_ENGINE_CLS_ENV, ) from ray.llm._internal.serve.configs.openai_api_models import ( ChatCompletionLogProb, @@ -438,9 +436,14 @@ async def __init__( await super().__init__(llm_config) self._engine_cls = engine_cls or self._default_engine_cls - self.engine = self._get_engine_class(self._llm_config) + self.engine: Optional[LLMEngine] = None + if self._engine_cls is not None: + self.engine = self._engine_cls(self._llm_config) + await asyncio.wait_for(self._start_engine(), timeout=ENGINE_START_TIMEOUT_S) + # TODO (Kourosh): I think we can completely remove image retriever. + # It was missed to get removed. self.image_retriever = ( image_retriever_cls() if image_retriever_cls @@ -466,25 +469,10 @@ async def __init__( self.response_postprocessor = ResponsePostprocessor() - @property - def _get_engine_class(self) -> Type[LLMEngine]: - """Helper to load the engine class from the environment variable. - - This is used for testing or escape-hatch for patching purposes. - If env variable is not set, it will fallback to the default engine class. - """ - engine_cls_path = os.environ.get(RAYLLM_VLLM_ENGINE_CLS_ENV) - if engine_cls_path: - try: - return import_attr(engine_cls_path) - except AttributeError: - logger.warning( - f"Failed to import engine class {engine_cls_path}. " - f"Using the default engine class {self._engine_cls}." - ) - return self._engine_cls - async def _start_engine(self): + if self.engine is None: + raise ValueError("Engine is not set") + await self.engine.start() # Push telemetry reports for the model in the current deployment. @@ -616,7 +604,13 @@ async def check_health(self) -> None: Check the health of the replica. Does not return anything. Raise error when the engine is dead and needs to be restarted. """ - return await self.engine.check_health() + if self.engine is None: + return + try: + return await self.engine.check_health() + except Exception as e: + logger.error("Engine health check failed in LLMServer.check_health: %s", e) + raise e async def embeddings(self, request: EmbeddingRequest) -> LLMEmbeddingsResponse: """Runs an embeddings request to the vllm engine, and return the response. diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index f58b2a400e79..35dc7c61761a 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -816,9 +816,9 @@ async def check_health(self) -> None: raise RuntimeError(f"{type(self.engine)} does not support health check.") try: - return await asyncio.wait_for(self.engine.check_health(), timeout=15) + await asyncio.wait_for(self.engine.check_health(), timeout=15) except BaseException as e: - logger.exception("Healthcheck failed. The replica will be restarted") + logger.error("Healthcheck failed. The replica will be restarted") raise e from None @staticmethod diff --git a/python/ray/llm/_internal/serve/deployments/prefill_decode_disagg/prefill_decode_disagg.py b/python/ray/llm/_internal/serve/deployments/prefill_decode_disagg/prefill_decode_disagg.py index ba7fc4684b23..399ddbba584b 100644 --- a/python/ray/llm/_internal/serve/deployments/prefill_decode_disagg/prefill_decode_disagg.py +++ b/python/ray/llm/_internal/serve/deployments/prefill_decode_disagg/prefill_decode_disagg.py @@ -1,6 +1,5 @@ """Using Ray Serve to deploy LLM models with P/D disaggregation. """ -import asyncio import logging import uuid from typing import Any, AsyncGenerator, Dict, Union @@ -63,6 +62,7 @@ def parse_configs_and_cast_type(config: Union[str, LLMConfig]) -> LLMConfig: class PDProxyServer(LLMServer): + _default_engine_cls = None """ Proxy between P/D LLM servers. @@ -83,14 +83,6 @@ async def __init__( prefill_server: DeploymentHandle, decode_server: DeploymentHandle, ): - class FakeEngine: - """Provide a fake engine such that proxy don't really start any engine.""" - - def __init__(self, *args, **kwargs): - pass - - async def start(self, *args, **kwargs): - pass # We pass `llm_config` here to let super() extract the model_id, such that /v1/models # endpoint can work correctly. @@ -98,7 +90,6 @@ async def start(self, *args, **kwargs): # API, instead of passing it in as an argument. await super().__init__( llm_config, - engine_cls=FakeEngine, ) self.prefill_server = prefill_server @@ -160,13 +151,6 @@ async def _predict( ): yield chunk - async def check_health(self) -> None: - """Check the health of the llm engine.""" - await asyncio.gather( - self.prefill_server.check_health.remote(), - self.decode_server.check_health.remote(), - ) - @classmethod def as_deployment(cls) -> serve.Deployment: """Turns PDProxyServer into a Ray Serve deployment.""" diff --git a/python/ray/llm/_internal/serve/deployments/routers/router.py b/python/ray/llm/_internal/serve/deployments/routers/router.py index b25276611d94..e488f269605c 100644 --- a/python/ray/llm/_internal/serve/deployments/routers/router.py +++ b/python/ray/llm/_internal/serve/deployments/routers/router.py @@ -232,12 +232,6 @@ async def _setup_handle_and_config_maps( async def check_health(self): await self._init_completed.wait() - await asyncio.gather( - *[ - handle.check_health.remote() - for handle in self._default_serve_handles.values() - ] - ) def _get_configured_serve_handle(self, model_id: str): """Gets a ServeHandle to a model deployment. From ba3eeb3397a090340ef92c4ec91201df22dc2129 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Wed, 18 Jun 2025 15:54:36 -0700 Subject: [PATCH 2/7] wip Signed-off-by: Kourosh Hakhamaneshi --- python/ray/llm/_internal/serve/configs/constants.py | 1 - .../ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/python/ray/llm/_internal/serve/configs/constants.py b/python/ray/llm/_internal/serve/configs/constants.py index 7d1d8452c30b..796b97a5934e 100644 --- a/python/ray/llm/_internal/serve/configs/constants.py +++ b/python/ray/llm/_internal/serve/configs/constants.py @@ -65,7 +65,6 @@ ENABLE_VERBOSE_TELEMETRY = bool(int(os.getenv("RAYLLM_ENABLE_VERBOSE_TELEMETRY", "0"))) -RAYLLM_VLLM_ENGINE_CLS_ENV = "RAYLLM_VLLM_ENGINE_CLS" # The ratio of number of router replicas to number of model replicas. Default to 2 # meaning that there are 2 router replicas for every model replica. diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index 35dc7c61761a..31d477c37c1c 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -816,7 +816,7 @@ async def check_health(self) -> None: raise RuntimeError(f"{type(self.engine)} does not support health check.") try: - await asyncio.wait_for(self.engine.check_health(), timeout=15) + await self.engine.check_health() except BaseException as e: logger.error("Healthcheck failed. The replica will be restarted") raise e from None From efe6f922b3bec927cb1b190a43106c251477771c Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Wed, 18 Jun 2025 18:59:01 -0700 Subject: [PATCH 3/7] wip Signed-off-by: Kourosh Hakhamaneshi --- .../serve/deployments/llm/llm_server.py | 3 +- .../serve/deployments/llm/vllm/vllm_engine.py | 1 - .../serve/test_llm_serve_fault_tolerance.py | 97 +++++++++++++++++++ 3 files changed, 98 insertions(+), 3 deletions(-) create mode 100644 release/llm_tests/serve/test_llm_serve_fault_tolerance.py diff --git a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py index e6c34df89b96..f727268ec131 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py +++ b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py @@ -439,8 +439,7 @@ async def __init__( self.engine: Optional[LLMEngine] = None if self._engine_cls is not None: self.engine = self._engine_cls(self._llm_config) - - await asyncio.wait_for(self._start_engine(), timeout=ENGINE_START_TIMEOUT_S) + await asyncio.wait_for(self._start_engine(), timeout=ENGINE_START_TIMEOUT_S) # TODO (Kourosh): I think we can completely remove image retriever. # It was missed to get removed. diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index 31d477c37c1c..25d8bd2fdf75 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -1,4 +1,3 @@ -import asyncio import os import re import time diff --git a/release/llm_tests/serve/test_llm_serve_fault_tolerance.py b/release/llm_tests/serve/test_llm_serve_fault_tolerance.py new file mode 100644 index 000000000000..9fedb7eff360 --- /dev/null +++ b/release/llm_tests/serve/test_llm_serve_fault_tolerance.py @@ -0,0 +1,97 @@ +import time +from typing import Literal, List + +import pytest +import ray +from ray import serve +from ray.serve.llm import LLMConfig, build_openai_app, ModelLoadingConfig + +MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct" +RAY_MODEL_ID = "qwen-0.5b" + + +def get_llm_config( + tensor_parallel_size: int = 1, +) -> LLMConfig: + """Create LLMConfig with specified parallelism parameters.""" + return LLMConfig( + model_loading_config=ModelLoadingConfig( + model_id=RAY_MODEL_ID, + model_source=MODEL_ID, + ), + deployment_config=dict( + name="llm_deployment", + num_replicas=2, + ), + engine_kwargs=dict( + tensor_parallel_size=tensor_parallel_size, + enforce_eager=True, + ), + runtime_env={"env_vars": {"VLLM_USE_V1": "1"}}, + ) + + +def find_replica_ids(deployment_name: str) -> List[str]: + actors = ray.util.list_named_actors("serve") + found_replica_ids = [] + for actor in actors: + if deployment_name in actor["name"]: + found_replica_ids.append(actor["name"]) + return found_replica_ids + + +def kill_replica(replica_id: str) -> None: + actor = ray.get_actor(replica_id, namespace="serve") + ray.kill(actor) + + +@pytest.fixture(name="app", scope="function") +def start_ray_serve( + tensor_parallel_size: int = 1, +) -> str: + """Start Ray Serve with specified parallelism parameters.""" + ray_url = "http://localhost:8000" + llm_config: LLMConfig = get_llm_config(tensor_parallel_size) + app = build_openai_app(dict(llm_configs=[llm_config])) + serve.run(app, blocking=False) + yield app + serve.shutdown() + + +def wait_for_deployment_status( + deployment_name: str, status: Literal["HEALTHY", "UNHEALTHY"], timeout_s: int = 120 +) -> None: + s = time.time() + print(f"Waiting for deployment {deployment_name} to become {status}") + while time.time() - s < timeout_s: + state = serve.status() + if state.applications["default"].deployments[deployment_name].status == status: + return + time.sleep(1) + raise TimeoutError( + f"Deployment {deployment_name} did not become " + f"{status} within {timeout_s} seconds" + ) + + +def test_recovery_from_replica_failure(app) -> None: + """Tests that the deployment recovers from replica failure.""" + + start_ray_serve(tensor_parallel_size=1) + wait_for_deployment_status("llm_deployment", "HEALTHY", timeout_s=60) + + # Kill both replicas + replica_ids = find_replica_ids("llm_deployment") + for replica_id in replica_ids: + print(f"Killing replica {replica_id}") + kill_replica(replica_id) + + # wait for deployment to get unhealthy + wait_for_deployment_status("llm_deployment", "UNHEALTHY", timeout_s=60) + + # Wait again for deployment to get healthy + wait_for_deployment_status("llm_deployment", "HEALTHY", timeout_s=60) + + +if __name__ == "__main__": + pytest.main(["-xvs", __file__]) From 7c1a5f091e0fa0590e454ce0cf778aa60eb09ca2 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Wed, 18 Jun 2025 19:48:26 -0700 Subject: [PATCH 4/7] wip Signed-off-by: Kourosh Hakhamaneshi --- .../serve/test_llm_serve_fault_tolerance.py | 24 +++++++++---------- .../serve/test_llm_serve_integration.py | 2 ++ release/release_tests.yaml | 4 ++-- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/release/llm_tests/serve/test_llm_serve_fault_tolerance.py b/release/llm_tests/serve/test_llm_serve_fault_tolerance.py index 9fedb7eff360..d81a234c3aa7 100644 --- a/release/llm_tests/serve/test_llm_serve_fault_tolerance.py +++ b/release/llm_tests/serve/test_llm_serve_fault_tolerance.py @@ -1,10 +1,10 @@ import time -from typing import Literal, List +from typing import Literal, List, Generator import pytest import ray from ray import serve -from ray.serve.llm import LLMConfig, build_openai_app, ModelLoadingConfig +from ray.serve.llm import LLMConfig, ModelLoadingConfig, build_llm_deployment MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct" RAY_MODEL_ID = "qwen-0.5b" @@ -20,7 +20,7 @@ def get_llm_config( model_source=MODEL_ID, ), deployment_config=dict( - name="llm_deployment", + name="test", num_replicas=2, ), engine_kwargs=dict( @@ -48,11 +48,10 @@ def kill_replica(replica_id: str) -> None: @pytest.fixture(name="app", scope="function") def start_ray_serve( tensor_parallel_size: int = 1, -) -> str: +) -> Generator: """Start Ray Serve with specified parallelism parameters.""" - ray_url = "http://localhost:8000" llm_config: LLMConfig = get_llm_config(tensor_parallel_size) - app = build_openai_app(dict(llm_configs=[llm_config])) + app = build_llm_deployment(llm_config, name_prefix="LLM:") serve.run(app, blocking=False) yield app serve.shutdown() @@ -62,8 +61,8 @@ def wait_for_deployment_status( deployment_name: str, status: Literal["HEALTHY", "UNHEALTHY"], timeout_s: int = 120 ) -> None: s = time.time() - print(f"Waiting for deployment {deployment_name} to become {status}") while time.time() - s < timeout_s: + print(f"Waiting for deployment {deployment_name} to become {status}") state = serve.status() if state.applications["default"].deployments[deployment_name].status == status: return @@ -76,21 +75,20 @@ def wait_for_deployment_status( def test_recovery_from_replica_failure(app) -> None: """Tests that the deployment recovers from replica failure.""" - - start_ray_serve(tensor_parallel_size=1) - wait_for_deployment_status("llm_deployment", "HEALTHY", timeout_s=60) + dname = "LLM:test" + wait_for_deployment_status(dname, "HEALTHY", timeout_s=60) # Kill both replicas - replica_ids = find_replica_ids("llm_deployment") + replica_ids = find_replica_ids(dname) for replica_id in replica_ids: print(f"Killing replica {replica_id}") kill_replica(replica_id) # wait for deployment to get unhealthy - wait_for_deployment_status("llm_deployment", "UNHEALTHY", timeout_s=60) + wait_for_deployment_status(dname, "UNHEALTHY", timeout_s=60) # Wait again for deployment to get healthy - wait_for_deployment_status("llm_deployment", "HEALTHY", timeout_s=60) + wait_for_deployment_status(dname, "HEALTHY", timeout_s=60) if __name__ == "__main__": diff --git a/release/llm_tests/serve/test_llm_serve_integration.py b/release/llm_tests/serve/test_llm_serve_integration.py index 97ee5c0d8fd4..c88dc8044b19 100644 --- a/release/llm_tests/serve/test_llm_serve_integration.py +++ b/release/llm_tests/serve/test_llm_serve_integration.py @@ -27,6 +27,7 @@ async def test_engine_metrics(): model="Qwen/Qwen2.5-0.5B-Instruct", dtype="auto", disable_log_stats=False, + enforce_eager=True, ) engine = AsyncLLM.from_engine_args( @@ -75,6 +76,7 @@ def remote_model_app(request): enable_chunked_prefill=True, enable_prefix_caching=True, trust_remote_code=remote_code, + enforce_eager=True, ), } diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 9fb5e96a1eea..f0098df5010f 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -4288,7 +4288,7 @@ long_running: false script: pytest -vs test_llm_serve_correctness.py -- name: llm_serve_integration +- name: llm_serve_vllm_integration_tests frequency: nightly python: "3.11" group: llm-serve @@ -4307,7 +4307,7 @@ run: timeout: 3600 long_running: false - script: pytest -vs test_llm_serve_integration.py + script: pytest -vs test_llm_serve_integration.py test_llm_serve_fault_tolerance.py - name: llm_serve_llama_3dot1_8B_quantized_tp1_1p1d frequency: nightly From a58e780390fdcb0c1b0fa4614dfd8bb54f7c0256 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Thu, 19 Jun 2025 22:13:16 -0700 Subject: [PATCH 5/7] wip Signed-off-by: Kourosh Hakhamaneshi --- .../ray/llm/_internal/serve/configs/constants.py | 1 + .../_internal/serve/deployments/llm/llm_server.py | 15 ++++++++++++++- .../cpu/builders/test_application_builders.py | 1 + 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/python/ray/llm/_internal/serve/configs/constants.py b/python/ray/llm/_internal/serve/configs/constants.py index 796b97a5934e..7d1d8452c30b 100644 --- a/python/ray/llm/_internal/serve/configs/constants.py +++ b/python/ray/llm/_internal/serve/configs/constants.py @@ -65,6 +65,7 @@ ENABLE_VERBOSE_TELEMETRY = bool(int(os.getenv("RAYLLM_ENABLE_VERBOSE_TELEMETRY", "0"))) +RAYLLM_VLLM_ENGINE_CLS_ENV = "RAYLLM_VLLM_ENGINE_CLS" # The ratio of number of router replicas to number of model replicas. Default to 2 # meaning that there are 2 router replicas for every model replica. diff --git a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py index f727268ec131..8b93bef98f48 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py +++ b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py @@ -5,6 +5,8 @@ # Third-party imports from ray import serve +from ray._common.utils import import_attr + # Local imports from ray.llm._internal.serve.configs.constants import ( @@ -12,6 +14,7 @@ DEFAULT_HEALTH_CHECK_TIMEOUT_S, ENGINE_START_TIMEOUT_S, MODEL_RESPONSE_BATCH_TIMEOUT_MS, + RAYLLM_VLLM_ENGINE_CLS_ENV, ) from ray.llm._internal.serve.configs.openai_api_models import ( ChatCompletionLogProb, @@ -435,7 +438,7 @@ async def __init__( """ await super().__init__(llm_config) - self._engine_cls = engine_cls or self._default_engine_cls + self._engine_cls = engine_cls or self._get_default_engine_class() self.engine: Optional[LLMEngine] = None if self._engine_cls is not None: self.engine = self._engine_cls(self._llm_config) @@ -467,6 +470,16 @@ async def __init__( )(lambda lora_model_id: self._load_model(lora_model_id)) self.response_postprocessor = ResponsePostprocessor() + + def _get_default_engine_class(self) -> Type[LLMEngine]: + """Helper to load the engine class from the environment variable. + This is used for testing or escape-hatch for patching purposes. + If env variable is not set, it will fallback to the default engine class. + """ + engine_cls_path = os.environ.get(RAYLLM_VLLM_ENGINE_CLS_ENV) + if engine_cls_path: + return import_attr(engine_cls_path) + return self._default_engine_cls async def _start_engine(self): if self.engine is None: diff --git a/python/ray/llm/tests/serve/cpu/builders/test_application_builders.py b/python/ray/llm/tests/serve/cpu/builders/test_application_builders.py index f99f5b31493d..4a9fbf4a9dd1 100644 --- a/python/ray/llm/tests/serve/cpu/builders/test_application_builders.py +++ b/python/ray/llm/tests/serve/cpu/builders/test_application_builders.py @@ -83,6 +83,7 @@ def serve_config_separate_model_config_files(): class TestBuildOpenaiApp: + def test_build_openai_app(self, get_llm_serve_args, shutdown_ray_and_serve): """Test `build_openai_app` can build app and run it with Serve.""" From d5576a582f30f30d74d2f9e5eb386b84089fc807 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Thu, 19 Jun 2025 22:15:12 -0700 Subject: [PATCH 6/7] wip Signed-off-by: Kourosh Hakhamaneshi --- python/ray/llm/_internal/serve/deployments/llm/llm_server.py | 3 +-- .../llm/tests/serve/cpu/builders/test_application_builders.py | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py index 8b93bef98f48..d1105db5afa8 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/llm_server.py +++ b/python/ray/llm/_internal/serve/deployments/llm/llm_server.py @@ -7,7 +7,6 @@ from ray import serve from ray._common.utils import import_attr - # Local imports from ray.llm._internal.serve.configs.constants import ( DEFAULT_HEALTH_CHECK_PERIOD_S, @@ -470,7 +469,7 @@ async def __init__( )(lambda lora_model_id: self._load_model(lora_model_id)) self.response_postprocessor = ResponsePostprocessor() - + def _get_default_engine_class(self) -> Type[LLMEngine]: """Helper to load the engine class from the environment variable. This is used for testing or escape-hatch for patching purposes. diff --git a/python/ray/llm/tests/serve/cpu/builders/test_application_builders.py b/python/ray/llm/tests/serve/cpu/builders/test_application_builders.py index 4a9fbf4a9dd1..f99f5b31493d 100644 --- a/python/ray/llm/tests/serve/cpu/builders/test_application_builders.py +++ b/python/ray/llm/tests/serve/cpu/builders/test_application_builders.py @@ -83,7 +83,6 @@ def serve_config_separate_model_config_files(): class TestBuildOpenaiApp: - def test_build_openai_app(self, get_llm_serve_args, shutdown_ray_and_serve): """Test `build_openai_app` can build app and run it with Serve.""" From 40ab6af843ab47c5ef588e0dc91e54b3a48bc436 Mon Sep 17 00:00:00 2001 From: Kourosh Hakhamaneshi Date: Fri, 20 Jun 2025 14:22:47 -0700 Subject: [PATCH 7/7] wip Signed-off-by: Kourosh Hakhamaneshi --- .../ray/llm/tests/serve/cpu/deployments/routers/test_router.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/ray/llm/tests/serve/cpu/deployments/routers/test_router.py b/python/ray/llm/tests/serve/cpu/deployments/routers/test_router.py index 90076a235cef..5ba14036df08 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/routers/test_router.py +++ b/python/ray/llm/tests/serve/cpu/deployments/routers/test_router.py @@ -170,8 +170,6 @@ async def test_check_health(self, llm_config: LLMConfig): await router.check_health() - assert server.check_health.remote.call_count == 1 - if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__]))