diff --git a/doc/BUILD.bazel b/doc/BUILD.bazel index 8e0cf7adde4e..2d0ac0634687 100644 --- a/doc/BUILD.bazel +++ b/doc/BUILD.bazel @@ -239,6 +239,8 @@ py_test_run_all_subdirectory( "source/serve/doc_code/stable_diffusion.py", "source/serve/doc_code/object_detection.py", "source/serve/doc_code/vllm_example.py", + "source/serve/doc_code/llm/llm_yaml_config_example.py", + "source/serve/doc_code/llm/qwen_example.py", ], extra_srcs = [], tags = [ @@ -270,6 +272,30 @@ py_test_run_all_subdirectory( ], ) +# -------------------------------------------------------------------- +# Test all doc/source/llm/doc_code/serve code included in rst/md files. +# -------------------------------------------------------------------- + +filegroup( + name = "serve_llm_examples", + srcs = glob(["source/llm/doc_code/serve/**/*.py"]), + visibility = ["//doc:__subpackages__"], +) + +# GPU Tests +py_test_run_all_subdirectory( + size = "large", + include = ["source/llm/doc_code/serve/**/*.py"], + exclude = [], + extra_srcs = [], + data = ["source/llm/doc_code/serve/qwen/llm_config_example.yaml"], + tags = [ + "exclusive", + "gpu", + "team:llm", + ], +) + # -------------------------------------------------------------------- # Test all doc/source/tune/doc_code code included in rst/md files. # -------------------------------------------------------------------- diff --git a/doc/source/llm/doc_code/serve/qwen/llm_config_example.yaml b/doc/source/llm/doc_code/serve/qwen/llm_config_example.yaml new file mode 100644 index 000000000000..cd5302b6f637 --- /dev/null +++ b/doc/source/llm/doc_code/serve/qwen/llm_config_example.yaml @@ -0,0 +1,29 @@ +# config.yaml +applications: +- args: + llm_configs: + - model_loading_config: + model_id: qwen-0.5b + model_source: Qwen/Qwen2.5-0.5B-Instruct + accelerator_type: A10G + deployment_config: + autoscaling_config: + min_replicas: 1 + max_replicas: 2 + runtime_env: + env_vars: + VLLM_USE_V1: "1" + - model_loading_config: + model_id: qwen-1.5b + model_source: Qwen/Qwen2.5-1.5B-Instruct + accelerator_type: A10G + deployment_config: + autoscaling_config: + min_replicas: 1 + max_replicas: 2 + runtime_env: + env_vars: + VLLM_USE_V1: "1" + import_path: ray.serve.llm:build_openai_app + name: llm_app + route_prefix: "/" \ No newline at end of file diff --git a/doc/source/llm/doc_code/serve/qwen/llm_yaml_config_example.py b/doc/source/llm/doc_code/serve/qwen/llm_yaml_config_example.py new file mode 100644 index 000000000000..1f921f886716 --- /dev/null +++ b/doc/source/llm/doc_code/serve/qwen/llm_yaml_config_example.py @@ -0,0 +1,47 @@ +""" +This file serves as a documentation example and CI test for YAML config deployment. + +Structure: +1. Monkeypatch setup: Ensures serve.run is non-blocking and removes accelerator requirements for CI testing. +2. Load YAML config and convert to Python using build_openai_app +3. Test validation (deployment status polling + cleanup) +""" + +import time +import os +import yaml +from ray import serve +from ray.serve.schema import ApplicationStatus +from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME +from ray.serve import llm + + +config_path = os.path.join(os.path.dirname(__file__), "llm_config_example.yaml") +with open(config_path, "r") as f: + config_dict = yaml.safe_load(f) + +llm_configs = config_dict["applications"][0]["args"]["llm_configs"] +for config in llm_configs: + config.pop("accelerator_type", None) + +app = llm.build_openai_app({"llm_configs": llm_configs}) +serve.run(app, blocking=False) + +status = ApplicationStatus.NOT_STARTED +timeout_seconds = 180 +start_time = time.time() + +while ( + status != ApplicationStatus.RUNNING and time.time() - start_time < timeout_seconds +): + status = serve.status().applications[SERVE_DEFAULT_APP_NAME].status + + if status in [ApplicationStatus.DEPLOY_FAILED, ApplicationStatus.UNHEALTHY]: + raise AssertionError(f"Deployment failed with status: {status}") + + time.sleep(1) + +if status != ApplicationStatus.RUNNING: + raise AssertionError( + f"Deployment failed to reach RUNNING status within {timeout_seconds}s. Current status: {status}" + ) diff --git a/doc/source/llm/doc_code/serve/qwen/qwen_example.py b/doc/source/llm/doc_code/serve/qwen/qwen_example.py new file mode 100644 index 000000000000..791405940351 --- /dev/null +++ b/doc/source/llm/doc_code/serve/qwen/qwen_example.py @@ -0,0 +1,84 @@ +""" +This file serves as a documentation example and CI test. + +Structure: +1. Monkeypatch setup: Ensures serve.run is non-blocking and removes accelerator requirements for CI testing. +2. Docs example (between __qwen_example_start/end__): Embedded in Sphinx docs via literalinclude. +3. Test validation (deployment status polling + cleanup) +""" + +import time +from ray import serve +from ray.serve.schema import ApplicationStatus +from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME +from ray.serve import llm + +_original_serve_run = serve.run +_original_build_openai_app = llm.build_openai_app + + +def _non_blocking_serve_run(app, **kwargs): + """Forces blocking=False for testing""" + kwargs["blocking"] = False + return _original_serve_run(app, **kwargs) + + +def _testing_build_openai_app(llm_serving_args): + """Removes accelerator requirements for testing""" + for config in llm_serving_args["llm_configs"]: + config.accelerator_type = None + + return _original_build_openai_app(llm_serving_args) + + +serve.run = _non_blocking_serve_run +llm.build_openai_app = _testing_build_openai_app + +# __qwen_example_start__ +from ray import serve +from ray.serve.llm import LLMConfig, build_openai_app + +llm_config = LLMConfig( + model_loading_config={ + "model_id": "qwen-0.5b", + "model_source": "Qwen/Qwen2.5-0.5B-Instruct", + }, + deployment_config={ + "autoscaling_config": { + "min_replicas": 1, + "max_replicas": 2, + } + }, + # Pass the desired accelerator type (e.g. A10G, L4, etc.) + accelerator_type="A10G", + # You can customize the engine arguments (e.g. vLLM engine kwargs) + engine_kwargs={ + "tensor_parallel_size": 2, + }, + runtime_env={"env_vars": {"VLLM_USE_V1": "1"}}, +) + +app = build_openai_app({"llm_configs": [llm_config]}) +serve.run(app, blocking=True) +# __qwen_example_end__ + +status = ApplicationStatus.NOT_STARTED +timeout_seconds = 180 +start_time = time.time() + +while ( + status != ApplicationStatus.RUNNING and time.time() - start_time < timeout_seconds +): + status = serve.status().applications[SERVE_DEFAULT_APP_NAME].status + + if status in [ApplicationStatus.DEPLOY_FAILED, ApplicationStatus.UNHEALTHY]: + raise AssertionError(f"Deployment failed with status: {status}") + + time.sleep(1) + +if status != ApplicationStatus.RUNNING: + raise AssertionError( + f"Deployment failed to reach RUNNING status within {timeout_seconds}s. Current status: {status}" + ) + +serve.shutdown() diff --git a/doc/source/serve/llm/serving-llms.rst b/doc/source/serve/llm/serving-llms.rst index cbba54e26e88..9d694c32a76c 100644 --- a/doc/source/serve/llm/serving-llms.rst +++ b/doc/source/serve/llm/serving-llms.rst @@ -68,31 +68,10 @@ Deployment through :class:`LLMRouter ` .. tab-item:: Builder Pattern :sync: builder - .. code-block:: python - - from ray import serve - from ray.serve.llm import LLMConfig, build_openai_app - - llm_config = LLMConfig( - model_loading_config=dict( - model_id="qwen-0.5b", - model_source="Qwen/Qwen2.5-0.5B-Instruct", - ), - deployment_config=dict( - autoscaling_config=dict( - min_replicas=1, max_replicas=2, - ) - ), - # Pass the desired accelerator type (e.g. A10G, L4, etc.) - accelerator_type="A10G", - # You can customize the engine arguments (e.g. vLLM engine kwargs) - engine_kwargs=dict( - tensor_parallel_size=2, - ), - ) - - app = build_openai_app({"llm_configs": [llm_config]}) - serve.run(app, blocking=True) + .. literalinclude:: ../../llm/doc_code/serve/qwen/qwen_example.py + :language: python + :start-after: __qwen_example_start__ + :end-before: __qwen_example_end__ .. tab-item:: Bind Pattern :sync: bind @@ -259,31 +238,8 @@ For production deployments, Ray Serve LLM provides utilities for config-driven d .. tab-item:: Inline Config :sync: inline - .. code-block:: yaml - - # config.yaml - applications: - - args: - llm_configs: - - model_loading_config: - model_id: qwen-0.5b - model_source: Qwen/Qwen2.5-0.5B-Instruct - accelerator_type: A10G - deployment_config: - autoscaling_config: - min_replicas: 1 - max_replicas: 2 - - model_loading_config: - model_id: qwen-1.5b - model_source: Qwen/Qwen2.5-1.5B-Instruct - accelerator_type: A10G - deployment_config: - autoscaling_config: - min_replicas: 1 - max_replicas: 2 - import_path: ray.serve.llm:build_openai_app - name: llm_app - route_prefix: "/" + .. literalinclude:: ../../llm/doc_code/serve/qwen/llm_config_example.yaml + :language: yaml .. tab-item:: Standalone Config