ray-project · kouroshHakha · Aug 11, 2025 · Jul 20, 2025 · Jul 21, 2025 · Jul 21, 2025
@@ -239,6 +239,8 @@ py_test_run_all_subdirectory(
         "source/serve/doc_code/stable_diffusion.py",
         "source/serve/doc_code/object_detection.py",
         "source/serve/doc_code/vllm_example.py",
+        "source/serve/doc_code/llm/llm_yaml_config_example.py",
+        "source/serve/doc_code/llm/qwen_example.py",
     ],
     extra_srcs = [],
     tags = [
@@ -270,6 +272,30 @@ py_test_run_all_subdirectory(
     ],
 )
 
+# --------------------------------------------------------------------
+# Test all doc/source/llm/doc_code/serve code included in rst/md files.
+# --------------------------------------------------------------------
+
+filegroup(
+    name = "serve_llm_examples",
+    srcs = glob(["source/llm/doc_code/serve/**/*.py"]),
+    visibility = ["//doc:__subpackages__"],
+)
+
+# GPU Tests
+py_test_run_all_subdirectory(
+    size = "large",
+    include = ["source/llm/doc_code/serve/**/*.py"],
+    exclude = [],
+    extra_srcs = [],
+    data = ["source/llm/doc_code/serve/qwen/llm_config_example.yaml"],
+    tags = [
+        "exclusive",
+        "gpu",
+        "team:llm",
+    ],
+)
+
 # --------------------------------------------------------------------
 # Test all doc/source/tune/doc_code code included in rst/md files.
 # --------------------------------------------------------------------

@@ -0,0 +1,29 @@
+# config.yaml
+applications:
+- args:
+    llm_configs:
+        - model_loading_config:
+            model_id: qwen-0.5b
+            model_source: Qwen/Qwen2.5-0.5B-Instruct
+          accelerator_type: A10G
+          deployment_config:
+            autoscaling_config:
+                min_replicas: 1
+                max_replicas: 2
+          runtime_env:
+            env_vars:
+                VLLM_USE_V1: "1"
+        - model_loading_config:
+            model_id: qwen-1.5b
+            model_source: Qwen/Qwen2.5-1.5B-Instruct
+          accelerator_type: A10G
+          deployment_config:
+            autoscaling_config:
+                min_replicas: 1
+                max_replicas: 2
+          runtime_env:
+            env_vars:
+                VLLM_USE_V1: "1"
+  import_path: ray.serve.llm:build_openai_app
+  name: llm_app
+  route_prefix: "/" 
@@ -0,0 +1,47 @@
+"""
+This file serves as a documentation example and CI test for YAML config deployment.
+
+Structure:
+1. Monkeypatch setup: Ensures serve.run is non-blocking and removes accelerator requirements for CI testing.
+2. Load YAML config and convert to Python using build_openai_app
+3. Test validation (deployment status polling + cleanup)
+"""
+
+import time
+import os
+import yaml
+from ray import serve
+from ray.serve.schema import ApplicationStatus
+from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME
+from ray.serve import llm
+
+
+config_path = os.path.join(os.path.dirname(__file__), "llm_config_example.yaml")
+with open(config_path, "r") as f:
+    config_dict = yaml.safe_load(f)
+
+llm_configs = config_dict["applications"][0]["args"]["llm_configs"]
+for config in llm_configs:
+    config.pop("accelerator_type", None)
+
+app = llm.build_openai_app({"llm_configs": llm_configs})
+serve.run(app, blocking=False)
+
+status = ApplicationStatus.NOT_STARTED
+timeout_seconds = 180
+start_time = time.time()
+
+while (
+    status != ApplicationStatus.RUNNING and time.time() - start_time < timeout_seconds
+):
+    status = serve.status().applications[SERVE_DEFAULT_APP_NAME].status
+
+    if status in [ApplicationStatus.DEPLOY_FAILED, ApplicationStatus.UNHEALTHY]:
+        raise AssertionError(f"Deployment failed with status: {status}")
+
+    time.sleep(1)
+
+if status != ApplicationStatus.RUNNING:
+    raise AssertionError(
+        f"Deployment failed to reach RUNNING status within {timeout_seconds}s. Current status: {status}"
+    )
@@ -0,0 +1,84 @@
+"""
+This file serves as a documentation example and CI test.
+
+Structure:
+1. Monkeypatch setup: Ensures serve.run is non-blocking and removes accelerator requirements for CI testing.
+2. Docs example (between __qwen_example_start/end__): Embedded in Sphinx docs via literalinclude.
+3. Test validation (deployment status polling + cleanup)
+"""
+
+import time
+from ray import serve
+from ray.serve.schema import ApplicationStatus
+from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME
+from ray.serve import llm
+
+_original_serve_run = serve.run
+_original_build_openai_app = llm.build_openai_app
+
+
+def _non_blocking_serve_run(app, **kwargs):
+    """Forces blocking=False for testing"""
+    kwargs["blocking"] = False
+    return _original_serve_run(app, **kwargs)
+
+
+def _testing_build_openai_app(llm_serving_args):
+    """Removes accelerator requirements for testing"""
+    for config in llm_serving_args["llm_configs"]:
+        config.accelerator_type = None
+
+    return _original_build_openai_app(llm_serving_args)
+
+
+serve.run = _non_blocking_serve_run
+llm.build_openai_app = _testing_build_openai_app
+
+# __qwen_example_start__
+from ray import serve
+from ray.serve.llm import LLMConfig, build_openai_app
+
+llm_config = LLMConfig(
+    model_loading_config={
+        "model_id": "qwen-0.5b",
+        "model_source": "Qwen/Qwen2.5-0.5B-Instruct",
+    },
+    deployment_config={
+        "autoscaling_config": {
+            "min_replicas": 1,
+            "max_replicas": 2,
+        }
+    },
+    # Pass the desired accelerator type (e.g. A10G, L4, etc.)
+    accelerator_type="A10G",
+    # You can customize the engine arguments (e.g. vLLM engine kwargs)
+    engine_kwargs={
+        "tensor_parallel_size": 2,
+    },
+    runtime_env={"env_vars": {"VLLM_USE_V1": "1"}},
+)
+
+app = build_openai_app({"llm_configs": [llm_config]})
+serve.run(app, blocking=True)
+# __qwen_example_end__
+
+status = ApplicationStatus.NOT_STARTED
+timeout_seconds = 180
+start_time = time.time()
+
+while (
+    status != ApplicationStatus.RUNNING and time.time() - start_time < timeout_seconds
+):
+    status = serve.status().applications[SERVE_DEFAULT_APP_NAME].status
+
+    if status in [ApplicationStatus.DEPLOY_FAILED, ApplicationStatus.UNHEALTHY]:
+        raise AssertionError(f"Deployment failed with status: {status}")
+
+    time.sleep(1)
+
+if status != ApplicationStatus.RUNNING:
+    raise AssertionError(
+        f"Deployment failed to reach RUNNING status within {timeout_seconds}s. Current status: {status}"
+    )
+
+serve.shutdown()
@@ -68,31 +68,10 @@ Deployment through :class:`LLMRouter <ray.serve.llm.LLMRouter>`
     .. tab-item:: Builder Pattern
         :sync: builder
 
-        .. code-block:: python
-
-            from ray import serve
-            from ray.serve.llm import LLMConfig, build_openai_app
-
-            llm_config = LLMConfig(
-                model_loading_config=dict(
-                    model_id="qwen-0.5b",
-                    model_source="Qwen/Qwen2.5-0.5B-Instruct",
-                ),
-                deployment_config=dict(
-                    autoscaling_config=dict(
-                        min_replicas=1, max_replicas=2,
-                    )
-                ),
-                # Pass the desired accelerator type (e.g. A10G, L4, etc.)
-                accelerator_type="A10G",
-                # You can customize the engine arguments (e.g. vLLM engine kwargs)
-                engine_kwargs=dict(
-                    tensor_parallel_size=2,
-                ),
-            )
-
-            app = build_openai_app({"llm_configs": [llm_config]})
-            serve.run(app, blocking=True)
+        .. literalinclude:: ../../llm/doc_code/serve/qwen/qwen_example.py
+            :language: python
+            :start-after: __qwen_example_start__
+            :end-before: __qwen_example_end__
 
     .. tab-item:: Bind Pattern
         :sync: bind
@@ -259,31 +238,8 @@ For production deployments, Ray Serve LLM provides utilities for config-driven d
     .. tab-item:: Inline Config
         :sync: inline
 
-        .. code-block:: yaml
-
-            # config.yaml
-            applications:
-            - args:
-                llm_configs:
-                    - model_loading_config:
-                        model_id: qwen-0.5b
-                        model_source: Qwen/Qwen2.5-0.5B-Instruct
-                      accelerator_type: A10G
-                      deployment_config:
-                        autoscaling_config:
-                            min_replicas: 1
-                            max_replicas: 2
-                    - model_loading_config:
-                        model_id: qwen-1.5b
-                        model_source: Qwen/Qwen2.5-1.5B-Instruct
-                      accelerator_type: A10G
-                      deployment_config:
-                        autoscaling_config:
-                            min_replicas: 1
-                            max_replicas: 2
-              import_path: ray.serve.llm:build_openai_app
-              name: llm_app
-              route_prefix: "/"
+        .. literalinclude:: ../../llm/doc_code/serve/qwen/llm_config_example.yaml
+            :language: yaml
 
 
     .. tab-item:: Standalone Config