Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions doc/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,8 @@ py_test_run_all_subdirectory(
"source/serve/doc_code/stable_diffusion.py",
"source/serve/doc_code/object_detection.py",
"source/serve/doc_code/vllm_example.py",
"source/serve/doc_code/llm/llm_yaml_config_example.py",
"source/serve/doc_code/llm/qwen_example.py",
],
extra_srcs = [],
tags = [
Expand Down Expand Up @@ -270,6 +272,30 @@ py_test_run_all_subdirectory(
],
)

# --------------------------------------------------------------------
# Test all doc/source/llm/doc_code/serve code included in rst/md files.
# --------------------------------------------------------------------

filegroup(
name = "serve_llm_examples",
srcs = glob(["source/llm/doc_code/serve/**/*.py"]),
visibility = ["//doc:__subpackages__"],
)

# GPU Tests
py_test_run_all_subdirectory(
size = "large",
include = ["source/llm/doc_code/serve/**/*.py"],
exclude = [],
extra_srcs = [],
data = ["source/llm/doc_code/serve/qwen/llm_config_example.yaml"],
tags = [
"exclusive",
"gpu",
"team:llm",
],
)

# --------------------------------------------------------------------
# Test all doc/source/tune/doc_code code included in rst/md files.
# --------------------------------------------------------------------
Expand Down
29 changes: 29 additions & 0 deletions doc/source/llm/doc_code/serve/qwen/llm_config_example.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# config.yaml
applications:
- args:
llm_configs:
- model_loading_config:
model_id: qwen-0.5b
model_source: Qwen/Qwen2.5-0.5B-Instruct
accelerator_type: A10G
deployment_config:
autoscaling_config:
min_replicas: 1
max_replicas: 2
runtime_env:
env_vars:
VLLM_USE_V1: "1"
- model_loading_config:
model_id: qwen-1.5b
model_source: Qwen/Qwen2.5-1.5B-Instruct
accelerator_type: A10G
deployment_config:
autoscaling_config:
min_replicas: 1
max_replicas: 2
runtime_env:
env_vars:
VLLM_USE_V1: "1"
import_path: ray.serve.llm:build_openai_app
name: llm_app
route_prefix: "/"
47 changes: 47 additions & 0 deletions doc/source/llm/doc_code/serve/qwen/llm_yaml_config_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""
This file serves as a documentation example and CI test for YAML config deployment.

Structure:
1. Monkeypatch setup: Ensures serve.run is non-blocking and removes accelerator requirements for CI testing.
2. Load YAML config and convert to Python using build_openai_app
3. Test validation (deployment status polling + cleanup)
"""

import time
import os
import yaml
from ray import serve
from ray.serve.schema import ApplicationStatus
from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME
from ray.serve import llm


config_path = os.path.join(os.path.dirname(__file__), "llm_config_example.yaml")
with open(config_path, "r") as f:
config_dict = yaml.safe_load(f)

llm_configs = config_dict["applications"][0]["args"]["llm_configs"]
for config in llm_configs:
config.pop("accelerator_type", None)

app = llm.build_openai_app({"llm_configs": llm_configs})
serve.run(app, blocking=False)

status = ApplicationStatus.NOT_STARTED
timeout_seconds = 180
start_time = time.time()

while (
status != ApplicationStatus.RUNNING and time.time() - start_time < timeout_seconds
):
status = serve.status().applications[SERVE_DEFAULT_APP_NAME].status

if status in [ApplicationStatus.DEPLOY_FAILED, ApplicationStatus.UNHEALTHY]:
raise AssertionError(f"Deployment failed with status: {status}")

time.sleep(1)

if status != ApplicationStatus.RUNNING:
raise AssertionError(
f"Deployment failed to reach RUNNING status within {timeout_seconds}s. Current status: {status}"
)
84 changes: 84 additions & 0 deletions doc/source/llm/doc_code/serve/qwen/qwen_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""
This file serves as a documentation example and CI test.

Structure:
1. Monkeypatch setup: Ensures serve.run is non-blocking and removes accelerator requirements for CI testing.
2. Docs example (between __qwen_example_start/end__): Embedded in Sphinx docs via literalinclude.
3. Test validation (deployment status polling + cleanup)
"""

import time
from ray import serve
from ray.serve.schema import ApplicationStatus
from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME
from ray.serve import llm

_original_serve_run = serve.run
_original_build_openai_app = llm.build_openai_app


def _non_blocking_serve_run(app, **kwargs):
"""Forces blocking=False for testing"""
kwargs["blocking"] = False
return _original_serve_run(app, **kwargs)


def _testing_build_openai_app(llm_serving_args):
"""Removes accelerator requirements for testing"""
for config in llm_serving_args["llm_configs"]:
config.accelerator_type = None

return _original_build_openai_app(llm_serving_args)


serve.run = _non_blocking_serve_run
llm.build_openai_app = _testing_build_openai_app

# __qwen_example_start__
from ray import serve
from ray.serve.llm import LLMConfig, build_openai_app

llm_config = LLMConfig(
model_loading_config={
"model_id": "qwen-0.5b",
"model_source": "Qwen/Qwen2.5-0.5B-Instruct",
},
deployment_config={
"autoscaling_config": {
"min_replicas": 1,
"max_replicas": 2,
}
},
# Pass the desired accelerator type (e.g. A10G, L4, etc.)
accelerator_type="A10G",
# You can customize the engine arguments (e.g. vLLM engine kwargs)
engine_kwargs={
"tensor_parallel_size": 2,
},
runtime_env={"env_vars": {"VLLM_USE_V1": "1"}},
)

app = build_openai_app({"llm_configs": [llm_config]})
serve.run(app, blocking=True)
# __qwen_example_end__

status = ApplicationStatus.NOT_STARTED
timeout_seconds = 180
start_time = time.time()

while (
status != ApplicationStatus.RUNNING and time.time() - start_time < timeout_seconds
):
status = serve.status().applications[SERVE_DEFAULT_APP_NAME].status

if status in [ApplicationStatus.DEPLOY_FAILED, ApplicationStatus.UNHEALTHY]:
raise AssertionError(f"Deployment failed with status: {status}")

time.sleep(1)

if status != ApplicationStatus.RUNNING:
raise AssertionError(
f"Deployment failed to reach RUNNING status within {timeout_seconds}s. Current status: {status}"
)

serve.shutdown()
56 changes: 6 additions & 50 deletions doc/source/serve/llm/serving-llms.rst
Original file line number Diff line number Diff line change
Expand Up @@ -68,31 +68,10 @@ Deployment through :class:`LLMRouter <ray.serve.llm.LLMRouter>`
.. tab-item:: Builder Pattern
:sync: builder

.. code-block:: python

from ray import serve
from ray.serve.llm import LLMConfig, build_openai_app

llm_config = LLMConfig(
model_loading_config=dict(
model_id="qwen-0.5b",
model_source="Qwen/Qwen2.5-0.5B-Instruct",
),
deployment_config=dict(
autoscaling_config=dict(
min_replicas=1, max_replicas=2,
)
),
# Pass the desired accelerator type (e.g. A10G, L4, etc.)
accelerator_type="A10G",
# You can customize the engine arguments (e.g. vLLM engine kwargs)
engine_kwargs=dict(
tensor_parallel_size=2,
),
)

app = build_openai_app({"llm_configs": [llm_config]})
serve.run(app, blocking=True)
.. literalinclude:: ../../llm/doc_code/serve/qwen/qwen_example.py
:language: python
:start-after: __qwen_example_start__
:end-before: __qwen_example_end__

.. tab-item:: Bind Pattern
:sync: bind
Expand Down Expand Up @@ -259,31 +238,8 @@ For production deployments, Ray Serve LLM provides utilities for config-driven d
.. tab-item:: Inline Config
:sync: inline

.. code-block:: yaml

# config.yaml
applications:
- args:
llm_configs:
- model_loading_config:
model_id: qwen-0.5b
model_source: Qwen/Qwen2.5-0.5B-Instruct
accelerator_type: A10G
deployment_config:
autoscaling_config:
min_replicas: 1
max_replicas: 2
- model_loading_config:
model_id: qwen-1.5b
model_source: Qwen/Qwen2.5-1.5B-Instruct
accelerator_type: A10G
deployment_config:
autoscaling_config:
min_replicas: 1
max_replicas: 2
import_path: ray.serve.llm:build_openai_app
name: llm_app
route_prefix: "/"
.. literalinclude:: ../../llm/doc_code/serve/qwen/llm_config_example.yaml
:language: yaml


.. tab-item:: Standalone Config
Expand Down