ray-project
diff --git a/‎python/ray/llm/_internal/common/base_pydantic.py‎
Lines changed: 6 additions & 0 deletions b/‎python/ray/llm/_internal/common/base_pydantic.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎python/ray/llm/_internal/serve/configs/server_models.py‎
Lines changed: 1 addition & 1 deletion b/‎python/ray/llm/_internal/serve/configs/server_models.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/ray/llm/_internal/serve/deployments/prefill_decode_disagg/builder_pd.py‎
Lines changed: 164 additions & 0 deletions b/‎python/ray/llm/_internal/serve/deployments/prefill_decode_disagg/builder_pd.py‎
Lines changed: 164 additions & 0 deletions
diff --git a/‎python/ray/llm/_internal/serve/deployments/prefill_decode_disagg/prefill_decode_disagg.py‎ renamed to ‎python/ray/llm/_internal/serve/deployments/prefill_decode_disagg/pd.py‎
Lines changed: 10 additions & 89 deletions b/‎python/ray/llm/_internal/serve/deployments/prefill_decode_disagg/prefill_decode_disagg.py‎ renamed to ‎python/ray/llm/_internal/serve/deployments/prefill_decode_disagg/pd.py‎
Lines changed: 10 additions & 89 deletions
@@ -23,3 +23,9 @@ def parse_yaml(cls: Type[ModelT], file, **kwargs) -> ModelT:
         kwargs.setdefault("Loader", yaml.SafeLoader)
         dict_args = yaml.load(file, **kwargs)
         return cls.model_validate(dict_args)
+
+    @classmethod
+    def from_file(cls: Type[ModelT], path: str, **kwargs) -> ModelT:
+        """Load a model from a YAML file path."""
+        with open(path, "r") as f:
+            return cls.parse_yaml(f, **kwargs)
@@ -375,7 +375,7 @@ def validate_experimental_configs(cls, value: Dict[str, Any]) -> Dict[str, Any]:
 
     @model_validator(mode="after")
     def _check_log_stats_with_metrics(self):
-        # Require disable_log_stats is not set to True when log_engine_metrics is enabled.
+        """Validate that disable_log_stats isn't enabled when log_engine_metrics is enabled."""
         if self.log_engine_metrics and self.engine_kwargs.get("disable_log_stats"):
             raise ValueError(
                 "disable_log_stats cannot be set to True when log_engine_metrics is enabled. "
 
@@ -0,0 +1,164 @@
+"""Using Ray Serve to deploy LLM models with P/D disaggregation.
+"""
+from typing import Any, Optional, Union
+
+from pydantic import Field, field_validator, model_validator
+
+from ray import serve
+from ray.llm._internal.common.base_pydantic import BaseModelExtended
+from ray.llm._internal.common.dict_utils import deep_merge_dicts
+from ray.llm._internal.serve.deployments.prefill_decode_disagg.pd import PDProxyServer
+from ray.llm._internal.serve.deployments.routers.builder_ingress import (
+    IngressClsConfig,
+    load_class,
+)
+from ray.llm._internal.serve.deployments.routers.router import (
+    make_fastapi_ingress,
+)
+from ray.serve.deployment import Application
+from ray.serve.llm import (
+    LLMConfig,
+    build_llm_deployment,
+)
+
+
+class ProxyClsConfig(BaseModelExtended):
+    proxy_cls: Union[str, type[PDProxyServer]] = Field(
+        default=PDProxyServer,
+        description="The proxy class or the class module path to use.",
+    )
+
+    proxy_extra_kwargs: Optional[dict] = Field(
+        default_factory=dict,
+        description="The kwargs to bind to the proxy deployment. This will be passed to the proxy class constructor.",
+    )
+
+    @field_validator("proxy_cls")
+    @classmethod
+    def validate_class(
+        cls, value: Union[str, type[PDProxyServer]]
+    ) -> type[PDProxyServer]:
+        if isinstance(value, str):
+            return load_class(value)
+        return value
+
+
+class PDServingArgs(BaseModelExtended):
+    """Schema for P/D serving args."""
+
+    prefill_config: Union[str, dict, LLMConfig]
+    decode_config: Union[str, dict, LLMConfig]
+    proxy_cls_config: Union[dict, ProxyClsConfig] = Field(
+        default_factory=ProxyClsConfig,
+        description="The configuration for the proxy class.",
+    )
+    proxy_deployment_config: Optional[dict] = Field(
+        default_factory=dict,
+        description="The Ray @server.deployment options for the proxy server.",
+    )
+    ingress_cls_config: Union[dict, IngressClsConfig] = Field(
+        default_factory=IngressClsConfig,
+        description="The configuration for the ingress class.",
+    )
+    ingress_deployment_config: Optional[dict] = Field(
+        default_factory=dict,
+        description="The Ray @server.deployment options for the ingress.",
+    )
+
+    @field_validator("prefill_config", "decode_config")
+    @classmethod
+    def _validate_llm_config(cls, value: Any) -> LLMConfig:
+        if isinstance(value, str):
+            return LLMConfig.from_file(value)
+        elif isinstance(value, dict):
+            return LLMConfig.model_validate(value)
+        elif isinstance(value, LLMConfig):
+            return value
+        else:
+            raise TypeError(f"Invalid LLMConfig type: {type(value)}")
+
+    @field_validator("proxy_cls_config")
+    @classmethod
+    def _validate_proxy_cls_config(
+        cls, value: Union[dict, ProxyClsConfig]
+    ) -> ProxyClsConfig:
+        if isinstance(value, dict):
+            return ProxyClsConfig.model_validate(value)
+        return value
+
+    @field_validator("ingress_cls_config")
+    @classmethod
+    def _validate_ingress_cls_config(
+        cls, value: Union[dict, IngressClsConfig]
+    ) -> IngressClsConfig:
+        if isinstance(value, dict):
+            return IngressClsConfig.model_validate(value)
+        return value
+
+    @model_validator(mode="after")
+    def _validate_model_ids(self):
+        """Validate that prefill and decode configs use the same model ID."""
+        if self.prefill_config.model_id != self.decode_config.model_id:
+            raise ValueError("P/D model id mismatch")
+        return self
+
+    @model_validator(mode="after")
+    def _validate_kv_transfer_config(self):
+        """Validate that kv_transfer_config is set for both prefill and decode configs."""
+        for config in [self.prefill_config, self.decode_config]:
+            if config.engine_kwargs.get("kv_transfer_config") is None:
+                raise ValueError(
+                    "kv_transfer_config is required for P/D disaggregation"
+                )
+        return self
+
+
+def build_pd_openai_app(pd_serving_args: dict) -> Application:
+    """Build a deployable application utilizing prefill/decode disaggregation."""
+    pd_config = PDServingArgs.model_validate(pd_serving_args)
+
+    prefill_deployment = build_llm_deployment(
+        pd_config.prefill_config, name_prefix="Prefill:"
+    )
+    decode_deployment = build_llm_deployment(
+        pd_config.decode_config, name_prefix="Decode:"
+    )
+
+    # Get the default deployment options from the PDProxyServer class based on the prefill and decode configs.
+    proxy_cls_config = pd_config.proxy_cls_config
+
+    pd_proxy_server_options = proxy_cls_config.proxy_cls.get_deployment_options(
+        pd_config.prefill_config, pd_config.decode_config
+    )
+
+    # Override if the proxy deployment config is provided.
+    if pd_config.proxy_deployment_config:
+        pd_proxy_server_options = deep_merge_dicts(
+            pd_proxy_server_options, pd_config.proxy_deployment_config
+        )
+
+    proxy_server_deployment = (
+        serve.deployment(proxy_cls_config.proxy_cls)
+        .options(**pd_proxy_server_options)
+        .bind(
+            prefill_server=prefill_deployment,
+            decode_server=decode_deployment,
+            **proxy_cls_config.proxy_extra_kwargs,
+        )
+    )
+
+    ingress_cls_config = pd_config.ingress_cls_config
+    ingress_options = ingress_cls_config.ingress_cls.get_deployment_options(
+        [pd_config.prefill_config, pd_config.decode_config]
+    )
+
+    if pd_config.ingress_deployment_config:
+        ingress_options = deep_merge_dicts(
+            ingress_options, pd_config.ingress_deployment_config
+        )
+
+    ingress_cls = make_fastapi_ingress(ingress_cls_config.ingress_cls)
+    return serve.deployment(ingress_cls, **ingress_options).bind(
+        llm_deployments=[proxy_server_deployment],
+        **ingress_cls_config.ingress_extra_kwargs,
+    )
@@ -1,13 +1,9 @@
 """Using Ray Serve to deploy LLM models with P/D disaggregation.
 """
 import logging
-import uuid
 from typing import Any, AsyncGenerator, Dict, Union
 
-from pydantic import Field
-
-from ray import serve
-from ray.llm._internal.common.base_pydantic import BaseModelExtended
+from ray.llm._internal.serve.configs.constants import DEFAULT_MAX_ONGOING_REQUESTS
 from ray.llm._internal.serve.configs.openai_api_models import (
     ChatCompletionRequest,
     ChatCompletionResponse,
@@ -18,53 +14,15 @@
     ErrorResponse,
 )
 from ray.llm._internal.serve.deployments.llm.llm_server import LLMServer
-from ray.llm._internal.serve.deployments.routers.builder_ingress import (
-    parse_args as parse_llm_configs,
-)
-from ray.llm._internal.serve.deployments.routers.router import (
-    OpenAiIngress,
-    make_fastapi_ingress,
-)
-from ray.serve.deployment import Application
 from ray.serve.handle import DeploymentHandle
-from ray.serve.llm import (
-    LLMConfig,
-    build_llm_deployment,
-)
+from ray.serve.llm import LLMConfig
 
 logger = logging.getLogger(__name__)
 RequestType = Union[ChatCompletionRequest, CompletionRequest]
 
-
-class PDServingArgs(BaseModelExtended):
-    """Schema for P/D serving args."""
-
-    prefill_config: Union[str, LLMConfig]
-    decode_config: Union[str, LLMConfig]
-    proxy_deployment_config: Dict[str, Any] = Field(
-        default_factory=dict,
-        description="""
-            The Ray @server.deployment options for the proxy server.
-        """,
-    )
-
-    def parse_args(self) -> "PDServingArgs":
-        """Converts this LLMServingArgs object into an DeployArgs object."""
-
-        def parse_configs_and_cast_type(config: Union[str, LLMConfig]) -> LLMConfig:
-            # ray.serve.llm.__init__ imports internal LLMConfig, and extends it to external-facing LLMConfig.
-            # parse_llm_configs returns internal LLMConfig, while {prefill, decode}_configs expect external-facing LLMConfig.
-            # So the model_dump() here is to convert the type, to satisfy pydantic.
-            # TODO(lk-chen): refactor llm_config parsing to avoid this model_dump, and make llm_config more reusable.
-            config = parse_llm_configs([config])[0]
-            return LLMConfig(**config.model_dump())
-
-        return PDServingArgs(
-            # Parse string file path into LLMConfig
-            prefill_config=parse_configs_and_cast_type(self.prefill_config),
-            decode_config=parse_configs_and_cast_type(self.decode_config),
-            proxy_deployment_config=self.proxy_deployment_config,
-        )
+DEFAULT_PD_PROXY_SERVER_OPTIONS = {
+    "max_ongoing_requests": DEFAULT_MAX_ONGOING_REQUESTS,
+}
 
 
 class PDProxyServer(LLMServer):
@@ -171,45 +129,8 @@ async def completions(
     ) -> AsyncGenerator[Union[str, CompletionResponse, ErrorResponse], None]:
         return self._handle_request(request)
 
-
-def build_pd_openai_app(pd_serving_args: dict) -> Application:
-    """Build a deployable application utilizing prefill/decode disaggregation."""
-
-    pd_config = PDServingArgs.model_validate(pd_serving_args).parse_args()
-
-    model_id = pd_config.decode_config.model_id
-    assert model_id == pd_config.prefill_config.model_id, "P/D model id mismatch"
-
-    for config in [pd_config.prefill_config, pd_config.decode_config]:
-        if "kv_transfer_config" not in config.engine_kwargs:
-            config.update_engine_kwargs(
-                kv_transfer_config=dict(
-                    kv_connector="NixlConnector",
-                    kv_role="kv_both",
-                    engine_id=str(uuid.uuid4()),
-                )
-            )
-
-    prefill_deployment = build_llm_deployment(
-        pd_config.prefill_config, name_prefix="Prefill:"
-    )
-    decode_deployment = build_llm_deployment(
-        pd_config.decode_config, name_prefix="Decode:"
-    )
-
-    proxy_server_deployment = (
-        serve.deployment(PDProxyServer)
-        .options(**pd_config.proxy_deployment_config)
-        .bind(
-            prefill_server=prefill_deployment,
-            decode_server=decode_deployment,
-        )
-    )
-
-    ingress_options = OpenAiIngress.get_deployment_options(
-        [pd_config.prefill_config, pd_config.decode_config]
-    )
-    ingress_cls = make_fastapi_ingress(OpenAiIngress)
-    return serve.deployment(ingress_cls, **ingress_options).bind(
-        llm_deployments=[proxy_server_deployment]
-    )
+    @classmethod
+    def get_deployment_options(
+        cls, prefill_config: "LLMConfig", decode_config: "LLMConfig"
+    ) -> Dict[str, Any]:
+        return DEFAULT_PD_PROXY_SERVER_OPTIONS