ray-project · kouroshHakha · Nov 12, 2025 · Nov 10, 2025 · Nov 10, 2025 · Nov 10, 2025
@@ -26,6 +26,9 @@
     model_source="unsloth/Llama-3.1-8B-Instruct",
     concurrency=1,  # 1 vLLM engine replica
     batch_size=32,  # 32 samples per batch
+    engine_kwargs={
+        "max_model_len": 4096, # Fit into test GPU memory
+    }
 )
 
 # Build processor

@@ -54,14 +54,11 @@ def __init__(self, dp_size: int, dp_size_per_node: Optional[int] = None):
                 f"with dp_size_per_node {self.dp_size_per_node}"
             )
 
-    async def register(
-        self, replica_ctx: "serve.context.ReplicaContext", node_id: Optional[str] = None
-    ):
+    async def register(self, node_id: Optional[str] = None):
         """
         Register a replica and assign a rank to it.
 
         Args:
-            replica_ctx: The replica context.
             node_id: The node id of the replica.
 
         Returns:

@@ -1,7 +1,6 @@
 import logging
 import time
 
-from ray import serve
 from ray.experimental.collective.util import get_address_and_port
 from ray.llm._internal.serve.core.configs.llm_config import LLMConfig
 from ray.llm._internal.serve.core.server.llm_server import LLMServer
@@ -24,9 +23,8 @@ class DPServer(LLMServer):
     async def __init__(self, llm_config: LLMConfig, dp_rank_assigner: DeploymentHandle):
         self.dp_rank_assigner = dp_rank_assigner
 
-        replica_ctx = serve.get_replica_context()
         node_id = get_runtime_context().get_node_id()
-        self.dp_rank = await self.dp_rank_assigner.register.remote(replica_ctx, node_id)
+        self.dp_rank = await self.dp_rank_assigner.register.remote(node_id)
 
         logger.info(f"DP rank {self.dp_rank} registered with rank assigner")