vllm-project
diff --git a/‎.github/workflows/multi_node_test.yaml‎
Lines changed: 9 additions & 0 deletions b/‎.github/workflows/multi_node_test.yaml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎docs/source/tutorials/multi_node_pd_disaggregation_mooncake.md‎
Lines changed: 5 additions & 5 deletions b/‎docs/source/tutorials/multi_node_pd_disaggregation_mooncake.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎examples/disaggregated_prefill_v1/README.md‎
Lines changed: 1 addition & 1 deletion b/‎examples/disaggregated_prefill_v1/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/disaggregated_prefill_v1/gen_ranktable.py‎
Lines changed: 6 additions & 1 deletion b/‎examples/disaggregated_prefill_v1/gen_ranktable.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py‎
Lines changed: 167 additions & 63 deletions b/‎examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py‎
Lines changed: 167 additions & 63 deletions
@@ -102,6 +102,15 @@ jobs:
             wait $LOG_PID || true
             kill $MONITOR_PID || true
 
+        - name: Generate summary
+          if: always()
+          run: |
+            if [ -f "/root/.cache/test_summary.md" ]; then
+              cat /root/.cache/test_summary.md >> "$GITHUB_STEP_SUMMARY"
+            else
+              echo "No summary file found." >> "$GITHUB_STEP_SUMMARY"
+            fi
+
         - name: Post process
           if: always()
           run: |
 
@@ -66,16 +66,16 @@ Install the relevant dependencies. The installation of Go is not required.
 
 ```shell
 cd Mooncake
-bash dependencies.sh
+bash dependencies.sh -y
 ```
 
 Install mpi
 
 ```shell
-apt purge mpich libmpich-dev
-apt purge openmpi-bin
-apt purge openmpi-bin libopenmpi-dev
-apt install mpich libmpich-dev
+apt purge mpich libmpich-dev -y
+apt purge openmpi-bin -y
+apt purge openmpi-bin libopenmpi-dev -y
+apt install mpich libmpich-dev -y
 export CPATH=/usr/lib/aarch64-linux-gnu/mpich/include/:$CPATH
 export CPATH=/usr/lib/aarch64-linux-gnu/openmpi/lib:$CPATH
 ```
 
@@ -205,7 +205,7 @@ vllm serve /models/deepseek_r1_w8a8 \
 Run proxy server on the first node:
 ```shell
 cd /vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1
-python toy_proxy_server.py --host 172.19.32.175 --port 1025 --prefiller-hosts 172.19.241.49 --prefiller-port 20002 --decoder-hosts 172.19.123.51 --decoder-ports 20002
+python load_balance_proxy_server_example.py --host 172.19.32.175 --port 1025 --prefiller-hosts 172.19.241.49 --prefiller-port 20002 --decoder-hosts 172.19.123.51 --decoder-ports 20002
 ```
 
 Verification
 
@@ -21,6 +21,10 @@
                     type=str,
                     required=False,
                     help="local device ids")
+parser.add_argument("--ranktable-path",
+                    type=str,
+                    default="./ranktable.json",
+                    help="output rank table path")
 args = parser.parse_args()
 local_host = args.local_host
 prefill_device_cnt = args.prefill_device_cnt
@@ -130,7 +134,8 @@ def get_cmd_stdout(cmd):
 }
 
 if local_rank == '0':
-    with open("ranktable.json", "w") as f:
+    os.makedirs(os.path.dirname(args.ranktable_path), exist_ok=True)
+    with open(args.ranktable_path, "w") as f:
         json.dump(ranktable, f, indent=4)
 
     print("gen ranktable.json done")
@@ -84,17 +84,18 @@
 #
 # For more details, see the code and comments in this file.
 
-
 import argparse
 import asyncio
 import functools
 import heapq
+import json
 import os
 import sys
-import uuid
 import threading
+import uuid
 from contextlib import asynccontextmanager
-from typing import List
+from dataclasses import dataclass
+from typing import Any, List
 
 import httpx
 from fastapi import FastAPI, Request
@@ -106,6 +107,7 @@
 # Add uvloop for faster event loop if available
 try:
     import uvloop
+
     asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
 except ImportError:
     pass
@@ -324,7 +326,7 @@ async def listen_for_disconnect(request: Request) -> None:
 
 
 def with_cancellation(handler_func):
-    
+
     @functools.wraps(handler_func)
     async def wrapper(*args, **kwargs):
         request = kwargs["request"]
@@ -337,9 +339,9 @@ async def wrapper(*args, **kwargs):
         if handler_task in done:
             return handler_task.result()
         return None
-    
+
     return wrapper
-        
+
 
 app = FastAPI(lifespan=lifespan)
 
@@ -362,7 +364,8 @@ async def send_request_to_service(client: httpx.AsyncClient,
         "remote_host": None,
         "remote_port": None,
         "aborted_request": list(aborted_requests),
-        "metaserver": f"http://{global_args.host}:{global_args.port}/v1/metaserver"
+        "metaserver":
+        f"http://{global_args.host}:{global_args.port}/v1/metaserver"
     }
     req_data["stream"] = False
     req_data["max_tokens"] = 1
@@ -455,72 +458,174 @@ def get_api_request_id(api, req_id):
         return "chatcmpl-" + req_id
 
 
+async def _handle_select_instance(api: str, req_data: Any,
+                                  request_length: int):
+    prefiller_score = proxy_state.calculate_prefill_scores(request_length)
+    logger.debug(
+        f"Request length: {request_length}, Prefiller score: {prefiller_score}"
+    )
+    request_id = await proxy_state.next_req_id()
+    # Select prefiller
+    prefiller_idx = proxy_state.select_prefiller(prefiller_score)
+    prefiller = proxy_state.prefillers[prefiller_idx]
+    result_future = asyncio.Future()  # type: ignore
+    request_id_api = get_api_request_id(api, request_id)
+    proxy_state.req_id_future[request_id_api] = result_future
+    # Send request to prefiller
+    asyncio.get_running_loop().create_task(
+        send_request_to_service(prefiller.client,
+                                prefiller_idx,
+                                api,
+                                req_data,
+                                request_id,
+                                max_retries=global_args.max_retries,
+                                base_delay=global_args.retry_delay))
+    proxy_state.release_prefiller(prefiller_idx, prefiller_score)
+
+    response = await result_future
+    del proxy_state.req_id_future[request_id_api]
+    req_data["kv_transfer_params"] = response
+
+    # Select decoder
+    decoder_score = proxy_state.calculate_decode_scores(request_length)
+    logger.debug("Decoder score: %f", decoder_score)
+    # Use the prefiller's kv_transfer_params to select decoder
+    decoder_idx = proxy_state.select_decoder(decoder_score)
+    decoder = proxy_state.decoders[decoder_idx]
+    logger.debug("Using %s %s", prefiller.url, decoder.url)
+    return InstanceInfo(request_id=request_id,
+                        prefiller_idx=prefiller_idx,
+                        prefiller_score=prefiller_score,
+                        prefiller=prefiller,
+                        decoder=decoder,
+                        decoder_idx=decoder_idx,
+                        decoder_score=decoder_score)
+
+
+@dataclass
+class InstanceInfo:
+    request_id: str
+    prefiller_idx: int
+    prefiller_score: float
+    prefiller: ServerState
+    decoder_idx: int
+    decoder_score: float
+    decoder: ServerState
+
+
 async def _handle_completions(api: str, request: Request):
     try:
         req_data = await request.json()
         req_body = await request.body()
         request_length = len(req_body)
-        prefiller_score = proxy_state.calculate_prefill_scores(request_length)
-        logger.debug(
-            f"Request length: {request_length}, Prefiller score: {prefiller_score}"
-        )
-        request_id = await proxy_state.next_req_id()
-        # Select prefiller
-        prefiller_idx = proxy_state.select_prefiller(prefiller_score)
-        prefiller = proxy_state.prefillers[prefiller_idx]
-        result_future = asyncio.Future()  # type: ignore
-        request_id_api = get_api_request_id(api, request_id)
-        proxy_state.req_id_future[request_id_api] = result_future
-        # Send request to prefiller
-        asyncio.get_running_loop().create_task(send_request_to_service(
-            prefiller.client,
-            prefiller_idx,
-            api,
-            req_data,
-            request_id,
-            max_retries=global_args.max_retries,
-            base_delay=global_args.retry_delay))
-        proxy_state.release_prefiller(prefiller_idx, prefiller_score)
-        
-        response = await result_future
-        del proxy_state.req_id_future[request_id_api]
-        req_data["kv_transfer_params"] = response
-
-        # Select decoder
-        decoder_score = proxy_state.calculate_decode_scores(request_length)
-        logger.debug("Decoder score: %f", decoder_score)
-        # Use the prefiller's kv_transfer_params to select decoder
-        decoder_idx = proxy_state.select_decoder(decoder_score)
-        decoder = proxy_state.decoders[decoder_idx]
-        logger.debug("Using %s %s", prefiller.url, decoder.url)
-        # Stream response from decoder
-        released_kv = False
+        instance_info = await _handle_select_instance(api, req_data,
+                                                      request_length)
+        stream_flag = bool(req_data.get("stream", False))
+        chat_flag = "messages" in req_data
+
+        if "prompt" in req_data:
+            origin_prompt = req_data["prompt"]
+        elif chat_flag:
+            messages = req_data["messages"]
+            origin_prompt = messages[0].get("content", "")
+        else:
+            origin_prompt = ""
+        # refer to vLLM sampling_params: max_token default value
+        origin_max_tokens = req_data.get("max_tokens", 16)
+
         async def generate_stream():
-            nonlocal released_kv
+            nonlocal instance_info
+            generated_token = ""
+            released_kv = False
+            retry_count = 0
+            retry = True
+            completion_tokens = 0
             # Only one await per chunk, minimal logic in loop
             try:
-                async for chunk in stream_service_response_with_retry(
-                        decoder.client,
-                        api,
-                        req_data,
-                        request_id=request_id,
-                        max_retries=global_args.max_retries,
-                        base_delay=global_args.retry_delay):
-                    if not released_kv and chunk:
-                        proxy_state.release_prefiller_kv(
-                            prefiller_idx, prefiller_score)
-                        released_kv = True
-                    yield chunk
+                while retry:
+                    retry = False
+                    async for chunk in stream_service_response_with_retry(
+                            instance_info.decoder.client,
+                            api,
+                            req_data,
+                            request_id=instance_info.request_id,
+                            max_retries=global_args.max_retries,
+                            base_delay=global_args.retry_delay):
+                        if not released_kv and chunk:
+                            proxy_state.release_prefiller_kv(
+                                instance_info.prefiller_idx,
+                                instance_info.prefiller_score)
+                            released_kv = True
+                        chunk_str = chunk.decode("utf-8").strip()
+                        if not chunk_str:
+                            continue
+                        if chunk_str.startswith("data: "):
+                            chunk_str = chunk_str[len("data: "):]
+                        try:
+                            chunk_json = json.loads(chunk_str)
+                        except json.JSONDecodeError:
+                            # if chunk is [done], skip it.
+                            logger.warning(
+                                f"Skipping chunk: {chunk_str}")
+                            yield chunk
+                            continue
+                        choices = chunk_json.get("choices", [])
+                        if not choices:
+                            yield chunk
+                            continue
+
+                        choice = choices[0]
+                        delta = choice.get("delta") or {}
+                        message = choice.get("message") or {}
+                        content = (
+                                delta.get("content")
+                                or message.get("content")
+                                or choice.get("text")
+                                or ""
+                                )
+                        generated_token += content
+
+                        stop_reason = choice.get(
+                            "stop_reason")
+                        usage = chunk_json.get("usage", {})
+                        completion_tokens = (completion_tokens + 1) if stream_flag else \
+                            (completion_tokens + usage.get("completion_tokens"))
+                        if stop_reason == "recomputed":
+                            retry = True
+                            retry_count += 1
+                            if chat_flag:
+                                messages[0][
+                                    "content"] = origin_prompt + generated_token
+                            else:
+                                req_data[
+                                    "prompt"] = origin_prompt + generated_token
+                            req_data[
+                                "max_tokens"] = origin_max_tokens - completion_tokens + retry_count
+                            tmp_request_length = len(
+                                json.dumps(req_data).encode("utf-8"))
+                            instance_info = await _handle_select_instance(
+                                api, req_data, tmp_request_length)
+                            break
+                        if retry_count > 0 and not stream_flag:
+                            if chat_flag:
+                                choices[0]["message"][
+                                    "content"] = generated_token
+                            else:
+                                choices[0]["text"] = generated_token
+                            chunk = json.dumps(chunk_json).encode("utf-8")
+                        yield chunk
             except Exception as e:
                 logger.error(
-                    f"Error during streaming from decoder {decoder.url}: {str(e)} the aborted request {request_id} will be routing to the target prefiller when new request is ready to dispatch to it"
+                    f"Error during streaming from decoder {instance_info.decoder.url}: {str(e)} the aborted request {instance_info.request_id} will be routing to the target prefiller when new request is ready to dispatch to it"
                 )
-                proxy_state.abort_prefiller_request(prefiller_idx, request_id)
-                proxy_state.release_prefiller_kv(prefiller_idx,
-                                                 prefiller_score)
+                proxy_state.abort_prefiller_request(
+                    instance_info.prefiller_idx, instance_info.request_id)
+                proxy_state.release_prefiller_kv(instance_info.prefiller_idx,
+                                                 instance_info.prefiller_score)
 
             # After streaming done, release tokens
-            proxy_state.release_decoder(decoder_idx, decoder_score)
+            proxy_state.release_decoder(instance_info.decoder_idx,
+                                        instance_info.decoder_score)
 
         return StreamingResponse(generate_stream(),
                                  media_type="application/json")
@@ -564,13 +669,12 @@ async def metaserver(request: Request):
             result_future = proxy_state.req_id_future[request_id]
             result_future.set_result(req_data)
     except Exception as e:
-        logger.error(
-            f"Post metaserver failed with: {str(e)}"
-        )
+        logger.error(f"Post metaserver failed with: {str(e)}")
 
 
 if __name__ == '__main__':
     global global_args
     global_args = parse_args()
     import uvicorn
+
     uvicorn.run(app, host=global_args.host, port=global_args.port)