polish code

cyber-pioneer · cyber-pioneer · commit 44ceca57dd6f · 2025-04-29T10:30:11.000+08:00
diff --git a/examples/qwen/conf/config_qwen2.5_7b_pd_disaggregation.yaml b/examples/qwen/conf/config_qwen2.5_7b_pd_disaggregation.yaml
@@ -11,10 +11,10 @@ experiment:
     port: 10001
     use_fs_serve: false
     prefill_decode_disaggregation: true
-    prefill_num: 1
-    prefill_address: 10.1.1.122 # optional, default "auto"
+    prefill_num: 2
+    prefill_address: x.x.x.x # optional, default "auto"
     decode_num: 2
-    decode_address: 10.1.1.108 # optional, default "auto"
+    decode_address: x.x.x.x # optional, default "auto"
   runner:
     hostfile: examples/qwen/conf/hostfile.txt
     docker: fr-v2
diff --git a/examples/qwen/conf/hostfile.txt b/examples/qwen/conf/hostfile.txt
@@ -1,5 +1,5 @@
 # ip slots type=xxx[optional]
 # master node
-10.1.1.122 slots=8 type=gpu
+x.x.x.x slots=8 type=gpu
 # worker nodes
-10.1.1.108 slots=8 type=gpu
+x.x.x.x slots=8 type=gpu
diff --git a/flagscale/runner/runner_serve.py b/flagscale/runner/runner_serve.py
@@ -294,7 +294,6 @@ def _generate_run_script_serve(config, host, node_rank, cmd, background=True, wi
         vllm_path = f"{root_dir}/vllm"
     deploy_config = config.experiment.get("deploy", {})
     envs = config.experiment.get("envs", {})
-    print(f"shell file ======================== {host_run_script_file}", flush=True)
     with open(host_run_script_file, "w") as f:
         f.write("#!/bin/bash\n\n")
         f.write("set -x\n")
@@ -321,17 +320,14 @@ def _generate_run_script_serve(config, host, node_rank, cmd, background=True, wi
                 kv_related_ports = _get_multiple_free_ports(ports_num)
                 pd_proxy_port = deploy_config.get("pd_proxy_port", None)
                 if not pd_proxy_port:
-                    raise ValueError(
-                        f"PD disaggregation requires a proxy port to be set."
-                    )
+                    raise ValueError(f"PD disaggregation requires a proxy port to be set.")
 
                 engine_args = _get_engine_args(config)
                 command_items = ["vllm", "serve"]
                 command_items.append(engine_args["model"])
                 other_args = flatten_dict_to_args(engine_args, ["model", "port"])
                 command_items.extend(other_args)
                 vllm_command = " ".join(command_items)
-                # vllm_command = "nohup " + vllm_command
                 if before_start_cmd:
                     vllm_command = f"{before_start_cmd} && " + vllm_command
                 if envs_str:
@@ -386,21 +382,18 @@ def _generate_run_script_serve(config, host, node_rank, cmd, background=True, wi
                             "http_port": str(http_port),
                         },
                     }
-                    print(
+                    logger.info(
                         f"============= prefill instance {i}, p_kv_config: {p_kv_config} =============",
                         flush=True,
                     )
                     card_ids = resource_manager.get_available_card_ids(
-                        address=p_address,
-                        num=each_instance_card_num,
+                        address=p_address, num=each_instance_card_num
                     )
                     card_ids_str = ",".join(map(str, card_ids))
                     ids_env = f"export CUDA_VISIBLE_DEVICES={card_ids_str}"
 
                     p_kv_config_json = json.dumps(p_kv_config)
-                    p_instance_log_path = os.path.join(
-                        default_log_dir, f"prefill_{i}.log"
-                    )
+                    p_instance_log_path = os.path.join(default_log_dir, f"prefill_{i}.log")
 
                     if p_address != master_ip:
                         p_kv_config_formate_json = p_kv_config_json.replace('"', '\\"')
@@ -433,21 +426,18 @@ def _generate_run_script_serve(config, host, node_rank, cmd, background=True, wi
                             "http_port": str(http_port),
                         },
                     }
-                    print(
+                    logger.info(
                         f"============= decode instance {i}, d_kv_config: {d_kv_config} =============",
                         flush=True,
                     )
                     card_ids = resource_manager.get_available_card_ids(
-                        address=d_address,
-                        num=each_instance_card_num,
+                        address=d_address, num=each_instance_card_num
                     )
                     card_ids_str = ",".join(map(str, card_ids))
                     ids_env = f"export CUDA_VISIBLE_DEVICES={card_ids_str}"
 
                     d_kv_config_json = json.dumps(d_kv_config)
-                    d_instance_log_path = os.path.join(
-                        default_log_dir, f"decode_{j}.log"
-                    )
+                    d_instance_log_path = os.path.join(default_log_dir, f"decode_{j}.log")
 
                     if d_address != master_ip:
                         d_kv_config_formate_json = d_kv_config_json.replace('"', '\\"')
@@ -683,9 +673,7 @@ def _prepare(self):
         self.user_envs = self.config.experiment.get("envs", {})
         entrypoint = self.config.experiment.task.get("entrypoint", None)
         if self.inference_engine:  # pd_disagg_router
-            if self.config.experiment.get("deploy", {}).get(
-                "prefill_decode_disaggregation", False
-            ):
+            if self.config.experiment.get("deploy", {}).get("prefill_decode_disaggregation", False):
                 self.user_script = "flagscale/serve/run_pd_disagg_router.py"
             elif not self.use_fs_serve:
                 self.user_script = "flagscale/serve/run_inference_engine.py"
@@ -783,7 +771,6 @@ def _stop_each(self, host, node_rank):
         kill_process_tree(pid)
 
         ray_executable = shutil.which("ray")
-        print(ray_executable)
         if ray_executable:
             ray_path = os.path.realpath(ray_executable)
             os.system(f"{ray_path} stop")
diff --git a/flagscale/serve/run_pd_disagg_router.py b/flagscale/serve/run_pd_disagg_router.py
@@ -1,4 +1,11 @@
-import logging
+# Copyright (c) 2025, BAAI. All rights reserved.
+#
+# Adopted from https://github.com/vllm-project/vllm/blob/1ad957950ffc1552af5abda78c03d88ddb67945b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py. Below is the original copyright:
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+
+
 import os
 import random
 import socket
@@ -8,6 +15,7 @@
 import aiohttp
 import msgpack
 import zmq
+
 from quart import Quart, make_response, request
 
 try:
@@ -29,18 +37,13 @@ class LoadManager:
     def __init__(self):
         self._lock = threading.Lock()
         # Each resource type 'P' or 'D' maps to {http_addr: {'zmq': zmq_addr, 'load': int}}
-        self._instances: dict[str, dict[str, dict[str, object]]] = {
-            "P": {},
-            "D": {},
-        }
+        self._instances: dict[str, dict[str, dict[str, object]]] = {"P": {}, "D": {}}
 
     def register(self, rtype: str, http_addr: str, zmq_addr: str):
         with self._lock:
             if http_addr not in self._instances[rtype]:
                 self._instances[rtype][http_addr] = {"zmq": zmq_addr, "load": 0}
-                logger.info(
-                    f"Registered new {rtype}-instance {http_addr} (zmq={zmq_addr})"
-                )
+                logger.info(f"Registered new {rtype}-instance {http_addr} (zmq={zmq_addr})")
             else:
                 # If zmq address changed, synchronize it
                 self._instances[rtype][http_addr]["zmq"] = zmq_addr
@@ -67,13 +70,8 @@ def get_random(self, rtype: str) -> tuple[str, str]:
 
     def get_robin_loaded(self, rtype: str) -> tuple[str, str]:
         with self._lock:
-            http_addr, info = min(
-                self._instances[rtype].items(), key=lambda kv: kv[1]["load"]
-            )
-            print(
-                f"========== whole instance status {self._instances}==========",
-                flush=True,
-            )
+            http_addr, info = min(self._instances[rtype].items(), key=lambda kv: kv[1]["load"])
+            print(f"========== whole instance status {self._instances}==========", flush=True)
         return http_addr, info["zmq"]
 
 
@@ -168,9 +166,7 @@ async def forward_request(url, data, request_id):
 async def handle_request():
     try:
         original_data = await request.get_json()
-        endpoint = (
-            request.path
-        )  # this will be '/v1/completions' or '/v1/chat/completions'
+        endpoint = request.path  # this will be '/v1/completions' or '/v1/chat/completions'
 
         # Prefill request: max_tokens=1
         prefill_request = original_data.copy()
@@ -191,9 +187,7 @@ async def handle_request():
         logger.info(f"Selected D-instance {decode_addr} via '{SCHEDULING_STRATEGY}'")
 
         # Keep original request_id composition format
-        request_id = (
-            f"___prefill_addr_{prefill_zmq}___decode_addr_{decode_zmq}_{random_uuid()}"
-        )
+        request_id = f"___prefill_addr_{prefill_zmq}___decode_addr_{decode_zmq}_{random_uuid()}"
 
         # Execute Prefill and update load
         lm.increment_load("P", prefill_addr)
@@ -235,9 +229,7 @@ def main():
         raise ValueError("No port specified in deploy config")
     if not pd_proxy_port:
         raise ValueError("No pd_proxy_port specified in deploy config")
-    print(
-        f"Starting Proxy Server...with pd_proxy_port {pd_proxy_port} and serve_port {serve_port}"
-    )
+    print(f"Starting Proxy Server...with pd_proxy_port {pd_proxy_port} and serve_port {serve_port}")
     listener = start_service_discovery("0.0.0.0", pd_proxy_port)
     app.run(host="0.0.0.0", port=serve_port)
     listener.join()