fix v1 disaggregate prefill example ranktable generation issue

machenglong2025 · machenglong2025 · commit 95082f345112 · 2025-06-16T13:58:26.000+08:00
Signed-off-by: machenglong &lt;machenglong_yewu@cmss.chinamobile.com&gt;
diff --git a/examples/disaggregate_prefill_v1/README.md b/examples/disaggregate_prefill_v1/README.md
@@ -0,0 +1,213 @@
+# Disaggregated Prefill-Decode Deployment Guide
+
+## Overview
+This demo document provides instructions for running a disaggregated vLLM-ascend service with separate prefill and decode stages across 4 nodes, uses 16 Ascend NPUs for two prefill nodes (P1/P2) and 16 Ascend NPUS for two decode nodes (D1/D2).
+
+## Prerequisites
+- Ascend NPU environment with vLLM 0.9.1 installed
+- Network interfaces configured for distributed communication (eg: eth0)
+- Model weights located at `/data01/deepseek_r1_w8a8_zhw`
+
+## Rank table generation
+The rank table is a JSON file that specifies the mapping of Ascend NPU ranks to nodes. The following command generates a rank table for all nodes with 16 cards prefill and 16 cards decode:
+
+Run the following command on every node to generate the rank table:
+```bash
+cd vllm-ascend/examples/disaggregate_prefill_v1/
+bash generate_ranktable.sh 16 16
+```
+Rank table will generated at `/vllm-workspace/vllm-ascend/examples/disaggregate_prefill_v1/ranktable.json`
+
+## Start disaggregated vLLM-ascend service 
+Execution Sequence
+- 4 configured node ip are: 172.19.32.175 172.19.241.49 172.19.123.51 172.19.190.36
+- Start Prefill on Node 1 (P1)
+- Start Prefill on Node 2 (P2)
+- Start Decode on Node 1 (D1)
+- Start Decode on Node 2 (D2)
+- Start proxy server on Node1
+
+* Run prefill server P1 on first node
+```bash
+export HCCL_IF_IP=`hostname -I|awk -F " " '{print$1}'`
+export GLOO_SOCKET_IFNAME="eth0"
+export TP_SOCKET_IFNAME="eth0"
+export HCCL_SOCKET_IFNAME="eth0"
+export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregate_prefill_v1/ranktable.json
+export OMP_PROC_BIND=false
+export OMP_NUM_THREADS=100
+export VLLM_USE_V1=1
+export VLLM_VERSION=0.9.1
+vllm serve /data01/deepseek_r1_w8a8_zhw \
+  --host 0.0.0.0 \
+  --port 20002 \
+  --data-parallel-size 2 \
+  --data-parallel-size-local 1 \
+  --api-server-count 2 \
+  --data-parallel-address 172.19.32.175 \
+  --data-parallel-rpc-port 13356 \
+  --tensor-parallel-size 8 \
+  --no-enable-prefix-caching \
+  --seed 1024 \
+  --served-model-name deepseek \
+  --max-model-len 6144  \
+  --max-num-batched-tokens 6144  \
+  --trust-remote-code \
+  --enforce-eager \
+  --gpu-memory-utilization 0.9  \
+  --kv-transfer-config  \
+  '{"kv_connector": "LLMDataDistConnectorA3",
+  "kv_buffer_device": "npu",
+  "kv_role": "kv_producer",
+  "kv_parallel_size": 1,
+  "kv_port": "20001",
+  "engine_id": "0",
+  "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_connector_v1_a3"
+  }'  \
+  --additional-config \
+  '{"torchair_graph_config": {"enable": false, "enable_multistream_shared_expert": false}, "expert_tensor_parallel_size": 1}'
+```
+
+* Run prefill server P2 on second node
+```bash
+export HCCL_IF_IP=`hostname -I|awk -F " " '{print$1}'`
+export GLOO_SOCKET_IFNAME="eth0"
+export TP_SOCKET_IFNAME="eth0"
+export HCCL_SOCKET_IFNAME="eth0"
+export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregate_prefill_v1/ranktable.json
+export OMP_PROC_BIND=false
+export OMP_NUM_THREADS=100
+export VLLM_USE_V1=1
+export VLLM_VERSION=0.9.1
+vllm serve /data01/deepseek_r1_w8a8_zhw \
+  --host 0.0.0.0 \
+  --port 20002 \
+  --headless \
+  --data-parallel-size 2 \
+  --data-parallel-start-rank 1 \
+  --data-parallel-size-local 1 \
+  --data-parallel-address 172.19.32.175 \
+  --data-parallel-rpc-port 13356 \
+  --tensor-parallel-size 8 \
+  --no-enable-prefix-caching \
+  --seed 1024 \
+  --served-model-name deepseek \
+  --max-model-len 6144  \
+  --max-num-batched-tokens 6144  \
+  --trust-remote-code \
+  --enforce-eager \
+  --gpu-memory-utilization 0.9  \
+  --kv-transfer-config  \
+  '{"kv_connector": "LLMDataDistConnectorA3",
+  "kv_buffer_device": "npu",
+  "kv_role": "kv_producer",
+  "kv_parallel_size": 1,
+  "kv_port": "20001",
+  "engine_id": "0",
+  "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_connector_v1_a3"
+  }'  \
+  --additional-config \
+  '{"torchair_graph_config": {"enable": false, "enable_multistream_shared_expert": false}, "expert_tensor_parallel_size": 1}' 
+```
+
+* Run decode server d1 on third node
+```bash
+export HCCL_IF_IP=`hostname -I|awk -F " " '{print$1}'`
+export GLOO_SOCKET_IFNAME="eth0"
+export TP_SOCKET_IFNAME="eth0"
+export HCCL_SOCKET_IFNAME="eth0"
+export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregate_prefill_v1/ranktable.json
+export OMP_PROC_BIND=false
+export OMP_NUM_THREADS=100
+export VLLM_USE_V1=1
+export VLLM_VERSION=0.9.1
+vllm serve /data01/deepseek_r1_w8a8_zhw \
+  --host 0.0.0.0 \
+  --port 20002 \
+  --data-parallel-size 2 \
+  --data-parallel-size-local 1 \
+  --api-server-count 2 \
+  --data-parallel-address 172.19.123.51 \
+  --data-parallel-rpc-port 13356 \
+  --tensor-parallel-size 8 \
+  --no-enable-prefix-caching \
+  --seed 1024 \
+  --served-model-name deepseek \
+  --max-model-len 6144  \
+  --max-num-batched-tokens 6144  \
+  --trust-remote-code \
+  --enforce-eager \
+  --gpu-memory-utilization 0.9  \
+  --kv-transfer-config  \
+  '{"kv_connector": "LLMDataDistConnectorA3",
+  "kv_buffer_device": "npu",
+  "kv_role": "kv_consumer",
+  "kv_parallel_size": 1,
+  "kv_port": "20001",
+  "engine_id": "0",
+  "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_connector_v1_a3"
+  }'  \
+  --additional-config \
+  '{"torchair_graph_config": {"enable": false, "enable_multistream_shared_expert": false}, "expert_tensor_parallel_size": 1}'
+```
+
+* Run decode server d2 on last node
+```bash
+export HCCL_IF_IP=`hostname -I|awk -F " " '{print$1}'`
+export GLOO_SOCKET_IFNAME="eth0"
+export TP_SOCKET_IFNAME="eth0"
+export HCCL_SOCKET_IFNAME="eth0"
+export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregate_prefill_v1/ranktable.json
+export OMP_PROC_BIND=false
+export OMP_NUM_THREADS=100
+export VLLM_USE_V1=1
+export VLLM_VERSION=0.9.1
+vllm serve /data01/deepseek_r1_w8a8_zhw \
+  --host 0.0.0.0 \
+  --port 20002 \
+  --headless \
+  --data-parallel-size 2 \
+  --data-parallel-start-rank 1 \
+  --data-parallel-size-local 1 \
+  --data-parallel-address 172.19.123.51 \
+  --data-parallel-rpc-port 13356 \
+  --tensor-parallel-size 8 \
+  --no-enable-prefix-caching \
+  --seed 1024 \
+  --served-model-name deepseek \
+  --max-model-len 6144  \
+  --max-num-batched-tokens 6144  \
+  --trust-remote-code \
+  --enforce-eager \
+  --gpu-memory-utilization 0.9  \
+  --kv-transfer-config  \
+  '{"kv_connector": "LLMDataDistConnectorA3",
+  "kv_buffer_device": "npu",
+  "kv_role": "kv_consumer",
+  "kv_parallel_size": 1,
+  "kv_port": "20001",
+  "engine_id": "0",
+  "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_connector_v1_a3"
+  }'  \
+  --additional-config \
+  '{"torchair_graph_config": {"enable": false, "enable_multistream_shared_expert": false}, "expert_tensor_parallel_size": 1}' 
+```
+
+* Run proxy server on the first node
+```bash
+cd /vllm-workspace/vllm-ascend/examples/disaggregate_prefill_v1
+python toy_proxy_server.py --host 172.19.32.175 --port 1025 --prefiller-hosts 172.19.241.49 --prefiller-port 20002 --decoder-hosts 172.19.123.51 --decoder-ports 20002
+```
+
+* Verification
+Check service health using the proxy server endpoint:
+```bash
+curl http://localhost:1025/v1/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "deepseek",
+        "prompt": "你是谁？",
+        "max_tokens": 100,
+        "temperature": 0
+    }'
+```
diff --git a/examples/disaggregate_prefill_v1/gen_ranktable.py b/examples/disaggregate_prefill_v1/gen_ranktable.py
@@ -19,9 +19,13 @@
 
 print("enter py")
 
+hccn_tool_path = os.environ.get(
+    "HCCN_TOOL_PATH", "/usr/local/Ascend/driver/tools/hccn_tool"
+)
 master_addr = os.environ.get("MASTER_ADDR")
 master_port = os.environ.get("MASTER_PORT")
 rank = os.environ.get("RANK")
+local_rank = os.environ.get("LOCAL_RANK")
 # This variable is set by torchrun, 
 # and is different from WORLD_SIZE in gen_rank_table.sh.
 world_size = os.environ.get("WORLD_SIZE")
@@ -44,26 +48,28 @@ def get_cmd_stdout(cmd):
 chips_per_card = get_cmd_stdout("npu-smi info -l | grep \"Chip Count\"").split("\n")[0].split(":")[1].strip()
 chips_per_card = int(chips_per_card)
 
+# generate local device list for local rank 0, and gather it to all ranks
 local_device_list: list[dict[str, str]] = list()
-super_pod_id = "0"
-for card_id in range(num_cards):
-    for chip_id in range(chips_per_card):
-        device_id = card_id * chips_per_card + chip_id
-        if soc_info.is_a3:
-            device_ip = get_cmd_stdout(f"/usr/local/Ascend/driver/tools/hccn_tool -i {device_id} -vnic -g | grep ipaddr").split(":")[1].strip()
-            super_device_id = get_cmd_stdout(f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep SDID").split(":")[1].strip()
-            super_pod_id = get_cmd_stdout(f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep \"Super Pod ID\"").split(":")[1].strip()
-        else:
-            device_ip = get_cmd_stdout(f"/usr/local/Ascend/driver/tools/hccn_tool -i {device_id} -ip -g | grep ipaddr").split(":")[1].strip()
+if local_rank == "0":
+    super_pod_id = "0"
+    for card_id in range(num_cards):
+        for chip_id in range(chips_per_card):
+            device_id = card_id * chips_per_card + chip_id
+            if soc_info.is_a3:
+                device_ip = get_cmd_stdout(f"{hccn_tool_path} -i {device_id} -vnic -g | grep ipaddr").split(":")[1].strip()
+                super_device_id = get_cmd_stdout(f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep SDID").split(":")[1].strip()
+                super_pod_id = get_cmd_stdout(f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep \"Super Pod ID\"").split(":")[1].strip()
+            else:
+                device_ip = get_cmd_stdout(f"{hccn_tool_path} -i {device_id} -ip -g | grep ipaddr").split(":")[1].strip()
 
-        device_info = {
-                "server_id": local_host,
-                "device_id": str(device_id),
-                "device_ip": str(device_ip),
-            }
-        if soc_info.is_a3:
-            device_info.update({"super_pod_id": str(super_pod_id), "super_device_id": str(super_device_id)})
-        local_device_list.append(device_info)
+            device_info = {
+                    "server_id": local_host,
+                    "device_id": str(device_id),
+                    "device_ip": str(device_ip),
+                }
+            if soc_info.is_a3:
+                device_info.update({"super_pod_id": str(super_pod_id), "super_device_id": str(super_device_id)})
+            local_device_list.append(device_info)
 
 dist.init_process_group(backend=dist.Backend.GLOO)
 global_device_list = [None] * dist.get_world_size()
@@ -84,7 +90,8 @@ def get_cmd_stdout(cmd):
 }
 
 
-with open("ranktable.json", "w") as f:
-    json.dump(ranktable, f, indent=4)
+if local_rank == '0':
+    with open("ranktable.json", "w") as f:
+        json.dump(ranktable, f, indent=4)
 
-print("gen ranktable.json done")
+    print("gen ranktable.json done")
diff --git a/examples/disaggregate_prefill_v1/gen_ranktable.sh b/examples/disaggregate_prefill_v1/gen_ranktable.sh
@@ -2,8 +2,9 @@
 
 source /usr/local/Ascend/ascend-toolkit/set_env.sh
 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH}
-
+#Please modify the IPs and IFRAME according to your environment
 IPs=('1.0.0.0' '1.0.0.1')
+IFRAME=enp189s0f0
 LOCAL_HOST=`hostname -I|awk -F " " '{print$1}'`
 GPUS_PER_NODE=8
 MASTER_ADDR=${IPs[0]}
@@ -35,7 +36,7 @@ echo "NODE_RANK": $NODE_RANK
 echo "==============="
 
 if [[ -n "${GEN_RANKTABLE}" || ! -e ${PWD}/ranktable.json ]]; then
-    GLOO_SOCKET_IFNAME=enp189s0f0 torchrun \
+    GLOO_SOCKET_IFNAME=${IFRAME} torchrun \
         --nproc_per_node 1 \
         --nnodes ${NNODES} \
         --node_rank ${NODE_RANK} \