Skip to content

Commit d8c7cc6

Browse files
committed
[Feat] EPD disaggregation multiple instance proxy
[Docs] Update EPD README.md Signed-off-by: LastZhabka <sakhmoldin.mukhammadarif@gmail.com>
1 parent 44a7c30 commit d8c7cc6

File tree

8 files changed

+350
-391
lines changed

8 files changed

+350
-391
lines changed

examples/online_serving/separated_encode/api_server/api_server_1e1pd.py

Lines changed: 0 additions & 220 deletions
This file was deleted.

examples/online_serving/separated_encode/launch_epd_serve_separated.sh renamed to examples/online_serving/separated_encode/launch_1e1pd.sh

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,12 @@ wait_for_server() {
1212
MODEL="/workspace/helper/Qwen2.5-VL-3B-Instruct"
1313
LOG_PATH=$LOG_PATH
1414
ENCODE_PORT=19534
15+
ENCODE_RANK=0
1516
PREFILL_DECODE_PORT=19535
17+
PREFILL_DECODE_RANK=1
1618
PROXY_PORT=10001
17-
GPU_E="6"
18-
GPU_PD="7"
19+
GPU_E="4"
20+
GPU_PD="5"
1921

2022
START_TIME=$(date +"%Y%m%d_%H%M%S")
2123

@@ -28,9 +30,8 @@ CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
2830
--max-num-seqs 128 \
2931
--instance-type "encode" \
3032
--connector-workers-num 8 \
31-
--epd-rank 0 &
33+
--epd-rank "$ENCODE_RANK" &
3234

33-
wait_for_server $ENCODE_PORT
3435

3536
CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
3637
--gpu-memory-utilization 0.9 \
@@ -39,16 +40,17 @@ CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
3940
--max-num-seqs 128 \
4041
--instance-type "prefill+decode" \
4142
--connector-workers-num 8 \
42-
--epd-rank 1 &
43+
--epd-rank "$PREFILL_DECODE_RANK" &
4344

45+
wait_for_server $ENCODE_PORT
4446
wait_for_server $PREFILL_DECODE_PORT
4547

46-
python examples/online_serving/separated_encode/proxy/proxy1e1pd_aiohttp.py \
48+
python examples/online_serving/separated_encode/proxy/proxy_aiohttp.py \
4749
--host "0.0.0.0" \
4850
--port "$PROXY_PORT" \
49-
--encode-server-url "http://localhost:$ENCODE_PORT" \
50-
--prefill-decode-server-url "http://localhost:$PREFILL_DECODE_PORT" \
51-
--e-rank 0 \
52-
--pd-rank 1 &
51+
--encode-servers-urls "http://localhost:$ENCODE_PORT" \
52+
--prefill-decode-servers-urls "http://localhost:$PREFILL_DECODE_PORT" \
53+
--encode-servers-ranks "$ENCODE_RANK" \
54+
--prefill-decode-servers-ranks "$PREFILL_DECODE_RANK" &
5355

5456
wait_for_server $PROXY_PORT
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
#!/bin/bash
2+
3+
4+
wait_for_server() {
5+
local port=$1
6+
timeout 12000 bash -c "
7+
until curl -s localhost:$port/v1/chat/completions > /dev/null; do
8+
sleep 1
9+
done" && return 0 || return 1
10+
}
11+
12+
MODEL="/workspace/helper/Qwen2.5-VL-3B-Instruct"
13+
LOG_PATH=$LOG_PATH
14+
15+
ENCODE_PORT=19534
16+
PREFILL_DECODE_PORT_F=19535
17+
PREFILL_DECODE_PORT_S=19536
18+
19+
ENCODE_RANK=0
20+
PREFILL_DECODE_RANK_F=1
21+
PREFILL_DECODE_RANK_S=2
22+
23+
GPU_E="3"
24+
GPU_PD_F="4"
25+
GPU_PD_S="5"
26+
27+
PROXY_PORT=10001
28+
29+
START_TIME=$(date +"%Y%m%d_%H%M%S")
30+
31+
redis-server &
32+
33+
CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
34+
--gpu-memory-utilization 0.9 \
35+
--port "$ENCODE_PORT" \
36+
--enable-request-id-headers \
37+
--max-num-seqs 128 \
38+
--instance-type "encode" \
39+
--connector-workers-num 8 \
40+
--epd-rank "$ENCODE_RANK" &
41+
42+
CUDA_VISIBLE_DEVICES="$GPU_PD_F" vllm serve "$MODEL" \
43+
--gpu-memory-utilization 0.9 \
44+
--port "$PREFILL_DECODE_PORT_F" \
45+
--enable-request-id-headers \
46+
--max-num-seqs 128 \
47+
--instance-type "prefill+decode" \
48+
--connector-workers-num 8 \
49+
--epd-rank "$PREFILL_DECODE_RANK_F" &
50+
51+
CUDA_VISIBLE_DEVICES="$GPU_PD_S" vllm serve "$MODEL" \
52+
--gpu-memory-utilization 0.9 \
53+
--port "$PREFILL_DECODE_PORT_S" \
54+
--enable-request-id-headers \
55+
--max-num-seqs 128 \
56+
--instance-type "prefill+decode" \
57+
--connector-workers-num 8 \
58+
--epd-rank "$PREFILL_DECODE_RANK_S" &
59+
60+
wait_for_server $ENCODE_PORT
61+
wait_for_server $PREFILL_DECODE_PORT_F
62+
wait_for_server $PREFILL_DECODE_PORT_S
63+
64+
python examples/online_serving/separated_encode/proxy/proxy_aiohttp.py \
65+
--host "0.0.0.0" \
66+
--port "$PROXY_PORT" \
67+
--encode-servers-urls "http://localhost:$ENCODE_PORT" \
68+
--prefill-decode-servers-urls "http://localhost:$PREFILL_DECODE_PORT_F,http://localhost:$PREFILL_DECODE_PORT_S" \
69+
--encode-servers-ranks "$ENCODE_RANK" \
70+
--prefill-decode-servers-ranks "$PREFILL_DECODE_RANK_F,$PREFILL_DECODE_RANK_S" &
71+
72+
wait_for_server $PROXY_PORT

0 commit comments

Comments
 (0)