Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions examples/online_serving/separated_encode/launch_1e1pd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/bin/bash


wait_for_server() {
local port=$1
timeout 12000 bash -c "
until curl -s localhost:$port/v1/chat/completions > /dev/null; do
sleep 1
done" && return 0 || return 1
}

MODEL="/workspace/helper/Qwen2.5-VL-3B-Instruct"
LOG_PATH=$LOG_PATH
ENCODE_PORT=19534
ENCODE_RANK=0
PREFILL_DECODE_PORT=19535
PREFILL_DECODE_RANK=1
PROXY_PORT=10001
GPU_E="4"
GPU_PD="5"
export REDIS_HOST="localhost"
export REDIS_PORT="6379"

START_TIME=$(date +"%Y%m%d_%H%M%S")

redis-server --bind "$REDIS_HOST" --port "$REDIS_PORT" &

CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
--gpu-memory-utilization 0.9 \
--port "$ENCODE_PORT" \
--enable-request-id-headers \
--max-num-seqs 128 \
--instance-type "encode" \
--connector-workers-num 8 \
--epd-rank "$ENCODE_RANK" &


CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
--gpu-memory-utilization 0.9 \
--port "$PREFILL_DECODE_PORT" \
--enable-request-id-headers \
--max-num-seqs 128 \
--instance-type "prefill+decode" \
--connector-workers-num 8 \
--epd-rank "$PREFILL_DECODE_RANK" &

wait_for_server $ENCODE_PORT
wait_for_server $PREFILL_DECODE_PORT

python examples/online_serving/separated_encode/proxy/proxy_aiohttp.py \
--host "0.0.0.0" \
--port "$PROXY_PORT" \
--encode-servers-urls "http://localhost:$ENCODE_PORT" \
--prefill-decode-servers-urls "http://localhost:$PREFILL_DECODE_PORT" \
--encode-servers-ranks "$ENCODE_RANK" \
--prefill-decode-servers-ranks "$PREFILL_DECODE_RANK" &

wait_for_server $PROXY_PORT
75 changes: 75 additions & 0 deletions examples/online_serving/separated_encode/launch_1e2pd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/bin/bash


wait_for_server() {
local port=$1
timeout 12000 bash -c "
until curl -s localhost:$port/v1/chat/completions > /dev/null; do
sleep 1
done" && return 0 || return 1
}

MODEL="/workspace/helper/Qwen2.5-VL-3B-Instruct"
LOG_PATH=$LOG_PATH

ENCODE_PORT=19534
PREFILL_DECODE_PORT_F=19535
PREFILL_DECODE_PORT_S=19536

ENCODE_RANK=0
PREFILL_DECODE_RANK_F=1
PREFILL_DECODE_RANK_S=2

GPU_E="3"
GPU_PD_F="4"
GPU_PD_S="5"

PROXY_PORT=10001

export REDIS_HOST="localhost"
export REDIS_PORT="6379"

START_TIME=$(date +"%Y%m%d_%H%M%S")

redis-server --bind "$REDIS_HOST" --port "$REDIS_PORT" &

CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
--gpu-memory-utilization 0.9 \
--port "$ENCODE_PORT" \
--enable-request-id-headers \
--max-num-seqs 128 \
--instance-type "encode" \
--connector-workers-num 8 \
--epd-rank "$ENCODE_RANK" &

CUDA_VISIBLE_DEVICES="$GPU_PD_F" vllm serve "$MODEL" \
--gpu-memory-utilization 0.9 \
--port "$PREFILL_DECODE_PORT_F" \
--enable-request-id-headers \
--max-num-seqs 128 \
--instance-type "prefill+decode" \
--connector-workers-num 8 \
--epd-rank "$PREFILL_DECODE_RANK_F" &

CUDA_VISIBLE_DEVICES="$GPU_PD_S" vllm serve "$MODEL" \
--gpu-memory-utilization 0.9 \
--port "$PREFILL_DECODE_PORT_S" \
--enable-request-id-headers \
--max-num-seqs 128 \
--instance-type "prefill+decode" \
--connector-workers-num 8 \
--epd-rank "$PREFILL_DECODE_RANK_S" &

wait_for_server $ENCODE_PORT
wait_for_server $PREFILL_DECODE_PORT_F
wait_for_server $PREFILL_DECODE_PORT_S

python examples/online_serving/separated_encode/proxy/proxy_aiohttp.py \
--host "0.0.0.0" \
--port "$PROXY_PORT" \
--encode-servers-urls "http://localhost:$ENCODE_PORT" \
--prefill-decode-servers-urls "http://localhost:$PREFILL_DECODE_PORT_F,http://localhost:$PREFILL_DECODE_PORT_S" \
--encode-servers-ranks "$ENCODE_RANK" \
--prefill-decode-servers-ranks "$PREFILL_DECODE_RANK_F,$PREFILL_DECODE_RANK_S" &

wait_for_server $PROXY_PORT
75 changes: 75 additions & 0 deletions examples/online_serving/separated_encode/launch_2e1pd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/bin/bash


wait_for_server() {
local port=$1
timeout 12000 bash -c "
until curl -s localhost:$port/v1/chat/completions > /dev/null; do
sleep 1
done" && return 0 || return 1
}

MODEL="/workspace/helper/Qwen2.5-VL-3B-Instruct"
LOG_PATH=$LOG_PATH

ENCODE_PORT_F=19534
ENCODE_PORT_S=19535
PREFILL_DECODE_PORT=19536

ENCODE_RANK_F=0
ENCODE_RANK_S=1
PREFILL_DECODE_RANK=2

GPU_E_F="3"
GPU_E_S="4"
GPU_PD="5"

PROXY_PORT=10001

export REDIS_HOST="localhost"
export REDIS_PORT="6379"

START_TIME=$(date +"%Y%m%d_%H%M%S")

redis-server --bind "$REDIS_HOST" --port "$REDIS_PORT" &

CUDA_VISIBLE_DEVICES="$GPU_E_F" vllm serve "$MODEL" \
--gpu-memory-utilization 0.9 \
--port "$ENCODE_PORT_F" \
--enable-request-id-headers \
--max-num-seqs 128 \
--instance-type "encode" \
--connector-workers-num 8 \
--epd-rank "$ENCODE_RANK_F" &

CUDA_VISIBLE_DEVICES="$GPU_E_S" vllm serve "$MODEL" \
--gpu-memory-utilization 0.9 \
--port "$ENCODE_PORT_S" \
--enable-request-id-headers \
--max-num-seqs 128 \
--instance-type "encode" \
--connector-workers-num 8 \
--epd-rank "$ENCODE_RANK_S" &

CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
--gpu-memory-utilization 0.9 \
--port "$PREFILL_DECODE_PORT" \
--enable-request-id-headers \
--max-num-seqs 128 \
--instance-type "prefill+decode" \
--connector-workers-num 8 \
--epd-rank "$PREFILL_DECODE_RANK" &

wait_for_server $ENCODE_PORT_F
wait_for_server $ENCODE_PORT_S
wait_for_server $PREFILL_DECODE_PORT

python examples/online_serving/separated_encode/proxy/proxy_aiohttp.py \
--host "0.0.0.0" \
--port "$PROXY_PORT" \
--encode-servers-urls "http://localhost:$ENCODE_PORT_F,http://localhost:$ENCODE_PORT_S" \
--prefill-decode-servers-urls "http://localhost:$PREFILL_DECODE_PORT" \
--encode-servers-ranks "$ENCODE_RANK_F, $ENCODE_RANK_S" \
--prefill-decode-servers-ranks "$PREFILL_DECODE_RANK" &

wait_for_server $PROXY_PORT
54 changes: 54 additions & 0 deletions examples/online_serving/separated_encode/launch_epd_serve.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/bin/bash


wait_for_server() {
local port=$1
timeout 12000 bash -c "
until curl -s localhost:$port/v1/chat/completions > /dev/null; do
sleep 1
Comment on lines +6 to +8
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The wait_for_server function attempts to check server readiness by sending a GET request to /v1/chat/completions. However, this endpoint in the vLLM OpenAI-compatible server only supports POST requests. A GET request from curl will result in a 405 Method Not Allowed error, causing the script to wait for the full timeout period before failing. You should use an endpoint that supports GET requests, such as /health, for the health check.

Suggested change
timeout 12000 bash -c "
until curl -s localhost:$port/v1/chat/completions > /dev/null; do
sleep 1
until curl -s localhost:$port/health > /dev/null; do

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same

done" && return 0 || return 1
}

MODEL="/workspace/helper/Qwen2.5-VL-3B-Instruct"
LOG_PATH=$LOG_PATH
ENCODE_PORT=19534
PREFILL_DECODE_PORT=19535
PROXY_PORT=10001
GPU="5"
export REDIS_HOST="localhost"
export REDIS_PORT="6379"
START_TIME=$(date +"%Y%m%d_%H%M%S")

redis-server --bind "$REDIS_HOST" --port "$REDIS_PORT" &

CUDA_VISIBLE_DEVICES="$GPU" vllm serve "$MODEL" \
--gpu-memory-utilization 0.2 \
--port "$ENCODE_PORT" \
--enable-request-id-headers \
--max-num-seqs 32 \
--instance-type "encode" \
--connector-workers-num 8 \
--epd-rank 0 &

wait_for_server $ENCODE_PORT

CUDA_VISIBLE_DEVICES="$GPU" vllm serve "$MODEL" \
--gpu-memory-utilization 0.7 \
--port "$PREFILL_DECODE_PORT" \
--enable-request-id-headers \
--max-num-seqs 128 \
--instance-type "prefill+decode" \
--connector-workers-num 8 \
--epd-rank 1 &

wait_for_server $PREFILL_DECODE_PORT

python examples/online_serving/separated_encode/proxy/proxy_aiohttp.py \
--host "0.0.0.0" \
--port "$PROXY_PORT" \
--encode-servers-urls "http://localhost:$ENCODE_PORT" \
--prefill-decode-servers-urls "http://localhost:$PREFILL_DECODE_PORT" \
--encode-servers-ranks "0" \
--prefill-decode-servers-ranks "1" &

wait_for_server $PROXY_PORT
Loading