File tree Expand file tree Collapse file tree 3 files changed +141
-0
lines changed
scripts/scheduled_integration_test Expand file tree Collapse file tree 3 files changed +141
-0
lines changed Original file line number Diff line number Diff line change 1+ #! /usr/bin/env bash
2+ set -euxo pipefail
3+
4+ # args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
5+ THRESHOLD=${1:- 0.25}
6+ NUM_Q=${2:- 1319}
7+ PORT=${3:- 8010}
8+ OUT_DIR=${OUT_DIR:-/ tmp/ vllm-scheduled}
9+ mkdir -p " ${OUT_DIR} "
10+
11+ wait_for_server () {
12+ local port=$1
13+ timeout 600 bash -c '
14+ until curl -sf "http://127.0.0.1:' " $port " ' /health" > /dev/null; do
15+ sleep 1
16+ done'
17+ }
18+
19+ MODEL=" deepseek-ai/DeepSeek-V2-lite"
20+ BACKENDS=(" deepep_high_throughput" " deepep_low_latency" )
21+
22+ cleanup () {
23+ if [[ -n " ${SERVER_PID:- } " ]] && kill -0 " ${SERVER_PID} " 2> /dev/null; then
24+ kill " ${SERVER_PID} " 2> /dev/null || true
25+ for _ in {1..20}; do
26+ kill -0 " ${SERVER_PID} " 2> /dev/null || break
27+ sleep 0.5
28+ done
29+ kill -9 " ${SERVER_PID} " 2> /dev/null || true
30+ fi
31+ }
32+ trap cleanup EXIT
33+
34+ for BACK in " ${BACKENDS[@]} " ; do
35+ VLLM_DEEP_GEMM_WARMUP=skip \
36+ VLLM_ALL2ALL_BACKEND=$BACK \
37+ vllm serve " $MODEL " \
38+ --enforce-eager \
39+ --tensor-parallel-size 2 \
40+ --data-parallel-size 2 \
41+ --enable-expert-parallel \
42+ --enable-eplb \
43+ --trust-remote-code \
44+ --max-model-len 2048 \
45+ --port $PORT &
46+ SERVER_PID=$!
47+ wait_for_server $PORT
48+
49+ TAG=$( echo " $MODEL " | tr ' /: \\n' ' _____' )
50+ OUT=" ${OUT_DIR} /${TAG} _${BACK} .json"
51+ python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
52+ python3 - << PY
53+ import json; acc=json.load(open('${OUT} '))['accuracy']
54+ print(f"${MODEL} ${BACK} : accuracy {acc:.3f}")
55+ assert acc >= ${THRESHOLD} , f"${MODEL} ${BACK} accuracy {acc}"
56+ PY
57+
58+ cleanup
59+ SERVER_PID=
60+ sleep 1
61+ PORT=$(( PORT+ 1 ))
62+ done
Original file line number Diff line number Diff line change 1+ #! /usr/bin/env bash
2+ set -euxo pipefail
3+
4+ # args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
5+ THRESHOLD=${1:- 0.8}
6+ NUM_Q=${2:- 1319}
7+ PORT=${3:- 8020}
8+ OUT_DIR=${OUT_DIR:-/ tmp/ vllm-scheduled}
9+ mkdir -p " ${OUT_DIR} "
10+
11+ wait_for_server () {
12+ local port=$1
13+ timeout 600 bash -c '
14+ until curl -sf "http://127.0.0.1:' " $port " ' /health" > /dev/null; do
15+ sleep 1
16+ done'
17+ }
18+
19+ MODEL=" QWen/Qwen3-30B-A3B-FP8"
20+ BACKENDS=(" deepep_high_throughput" " deepep_low_latency" )
21+
22+ cleanup () {
23+ if [[ -n " ${SERVER_PID:- } " ]] && kill -0 " ${SERVER_PID} " 2> /dev/null; then
24+ kill " ${SERVER_PID} " 2> /dev/null || true
25+ for _ in {1..20}; do
26+ kill -0 " ${SERVER_PID} " 2> /dev/null || break
27+ sleep 0.5
28+ done
29+ kill -9 " ${SERVER_PID} " 2> /dev/null || true
30+ fi
31+ }
32+ trap cleanup EXIT
33+
34+ for BACK in " ${BACKENDS[@]} " ; do
35+ VLLM_DEEP_GEMM_WARMUP=skip \
36+ VLLM_ALL2ALL_BACKEND=$BACK \
37+ vllm serve " $MODEL " \
38+ --enforce-eager \
39+ --tensor-parallel-size 2 \
40+ --data-parallel-size 2 \
41+ --enable-expert-parallel \
42+ --trust-remote-code \
43+ --max-model-len 2048 \
44+ --port $PORT &
45+ SERVER_PID=$!
46+ wait_for_server $PORT
47+
48+ TAG=$( echo " $MODEL " | tr ' /: \\n' ' _____' )
49+ OUT=" ${OUT_DIR} /${TAG} _${BACK} .json"
50+ python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
51+ python3 - << PY
52+ import json; acc=json.load(open('${OUT} '))['accuracy']
53+ print(f"${MODEL} ${BACK} : accuracy {acc:.3f}")
54+ assert acc >= ${THRESHOLD} , f"${MODEL} ${BACK} accuracy {acc}"
55+ PY
56+
57+ cleanup
58+ SERVER_PID=
59+ sleep 1
60+ PORT=$(( PORT+ 1 ))
61+ done
Original file line number Diff line number Diff line change @@ -1234,3 +1234,21 @@ steps:
12341234 - .buildkite/scripts/run-prime-rl-test.sh
12351235 commands :
12361236 - bash .buildkite/scripts/run-prime-rl-test.sh
1237+
1238+ - label : DeepSeek V2-Lite Accuracy
1239+ timeout_in_minutes : 60
1240+ gpu : h100
1241+ optional : true
1242+ num_gpus : 4
1243+ working_dir : " /vllm-workspace"
1244+ commands :
1245+ - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
1246+
1247+ - label : Qwen3-30B-A3B-FP8-block Accuracy
1248+ timeout_in_minutes : 60
1249+ gpu : h100
1250+ optional : true
1251+ num_gpus : 4
1252+ working_dir : " /vllm-workspace"
1253+ commands :
1254+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
You can’t perform that action at this time.
0 commit comments