Skip to content

Commit bad5387

Browse files
committed
example slurm script for submitting jobs
1 parent 5b45943 commit bad5387

File tree

3 files changed

+254
-0
lines changed

3 files changed

+254
-0
lines changed
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/bin/bash
2+
#SBATCH --time=2:00:00
3+
#SBATCH --partition=batch_short
4+
#SBATCH --gres=gpu:8
5+
#SBATCH --tasks=1
6+
#SBATCH --nodes=1
7+
#SBATCH --ntasks-per-node=1
8+
#SBATCH --exclusive
9+
#SBATCH --output=benchmark-slurm-output-%j.txt
10+
#SBATCH --error=benchmark-slurm-error-%j.txt
11+
12+
set -eux
13+
set -o pipefail
14+
15+
mkdir -p ${OUTPUT_HOST_DIR}/${SLURM_JOB_ID}
16+
17+
srun \
18+
--container-image=${CONTAINER_IMAGE} \
19+
--container-mounts=${CACHE_HOST_DIR}:${CACHE_CONTAINER_DIR},${OUTPUT_HOST_DIR}:${OUTPUT_CONTAINER_DIR} \
20+
--no-container-mount-home \
21+
mlperf-inf-mm-vl2l benchmark vllm \
22+
--settings.test.scenario=${SCENARIO} \
23+
--settings.test.mode=${MODE} \
24+
--dataset.token=${DATASET_TOKEN} \
25+
--vllm.model.repo_id=${MODEL_REPO_ID} \
26+
--vllm.cli=--async-scheduling \
27+
--vllm.cli=--max-model-len=32768 \
28+
--vllm.cli=--max-num-seqs=1024 \
29+
--vllm.cli=--mm-encoder-tp-mode=data \
30+
--vllm.cli=--limit-mm-per-prompt.video=0 \
31+
--vllm.cli=--tensor-parallel-size=8 \
32+
--settings.logging.log_output.outdir=${OUTPUT_CONTAINER_DIR}/${SLURM_JOB_ID}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#!/bin/bash
2+
#SBATCH --time=1:00:00
3+
#SBATCH --partition=cpu_short
4+
#SBATCH --nodes=1
5+
#SBATCH --tasks=1
6+
#SBATCH --ntasks-per-node=1
7+
#SBATCH --cpus-per-task=8
8+
#SBATCH --mem-per-cpu=16G
9+
#SBATCH --output=evaluate-slurm-output-%j.txt
10+
#SBATCH --error=evaluate-slurm-error-%j.txt
11+
12+
srun \
13+
--container-image=${CONTAINER_IMAGE} \
14+
--container-mounts=${CACHE_HOST_DIR}:${CACHE_CONTAINER_DIR},${OUTPUT_HOST_DIR}:${OUTPUT_CONTAINER_DIR} \
15+
--no-container-mount-home \
16+
mlperf-inf-mm-vl2l evaluate \
17+
--dataset.token=${DATASET_TOKEN} \
18+
--filename=${OUTPUT_CONTAINER_DIR}/${SLURM_JOB_ID}/mlperf_log_accuracy.json
Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
#!/bin/bash
2+
3+
set -eux
4+
set -o pipefail
5+
6+
DEFAULT_CONTAINER_IMAGE=""
7+
container_image=${DEFAULT_CONTAINER_IMAGE}
8+
9+
DEFAULT_DATASET_TOKEN=""
10+
dataset_token=${DEFAULT_DATASET_TOKEN}
11+
12+
DEFAULT_MODEL_REPO_ID=Qwen/Qwen3-VL-235B-A22B-Instruct
13+
model_repo_id=${DEFAULT_MODEL_REPO_ID}
14+
15+
DEFAULT_SCENARIO=offline
16+
scenario=${DEFAULT_SCENARIO}
17+
18+
DEFAULT_MODE=accuracy_only
19+
mode=${DEFAULT_MODE}
20+
21+
DEFAULT_CACHE_HOST_DIR=""
22+
cache_host_dir=${DEFAULT_CACHE_HOST_DIR}
23+
24+
DEFAULT_OUTPUT_HOST_DIR=$(pwd)/outputs
25+
output_host_dir=${DEFAULT_OUTPUT_HOST_DIR}
26+
27+
DEFAULT_SLURM_ACCOUNT=""
28+
slurm_account=${DEFAULT_SLURM_ACCOUNT}
29+
30+
DEFAULT_BENCHMARK_SLURM_PARTITION=""
31+
benchmark_slurm_partition=${DEFAULT_BENCHMARK_SLURM_PARTITION}
32+
33+
DEFAULT_EVALUATE_SLURM_PARTITION=""
34+
evaluate_slurm_partition=${DEFAULT_EVALUATE_SLURM_PARTITION}
35+
36+
function _exit_with_help_msg() {
37+
cat <<EOF
38+
Submit a benchmarking (and optionally, an evaluation) job(s) for the VL2L benchmark.
39+
40+
Usage: ${BASH_SOURCE[0]}
41+
[-ci | --container-image] Container image to run the benchmark (default: ${DEFAULT_CONTAINER_IMAGE}).
42+
[-dt | --dataset-token] Access token for the Shopify Global Catalogue dataset (default: ${DEFAULT_DATASET_TOKEN}).
43+
[-mri | --model-repo-id] HuggingFace repo ID of the model to benchmark (default: ${DEFAULT_MODEL_REPO_ID}).
44+
[-s | --scenario] Benchmark scenario (default: ${DEFAULT_SCENARIO}).
45+
[-m | --mode] Benchmark mode (default: ${DEFAULT_MODE}).
46+
[-chd | --cache-host-dir] Host directory of the `.cache` directory to which HuggingFace will dump the dataset and the model checkpoint, and vLLM will dump compilation artifacts (default: ${DEFAULT_CACHE_HOST_DIR}).
47+
[-ohd | --output-host-dir] Host directory to which the benchmark and evaluation results will be dumped (default: ${DEFAULT_OUTPUT_HOST_DIR}).
48+
[-sa | --slurm-account] Slurm account for submitting the benchmark and evaluation jobs (default: ${DEFAULT_SLURM_ACCOUNT}).
49+
[-bsp | --benchmark-slurm-partition] Slurm partition for submitting the benchmarking job; usually a partition with nodes that have GPUs (default: ${DEFAULT_BENCHMARK_SLURM_PARTITION}).
50+
[-esp | --evaluate-slurm-partition] Slurm partition for submitting the evaluation job; usually a partition with nodes that have CPUs only (default: ${DEFAULT_EVALUATE_SLURM_PARTITION}).
51+
[-h | --help] Print this help message.
52+
EOF
53+
if [ -n "$1" ]; then
54+
echo "$(tput bold setab 1)$1$(tput sgr0)"
55+
fi
56+
exit "$2"
57+
}
58+
59+
while [[ $# -gt 0 ]]; do
60+
case $1 in
61+
-ci | --container-image)
62+
container_image=$2
63+
shift
64+
shift
65+
;;
66+
-ci=* | --container-image=*)
67+
container_image=${1#*=}
68+
shift
69+
;;
70+
-dt | --dataset-token)
71+
dataset_token=$2
72+
shift
73+
shift
74+
;;
75+
-dt=* | --dataset-token=*)
76+
dataset_token=${1#*=}
77+
shift
78+
;;
79+
-mri | --model-repo-id)
80+
model_repo_id=$2
81+
shift
82+
shift
83+
;;
84+
-mri=* | --model-repo-id=*)
85+
model_repo_id=${1#*=}
86+
shift
87+
;;
88+
-s | --scenario)
89+
scenario=$2
90+
shift
91+
shift
92+
;;
93+
-s=* | --scenario=*)
94+
scenario=${1#*=}
95+
shift
96+
;;
97+
-m | --mode)
98+
mode=$2
99+
shift
100+
shift
101+
;;
102+
-m=* | --mode=*)
103+
mode=${1#*=}
104+
shift
105+
;;
106+
-chd | --cache-host-dir)
107+
cache_host_dir=$2
108+
shift
109+
shift
110+
;;
111+
-chd=* | --cache-host-dir=*)
112+
cache_host_dir=${1#*=}
113+
shift
114+
;;
115+
-ohd | --output-host-dir)
116+
output_host_dir=$2
117+
shift
118+
shift
119+
;;
120+
-ohd=* | --output-host-dir=*)
121+
output_host_dir=${1#*=}
122+
shift
123+
;;
124+
-sa | --slurm-account)
125+
slurm_account=$2
126+
shift
127+
shift
128+
;;
129+
-sa=* | --slurm-account=*)
130+
slurm_account=${1#*=}
131+
shift
132+
;;
133+
-bsp | --benchmark-slurm-partition)
134+
benchmark_slurm_partition=$2
135+
shift
136+
shift
137+
;;
138+
-bsp=* | --benchmark-slurm-partition=*)
139+
benchmark_slurm_partition=${1#*=}
140+
shift
141+
;;
142+
-esp | --evaluate-slurm-partition)
143+
evaluate_slurm_partition=$2
144+
shift
145+
shift
146+
;;
147+
-esp=* | --evaluate-slurm-partition=*)
148+
evaluate_slurm_partition=${1#*=}
149+
shift
150+
;;
151+
-h | --help)
152+
_exit_with_help_msg "" 0
153+
;;
154+
*)
155+
_exit_with_help_msg "[ERROR] Unknown option: $1" 1
156+
;;
157+
esac
158+
done
159+
160+
if [[ -z "${container_image}" ]]; then
161+
_exit_with_help_msg "[ERROR] -ci or --container-image is required." 1
162+
fi
163+
164+
if [[ -z "${dataset_token}" ]]; then
165+
_exit_with_help_msg "[ERROR] -dt or --dataset-token is required." 1
166+
fi
167+
168+
if [[ -z "${cache_host_dir}" ]]; then
169+
_exit_with_help_msg "[ERROR] -chd or --cache-host-dir is required." 1
170+
fi
171+
172+
if [[ -z "${slurm_account}" ]]; then
173+
_exit_with_help_msg "[ERROR] -sa or --slurm-account is required." 1
174+
fi
175+
176+
if [[ -z "${benchmark_slurm_partition}" ]]; then
177+
_exit_with_help_msg "[ERROR] -bsp or --benchmark-slurm-partition is required." 1
178+
fi
179+
180+
if [[ -z "${evaluate_slurm_partition}" ]]; then
181+
_exit_with_help_msg "[ERROR] -esp or --evaluate-slurm-partition is required." 1
182+
fi
183+
184+
cache_container_dir=/root/.cache
185+
output_container_dir=/outputs
186+
187+
mkdir -p "${output_host_dir}"
188+
189+
benchmark_job_id=$(
190+
sbatch --parsable \
191+
--export=CACHE_HOST_DIR="${cache_host_dir}",CACHE_CONTAINER_DIR="${cache_container_dir}",OUTPUT_HOST_DIR="${output_host_dir}",OUTPUT_CONTAINER_DIR="${output_container_dir}",CONTAINER_IMAGE="${container_image}",SCENARIO="${scenario}",MODE="${mode}",DATASET_TOKEN="${dataset_token}",MODEL_REPO_ID="${model_repo_id}" \
192+
--account="${slurm_account}" \
193+
--partition="${benchmark_slurm_partition}" \
194+
benchmark.sh
195+
)
196+
197+
if [[ "${mode}" == "accuracy_only" ]]; then
198+
sbatch \
199+
--dependency=afterok:"${benchmark_job_id}" \
200+
--export=CACHE_HOST_DIR="${cache_host_dir}",CACHE_CONTAINER_DIR="${cache_container_dir}",OUTPUT_HOST_DIR="${output_host_dir}",OUTPUT_CONTAINER_DIR="${output_container_dir}",CONTAINER_IMAGE="${container_image}",DATASET_TOKEN="${dataset_token}" \
201+
--account="${slurm_account}" \
202+
--partition="${evaluate_slurm_partition}" \
203+
evaluate.sh
204+
fi

0 commit comments

Comments
 (0)