|
| 1 | +#!/bin/bash |
| 2 | +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 3 | +# SPDX-License-Identifier: Apache-2.0 |
| 4 | +MULTI_ROUND="${MULTI_ROUND:-8}" |
| 5 | + |
| 6 | +# set MOUNT_DIR |
| 7 | +MOUNT_DIR="${MOUNT_DIR:-${PWD}}" |
| 8 | +CONTAINER_NAME=disaggr-test |
| 9 | + |
| 10 | + |
| 11 | +STREAMING=true |
| 12 | +CTX_GPU_FRAC=0.75 |
| 13 | +CACHE_TRANSCEIVER_MAX_NUM_TOKENS=8448 |
| 14 | + |
| 15 | +num_ctx_servers=$1 |
| 16 | +ctx_tp_size=$2 |
| 17 | +ctx_batch_size=$3 |
| 18 | +ctx_max_num_tokens=$4 |
| 19 | +ctx_enable_attention_dp=$5 |
| 20 | +num_gen_servers=$6 |
| 21 | +gen_tp_size=$7 |
| 22 | +gen_batch_size=$8 |
| 23 | +gen_max_num_tokens=$9 |
| 24 | +gen_enable_attention_dp=${10} |
| 25 | +gen_gpu_memory_fraction=${11} |
| 26 | +eplb_num_slots=${12} |
| 27 | +mtp_size=${13} |
| 28 | +concurrency_list=${14} |
| 29 | +gen_nodes=${15} |
| 30 | +kind=${16} |
| 31 | +model_path=${17} |
| 32 | +served_model_name=${18} |
| 33 | +image=${19} |
| 34 | +isl=${20} |
| 35 | +osl=${21} |
| 36 | + |
| 37 | +ctx_max_seq_len=$((${isl} + 203)) |
| 38 | +gen_max_seq_len=$((${isl} + ${osl} + 203)) |
| 39 | + |
| 40 | +WORK_DIR=${MOUNT_DIR} |
| 41 | +LOG_DIR=$WORK_DIR/${kind}-bm-${isl}-${osl} |
| 42 | +SCRIPTS_DIR=${WORK_DIR}/ |
| 43 | +set_clock_cmd="bash ${SCRIPTS_DIR}/set_clock.sh" |
| 44 | +mkdir -p ${LOG_DIR} |
| 45 | +echo "trying to submit job" |
| 46 | + |
| 47 | +sub_dir=${LOG_DIR}/ctx${num_ctx_servers}_gen${num_gen_servers}_dep${gen_tp_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size} |
| 48 | + |
| 49 | +echo "concurrency_list: ${concurrency_list}" |
| 50 | + |
| 51 | +ctx_gpus=$((num_ctx_servers * ctx_tp_size)) |
| 52 | +gen_gpus=$((num_gen_servers * gen_tp_size)) |
| 53 | + |
| 54 | +echo "enable_attention_dp: ${ctx_enable_attention_dp}, ${gen_enable_attention_dp}, gpu_memory_fraction: ${gen_gpu_memory_fraction}" |
| 55 | + |
| 56 | +enable_pdl=false |
| 57 | +if [ "${gen_enable_attention_dp}" = "false" ]; then |
| 58 | + enable_pdl=true |
| 59 | + echo "enable_pdl: ${enable_pdl}" |
| 60 | + sub_dir=${LOG_DIR}/ctx${num_ctx_servers}_gen${num_gen_servers}_tep${gen_tp_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size} |
| 61 | +fi |
| 62 | + |
| 63 | +full_logdir=${sub_dir} |
| 64 | +artifacts_dir=${full_logdir}/genai_perf_artifacts |
| 65 | +mkdir -p ${artifacts_dir} |
| 66 | + |
| 67 | + |
| 68 | +# Set clock |
| 69 | +srun ${set_clock_cmd} |
| 70 | + |
| 71 | +container_mounts=${MOUNT_DIR}:${MOUNT_DIR},${model_path}:${model_path} |
| 72 | + |
| 73 | +# start the container |
| 74 | +srun -l --container-image=${image} \ |
| 75 | + --container-name=${CONTAINER_NAME} \ |
| 76 | + --container-mounts=${container_mounts} \ |
| 77 | + --mpi=pmix \ |
| 78 | + echo "Container up." |
| 79 | + |
| 80 | +# generate the yaml file |
| 81 | +srun -l --container-name=${CONTAINER_NAME} \ |
| 82 | + --container-mounts=${container_mounts} \ |
| 83 | + --mpi=pmix --overlap \ |
| 84 | + -n 1 -N 1 \ |
| 85 | + python3 ${SCRIPTS_DIR}/scripts/gen_yaml.py --config ${full_logdir}/config.yaml \ |
| 86 | + --model ${model_path} \ |
| 87 | + --num_ctx_servers ${num_ctx_servers} \ |
| 88 | + --ctx_tp_size ${ctx_tp_size} \ |
| 89 | + --ctx_batch_size ${ctx_batch_size} \ |
| 90 | + --ctx_max_num_tokens ${ctx_max_num_tokens} \ |
| 91 | + --ctx_max_seq_len ${ctx_max_seq_len} \ |
| 92 | + --ctx_free_gpu_memory_fraction ${CTX_GPU_FRAC} \ |
| 93 | + --cache_transceiver_max_num_tokens ${CACHE_TRANSCEIVER_MAX_NUM_TOKENS} \ |
| 94 | + --num_gen_servers ${num_gen_servers} \ |
| 95 | + --gen_tp_size ${gen_tp_size} \ |
| 96 | + --gen_batch_size ${gen_batch_size} \ |
| 97 | + --gen_max_num_tokens ${gen_max_num_tokens} \ |
| 98 | + --gen_max_seq_len ${gen_max_seq_len} \ |
| 99 | + --gen_gpu_memory_fraction ${gen_gpu_memory_fraction} \ |
| 100 | + --eplb_num_slots ${eplb_num_slots} \ |
| 101 | + $(if [ "${gen_enable_attention_dp}" = "true" ]; then echo "--gen_enable_attention_dp"; fi) \ |
| 102 | + $(if [ "${ctx_enable_attention_dp}" = "true" ]; then echo "--ctx_enable_attention_dp"; fi) \ |
| 103 | + $(if [ "${mtp_size}" -gt 0 ]; then echo "--mtp_size ${mtp_size}"; fi) |
| 104 | + |
| 105 | +echo "YAML file generated." |
| 106 | + |
| 107 | +nsys_on="" |
| 108 | +# nsys_on=${full_logdir} |
| 109 | + |
| 110 | +nodes=($(scontrol show hostnames "$SLURM_JOB_NODELIST")) |
| 111 | + |
| 112 | +export HEAD_NODE="${nodes[0]}" |
| 113 | +export HEAD_NODE_IP="$(hostname -i)" |
| 114 | +export ETCD_ENDPOINTS="${HEAD_NODE_IP}:2379" |
| 115 | +export NATS_SERVER="nats://${HEAD_NODE_IP}:4222" |
| 116 | + |
| 117 | +# Create a temporary file to store PIDs |
| 118 | +PID_FILE=$(mktemp) |
| 119 | +trap 'cleanup_and_exit' EXIT |
| 120 | + |
| 121 | +cleanup_and_exit() { |
| 122 | + if [ -f "$PID_FILE" ]; then |
| 123 | + echo "Cleaning up spawned processes..." |
| 124 | + while read -r pid; do |
| 125 | + if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then |
| 126 | + echo "Sending TERM to process $pid" |
| 127 | + kill -TERM "$pid" 2>/dev/null |
| 128 | + sleep 2 |
| 129 | + if kill -0 "$pid" 2>/dev/null; then |
| 130 | + echo "Process $pid still running, sending KILL" |
| 131 | + kill -KILL "$pid" 2>/dev/null |
| 132 | + fi |
| 133 | + fi |
| 134 | + done < "$PID_FILE" |
| 135 | + rm -f "$PID_FILE" |
| 136 | + fi |
| 137 | +} |
| 138 | + |
| 139 | +# start the server |
| 140 | +srun -l --container-name=${CONTAINER_NAME} \ |
| 141 | + --container-mounts=${container_mounts} \ |
| 142 | + --mpi=pmix --overlap -N 1 -n 1 \ |
| 143 | + --oversubscribe \ |
| 144 | + --overlap \ |
| 145 | + --container-env ETCD_ENDPOINTS,NATS_SERVER,HEAD_NODE_IP,HEAD_NODE \ |
| 146 | + -w ${nodes[0]} \ |
| 147 | + bash ${SCRIPTS_DIR}/scripts/start_frontend.sh &> ${full_logdir}/output_server.log & |
| 148 | +SERVER_PID=$! |
| 149 | +echo "$SERVER_PID" >> "$PID_FILE" |
| 150 | + |
| 151 | +# wait for the server to start |
| 152 | +sleep 10 |
| 153 | + |
| 154 | +PREFILL_COUNT=$(grep 'prefill_count:' "${full_logdir}/instance_config.yaml" | awk '{print $2}') |
| 155 | +if [ -z "$PREFILL_COUNT" ]; then |
| 156 | + echo "Error: Failed to extract prefill_count from instance_config.yaml" |
| 157 | + exit 1 |
| 158 | +fi |
| 159 | +echo "Prefill Count: $PREFILL_COUNT" |
| 160 | + |
| 161 | +# start the prefill workers |
| 162 | +prefill_pids=() |
| 163 | +for ((i=1; i<=PREFILL_COUNT; i++)); do |
| 164 | + echo "Running Prefill Worker: ${i}" |
| 165 | + node_idx=$((i-1)) |
| 166 | + echo "Running Prefill Nodes: ${nodes[node_idx]}" |
| 167 | + srun -l --container-name=${CONTAINER_NAME} \ |
| 168 | + --container-mounts=${container_mounts} \ |
| 169 | + --mpi=pmix --overlap -w ${nodes[node_idx]} \ |
| 170 | + --oversubscribe \ |
| 171 | + --overlap \ |
| 172 | + --ntasks 4 \ |
| 173 | + --nodes 1 \ |
| 174 | + bash ${SCRIPTS_DIR}/scripts/start_worker.sh ${full_logdir}/prefill_config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} ${served_model_name} ${model_path} 'prefill' &> ${full_logdir}/output_workers.log & |
| 175 | + prefill_pids+=($!) |
| 176 | + echo "$!" >> "$PID_FILE" |
| 177 | +done |
| 178 | + |
| 179 | +DECODE_COUNT=$(grep 'decode_count:' "${full_logdir}/instance_config.yaml" | awk '{print $2}') |
| 180 | +if [ -z "$DECODE_COUNT" ]; then |
| 181 | + echo "Error: Failed to extract decode_count from instance_config.yaml" |
| 182 | + exit 1 |
| 183 | +fi |
| 184 | +echo "Decode Count: $DECODE_COUNT" |
| 185 | + |
| 186 | +num_gen_nodes=$((gen_nodes/num_gen_servers)) |
| 187 | +decode_start_idx=$PREFILL_COUNT |
| 188 | +for ((i=1; i<=DECODE_COUNT; i++)); do |
| 189 | + echo "Running Decode Worker: ${i}" |
| 190 | + decode_node_list=() |
| 191 | + for ((j=0; j<num_gen_nodes; j++)); do |
| 192 | + node_idx=$((decode_start_idx + (i-1)*num_gen_nodes + j)) |
| 193 | + decode_node_list+=("${nodes[node_idx]}") |
| 194 | + done |
| 195 | + decode_nodes_csv=$(IFS=, ; echo "${decode_node_list[*]}") |
| 196 | + echo "Running Decode Nodes: ${decode_nodes_csv}" |
| 197 | + srun -l --container-name=${CONTAINER_NAME} \ |
| 198 | + --container-mounts=${container_mounts} \ |
| 199 | + --mpi=pmix \ |
| 200 | + -w ${decode_nodes_csv} \ |
| 201 | + --nodes ${num_gen_nodes} \ |
| 202 | + --ntasks $gen_tp_size \ |
| 203 | + --oversubscribe \ |
| 204 | + --overlap \ |
| 205 | + bash ${SCRIPTS_DIR}/scripts/start_worker.sh ${full_logdir}/decode_config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} ${served_model_name} ${model_path} 'decode' &> ${full_logdir}/output_workers.log & |
| 206 | + echo "$!" >> "$PID_FILE" |
| 207 | +done |
| 208 | + |
| 209 | +total_gpus=$((ctx_gpus + gen_gpus)) |
| 210 | + |
| 211 | +# start the loadgen |
| 212 | +srun -l --container-name=${CONTAINER_NAME} \ |
| 213 | + --container-mounts=${container_mounts},${artifacts_dir}:${artifacts_dir} \ |
| 214 | + --mpi=pmix --overlap -N 1 -n 1 \ |
| 215 | + -w ${nodes[0]} \ |
| 216 | + bash ${SCRIPTS_DIR}/scripts/bench.sh ${served_model_name} ${MULTI_ROUND} ${num_gen_servers} "${concurrency_list}" ${STREAMING} ${full_logdir} ${total_gpus} ${artifacts_dir} ${model_path} ${isl} ${osl} ${kind} > ${full_logdir}/bench.log 2>&1 |
| 217 | + |
| 218 | +# Wait for all background processes to complete |
| 219 | +wait |
| 220 | + |
| 221 | +# Cleanup will be handled by the EXIT trap |
0 commit comments