diff --git a/components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm b/components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm index b5650738a7..153d3e31c0 100755 --- a/components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm +++ b/components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm @@ -10,29 +10,32 @@ CONTAINER_NAME=disaggr-test STREAMING=true CTX_GPU_FRAC=0.85 -CACHE_TRANSCEIVER_MAX_NUM_TOKENS=${CACHE_TRANSCEIVER_MAX_NUM_TOKENS:-4608} num_ctx_servers=$1 ctx_tp_size=$2 -ctx_batch_size=$3 -ctx_max_num_tokens=$4 -ctx_enable_attention_dp=$5 -num_gen_servers=$6 -gen_tp_size=$7 -gen_batch_size=$8 -gen_max_num_tokens=$9 -gen_enable_attention_dp=${10} -gen_gpu_memory_fraction=${11} -eplb_num_slots=${12} -mtp_size=${13} -concurrency_list=${14} -gen_nodes=${15} -kind=${16} -model_path=${17} -served_model_name=${18} -image=${19} -isl=${20} -osl=${21} +ctx_ep_size=$3 +ctx_enable_attention_dp=$4 +ctx_batch_size=$5 +ctx_max_num_tokens=$6 +num_gen_servers=$7 +gen_tp_size=$8 +gen_ep_size=$9 +gen_batch_size=$10 +gen_max_num_tokens=${11} +gen_enable_attention_dp=${12} +gen_gpu_memory_fraction=${13} +eplb_num_slots=${14} +mtp_size=${15} +concurrency_list=${16} +gen_nodes=${17} +kind=${18} +model_path=${19} +served_model_name=${20} +image=${21} +isl=${22} +osl=${23} + +CACHE_TRANSCEIVER_MAX_NUM_TOKENS=${CACHE_TRANSCEIVER_MAX_NUM_TOKENS:-$((${isl} + ${osl} + 512))} ctx_max_seq_len=$((${isl} + 203)) gen_max_seq_len=$((${isl} + ${osl} + 203)) @@ -44,7 +47,7 @@ set_clock_cmd="bash ${SCRIPTS_DIR}/set_clock.sh" mkdir -p ${LOG_DIR} echo "trying to submit job" -sub_dir=${LOG_DIR}/ctx${num_ctx_servers}_gen${num_gen_servers}_dep${gen_tp_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size} +sub_dir=${LOG_DIR}/ctx${num_ctx_servers}_gen${num_gen_servers}_dp${gen_tp_size}_ep${gen_ep_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size} echo "concurrency_list: ${concurrency_list}" @@ -53,11 +56,8 @@ gen_gpus=$((num_gen_servers * gen_tp_size)) echo "enable_attention_dp: ${ctx_enable_attention_dp}, ${gen_enable_attention_dp}, gpu_memory_fraction: ${gen_gpu_memory_fraction}" -enable_pdl=false if [ "${gen_enable_attention_dp}" = "false" ]; then - enable_pdl=true - echo "enable_pdl: ${enable_pdl}" - sub_dir=${LOG_DIR}/ctx${num_ctx_servers}_gen${num_gen_servers}_tep${gen_tp_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size} + sub_dir=${LOG_DIR}/ctx${num_ctx_servers}_gen${num_gen_servers}_tp${gen_tp_size}_ep${gen_ep_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size} fi full_logdir=${sub_dir} @@ -84,6 +84,7 @@ srun -l --container-name=${CONTAINER_NAME} \ --model ${model_path} \ --num_ctx_servers ${num_ctx_servers} \ --ctx_tp_size ${ctx_tp_size} \ + --ctx_ep_size ${ctx_ep_size} \ --ctx_batch_size ${ctx_batch_size} \ --ctx_max_num_tokens ${ctx_max_num_tokens} \ --ctx_max_seq_len ${ctx_max_seq_len} \ @@ -91,6 +92,7 @@ srun -l --container-name=${CONTAINER_NAME} \ --cache_transceiver_max_num_tokens ${CACHE_TRANSCEIVER_MAX_NUM_TOKENS} \ --num_gen_servers ${num_gen_servers} \ --gen_tp_size ${gen_tp_size} \ + --gen_ep_size ${gen_ep_size} \ --gen_batch_size ${gen_batch_size} \ --gen_max_num_tokens ${gen_max_num_tokens} \ --gen_max_seq_len ${gen_max_seq_len} \ @@ -177,7 +179,7 @@ for ((i=1; i<=DECODE_COUNT; i++)); do --ntasks $gen_tp_size \ --oversubscribe \ --overlap \ - bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/decode_config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} ${served_model_name} ${model_path} 'decode' &> ${full_logdir}/output_decode_worker_${i}.log & + bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/decode_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'decode' &> ${full_logdir}/output_decode_worker_${i}.log & echo "$!" >> "$PID_FILE" done @@ -200,9 +202,9 @@ for ((i=1; i<=PREFILL_COUNT; i++)); do --mpi=pmix --overlap -w ${nodes[node_idx]} \ --oversubscribe \ --overlap \ - --ntasks 4 \ + --ntasks $(( ctx_tp_size < 4 ? ctx_tp_size : 4 )) \ --nodes 1 \ - bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/prefill_config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} ${served_model_name} ${model_path} 'prefill' &> ${full_logdir}/output_prefill_worker_${i}.log & + bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/prefill_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'prefill' &> ${full_logdir}/output_prefill_worker_${i}.log & prefill_pids+=($!) echo "$!" >> "$PID_FILE" done diff --git a/components/backends/trtllm/performance_sweeps/scripts/bench b/components/backends/trtllm/performance_sweeps/scripts/bench index f3ea022a57..2a35585f0c 160000 --- a/components/backends/trtllm/performance_sweeps/scripts/bench +++ b/components/backends/trtllm/performance_sweeps/scripts/bench @@ -1 +1 @@ -Subproject commit f3ea022a5780de5d0babc5fffa53634e2023d28f +Subproject commit 2a35585f0cb2c98d18934088c867f1ba52d373b4 diff --git a/components/backends/trtllm/performance_sweeps/scripts/gen_yaml.py b/components/backends/trtllm/performance_sweeps/scripts/gen_yaml.py index 8c256e82dd..f388f4d920 100644 --- a/components/backends/trtllm/performance_sweeps/scripts/gen_yaml.py +++ b/components/backends/trtllm/performance_sweeps/scripts/gen_yaml.py @@ -2,12 +2,262 @@ # SPDX-License-Identifier: Apache-2.0 import argparse +from enum import Enum import os import re from typing import Any, Dict, List import yaml +class ModelType(Enum): + """ + Model type. + """ + GPT_OSS = "gpt_oss" + DSR1 = "dsr1" + +def get_model_type(model_path: str) -> str: + if "r1" in model_path.lower(): + print("Inferring DSR1-type model") + return ModelType.DSR1 + else: + print("Inferring GPT-oss-type model") + return ModelType.GPT_OSS + +def generate_dsr1_config( + config_path: str, + decode_config_path: str, + instance_config_path: str, + args: argparse.Namespace +): + gen_cuda_graph_batch_sizes = [ + 1, + 2, + 4, + 8, + 16, + 32, + 64, + 128, + 256, + 384, + 512, + 768, + 1024, + 2048, + args.gen_batch_size, + ] + + gen_moe_backend = "CUTLASS" + if args.gen_tp_size >= 16 and args.gen_enable_attention_dp: + gen_moe_backend = "WIDEEP" + if not args.gen_enable_attention_dp: + gen_moe_backend = "TRTLLM" + + prefill_config: Dict[str, Any] = { + "max_batch_size": args.ctx_batch_size, + "max_num_tokens": args.ctx_max_num_tokens, + "max_seq_len": args.ctx_max_seq_len, + "tensor_parallel_size": args.ctx_tp_size, + "moe_expert_parallel_size": args.ctx_ep_size, + "enable_attention_dp": args.ctx_enable_attention_dp, + "pipeline_parallel_size": 1, + "cuda_graph_config": None, + "print_iter_log": True, + "disable_overlap_scheduler": True, + "kv_cache_config": { + "enable_block_reuse": False, + "free_gpu_memory_fraction": args.ctx_free_gpu_memory_fraction, + "dtype": "fp8", + }, + "cache_transceiver_config": { + "max_tokens_in_buffer": args.cache_transceiver_max_num_tokens, + "backend": "DEFAULT", + }, + } + + decode_config: Dict[str, Any] = { + "tensor_parallel_size": args.gen_tp_size, + "moe_expert_parallel_size": args.gen_tp_size, + "enable_attention_dp": args.gen_enable_attention_dp, + "pipeline_parallel_size": 1, + "max_batch_size": args.gen_batch_size, + "max_num_tokens": args.gen_max_num_tokens, + "max_seq_len": args.gen_max_seq_len, + "cuda_graph_config": { + "enable_padding": True, + "batch_sizes": gen_cuda_graph_batch_sizes, + }, + "print_iter_log": True, + "kv_cache_config": { + "enable_block_reuse": False, + "free_gpu_memory_fraction": args.gen_gpu_memory_fraction, + "dtype": "fp8", + }, + "moe_config": { + "backend": gen_moe_backend, + "use_low_precision_moe_combine": True, + }, + "cache_transceiver_config": { + "max_tokens_in_buffer": args.cache_transceiver_max_num_tokens, + "backend": "DEFAULT", + }, + "stream_interval": 20, + } + + if args.gen_tp_size == 8 and not args.gen_enable_attention_dp: + decode_config["allreduce_strategy"] = "MNNVL" + + if args.eplb_num_slots > 0: + moe_load_balancer_file = os.path.join( + os.path.dirname(config_path), "moe_load_balancer.yaml" + ) + # Ensure the directory exists before writing the file + os.makedirs(os.path.dirname(moe_load_balancer_file), exist_ok=True) + moe_load_balancer_config = { + "num_slots": args.eplb_num_slots, + "layer_updates_per_iter": 1, + } + with open(moe_load_balancer_file, "w") as f: + yaml.dump( + moe_load_balancer_config, f, default_flow_style=False, sort_keys=False + ) + decode_config["moe_config"]["load_balancer"] = moe_load_balancer_file + + if args.mtp_size > 0: + prefill_config["speculative_config"] = { + "decoding_type": "MTP", + "num_nextn_predict_layers": args.mtp_size, + } + decode_config["speculative_config"] = { + "decoding_type": "MTP", + "num_nextn_predict_layers": args.mtp_size, + } + + return prefill_config, decode_config + +def generate_gpt_oss_config( + config_path: str, + decode_config_path: str, + instance_config_path: str, + args: argparse.Namespace +): + gen_cuda_graph_batch_sizes = [ + 1, + 2, + 4, + 8, + 16, + 32, + 64, + 128, + 256, + 384, + 512, + 768, + 1024, + 2048, + args.gen_batch_size, + ] + + gen_moe_backend = "TRTLLM" + + prefill_config: Dict[str, Any] = { + "max_batch_size": args.ctx_batch_size, + "max_num_tokens": args.ctx_max_num_tokens, + "max_seq_len": args.ctx_max_seq_len, + "tensor_parallel_size": args.ctx_tp_size, + "moe_expert_parallel_size": args.ctx_ep_size, + "enable_attention_dp": args.ctx_enable_attention_dp, + "pipeline_parallel_size": 1, + "cuda_graph_config": None, + "print_iter_log": True, + "disable_overlap_scheduler": True, + "kv_cache_config": { + "enable_block_reuse": False, + "free_gpu_memory_fraction": args.ctx_free_gpu_memory_fraction, + "dtype": "fp8", + }, + "cuda_graph_config": { + "enable_padding": True, + "max_batch_size": 30, + }, + "num_postprocess_workers": 4, + "cache_transceiver_config": { + "max_tokens_in_buffer": args.cache_transceiver_max_num_tokens, + "backend": "UCX", + }, + "moe_config": { + "backend": "TRTLLM" + } + } + + decode_config: Dict[str, Any] = { + "allreduce_strategy": "AUTO", + "attention_dp_config": { + "enable_balance": True + }, + "disable_overlap_scheduler": False, + "tensor_parallel_size": args.gen_tp_size, + "moe_expert_parallel_size": args.gen_ep_size, + "enable_attention_dp": args.gen_enable_attention_dp, + "pipeline_parallel_size": 1, + "max_batch_size": args.gen_batch_size, + "max_num_tokens": args.gen_max_num_tokens, + "max_seq_len": args.gen_max_seq_len, + "cuda_graph_config": { + "enable_padding": True, + "batch_sizes": gen_cuda_graph_batch_sizes, + }, + "print_iter_log": True, + "kv_cache_config": { + "enable_block_reuse": False, + "free_gpu_memory_fraction": args.gen_gpu_memory_fraction, + "dtype": "fp8", + }, + "moe_config": { + "backend": gen_moe_backend, + }, + "cache_transceiver_config": { + "max_tokens_in_buffer": args.cache_transceiver_max_num_tokens, + "backend": "UCX", + }, + "stream_interval": 20, + "num_postprocess_workers": 4 + } + + if args.eplb_num_slots > 0: + moe_load_balancer_file = os.path.join( + os.path.dirname(config_path), "moe_load_balancer.yaml" + ) + # Ensure the directory exists before writing the file + os.makedirs(os.path.dirname(moe_load_balancer_file), exist_ok=True) + moe_load_balancer_config = { + "num_slots": args.eplb_num_slots, + "layer_updates_per_iter": 1, + } + with open(moe_load_balancer_file, "w") as f: + yaml.dump( + moe_load_balancer_config, f, default_flow_style=False, sort_keys=False + ) + decode_config["moe_config"]["load_balancer"] = moe_load_balancer_file + + if args.mtp_size > 0: + prefill_config["speculative_config"] = { + "decoding_type": "MTP", + "num_nextn_predict_layers": args.mtp_size, + } + decode_config["speculative_config"] = { + "decoding_type": "MTP", + "num_nextn_predict_layers": args.mtp_size, + } + + return prefill_config, decode_config + +CONFIG_MAPPING = { + ModelType.GPT_OSS: generate_gpt_oss_config, + ModelType.DSR1: generate_dsr1_config, +} def process_node_and_task() -> tuple[int, List[str], List[str]]: """ @@ -144,6 +394,7 @@ def gen_config_file( ctx_enable_attention_dp: bool, num_gen_servers: int, gen_tp_size: int, + gen_ep_size: int, gen_batch_size: int, gen_max_num_tokens: int, gen_max_seq_len: int, @@ -153,7 +404,7 @@ def gen_config_file( mtp_size: int = 0, worker_start_port: int = 8001, server_port: int = 8000, - cache_transceiver_max_num_tokens: int = 4608, + cache_transceiver_max_num_tokens: int = 9216, ) -> None: """ Generate configuration YAML file for disaggregated inference. @@ -170,6 +421,7 @@ def gen_config_file( ctx_enable_attention_dp: Enable attention DP for context servers num_gen_servers: Number of generation servers gen_tp_size: Tensor parallel size for generation servers + gen_ep_size: Expert parallel size for generation servers gen_batch_size: Batch size for generation servers gen_max_num_tokens: Max number of tokens for generation servers gen_enable_attention_dp: Enable attention DP for generation servers @@ -178,109 +430,15 @@ def gen_config_file( worker_start_port: Start port for workers server_port: Server port """ - gen_cuda_graph_batch_sizes = [ - 1, - 2, - 4, - 8, - 16, - 32, - 64, - 128, - 256, - 384, - 512, - 768, - 1024, - 2048, - gen_batch_size, - ] - - gen_moe_backend = "CUTLASS" - if gen_tp_size >= 16 and gen_enable_attention_dp: - gen_moe_backend = "WIDEEP" - if not gen_enable_attention_dp: - gen_moe_backend = "TRTLLM" - - prefill_config: Dict[str, Any] = { - "max_batch_size": ctx_batch_size, - "max_num_tokens": ctx_max_num_tokens, - "max_seq_len": ctx_max_seq_len, - "tensor_parallel_size": ctx_tp_size, - "moe_expert_parallel_size": ctx_tp_size, - "enable_attention_dp": ctx_enable_attention_dp, - "pipeline_parallel_size": 1, - "cuda_graph_config": None, - "print_iter_log": True, - "disable_overlap_scheduler": True, - "kv_cache_config": { - "enable_block_reuse": False, - "free_gpu_memory_fraction": ctx_free_gpu_memory_fraction, - "dtype": "fp8", - }, - "cache_transceiver_config": { - "max_tokens_in_buffer": cache_transceiver_max_num_tokens, - "backend": "DEFAULT", - }, - } - decode_config: Dict[str, Any] = { - "tensor_parallel_size": gen_tp_size, - "moe_expert_parallel_size": gen_tp_size, - "enable_attention_dp": gen_enable_attention_dp, - "pipeline_parallel_size": 1, - "max_batch_size": gen_batch_size, - "max_num_tokens": gen_max_num_tokens, - "max_seq_len": gen_max_seq_len, - "cuda_graph_config": { - "enable_padding": True, - "batch_sizes": gen_cuda_graph_batch_sizes, - }, - "print_iter_log": True, - "kv_cache_config": { - "enable_block_reuse": False, - "free_gpu_memory_fraction": gen_gpu_memory_fraction, - "dtype": "fp8", - }, - "moe_config": { - "backend": gen_moe_backend, - "use_low_precision_moe_combine": True, - }, - "cache_transceiver_config": { - "max_tokens_in_buffer": cache_transceiver_max_num_tokens, - "backend": "DEFAULT", - }, - "stream_interval": 20, - } + model_type = get_model_type(model_path) - if gen_tp_size == 8 and not gen_enable_attention_dp: - decode_config["allreduce_strategy"] = "MNNVL" - - if eplb_num_slots > 0: - moe_load_balancer_file = os.path.join( - os.path.dirname(config_path), "moe_load_balancer.yaml" - ) - # Ensure the directory exists before writing the file - os.makedirs(os.path.dirname(moe_load_balancer_file), exist_ok=True) - moe_load_balancer_config = { - "num_slots": eplb_num_slots, - "layer_updates_per_iter": 1, - } - with open(moe_load_balancer_file, "w") as f: - yaml.dump( - moe_load_balancer_config, f, default_flow_style=False, sort_keys=False - ) - decode_config["moe_config"]["load_balancer"] = moe_load_balancer_file - - if mtp_size > 0: - prefill_config["speculative_config"] = { - "decoding_type": "MTP", - "num_nextn_predict_layers": mtp_size, - } - decode_config["speculative_config"] = { - "decoding_type": "MTP", - "num_nextn_predict_layers": mtp_size, - } + prefill_config, decode_config = CONFIG_MAPPING[model_type]( + config_path, + decode_config_path, + instance_config_path, + args + ) counts = {"prefill_count": num_ctx_servers, "decode_count": num_gen_servers} @@ -309,6 +467,12 @@ def gen_config_file( required=True, help="Tensor parallel size for context servers", ) + parser.add_argument( + "--ctx_ep_size", + type=int, + required=True, + help="Expert parallel size for context servers", + ) parser.add_argument( "--ctx_batch_size", type=int, @@ -351,6 +515,12 @@ def gen_config_file( required=True, help="Tensor parallel size for generation servers", ) + parser.add_argument( + "--gen_ep_size", + type=int, + required=True, + help="Expert parallel size for generation servers", + ) parser.add_argument( "--gen_batch_size", type=int, diff --git a/components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh b/components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh index 305fd157ec..d99b1b531b 100755 --- a/components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh +++ b/components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh @@ -3,13 +3,12 @@ # SPDX-License-Identifier: Apache-2.0 config_file=$1 -enable_pdl=$2 -ctx_gpus=$3 -model_name=$4 -model_path=$5 -disaggregation_mode=$6 +ctx_gpus=$2 +model_name=$3 +model_path=$4 +disaggregation_mode=$5 unset UCX_TLS -echo "config_file: ${config_file}, enable_pdl: ${enable_pdl}, ctx_gpus: ${ctx_gpus}, disaggregation_mode: ${disaggregation_mode}" +echo "config_file: ${config_file}, ctx_gpus: ${ctx_gpus}, disaggregation_mode: ${disaggregation_mode}" # Read configuration values from the YAML config file if [ ! -f "${config_file}" ]; then @@ -47,9 +46,12 @@ export TRTLLM_MOE_ENABLE_ALLTOALL_WITHOUT_ALLGATHER=1 # "moe_backend.use_low_precision_combine: true" in recent trtllm commits, and # can be removed. Keeping it here in case the script is ran with older commits. export TRTLLM_MOE_USE_LOW_PRECISION_COMBINE=1 +# TODO: Is there ever a case where we don't want this enabled? +export TRTLLM_ENABLE_PDL=1 -if [ "${enable_pdl}" = "true" ]; then - export TRTLLM_ENABLE_PDL=1 +if [[ "${model_path,,}" != *r1* ]]; then + echo "Inferred gpt-oss style model. Setting OVERRIDE_QUANT_ALGO to W4A8_MXFP4_MXFP8" + export OVERRIDE_QUANT_ALGO=W4A8_MXFP4_MXFP8 fi # NOTE: Set (or unset) these depending on what cluster you're using diff --git a/components/backends/trtllm/performance_sweeps/submit_disagg.sh b/components/backends/trtllm/performance_sweeps/submit_disagg.sh index 5300cc9c27..0b23bc9925 100755 --- a/components/backends/trtllm/performance_sweeps/submit_disagg.sh +++ b/components/backends/trtllm/performance_sweeps/submit_disagg.sh @@ -77,15 +77,19 @@ usage() { # Run single task run_single() { local ctx_num=$1 - local gen_num=$2 - local gen_tp_size=$3 - local gen_batch_size=$4 - local gen_max_num_tokens=$5 - local gen_enable_attention_dp=$6 - local gen_gpu_memory_fraction=$7 - local gen_mtp_size=$8 - local gen_eplb_num_slots=$9 - local gen_concurrency_list=${10} + local ctx_tp_size=$2 + local ctx_ep_size=$3 + local ctx_enable_attention_dp=$4 + local gen_num=$5 + local gen_tp_size=$6 + local gen_ep_size=$7 + local gen_batch_size=$8 + local gen_max_num_tokens=$9 + local gen_enable_attention_dp=${10} + local gen_gpu_memory_fraction=${11} + local gen_eplb_num_slots=${12} + local gen_mtp_size=${13} + local gen_concurrency_list=${14} # TODO: expose kind to the command line local kind="dynamo_disagg" @@ -94,179 +98,10 @@ run_single() { total_nodes=$((ctx_num + gen_nodes)) total_tasks=$((total_nodes * 4)) set -x - if (( ISL == OSL )); then - sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} 4 4 4608 true ${gen_num} ${gen_tp_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL} - else - sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} 4 1 8448 true ${gen_num} ${gen_tp_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL} - fi + sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} ${ctx_tp_size} ${ctx_ep_size} ${ctx_enable_attention_dp} 30 20000 ${gen_num} ${gen_tp_size} ${gen_ep_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL} set +x } -# MTP0 Configuration (gen_mtp_size=0) -run_4_gpus_mtp0() { - echo "Running 4 GPUs MTP0 combinations..." - if (( ISL == OSL )); then - run_single 1 5 4 128 128 false "0.9" 0 0 "1 2 4 8 16 32 64 128 192" - run_single 1 5 4 64 64 true "0.85" 0 0 "256 384" - run_single 1 4 4 128 128 true "0.85" 0 0 "512 768" - run_single 2 5 4 256 256 true "0.85" 0 0 "1024 1536" - run_single 1 2 4 512 512 true "0.85" 0 0 "2048 3072" - run_single 2 3 4 768 768 true "0.85" 0 0 "3072 4096" - else - run_single 1 5 4 16 16 false "0.9" 0 0 "1 2 4 8 16 24" - run_single 1 4 4 32 32 false "0.9" 0 0 "32 48" - run_single 2 5 4 64 64 false "0.9" 0 0 "64 96" - run_single 1 2 4 128 128 false "0.9" 0 0 "128 192" - run_single 1 1 4 64 64 true "0.8" 0 0 "256 384" - run_single 3 2 4 128 128 true "0.8" 0 0 "512 768" - fi -} - -run_8_gpus_mtp0() { - echo "Running 8 GPUs MTP0 combinations..." - if (( ISL == OSL )); then - run_single 1 4 8 128 128 false "0.9" 0 0 "1 2 4 8 16 32 64 128 192 256" - run_single 1 4 8 32 32 true "0.8" 0 0 "256 384" - run_single 1 3 8 64 64 true "0.8" 0 0 "512 768" - run_single 1 2 8 128 128 true "0.8" 0 0 "1024 1536" - run_single 1 1 8 256 256 true "0.8" 0 0 "2048 3072" - run_single 1 1 8 512 512 true "0.8" 0 0 "4096 6144" - run_single 3 2 8 768 768 true "0.8" 0 0 "6144 8192" - run_single 3 2 8 1024 1024 true "0.8" 0 0 "8192 12288" - else - run_single 1 4 8 16 16 false "0.9" 0 0 "1 2 4 8 16 24" - run_single 1 3 8 32 32 false "0.9" 0 0 "32 48" - run_single 1 2 8 64 64 false "0.9" 0 0 "64 96" - run_single 1 1 8 128 128 false "0.9" 0 0 "128 192" - run_single 3 2 8 32 32 true "0.8" 0 0 "256 384" - run_single 5 2 8 64 64 true "0.8" 0 0 "512 768" - run_single 4 1 8 128 128 true "0.8" 0 0 "1024 1536" - run_single 5 1 8 256 256 true "0.8" 0 0 "2048 3072" - fi -} - -run_16_gpus_mtp0() { - echo "Running 16 GPUs MTP0 combinations..." - if (( ISL == OSL )); then - run_single 1 1 16 64 64 true "0.75" 0 0 "16 32 64 128 256 512 1024 1536" - run_single 2 1 16 128 128 true "0.75" 0 256 "2048 3072" - run_single 2 1 16 256 256 true "0.75" 0 256 "4096 6144" - run_single 3 1 16 512 512 true "0.75" 0 256 "8192 12288" - run_single 3 1 16 768 768 true "0.75" 0 256 "12288 16384" - run_single 3 1 16 1024 1024 true "0.75" 0 288 "16384 20480" - else - run_single 1 1 16 8 8 true "0.8" 0 0 "16 32 64 128 192" # 5 - run_single 2 1 16 16 16 true "0.8" 0 0 "256 384" # 6 - run_single 3 1 16 32 32 true "0.8" 0 0 "512 768" # 7 - run_single 6 1 16 64 64 true "0.8" 0 0 "1024 1536" # 10 - run_single 8 1 16 128 128 true "0.8" 0 256 "2048 3072" # 12 - run_single 10 1 16 256 256 true "0.8" 0 256 "4096 6144" # 14 - fi -} - -run_32_gpus_mtp0() { - echo "Running 32 GPUs MTP0 combinations..." - if (( ISL == OSL )); then - run_single 1 1 32 32 32 true "0.7" 0 0 "32 64 128 256 512 1024 1536" - run_single 2 1 32 64 64 true "0.7" 0 256 "2048 3072" - run_single 3 1 32 128 128 true "0.7" 0 288 "4096 6144" - run_single 4 1 32 256 256 true "0.7" 0 288 "8192 12288" - run_single 5 1 32 512 512 true "0.7" 0 288 "16384 20480" - else - run_single 1 1 32 4 4 true "0.7" 0 0 "32 64 128 192" # 9 - run_single 2 1 32 8 8 true "0.7" 0 0 "256 384" # 10 - run_single 4 1 32 16 16 true "0.7" 0 0 "512 768" # 12 - run_single 7 1 32 32 32 true "0.7" 0 0 "1024 1536" # 15 - fi -} - -# MTP Configuration (gen_mtp_size=1,2,3) -run_4_gpus_mtp() { - echo "Running 4 GPUs MTP combinations..." - if (( ISL == OSL )); then - run_single 1 5 4 32 128 false "0.9" 3 0 "1 2 4 8 16 32 48" - run_single 1 5 4 32 128 true "0.9" 3 0 "64 128 192" - run_single 1 4 4 64 256 true "0.9" 3 0 "256 384" - run_single 1 3 4 128 512 true "0.9" 3 0 "512 768" - run_single 1 2 4 256 768 true "0.9" 2 0 "1024 1536" - run_single 2 3 4 512 1024 true "0.9" 1 0 "2048 3072" - run_single 1 1 4 768 1536 true "0.9" 1 0 "3072 4096" - else - run_single 1 5 4 8 32 false "0.9" 3 0 "1 2 4 8 12" - run_single 1 4 4 16 64 false "0.9" 3 0 "16 24" - run_single 1 3 4 32 128 false "0.9" 3 0 "32 48" - run_single 2 3 4 16 64 true "0.8" 3 0 "64 96" - run_single 1 1 4 32 128 true "0.8" 3 0 "128 192" - run_single 2 1 4 64 256 true "0.8" 2 0 "256 384" - run_single 5 2 4 128 512 true "0.8" 1 0 "512 768" - fi -} - -run_8_gpus_mtp() { - echo "Running 8 GPUs MTP combinations..." - if (( ISL == OSL )); then - run_single 1 4 8 32 128 false "0.9" 3 0 "1 2 4 8 16 32 48" - run_single 1 4 8 16 64 true "0.8" 3 0 "64 128 192" - run_single 1 3 8 32 128 true "0.8" 3 0 "256 384" - run_single 1 2 8 64 256 true "0.8" 3 0 "512 768" - run_single 1 1 8 128 512 true "0.8" 3 0 "1024 1536" - run_single 1 1 8 256 512 true "0.8" 1 0 "2048 3072" - run_single 3 2 8 512 1024 true "0.8" 1 0 "4096 6144" - run_single 3 2 8 768 1536 true "0.8" 1 0 "6144 8192" - run_single 3 2 8 1024 2048 true "0.8" 1 0 "8192 12288" - else - run_single 1 4 8 8 32 false "0.9" 3 0 "1 2 4 8 12" - run_single 1 3 8 16 64 false "0.9" 3 0 "16 24" - run_single 1 2 8 32 128 false "0.9" 3 0 "32 48" - run_single 1 1 8 8 32 true "0.8" 3 0 "64 96" - run_single 3 2 8 16 64 true "0.8" 3 0 "128 192" - run_single 5 2 8 32 128 true "0.8" 3 0 "256 384" - run_single 7 2 8 64 256 true "0.8" 2 0 "512 768" - run_single 5 1 8 128 256 true "0.8" 1 0 "1024 1536" - run_single 6 1 8 256 512 true "0.8" 1 0 "2048 3072" - fi -} - -run_16_gpus_mtp() { - echo "Running 16 GPUs MTP combinations..." - if (( ISL == OSL )); then - run_single 1 1 16 32 128 true "0.7" 3 0 "16 32 64 128 256 512 768" - run_single 1 1 16 64 256 true "0.7" 3 256 "1024 1536" - run_single 2 1 16 128 256 true "0.7" 1 288 "2048 3072" - run_single 2 1 16 256 512 true "0.7" 1 288 "4096 6144" - run_single 3 1 16 512 1024 true "0.7" 1 288 "8192 12288" - run_single 3 1 16 768 1536 true "0.7" 1 288 "12288 16384" - run_single 3 1 16 1024 1024 true "0.75" 0 288 "16384 20480" - else - run_single 1 1 16 4 16 true "0.8" 3 0 "16 32 64 96" # 5 - run_single 2 1 16 8 32 true "0.8" 3 0 "128 192" # 6 - run_single 4 1 16 16 64 true "0.8" 3 0 "256 384" # 8 - run_single 6 1 16 32 128 true "0.8" 3 0 "512 768" # 10 - run_single 8 1 16 64 256 true "0.8" 2 256 "1024 1536" # 13 - run_single 10 1 16 128 256 true "0.8" 1 256 "2048 3072" # 15 - run_single 12 1 16 256 512 true "0.8" 1 256 "4096 6144" # 16 - fi - -} - -run_32_gpus_mtp() { - echo "Running 32 GPUs MTP combinations..." - if (( ISL == OSL )); then - run_single 1 1 32 16 64 true "0.6" 3 0 "32 64 128 256 512 768" - run_single 2 1 32 32 128 true "0.6" 3 288 "1024 1536" - run_single 3 1 32 64 256 true "0.6" 3 288 "2048 3072" - run_single 3 1 32 128 256 true "0.6" 1 288 "4096 6144" - run_single 4 1 32 256 512 true "0.6" 1 288 "8192 12288" - run_single 5 1 32 512 1024 true "0.6" 1 288 "16384 20480" - else - run_single 1 1 32 1 4 true "0.7" 3 0 "32 48" # 9 - run_single 2 1 32 2 8 true "0.7" 3 0 "64 96" # 10 - run_single 3 1 32 4 16 true "0.7" 3 0 "128 192" # 11 - run_single 5 1 32 8 32 true "0.7" 3 0 "256 384" # 13 - run_single 8 1 32 16 64 true "0.7" 3 256 "512 768" # 16 - fi -} - # Main function main() { local mtp_mode=$1 @@ -279,139 +114,75 @@ main() { fi case $mode in - "all") - echo "Running all GPU configurations for $mtp_mode mode..." - if [[ "$mtp_mode" == "mtp=off" ]]; then - run_4_gpus_mtp0 - run_8_gpus_mtp0 - run_16_gpus_mtp0 - run_32_gpus_mtp0 - else - run_4_gpus_mtp - run_8_gpus_mtp - run_16_gpus_mtp - run_32_gpus_mtp - fi - ;; - "pareto") - # 1k/1k - export ISL=1024 - export OSL=1024 - export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=4608 - - if [[ "$mtp_mode" == "mtp=off" ]]; then - # 1k/1k mtp=off - run_single 1 4 8 128 128 false "0.9" 0 0 "1 2 4 8 16 32 64 141" - run_single 1 1 32 32 32 true "0.7" 0 0 "1075" - run_single 1 1 16 64 64 true "0.75" 0 0 "1075" - run_single 2 1 16 256 256 true "0.75" 0 0 "2048 4300" - run_single 1 1 8 512 512 true "0.8" 0 0 "4300" - - else - # 1k/1k mtp=on - run_single 1 4 8 32 128 false "0.9" 3 0 "1 2 4 8 16 36" - run_single 1 1 16 64 256 true "0.7" 3 0 "512 1075" - run_single 2 1 16 128 256 true "0.7" 1 0 "2150" - run_single 1 1 32 16 64 true "0.6" 3 0 "512" - run_single 1 1 8 256 512 true "0.8" 1 0 "2252" - fi - - # 8k/1k - export ISL=8192 - export OSL=1024 - export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=8448 - - if [[ "$mtp_mode" == "mtp=off" ]]; then - # 8k/1k mtp=off - run_single 1 3 8 32 32 false "0.9" 0 0 "1 2 4 8 16 34" - run_single 4 1 32 16 16 true "0.7" 0 0 "256 538" - run_single 7 1 32 32 32 true "0.7" 0 0 "1075" # remove if need 5 cofigs - run_single 6 1 16 64 64 true "0.75" 0 0 "1075" - run_single 8 1 16 128 128 true "0.75" 0 0 "2150" - run_single 5 1 8 256 256 true "0.8" 0 0 "2150" - else - # 8k/1k mtp=on - run_single 1 3 8 16 64 false "0.9" 3 0 "1 2 4 8 18" - run_single 5 1 32 8 32 true "0.7" 3 0 "128 269" - run_single 8 1 32 16 64 true "0.7" 3 0 "538" - run_single 6 1 16 32 128 true "0.75" 3 0 "538" # remove if need 5 configs - run_single 8 1 16 64 256 true "0.75" 2 0 "1075" - run_single 5 1 8 128 256 true "0.8" 1 0 "1075" # remove if need 5 configs - run_single 6 1 8 256 512 true "0.8" 1 0 "2150" - fi - ;; - "4GPU") - echo "Running 4 GPUs combinations for $mtp_mode mode..." - if [[ "$mtp_mode" == "mtp=off" ]]; then - run_4_gpus_mtp0 - else - run_4_gpus_mtp - fi - ;; - "8GPU") - echo "Running 8 GPUs combinations for $mtp_mode mode..." - if [[ "$mtp_mode" == "mtp=off" ]]; then - run_8_gpus_mtp0 - else - run_8_gpus_mtp - fi - ;; - "16GPU") - echo "Running 16 GPUs combinations for $mtp_mode mode..." - if [[ "$mtp_mode" == "mtp=off" ]]; then - run_16_gpus_mtp0 - else - run_16_gpus_mtp - fi - ;; - "32GPU") - echo "Running 32 GPUs combinations for $mtp_mode mode..." - if [[ "$mtp_mode" == "mtp=off" ]]; then - run_32_gpus_mtp0 - else - run_32_gpus_mtp - fi - ;; "tep") - if [ $# -ne 11 ]; then - echo "Error: TEP mode requires 11 additional parameters (including mtp_mode)" + if [ $# -ne 14 ]; then + echo "Error: TEP mode requires 14 additional parameters (including mtp_mode)" usage fi local ctx_num=$3 - local gen_num=$4 - local gen_tp_size=$5 - local gen_batch_size=$6 - local gen_max_num_tokens=$7 - local gen_gpu_memory_fraction=$8 - local gen_mtp_size=$9 - local gen_eplb_num_slots=${10} - local gen_concurrency_list=${11} - - echo "Running TEP mode ($mtp_mode) with ctx_num=$ctx_num, gen_num=$gen_num, gen_tp_size=$gen_tp_size, gen_batch_size=$gen_batch_size, gen_max_num_tokens=$gen_max_num_tokens, gen_gpu_memory_fraction=$gen_gpu_memory_fraction, gen_mtp_size=$gen_mtp_size, gen_eplb_num_slots=$gen_eplb_num_slots, gen_concurrency_list=\"$gen_concurrency_list\"" + local ctx_tp_size=$4 + local ctx_ep_size=$5 + local ctx_enable_attention_dp=$6 + local gen_num=$7 + local gen_tp_size=$8 + local gen_batch_size=$9 + local gen_max_num_tokens=${10} + local gen_gpu_memory_fraction=${11} + local gen_mtp_size=${12} + local gen_eplb_num_slots=${13} + local gen_concurrency_list=${14} + + echo "Running TEP mode ($mtp_mode) with ctx_num=$ctx_num, gen_num=$gen_num, gen_tp_size=$gen_tp_size, gen_ep_size=$gen_tp_size, gen_batch_size=$gen_batch_size, gen_max_num_tokens=$gen_max_num_tokens, gen_gpu_memory_fraction=$gen_gpu_memory_fraction, gen_mtp_size=$gen_mtp_size, gen_eplb_num_slots=$gen_eplb_num_slots, gen_concurrency_list=\"$gen_concurrency_list\"" # TEP mode: Use false to disable attention dp - run_single $ctx_num $gen_num $gen_tp_size $gen_batch_size $gen_max_num_tokens false $gen_gpu_memory_fraction $gen_mtp_size $gen_eplb_num_slots "$gen_concurrency_list" + run_single $ctx_num $ctx_tp_size $ctx_ep_size $ctx_enable_attention_dp $gen_num $gen_tp_size $gen_tp_size $gen_batch_size $gen_max_num_tokens false $gen_gpu_memory_fraction $gen_mtp_size $gen_eplb_num_slots "$gen_concurrency_list" ;; "dep") - if [ $# -ne 11 ]; then - echo "Error: DEP mode requires 11 additional parameters (including mtp_mode)" + if [ $# -ne 14 ]; then + echo "Error: DEP mode requires 14 additional parameters (including mtp_mode)" usage fi local ctx_num=$3 - local gen_num=$4 - local gen_tp_size=$5 - local gen_batch_size=$6 - local gen_max_num_tokens=$7 - local gen_gpu_memory_fraction=$8 - local gen_mtp_size=$9 - local gen_eplb_num_slots=${10} - local gen_concurrency_list=${11} - - echo "Running DEP mode ($mtp_mode) with ctx_num=$ctx_num, gen_num=$gen_num, gen_tp_size=$gen_tp_size, gen_batch_size=$gen_batch_size, gen_max_num_tokens=$gen_max_num_tokens, gen_gpu_memory_fraction=$gen_gpu_memory_fraction, gen_mtp_size=$gen_mtp_size, gen_eplb_num_slots=$gen_eplb_num_slots, gen_concurrency_list=\"$gen_concurrency_list\"" + local ctx_tp_size=$4 + local ctx_ep_size=$5 + local ctx_enable_attention_dp=$6 + local gen_num=$7 + local gen_tp_size=$8 + local gen_batch_size=$9 + local gen_max_num_tokens=${10} + local gen_gpu_memory_fraction=${11} + local gen_mtp_size=${12} + local gen_eplb_num_slots=${13} + local gen_concurrency_list=${14} + + echo "Running DEP mode ($mtp_mode) with ctx_num=$ctx_num, ctx_tp_size=$ctx_tp_size, ctx_enable_attention_dp=$ctx_enable_attention_dp, gen_num=$gen_num, gen_tp_size=$gen_tp_size, gen_ep_size=$gen_tp_size, gen_batch_size=$gen_batch_size, gen_max_num_tokens=$gen_max_num_tokens, gen_gpu_memory_fraction=$gen_gpu_memory_fraction, gen_mtp_size=$gen_mtp_size, gen_eplb_num_slots=$gen_eplb_num_slots, gen_concurrency_list=\"$gen_concurrency_list\"" + + run_single $ctx_num $ctx_tp_size $ctx_ep_size $ctx_enable_attention_dp $gen_num $gen_tp_size $gen_tp_size $gen_batch_size $gen_max_num_tokens true $gen_gpu_memory_fraction $gen_mtp_size $gen_eplb_num_slots "$gen_concurrency_list" + ;; + "tp") + if [ $# -ne 14 ]; then + echo "Error: TP mode requires 14 additional parameters (including mtp_mode)" + usage + fi - run_single $ctx_num $gen_num $gen_tp_size $gen_batch_size $gen_max_num_tokens true $gen_gpu_memory_fraction $gen_mtp_size $gen_eplb_num_slots "$gen_concurrency_list" + local ctx_num=$3 + local ctx_tp_size=$4 + local ctx_ep_size=$5 + local ctx_enable_attention_dp=$6 + local gen_num=$7 + local gen_tp_size=$8 + local gen_batch_size=$9 + local gen_max_num_tokens=${10} + local gen_gpu_memory_fraction=${11} + local gen_mtp_size=${12} + local gen_eplb_num_slots=${13} + local gen_concurrency_list=${14} + + echo "Running TP mode ($mtp_mode) with ctx_num=$ctx_num, gen_num=$gen_num, gen_tp_size=$gen_tp_size, gen_ep_size=1, gen_batch_size=$gen_batch_size, gen_max_num_tokens=$gen_max_num_tokens, gen_gpu_memory_fraction=$gen_gpu_memory_fraction, gen_mtp_size=$gen_mtp_size, gen_eplb_num_slots=$gen_eplb_num_slots, gen_concurrency_list=\"$gen_concurrency_list\"" + + run_single $ctx_num $ctx_tp_size $ctx_ep_size $ctx_enable_attention_dp $gen_num $gen_tp_size 1 $gen_batch_size $gen_max_num_tokens false $gen_gpu_memory_fraction $gen_mtp_size $gen_eplb_num_slots "$gen_concurrency_list" ;; *) echo "Error: Unknown mode '$mode'"