diff --git a/components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm b/components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm
index b5650738a7..153d3e31c0 100755
--- a/components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm
+++ b/components/backends/trtllm/performance_sweeps/benchmark_disagg.slurm
@@ -10,29 +10,32 @@ CONTAINER_NAME=disaggr-test
 
 STREAMING=true
 CTX_GPU_FRAC=0.85
-CACHE_TRANSCEIVER_MAX_NUM_TOKENS=${CACHE_TRANSCEIVER_MAX_NUM_TOKENS:-4608}
 
 num_ctx_servers=$1
 ctx_tp_size=$2
-ctx_batch_size=$3
-ctx_max_num_tokens=$4
-ctx_enable_attention_dp=$5
-num_gen_servers=$6
-gen_tp_size=$7
-gen_batch_size=$8
-gen_max_num_tokens=$9
-gen_enable_attention_dp=${10}
-gen_gpu_memory_fraction=${11}
-eplb_num_slots=${12}
-mtp_size=${13}
-concurrency_list=${14}
-gen_nodes=${15}
-kind=${16}
-model_path=${17}
-served_model_name=${18}
-image=${19}
-isl=${20}
-osl=${21}
+ctx_ep_size=$3
+ctx_enable_attention_dp=$4
+ctx_batch_size=$5
+ctx_max_num_tokens=$6
+num_gen_servers=$7
+gen_tp_size=$8
+gen_ep_size=$9
+gen_batch_size=$10
+gen_max_num_tokens=${11}
+gen_enable_attention_dp=${12}
+gen_gpu_memory_fraction=${13}
+eplb_num_slots=${14}
+mtp_size=${15}
+concurrency_list=${16}
+gen_nodes=${17}
+kind=${18}
+model_path=${19}
+served_model_name=${20}
+image=${21}
+isl=${22}
+osl=${23}
+
+CACHE_TRANSCEIVER_MAX_NUM_TOKENS=${CACHE_TRANSCEIVER_MAX_NUM_TOKENS:-$((${isl} + ${osl} + 512))}
 
 ctx_max_seq_len=$((${isl} + 203))
 gen_max_seq_len=$((${isl} + ${osl} + 203))
@@ -44,7 +47,7 @@ set_clock_cmd="bash ${SCRIPTS_DIR}/set_clock.sh"
 mkdir -p ${LOG_DIR}
 echo "trying to submit job"
 
-sub_dir=${LOG_DIR}/ctx${num_ctx_servers}_gen${num_gen_servers}_dep${gen_tp_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size}
+sub_dir=${LOG_DIR}/ctx${num_ctx_servers}_gen${num_gen_servers}_dp${gen_tp_size}_ep${gen_ep_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size}
 
 echo "concurrency_list: ${concurrency_list}"
 
@@ -53,11 +56,8 @@ gen_gpus=$((num_gen_servers * gen_tp_size))
 
 echo "enable_attention_dp: ${ctx_enable_attention_dp}, ${gen_enable_attention_dp}, gpu_memory_fraction: ${gen_gpu_memory_fraction}"
 
-enable_pdl=false
 if [ "${gen_enable_attention_dp}" = "false" ]; then
-    enable_pdl=true
-    echo "enable_pdl: ${enable_pdl}"
-    sub_dir=${LOG_DIR}/ctx${num_ctx_servers}_gen${num_gen_servers}_tep${gen_tp_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size}
+    sub_dir=${LOG_DIR}/ctx${num_ctx_servers}_gen${num_gen_servers}_tp${gen_tp_size}_ep${gen_ep_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size}
 fi
 
 full_logdir=${sub_dir}
@@ -84,6 +84,7 @@ srun -l --container-name=${CONTAINER_NAME} \
             --model ${model_path} \
             --num_ctx_servers ${num_ctx_servers} \
             --ctx_tp_size ${ctx_tp_size} \
+            --ctx_ep_size ${ctx_ep_size} \
             --ctx_batch_size ${ctx_batch_size} \
             --ctx_max_num_tokens ${ctx_max_num_tokens} \
             --ctx_max_seq_len ${ctx_max_seq_len} \
@@ -91,6 +92,7 @@ srun -l --container-name=${CONTAINER_NAME} \
             --cache_transceiver_max_num_tokens ${CACHE_TRANSCEIVER_MAX_NUM_TOKENS} \
             --num_gen_servers ${num_gen_servers} \
             --gen_tp_size ${gen_tp_size} \
+            --gen_ep_size ${gen_ep_size} \
             --gen_batch_size ${gen_batch_size} \
             --gen_max_num_tokens ${gen_max_num_tokens} \
             --gen_max_seq_len ${gen_max_seq_len} \
@@ -177,7 +179,7 @@ for ((i=1; i<=DECODE_COUNT; i++)); do
       --ntasks $gen_tp_size \
       --oversubscribe \
       --overlap \
-      bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/decode_config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} ${served_model_name} ${model_path} 'decode' &> ${full_logdir}/output_decode_worker_${i}.log &
+      bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/decode_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'decode' &> ${full_logdir}/output_decode_worker_${i}.log &
   echo "$!" >> "$PID_FILE"
 done
 
@@ -200,9 +202,9 @@ for ((i=1; i<=PREFILL_COUNT; i++)); do
         --mpi=pmix --overlap -w ${nodes[node_idx]} \
         --oversubscribe \
         --overlap \
-        --ntasks 4 \
+        --ntasks $(( ctx_tp_size < 4 ? ctx_tp_size : 4 )) \
         --nodes 1 \
-        bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/prefill_config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} ${served_model_name} ${model_path} 'prefill' &> ${full_logdir}/output_prefill_worker_${i}.log &
+        bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/prefill_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'prefill' &> ${full_logdir}/output_prefill_worker_${i}.log &
   prefill_pids+=($!)
   echo "$!" >> "$PID_FILE"
 done
diff --git a/components/backends/trtllm/performance_sweeps/scripts/bench b/components/backends/trtllm/performance_sweeps/scripts/bench
index f3ea022a57..2a35585f0c 160000
--- a/components/backends/trtllm/performance_sweeps/scripts/bench
+++ b/components/backends/trtllm/performance_sweeps/scripts/bench
@@ -1 +1 @@
-Subproject commit f3ea022a5780de5d0babc5fffa53634e2023d28f
+Subproject commit 2a35585f0cb2c98d18934088c867f1ba52d373b4
diff --git a/components/backends/trtllm/performance_sweeps/scripts/gen_yaml.py b/components/backends/trtllm/performance_sweeps/scripts/gen_yaml.py
index 8c256e82dd..f388f4d920 100644
--- a/components/backends/trtllm/performance_sweeps/scripts/gen_yaml.py
+++ b/components/backends/trtllm/performance_sweeps/scripts/gen_yaml.py
@@ -2,12 +2,262 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import argparse
+from enum import Enum
 import os
 import re
 from typing import Any, Dict, List
 
 import yaml
 
+class ModelType(Enum):
+    """
+    Model type.
+    """
+    GPT_OSS = "gpt_oss"
+    DSR1 = "dsr1"
+    
+def get_model_type(model_path: str) -> str:
+    if "r1" in model_path.lower():
+        print("Inferring DSR1-type model")
+        return ModelType.DSR1
+    else:
+        print("Inferring GPT-oss-type model")
+        return ModelType.GPT_OSS
+
+def generate_dsr1_config(    
+    config_path: str,
+    decode_config_path: str,
+    instance_config_path: str,
+    args: argparse.Namespace
+):
+    gen_cuda_graph_batch_sizes = [
+        1,
+        2,
+        4,
+        8,
+        16,
+        32,
+        64,
+        128,
+        256,
+        384,
+        512,
+        768,
+        1024,
+        2048,
+        args.gen_batch_size,
+    ]
+
+    gen_moe_backend = "CUTLASS"
+    if args.gen_tp_size >= 16 and args.gen_enable_attention_dp:
+        gen_moe_backend = "WIDEEP"
+    if not args.gen_enable_attention_dp:
+        gen_moe_backend = "TRTLLM"
+
+    prefill_config: Dict[str, Any] = {
+        "max_batch_size": args.ctx_batch_size,
+        "max_num_tokens": args.ctx_max_num_tokens,
+        "max_seq_len": args.ctx_max_seq_len,
+        "tensor_parallel_size": args.ctx_tp_size,
+        "moe_expert_parallel_size": args.ctx_ep_size,
+        "enable_attention_dp": args.ctx_enable_attention_dp,
+        "pipeline_parallel_size": 1,
+        "cuda_graph_config": None,
+        "print_iter_log": True,
+        "disable_overlap_scheduler": True,
+        "kv_cache_config": {
+            "enable_block_reuse": False,
+            "free_gpu_memory_fraction": args.ctx_free_gpu_memory_fraction,
+            "dtype": "fp8",
+        },
+        "cache_transceiver_config": {
+            "max_tokens_in_buffer": args.cache_transceiver_max_num_tokens,
+            "backend": "DEFAULT",
+        },
+    }
+
+    decode_config: Dict[str, Any] = {
+        "tensor_parallel_size": args.gen_tp_size,
+        "moe_expert_parallel_size": args.gen_tp_size,
+        "enable_attention_dp": args.gen_enable_attention_dp,
+        "pipeline_parallel_size": 1,
+        "max_batch_size": args.gen_batch_size,
+        "max_num_tokens": args.gen_max_num_tokens,
+        "max_seq_len": args.gen_max_seq_len,
+        "cuda_graph_config": {
+            "enable_padding": True,
+            "batch_sizes": gen_cuda_graph_batch_sizes,
+        },
+        "print_iter_log": True,
+        "kv_cache_config": {
+            "enable_block_reuse": False,
+            "free_gpu_memory_fraction": args.gen_gpu_memory_fraction,
+            "dtype": "fp8",
+        },
+        "moe_config": {
+            "backend": gen_moe_backend,
+            "use_low_precision_moe_combine": True,
+        },
+        "cache_transceiver_config": {
+            "max_tokens_in_buffer": args.cache_transceiver_max_num_tokens,
+            "backend": "DEFAULT",
+        },
+        "stream_interval": 20,
+    }
+
+    if args.gen_tp_size == 8 and not args.gen_enable_attention_dp:
+        decode_config["allreduce_strategy"] = "MNNVL"
+
+    if args.eplb_num_slots > 0:
+        moe_load_balancer_file = os.path.join(
+            os.path.dirname(config_path), "moe_load_balancer.yaml"
+        )
+        # Ensure the directory exists before writing the file
+        os.makedirs(os.path.dirname(moe_load_balancer_file), exist_ok=True)
+        moe_load_balancer_config = {
+            "num_slots": args.eplb_num_slots,
+            "layer_updates_per_iter": 1,
+        }
+        with open(moe_load_balancer_file, "w") as f:
+            yaml.dump(
+                moe_load_balancer_config, f, default_flow_style=False, sort_keys=False
+            )
+        decode_config["moe_config"]["load_balancer"] = moe_load_balancer_file
+
+    if args.mtp_size > 0:
+        prefill_config["speculative_config"] = {
+            "decoding_type": "MTP",
+            "num_nextn_predict_layers": args.mtp_size,
+        }
+        decode_config["speculative_config"] = {
+            "decoding_type": "MTP",
+            "num_nextn_predict_layers": args.mtp_size,
+        }
+    
+    return prefill_config, decode_config
+
+def generate_gpt_oss_config(
+    config_path: str,
+    decode_config_path: str,
+    instance_config_path: str,
+    args: argparse.Namespace
+):
+    gen_cuda_graph_batch_sizes = [
+        1,
+        2,
+        4,
+        8,
+        16,
+        32,
+        64,
+        128,
+        256,
+        384,
+        512,
+        768,
+        1024,
+        2048,
+        args.gen_batch_size,
+    ]
+
+    gen_moe_backend = "TRTLLM"
+
+    prefill_config: Dict[str, Any] = {
+        "max_batch_size": args.ctx_batch_size,
+        "max_num_tokens": args.ctx_max_num_tokens,
+        "max_seq_len": args.ctx_max_seq_len,
+        "tensor_parallel_size": args.ctx_tp_size,
+        "moe_expert_parallel_size": args.ctx_ep_size,
+        "enable_attention_dp": args.ctx_enable_attention_dp,
+        "pipeline_parallel_size": 1,
+        "cuda_graph_config": None,
+        "print_iter_log": True,
+        "disable_overlap_scheduler": True,
+        "kv_cache_config": {
+            "enable_block_reuse": False,
+            "free_gpu_memory_fraction": args.ctx_free_gpu_memory_fraction,
+            "dtype": "fp8",
+        },
+        "cuda_graph_config": {
+            "enable_padding": True,
+            "max_batch_size": 30,
+        },
+        "num_postprocess_workers": 4,
+        "cache_transceiver_config": {
+            "max_tokens_in_buffer": args.cache_transceiver_max_num_tokens,
+            "backend": "UCX",
+        },
+        "moe_config": {
+            "backend": "TRTLLM"
+        }
+    }
+
+    decode_config: Dict[str, Any] = {
+        "allreduce_strategy": "AUTO",
+        "attention_dp_config": {
+            "enable_balance": True
+        },
+        "disable_overlap_scheduler": False,
+        "tensor_parallel_size": args.gen_tp_size,
+        "moe_expert_parallel_size": args.gen_ep_size,
+        "enable_attention_dp": args.gen_enable_attention_dp,
+        "pipeline_parallel_size": 1,
+        "max_batch_size": args.gen_batch_size,
+        "max_num_tokens": args.gen_max_num_tokens,
+        "max_seq_len": args.gen_max_seq_len,
+        "cuda_graph_config": {
+            "enable_padding": True,
+            "batch_sizes": gen_cuda_graph_batch_sizes,
+        },
+        "print_iter_log": True,
+        "kv_cache_config": {
+            "enable_block_reuse": False,
+            "free_gpu_memory_fraction": args.gen_gpu_memory_fraction,
+            "dtype": "fp8",
+        },
+        "moe_config": {
+            "backend": gen_moe_backend,
+        },
+        "cache_transceiver_config": {
+            "max_tokens_in_buffer": args.cache_transceiver_max_num_tokens,
+            "backend": "UCX",
+        },
+        "stream_interval": 20,
+        "num_postprocess_workers": 4
+    }
+
+    if args.eplb_num_slots > 0:
+        moe_load_balancer_file = os.path.join(
+            os.path.dirname(config_path), "moe_load_balancer.yaml"
+        )
+        # Ensure the directory exists before writing the file
+        os.makedirs(os.path.dirname(moe_load_balancer_file), exist_ok=True)
+        moe_load_balancer_config = {
+            "num_slots": args.eplb_num_slots,
+            "layer_updates_per_iter": 1,
+        }
+        with open(moe_load_balancer_file, "w") as f:
+            yaml.dump(
+                moe_load_balancer_config, f, default_flow_style=False, sort_keys=False
+            )
+        decode_config["moe_config"]["load_balancer"] = moe_load_balancer_file
+
+    if args.mtp_size > 0:
+        prefill_config["speculative_config"] = {
+            "decoding_type": "MTP",
+            "num_nextn_predict_layers": args.mtp_size,
+        }
+        decode_config["speculative_config"] = {
+            "decoding_type": "MTP",
+            "num_nextn_predict_layers": args.mtp_size,
+        }
+    
+    return prefill_config, decode_config
+
+CONFIG_MAPPING = {
+    ModelType.GPT_OSS: generate_gpt_oss_config,
+    ModelType.DSR1: generate_dsr1_config,
+}
 
 def process_node_and_task() -> tuple[int, List[str], List[str]]:
     """
@@ -144,6 +394,7 @@ def gen_config_file(
     ctx_enable_attention_dp: bool,
     num_gen_servers: int,
     gen_tp_size: int,
+    gen_ep_size: int,
     gen_batch_size: int,
     gen_max_num_tokens: int,
     gen_max_seq_len: int,
@@ -153,7 +404,7 @@ def gen_config_file(
     mtp_size: int = 0,
     worker_start_port: int = 8001,
     server_port: int = 8000,
-    cache_transceiver_max_num_tokens: int = 4608,
+    cache_transceiver_max_num_tokens: int = 9216,
 ) -> None:
     """
     Generate configuration YAML file for disaggregated inference.
@@ -170,6 +421,7 @@ def gen_config_file(
         ctx_enable_attention_dp: Enable attention DP for context servers
         num_gen_servers: Number of generation servers
         gen_tp_size: Tensor parallel size for generation servers
+        gen_ep_size: Expert parallel size for generation servers
         gen_batch_size: Batch size for generation servers
         gen_max_num_tokens: Max number of tokens for generation servers
         gen_enable_attention_dp: Enable attention DP for generation servers
@@ -178,109 +430,15 @@ def gen_config_file(
         worker_start_port: Start port for workers
         server_port: Server port
     """
-    gen_cuda_graph_batch_sizes = [
-        1,
-        2,
-        4,
-        8,
-        16,
-        32,
-        64,
-        128,
-        256,
-        384,
-        512,
-        768,
-        1024,
-        2048,
-        gen_batch_size,
-    ]
-
-    gen_moe_backend = "CUTLASS"
-    if gen_tp_size >= 16 and gen_enable_attention_dp:
-        gen_moe_backend = "WIDEEP"
-    if not gen_enable_attention_dp:
-        gen_moe_backend = "TRTLLM"
-
-    prefill_config: Dict[str, Any] = {
-        "max_batch_size": ctx_batch_size,
-        "max_num_tokens": ctx_max_num_tokens,
-        "max_seq_len": ctx_max_seq_len,
-        "tensor_parallel_size": ctx_tp_size,
-        "moe_expert_parallel_size": ctx_tp_size,
-        "enable_attention_dp": ctx_enable_attention_dp,
-        "pipeline_parallel_size": 1,
-        "cuda_graph_config": None,
-        "print_iter_log": True,
-        "disable_overlap_scheduler": True,
-        "kv_cache_config": {
-            "enable_block_reuse": False,
-            "free_gpu_memory_fraction": ctx_free_gpu_memory_fraction,
-            "dtype": "fp8",
-        },
-        "cache_transceiver_config": {
-            "max_tokens_in_buffer": cache_transceiver_max_num_tokens,
-            "backend": "DEFAULT",
-        },
-    }
 
-    decode_config: Dict[str, Any] = {
-        "tensor_parallel_size": gen_tp_size,
-        "moe_expert_parallel_size": gen_tp_size,
-        "enable_attention_dp": gen_enable_attention_dp,
-        "pipeline_parallel_size": 1,
-        "max_batch_size": gen_batch_size,
-        "max_num_tokens": gen_max_num_tokens,
-        "max_seq_len": gen_max_seq_len,
-        "cuda_graph_config": {
-            "enable_padding": True,
-            "batch_sizes": gen_cuda_graph_batch_sizes,
-        },
-        "print_iter_log": True,
-        "kv_cache_config": {
-            "enable_block_reuse": False,
-            "free_gpu_memory_fraction": gen_gpu_memory_fraction,
-            "dtype": "fp8",
-        },
-        "moe_config": {
-            "backend": gen_moe_backend,
-            "use_low_precision_moe_combine": True,
-        },
-        "cache_transceiver_config": {
-            "max_tokens_in_buffer": cache_transceiver_max_num_tokens,
-            "backend": "DEFAULT",
-        },
-        "stream_interval": 20,
-    }
+    model_type = get_model_type(model_path)
 
-    if gen_tp_size == 8 and not gen_enable_attention_dp:
-        decode_config["allreduce_strategy"] = "MNNVL"
-
-    if eplb_num_slots > 0:
-        moe_load_balancer_file = os.path.join(
-            os.path.dirname(config_path), "moe_load_balancer.yaml"
-        )
-        # Ensure the directory exists before writing the file
-        os.makedirs(os.path.dirname(moe_load_balancer_file), exist_ok=True)
-        moe_load_balancer_config = {
-            "num_slots": eplb_num_slots,
-            "layer_updates_per_iter": 1,
-        }
-        with open(moe_load_balancer_file, "w") as f:
-            yaml.dump(
-                moe_load_balancer_config, f, default_flow_style=False, sort_keys=False
-            )
-        decode_config["moe_config"]["load_balancer"] = moe_load_balancer_file
-
-    if mtp_size > 0:
-        prefill_config["speculative_config"] = {
-            "decoding_type": "MTP",
-            "num_nextn_predict_layers": mtp_size,
-        }
-        decode_config["speculative_config"] = {
-            "decoding_type": "MTP",
-            "num_nextn_predict_layers": mtp_size,
-        }
+    prefill_config, decode_config = CONFIG_MAPPING[model_type](
+        config_path,
+        decode_config_path,
+        instance_config_path,
+        args
+    )
 
     counts = {"prefill_count": num_ctx_servers, "decode_count": num_gen_servers}
 
@@ -309,6 +467,12 @@ def gen_config_file(
         required=True,
         help="Tensor parallel size for context servers",
     )
+    parser.add_argument(
+        "--ctx_ep_size",
+        type=int,
+        required=True,
+        help="Expert parallel size for context servers",
+    )
     parser.add_argument(
         "--ctx_batch_size",
         type=int,
@@ -351,6 +515,12 @@ def gen_config_file(
         required=True,
         help="Tensor parallel size for generation servers",
     )
+    parser.add_argument(
+        "--gen_ep_size",
+        type=int,
+        required=True,
+        help="Expert parallel size for generation servers",
+    )
     parser.add_argument(
         "--gen_batch_size",
         type=int,
diff --git a/components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh b/components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh
index 305fd157ec..d99b1b531b 100755
--- a/components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh
+++ b/components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh
@@ -3,13 +3,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 config_file=$1
-enable_pdl=$2
-ctx_gpus=$3
-model_name=$4
-model_path=$5
-disaggregation_mode=$6
+ctx_gpus=$2
+model_name=$3
+model_path=$4
+disaggregation_mode=$5
 unset UCX_TLS
-echo "config_file: ${config_file}, enable_pdl: ${enable_pdl}, ctx_gpus: ${ctx_gpus}, disaggregation_mode: ${disaggregation_mode}"
+echo "config_file: ${config_file}, ctx_gpus: ${ctx_gpus}, disaggregation_mode: ${disaggregation_mode}"
 
 # Read configuration values from the YAML config file
 if [ ! -f "${config_file}" ]; then
@@ -47,9 +46,12 @@ export TRTLLM_MOE_ENABLE_ALLTOALL_WITHOUT_ALLGATHER=1
 # "moe_backend.use_low_precision_combine: true" in recent trtllm commits, and
 # can be removed. Keeping it here in case the script is ran with older commits.
 export TRTLLM_MOE_USE_LOW_PRECISION_COMBINE=1
+# TODO: Is there ever a case where we don't want this enabled?
+export TRTLLM_ENABLE_PDL=1
 
-if [ "${enable_pdl}" = "true" ]; then
-    export TRTLLM_ENABLE_PDL=1
+if [[ "${model_path,,}" != *r1* ]]; then
+    echo "Inferred gpt-oss style model. Setting OVERRIDE_QUANT_ALGO to W4A8_MXFP4_MXFP8"
+    export OVERRIDE_QUANT_ALGO=W4A8_MXFP4_MXFP8
 fi
 
 # NOTE: Set (or unset) these depending on what cluster you're using
diff --git a/components/backends/trtllm/performance_sweeps/submit_disagg.sh b/components/backends/trtllm/performance_sweeps/submit_disagg.sh
index 5300cc9c27..0b23bc9925 100755
--- a/components/backends/trtllm/performance_sweeps/submit_disagg.sh
+++ b/components/backends/trtllm/performance_sweeps/submit_disagg.sh
@@ -77,15 +77,19 @@ usage() {
 # Run single task
 run_single() {
     local ctx_num=$1
-    local gen_num=$2
-    local gen_tp_size=$3
-    local gen_batch_size=$4
-    local gen_max_num_tokens=$5
-    local gen_enable_attention_dp=$6
-    local gen_gpu_memory_fraction=$7
-    local gen_mtp_size=$8
-    local gen_eplb_num_slots=$9
-    local gen_concurrency_list=${10}
+    local ctx_tp_size=$2
+    local ctx_ep_size=$3
+    local ctx_enable_attention_dp=$4
+    local gen_num=$5
+    local gen_tp_size=$6
+    local gen_ep_size=$7
+    local gen_batch_size=$8
+    local gen_max_num_tokens=$9
+    local gen_enable_attention_dp=${10}
+    local gen_gpu_memory_fraction=${11}
+    local gen_eplb_num_slots=${12}
+    local gen_mtp_size=${13}
+    local gen_concurrency_list=${14}
 
     # TODO: expose kind to the command line
     local kind="dynamo_disagg"
@@ -94,179 +98,10 @@ run_single() {
     total_nodes=$((ctx_num + gen_nodes))
     total_tasks=$((total_nodes * 4))
     set -x
-    if (( ISL == OSL )); then
-        sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} 4 4 4608 true ${gen_num} ${gen_tp_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL}
-    else
-        sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} 4 1 8448 true ${gen_num} ${gen_tp_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL}
-    fi
+    sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} ${ctx_tp_size} ${ctx_ep_size} ${ctx_enable_attention_dp} 30 20000 ${gen_num} ${gen_tp_size} ${gen_ep_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL}
     set +x
 }
 
-# MTP0 Configuration (gen_mtp_size=0)
-run_4_gpus_mtp0() {
-    echo "Running 4 GPUs MTP0 combinations..."
-    if (( ISL == OSL )); then
-        run_single 1 5 4 128 128 false "0.9" 0 0 "1 2 4 8 16 32 64 128 192"
-        run_single 1 5 4 64 64 true "0.85" 0 0 "256 384"
-        run_single 1 4 4 128 128 true "0.85" 0 0 "512 768"
-        run_single 2 5 4 256 256 true "0.85" 0 0 "1024 1536"
-        run_single 1 2 4 512 512 true "0.85" 0 0 "2048 3072"
-        run_single 2 3 4 768 768 true "0.85" 0 0 "3072 4096"
-    else
-        run_single 1 5 4 16 16 false "0.9" 0 0 "1 2 4 8 16 24"
-        run_single 1 4 4 32 32 false "0.9" 0 0 "32 48"
-        run_single 2 5 4 64 64 false "0.9" 0 0 "64 96"
-        run_single 1 2 4 128 128 false "0.9" 0 0 "128 192"
-        run_single 1 1 4 64 64 true "0.8" 0 0 "256 384"
-        run_single 3 2 4 128 128 true "0.8" 0 0 "512 768"
-    fi
-}
-
-run_8_gpus_mtp0() {
-    echo "Running 8 GPUs MTP0 combinations..."
-    if (( ISL == OSL )); then
-        run_single 1 4 8 128 128 false "0.9" 0 0 "1 2 4 8 16 32 64 128 192 256"
-        run_single 1 4 8 32 32 true "0.8" 0 0 "256 384"
-        run_single 1 3 8 64 64 true "0.8" 0 0 "512 768"
-        run_single 1 2 8 128 128 true "0.8" 0 0 "1024 1536"
-        run_single 1 1 8 256 256 true "0.8" 0 0 "2048 3072"
-        run_single 1 1 8 512 512 true "0.8" 0 0 "4096 6144"
-        run_single 3 2 8 768 768 true "0.8" 0 0 "6144 8192"
-        run_single 3 2 8 1024 1024 true "0.8" 0 0 "8192 12288"
-    else
-        run_single 1 4 8 16 16 false "0.9" 0 0 "1 2 4 8 16 24"
-        run_single 1 3 8 32 32 false "0.9" 0 0 "32 48"
-        run_single 1 2 8 64 64 false "0.9" 0 0 "64 96"
-        run_single 1 1 8 128 128 false "0.9" 0 0 "128 192"
-        run_single 3 2 8 32 32 true "0.8" 0 0 "256 384"
-        run_single 5 2 8 64 64 true "0.8" 0 0 "512 768"
-        run_single 4 1 8 128 128 true "0.8" 0 0 "1024 1536"
-        run_single 5 1 8 256 256 true "0.8" 0 0 "2048 3072"
-    fi
-}
-
-run_16_gpus_mtp0() {
-    echo "Running 16 GPUs MTP0 combinations..."
-    if (( ISL == OSL )); then
-        run_single 1 1 16 64 64 true "0.75" 0 0 "16 32 64 128 256 512 1024 1536"
-        run_single 2 1 16 128 128 true "0.75" 0 256 "2048 3072"
-        run_single 2 1 16 256 256 true "0.75" 0 256 "4096 6144"
-        run_single 3 1 16 512 512 true "0.75" 0 256 "8192 12288"
-        run_single 3 1 16 768 768 true "0.75" 0 256 "12288 16384"
-        run_single 3 1 16 1024 1024 true "0.75" 0 288 "16384 20480"
-    else
-        run_single 1 1 16 8 8 true "0.8" 0 0 "16 32 64 128 192" # 5
-        run_single 2 1 16 16 16 true "0.8" 0 0 "256 384"        # 6
-        run_single 3 1 16 32 32 true "0.8" 0 0 "512 768"       # 7
-        run_single 6 1 16 64 64 true "0.8" 0 0 "1024 1536"     # 10
-        run_single 8 1 16 128 128 true "0.8" 0 256 "2048 3072"   # 12
-        run_single 10 1 16 256 256 true "0.8" 0 256 "4096 6144" # 14
-    fi
-}
-
-run_32_gpus_mtp0() {
-    echo "Running 32 GPUs MTP0 combinations..."
-    if (( ISL == OSL )); then
-        run_single 1 1 32 32 32 true "0.7" 0 0 "32 64 128 256 512 1024 1536"
-        run_single 2 1 32 64 64 true "0.7" 0 256 "2048 3072"
-        run_single 3 1 32 128 128 true "0.7" 0 288 "4096 6144"
-        run_single 4 1 32 256 256 true "0.7" 0 288 "8192 12288"
-        run_single 5 1 32 512 512 true "0.7" 0 288 "16384 20480"
-    else
-        run_single 1 1 32 4 4 true "0.7" 0 0 "32 64 128 192"  # 9
-        run_single 2 1 32 8 8 true "0.7" 0 0 "256 384"          # 10
-        run_single 4 1 32 16 16 true "0.7" 0 0 "512 768"       # 12
-        run_single 7 1 32 32 32 true "0.7" 0 0 "1024 1536"     # 15
-    fi
-}
-
-# MTP Configuration (gen_mtp_size=1,2,3)
-run_4_gpus_mtp() {
-    echo "Running 4 GPUs MTP combinations..."
-    if (( ISL == OSL )); then
-        run_single 1 5 4 32 128 false "0.9" 3 0 "1 2 4 8 16 32 48"
-        run_single 1 5 4 32 128 true "0.9" 3 0 "64 128 192"
-        run_single 1 4 4 64 256 true "0.9" 3 0 "256 384"
-        run_single 1 3 4 128 512 true "0.9" 3 0 "512 768"
-        run_single 1 2 4 256 768 true "0.9" 2 0 "1024 1536"
-        run_single 2 3 4 512 1024 true "0.9" 1 0 "2048 3072"
-        run_single 1 1 4 768 1536 true "0.9" 1 0 "3072 4096"
-    else
-        run_single 1 5 4 8 32 false "0.9" 3 0 "1 2 4 8 12"
-        run_single 1 4 4 16 64 false "0.9" 3 0 "16 24"
-        run_single 1 3 4 32 128 false "0.9" 3 0 "32 48"
-        run_single 2 3 4 16 64 true "0.8" 3 0 "64 96"
-        run_single 1 1 4 32 128 true "0.8" 3 0 "128 192"
-        run_single 2 1 4 64 256 true "0.8" 2 0 "256 384"
-        run_single 5 2 4 128 512 true "0.8" 1 0 "512 768"
-    fi
-}
-
-run_8_gpus_mtp() {
-    echo "Running 8 GPUs MTP combinations..."
-    if (( ISL == OSL )); then
-        run_single 1 4 8 32 128 false "0.9" 3 0 "1 2 4 8 16 32 48"
-        run_single 1 4 8 16 64 true "0.8" 3 0 "64 128 192"
-        run_single 1 3 8 32 128 true "0.8" 3 0 "256 384"
-        run_single 1 2 8 64 256 true "0.8" 3 0 "512 768"
-        run_single 1 1 8 128 512 true "0.8" 3 0 "1024 1536"
-        run_single 1 1 8 256 512 true "0.8" 1 0 "2048 3072"
-        run_single 3 2 8 512 1024 true "0.8" 1 0 "4096 6144"
-        run_single 3 2 8 768 1536 true "0.8" 1 0 "6144 8192"
-        run_single 3 2 8 1024 2048 true "0.8" 1 0 "8192 12288"
-    else
-        run_single 1 4 8 8 32 false "0.9" 3 0 "1 2 4 8 12"
-        run_single 1 3 8 16 64 false "0.9" 3 0 "16 24"
-        run_single 1 2 8 32 128 false "0.9" 3 0 "32 48"
-        run_single 1 1 8 8 32 true "0.8" 3 0 "64 96"
-        run_single 3 2 8 16 64 true "0.8" 3 0 "128 192"
-        run_single 5 2 8 32 128 true "0.8" 3 0 "256 384"
-        run_single 7 2 8 64 256 true "0.8" 2 0 "512 768"
-        run_single 5 1 8 128 256 true "0.8" 1 0 "1024 1536"
-        run_single 6 1 8 256 512 true "0.8" 1 0 "2048 3072"
-    fi
-}
-
-run_16_gpus_mtp() {
-    echo "Running 16 GPUs MTP combinations..."
-    if (( ISL == OSL )); then
-        run_single 1 1 16 32 128 true "0.7" 3 0 "16 32 64 128 256 512 768"
-        run_single 1 1 16 64 256 true "0.7" 3 256 "1024 1536"
-        run_single 2 1 16 128 256 true "0.7" 1 288 "2048 3072"
-        run_single 2 1 16 256 512 true "0.7" 1 288 "4096 6144"
-        run_single 3 1 16 512 1024 true "0.7" 1 288 "8192 12288"
-        run_single 3 1 16 768 1536 true "0.7" 1 288 "12288 16384"
-        run_single 3 1 16 1024 1024 true "0.75" 0 288 "16384 20480"
-    else
-        run_single 1 1 16 4 16 true "0.8" 3 0 "16 32 64 96" # 5
-        run_single 2 1 16 8 32 true "0.8" 3 0 "128 192"       # 6
-        run_single 4 1 16 16 64 true "0.8" 3 0 "256 384"      # 8
-        run_single 6 1 16 32 128 true "0.8" 3 0 "512 768"    # 10
-        run_single 8 1 16 64 256 true "0.8" 2 256 "1024 1536" # 13
-        run_single 10 1 16 128 256 true "0.8" 1 256 "2048 3072" # 15
-        run_single 12 1 16 256 512 true "0.8" 1 256 "4096 6144" # 16
-    fi
-
-}
-
-run_32_gpus_mtp() {
-    echo "Running 32 GPUs MTP combinations..."
-    if (( ISL == OSL )); then
-        run_single 1 1 32 16 64 true "0.6" 3 0 "32 64 128 256 512 768"
-        run_single 2 1 32 32 128 true "0.6" 3 288 "1024 1536"
-        run_single 3 1 32 64 256 true "0.6" 3 288 "2048 3072"
-        run_single 3 1 32 128 256 true "0.6" 1 288 "4096 6144"
-        run_single 4 1 32 256 512 true "0.6" 1 288 "8192 12288"
-        run_single 5 1 32 512 1024 true "0.6" 1 288 "16384 20480"
-    else
-        run_single 1 1 32 1 4 true "0.7" 3 0 "32 48" # 9
-        run_single 2 1 32 2 8 true "0.7" 3 0 "64 96" # 10
-        run_single 3 1 32 4 16 true "0.7" 3 0 "128 192" # 11
-        run_single 5 1 32 8 32 true "0.7" 3 0 "256 384" # 13
-        run_single 8 1 32 16 64 true "0.7" 3 256 "512 768" # 16
-    fi
-}
-
 # Main function
 main() {
     local mtp_mode=$1
@@ -279,139 +114,75 @@ main() {
     fi
 
     case $mode in
-        "all")
-            echo "Running all GPU configurations for $mtp_mode mode..."
-            if [[ "$mtp_mode" == "mtp=off" ]]; then
-                run_4_gpus_mtp0
-                run_8_gpus_mtp0
-                run_16_gpus_mtp0
-                run_32_gpus_mtp0
-            else
-                run_4_gpus_mtp
-                run_8_gpus_mtp
-                run_16_gpus_mtp
-                run_32_gpus_mtp
-            fi
-            ;;
-        "pareto")
-            # 1k/1k
-            export ISL=1024
-            export OSL=1024
-            export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=4608
-
-            if [[ "$mtp_mode" == "mtp=off" ]]; then
-                # 1k/1k mtp=off
-                run_single 1 4 8 128 128 false "0.9" 0 0 "1 2 4 8 16 32 64 141"
-                run_single 1 1 32 32 32 true "0.7" 0 0 "1075"
-                run_single 1 1 16 64 64 true "0.75" 0 0 "1075"
-                run_single 2 1 16 256 256 true "0.75" 0 0 "2048 4300"
-                run_single 1 1 8 512 512 true "0.8" 0 0 "4300"
-
-            else
-                # 1k/1k mtp=on
-                run_single 1 4 8 32 128 false "0.9" 3 0 "1 2 4 8 16 36"
-                run_single 1 1 16 64 256 true "0.7" 3 0 "512 1075"
-                run_single 2 1 16 128 256 true "0.7" 1 0 "2150"
-                run_single 1 1 32 16 64 true "0.6" 3 0 "512"
-                run_single 1 1 8 256 512 true "0.8" 1 0 "2252"
-            fi
-
-            # 8k/1k
-            export ISL=8192
-            export OSL=1024
-            export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=8448
-
-            if [[ "$mtp_mode" == "mtp=off" ]]; then
-                # 8k/1k mtp=off
-                run_single 1 3 8 32 32 false "0.9" 0 0 "1 2 4 8 16 34"
-                run_single 4 1 32 16 16 true "0.7" 0 0 "256 538"
-                run_single 7 1 32 32 32 true "0.7" 0 0 "1075" # remove if need 5 cofigs
-                run_single 6 1 16 64 64 true "0.75" 0 0 "1075"
-                run_single 8 1 16 128 128 true "0.75" 0 0 "2150"
-                run_single 5 1 8 256 256 true "0.8" 0 0 "2150"
-            else
-                # 8k/1k mtp=on
-                run_single 1 3 8 16 64 false "0.9" 3 0 "1 2 4 8 18"
-                run_single 5 1 32 8 32 true "0.7" 3 0 "128 269"
-                run_single 8 1 32 16 64 true "0.7" 3 0 "538"
-                run_single 6 1 16 32 128 true "0.75" 3 0 "538" # remove if need 5 configs
-                run_single 8 1 16 64 256 true "0.75" 2 0 "1075"
-                run_single 5 1 8 128 256 true "0.8" 1 0 "1075" # remove if need 5 configs
-                run_single 6 1 8 256 512 true "0.8" 1 0 "2150"
-            fi
-            ;;
-        "4GPU")
-            echo "Running 4 GPUs combinations for $mtp_mode mode..."
-            if [[ "$mtp_mode" == "mtp=off" ]]; then
-                run_4_gpus_mtp0
-            else
-                run_4_gpus_mtp
-            fi
-            ;;
-        "8GPU")
-            echo "Running 8 GPUs combinations for $mtp_mode mode..."
-            if [[ "$mtp_mode" == "mtp=off" ]]; then
-                run_8_gpus_mtp0
-            else
-                run_8_gpus_mtp
-            fi
-            ;;
-        "16GPU")
-            echo "Running 16 GPUs combinations for $mtp_mode mode..."
-            if [[ "$mtp_mode" == "mtp=off" ]]; then
-                run_16_gpus_mtp0
-            else
-                run_16_gpus_mtp
-            fi
-            ;;
-        "32GPU")
-            echo "Running 32 GPUs combinations for $mtp_mode mode..."
-            if [[ "$mtp_mode" == "mtp=off" ]]; then
-                run_32_gpus_mtp0
-            else
-                run_32_gpus_mtp
-            fi
-            ;;
         "tep")
-            if [ $# -ne 11 ]; then
-                echo "Error: TEP mode requires 11 additional parameters (including mtp_mode)"
+            if [ $# -ne 14 ]; then
+                echo "Error: TEP mode requires 14 additional parameters (including mtp_mode)"
                 usage
             fi
 
             local ctx_num=$3
-            local gen_num=$4
-            local gen_tp_size=$5
-            local gen_batch_size=$6
-            local gen_max_num_tokens=$7
-            local gen_gpu_memory_fraction=$8
-            local gen_mtp_size=$9
-            local gen_eplb_num_slots=${10}
-            local gen_concurrency_list=${11}
-
-            echo "Running TEP mode ($mtp_mode) with ctx_num=$ctx_num, gen_num=$gen_num, gen_tp_size=$gen_tp_size, gen_batch_size=$gen_batch_size, gen_max_num_tokens=$gen_max_num_tokens, gen_gpu_memory_fraction=$gen_gpu_memory_fraction, gen_mtp_size=$gen_mtp_size, gen_eplb_num_slots=$gen_eplb_num_slots, gen_concurrency_list=\"$gen_concurrency_list\""
+            local ctx_tp_size=$4
+            local ctx_ep_size=$5
+            local ctx_enable_attention_dp=$6
+            local gen_num=$7
+            local gen_tp_size=$8
+            local gen_batch_size=$9
+            local gen_max_num_tokens=${10}
+            local gen_gpu_memory_fraction=${11}
+            local gen_mtp_size=${12}
+            local gen_eplb_num_slots=${13}
+            local gen_concurrency_list=${14}
+
+            echo "Running TEP mode ($mtp_mode) with ctx_num=$ctx_num, gen_num=$gen_num, gen_tp_size=$gen_tp_size, gen_ep_size=$gen_tp_size, gen_batch_size=$gen_batch_size, gen_max_num_tokens=$gen_max_num_tokens, gen_gpu_memory_fraction=$gen_gpu_memory_fraction, gen_mtp_size=$gen_mtp_size, gen_eplb_num_slots=$gen_eplb_num_slots, gen_concurrency_list=\"$gen_concurrency_list\""
 
             # TEP mode: Use false to disable attention dp
-            run_single $ctx_num $gen_num $gen_tp_size $gen_batch_size $gen_max_num_tokens false $gen_gpu_memory_fraction $gen_mtp_size $gen_eplb_num_slots "$gen_concurrency_list"
+            run_single $ctx_num $ctx_tp_size $ctx_ep_size $ctx_enable_attention_dp $gen_num $gen_tp_size $gen_tp_size $gen_batch_size $gen_max_num_tokens false $gen_gpu_memory_fraction $gen_mtp_size $gen_eplb_num_slots "$gen_concurrency_list"
             ;;
         "dep")
-            if [ $# -ne 11 ]; then
-                echo "Error: DEP mode requires 11 additional parameters (including mtp_mode)"
+            if [ $# -ne 14 ]; then
+                echo "Error: DEP mode requires 14 additional parameters (including mtp_mode)"
                 usage
             fi
 
             local ctx_num=$3
-            local gen_num=$4
-            local gen_tp_size=$5
-            local gen_batch_size=$6
-            local gen_max_num_tokens=$7
-            local gen_gpu_memory_fraction=$8
-            local gen_mtp_size=$9
-            local gen_eplb_num_slots=${10}
-            local gen_concurrency_list=${11}
-
-            echo "Running DEP mode ($mtp_mode) with ctx_num=$ctx_num, gen_num=$gen_num, gen_tp_size=$gen_tp_size, gen_batch_size=$gen_batch_size, gen_max_num_tokens=$gen_max_num_tokens, gen_gpu_memory_fraction=$gen_gpu_memory_fraction, gen_mtp_size=$gen_mtp_size, gen_eplb_num_slots=$gen_eplb_num_slots, gen_concurrency_list=\"$gen_concurrency_list\""
+            local ctx_tp_size=$4
+            local ctx_ep_size=$5
+            local ctx_enable_attention_dp=$6
+            local gen_num=$7
+            local gen_tp_size=$8
+            local gen_batch_size=$9
+            local gen_max_num_tokens=${10}
+            local gen_gpu_memory_fraction=${11}
+            local gen_mtp_size=${12}
+            local gen_eplb_num_slots=${13}
+            local gen_concurrency_list=${14}
+
+            echo "Running DEP mode ($mtp_mode) with ctx_num=$ctx_num, ctx_tp_size=$ctx_tp_size, ctx_enable_attention_dp=$ctx_enable_attention_dp, gen_num=$gen_num, gen_tp_size=$gen_tp_size, gen_ep_size=$gen_tp_size, gen_batch_size=$gen_batch_size, gen_max_num_tokens=$gen_max_num_tokens, gen_gpu_memory_fraction=$gen_gpu_memory_fraction, gen_mtp_size=$gen_mtp_size, gen_eplb_num_slots=$gen_eplb_num_slots, gen_concurrency_list=\"$gen_concurrency_list\""
+
+            run_single $ctx_num $ctx_tp_size $ctx_ep_size $ctx_enable_attention_dp $gen_num $gen_tp_size $gen_tp_size $gen_batch_size $gen_max_num_tokens true $gen_gpu_memory_fraction $gen_mtp_size $gen_eplb_num_slots "$gen_concurrency_list"
+            ;;
+        "tp")
+            if [ $# -ne 14 ]; then
+                echo "Error: TP mode requires 14 additional parameters (including mtp_mode)"
+                usage
+            fi
 
-            run_single $ctx_num $gen_num $gen_tp_size $gen_batch_size $gen_max_num_tokens true $gen_gpu_memory_fraction $gen_mtp_size $gen_eplb_num_slots "$gen_concurrency_list"
+            local ctx_num=$3
+            local ctx_tp_size=$4
+            local ctx_ep_size=$5
+            local ctx_enable_attention_dp=$6
+            local gen_num=$7
+            local gen_tp_size=$8
+            local gen_batch_size=$9
+            local gen_max_num_tokens=${10}
+            local gen_gpu_memory_fraction=${11}
+            local gen_mtp_size=${12}
+            local gen_eplb_num_slots=${13}
+            local gen_concurrency_list=${14}
+
+            echo "Running TP mode ($mtp_mode) with ctx_num=$ctx_num, gen_num=$gen_num, gen_tp_size=$gen_tp_size, gen_ep_size=1, gen_batch_size=$gen_batch_size, gen_max_num_tokens=$gen_max_num_tokens, gen_gpu_memory_fraction=$gen_gpu_memory_fraction, gen_mtp_size=$gen_mtp_size, gen_eplb_num_slots=$gen_eplb_num_slots, gen_concurrency_list=\"$gen_concurrency_list\""
+
+            run_single $ctx_num $ctx_tp_size $ctx_ep_size $ctx_enable_attention_dp $gen_num $gen_tp_size 1 $gen_batch_size $gen_max_num_tokens false $gen_gpu_memory_fraction $gen_mtp_size $gen_eplb_num_slots "$gen_concurrency_list"
             ;;
         *)
             echo "Error: Unknown mode '$mode'"