From d6677c9f0a78c3dd2982cad779fd3f64eec9d911 Mon Sep 17 00:00:00 2001
From: 22quinn <33176974+22quinn@users.noreply.github.com>
Date: Sun, 26 Oct 2025 22:30:03 -0700
Subject: [PATCH 1/6] [CI/Build] Test torchrun with 8 cards

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml                 | 19 +++++++++++++++++--
 .../offline_inference/torchrun_dp_example.py  | 16 ++++++++++++----
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 3f280c837be9..96f999cd3c17 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -205,6 +205,21 @@ steps:
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
   - popd
 
+- label: Distributed Tests (8 GPUs) # ?min
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 8
+  source_file_dependencies:
+  - vllm/distributed/
+  - tests/examples/offline_inference/torchrun_dp_example.py
+  - tests/v1/distributed
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  # test with torchrun tp=2 and dp=4 with ep
+  - TP_SIZE=2 DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py
+
 - label: EPLB Algorithm Test # 5min
   timeout_in_minutes: 15
   working_dir: "/vllm-workspace/tests"
@@ -392,7 +407,7 @@ steps:
       --ignore=lora/test_deepseekv2_tp.py \
       --ignore=lora/test_gptoss.py \
       --ignore=lora/test_qwen3moe_tp.py
-      
+
   parallelism: 4
 
 - label: PyTorch Compilation Unit Tests # 15min
@@ -1101,7 +1116,7 @@ steps:
   - tests/weight_loading
   commands:
     - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
-  
+
 - label: NixlConnector PD accuracy tests (Distributed) # 30min
   timeout_in_minutes: 30
   working_dir: "/vllm-workspace/tests"
diff --git a/examples/offline_inference/torchrun_dp_example.py b/examples/offline_inference/torchrun_dp_example.py
index 295d1637528c..349e5ad167e6 100644
--- a/examples/offline_inference/torchrun_dp_example.py
+++ b/examples/offline_inference/torchrun_dp_example.py
@@ -11,8 +11,16 @@
 ```
 """
 
+import os
+
 from vllm import LLM, SamplingParams
 
+tp_size = int(os.getenv("TP_SIZE", "1"))
+pp_size = int(os.getenv("PP_SIZE", "1"))
+dp_size = int(os.getenv("DP_SIZE", "2"))
+enable_ep = bool(int(os.getenv("ENABLE_EP", "0")))
+
+
 # Create prompts, the same across all ranks
 prompts = [
     "Hello, my name is",
@@ -31,10 +39,10 @@
 # deterministic across ranks.
 llm = LLM(
     model="microsoft/Phi-mini-MoE-instruct",
-    tensor_parallel_size=1,
-    data_parallel_size=2,
-    pipeline_parallel_size=1,
-    enable_expert_parallel=False,
+    tensor_parallel_size=tp_size,
+    data_parallel_size=dp_size,
+    pipeline_parallel_size=pp_size,
+    enable_expert_parallel=enable_ep,
     distributed_executor_backend="external_launcher",
     max_model_len=4096,
     gpu_memory_utilization=0.6,

From 3a8f5e8b828528f98304592c58fec2a0c5f6c838 Mon Sep 17 00:00:00 2001
From: 22quinn <33176974+22quinn@users.noreply.github.com>
Date: Sun, 26 Oct 2025 22:38:05 -0700
Subject: [PATCH 2/6] organize triggers

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 96f999cd3c17..5025e0797d2d 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -211,9 +211,12 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_gpus: 8
   source_file_dependencies:
+  - examples/offline_inference/torchrun_dp_example.py
+  - vllm/config/parallel.py
   - vllm/distributed/
-  - tests/examples/offline_inference/torchrun_dp_example.py
-  - tests/v1/distributed
+  - vllm/v1/engine/llm_engine.py
+  - vllm/v1/executor/uniproc_executor.py
+  - vllm/v1/worker/gpu_worker.py
   commands:
   # https://github.com/NVIDIA/nccl/issues/1838
   - export NCCL_CUMEM_HOST_ENABLE=0

From 2e181e7de1bce04b566690823a05e6f403657765 Mon Sep 17 00:00:00 2001
From: 22quinn <33176974+22quinn@users.noreply.github.com>
Date: Mon, 27 Oct 2025 16:19:29 -0700
Subject: [PATCH 3/6] gpu: h100

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 5025e0797d2d..9b3206dcf5d6 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -207,9 +207,9 @@ steps:
 
 - label: Distributed Tests (8 GPUs) # ?min
   timeout_in_minutes: 50
-  mirror_hardwares: [amdexperimental]
-  working_dir: "/vllm-workspace/tests"
+  gpu: h100
   num_gpus: 8
+  working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - examples/offline_inference/torchrun_dp_example.py
   - vllm/config/parallel.py

From 894c0861a9e91b8a7b0080db8842f883678a80aa Mon Sep 17 00:00:00 2001
From: 22quinn <33176974+22quinn@users.noreply.github.com>
Date: Tue, 28 Oct 2025 10:38:15 -0700
Subject: [PATCH 4/6] timeout for 8-h100 test: 10 minutes

---
 .buildkite/test-pipeline.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index a40a63c79187..fd1c66bea60a 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -205,8 +205,8 @@ steps:
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
   - popd
 
-- label: Distributed Tests (8 GPUs) # ?min
-  timeout_in_minutes: 50
+- label: Distributed Tests (8 GPUs) # 4min
+  timeout_in_minutes: 10
   gpu: h100
   num_gpus: 8
   working_dir: "/vllm-workspace/tests"

From 4a0afe0072cd54f88439d2669972863880452600 Mon Sep 17 00:00:00 2001
From: 22quinn <33176974+22quinn@users.noreply.github.com>
Date: Tue, 28 Oct 2025 16:48:42 -0700
Subject: [PATCH 5/6] argparse

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
---
 .../offline_inference/torchrun_dp_example.py  | 84 ++++++++++++++++---
 1 file changed, 71 insertions(+), 13 deletions(-)

diff --git a/examples/offline_inference/torchrun_dp_example.py b/examples/offline_inference/torchrun_dp_example.py
index 349e5ad167e6..eb7ed969ea4b 100644
--- a/examples/offline_inference/torchrun_dp_example.py
+++ b/examples/offline_inference/torchrun_dp_example.py
@@ -9,16 +9,74 @@
 ```bash
 $ torchrun --nproc-per-node=2 examples/offline_inference/torchrun_dp_example.py
 ```
+
+With custom parallelism settings:
+```bash
+$ torchrun --nproc-per-node=8 examples/offline_inference/torchrun_dp_example.py \
+    --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+```
 """
 
-import os
+import argparse
 
 from vllm import LLM, SamplingParams
 
-tp_size = int(os.getenv("TP_SIZE", "1"))
-pp_size = int(os.getenv("PP_SIZE", "1"))
-dp_size = int(os.getenv("DP_SIZE", "2"))
-enable_ep = bool(int(os.getenv("ENABLE_EP", "0")))
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Data-parallel inference with torchrun"
+    )
+    parser.add_argument(
+        "--tp-size",
+        type=int,
+        default=1,
+        help="Tensor parallel size (default: 1)",
+    )
+    parser.add_argument(
+        "--pp-size",
+        type=int,
+        default=1,
+        help="Pipeline parallel size (default: 1)",
+    )
+    parser.add_argument(
+        "--dp-size",
+        type=int,
+        default=2,
+        help="Data parallel size (default: 2)",
+    )
+    parser.add_argument(
+        "--enable-ep",
+        action="store_true",
+        help="Enable expert parallel (default: False)",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="microsoft/Phi-mini-MoE-instruct",
+        help="Model name or path (default: microsoft/Phi-mini-MoE-instruct)",
+    )
+    parser.add_argument(
+        "--max-model-len",
+        type=int,
+        default=4096,
+        help="Maximum model length (default: 4096)",
+    )
+    parser.add_argument(
+        "--gpu-memory-utilization",
+        type=float,
+        default=0.6,
+        help="GPU memory utilization (default: 0.6)",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=1,
+        help="Random seed (default: 1)",
+    )
+    return parser.parse_args()
+
+
+args = parse_args()
 
 
 # Create prompts, the same across all ranks
@@ -38,15 +96,15 @@
 # all ranks have the same random seed, so that sampling can be
 # deterministic across ranks.
 llm = LLM(
-    model="microsoft/Phi-mini-MoE-instruct",
-    tensor_parallel_size=tp_size,
-    data_parallel_size=dp_size,
-    pipeline_parallel_size=pp_size,
-    enable_expert_parallel=enable_ep,
+    model=args.model,
+    tensor_parallel_size=args.tp_size,
+    data_parallel_size=args.dp_size,
+    pipeline_parallel_size=args.pp_size,
+    enable_expert_parallel=args.enable_ep,
     distributed_executor_backend="external_launcher",
-    max_model_len=4096,
-    gpu_memory_utilization=0.6,
-    seed=1,
+    max_model_len=args.max_model_len,
+    gpu_memory_utilization=args.gpu_memory_utilization,
+    seed=args.seed,
 )
 
 dp_rank = llm.llm_engine.vllm_config.parallel_config.data_parallel_rank

From c1b2ff09456a46e258656ebc5b0b8e0162060681 Mon Sep 17 00:00:00 2001
From: 22quinn <33176974+22quinn@users.noreply.github.com>
Date: Tue, 28 Oct 2025 16:57:18 -0700
Subject: [PATCH 6/6] arg

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index fd1c66bea60a..c6fd0be4d8cf 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -221,7 +221,7 @@ steps:
   # https://github.com/NVIDIA/nccl/issues/1838
   - export NCCL_CUMEM_HOST_ENABLE=0
   # test with torchrun tp=2 and dp=4 with ep
-  - TP_SIZE=2 DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py
+  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
 
 - label: EPLB Algorithm Test # 5min
   timeout_in_minutes: 15