From d6677c9f0a78c3dd2982cad779fd3f64eec9d911 Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Sun, 26 Oct 2025 22:30:03 -0700 Subject: [PATCH 1/6] [CI/Build] Test torchrun with 8 cards Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 19 +++++++++++++++++-- .../offline_inference/torchrun_dp_example.py | 16 ++++++++++++---- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 3f280c837be9..96f999cd3c17 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -205,6 +205,21 @@ steps: - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - popd +- label: Distributed Tests (8 GPUs) # ?min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + working_dir: "/vllm-workspace/tests" + num_gpus: 8 + source_file_dependencies: + - vllm/distributed/ + - tests/examples/offline_inference/torchrun_dp_example.py + - tests/v1/distributed + commands: + # https://github.com/NVIDIA/nccl/issues/1838 + - export NCCL_CUMEM_HOST_ENABLE=0 + # test with torchrun tp=2 and dp=4 with ep + - TP_SIZE=2 DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py + - label: EPLB Algorithm Test # 5min timeout_in_minutes: 15 working_dir: "/vllm-workspace/tests" @@ -392,7 +407,7 @@ steps: --ignore=lora/test_deepseekv2_tp.py \ --ignore=lora/test_gptoss.py \ --ignore=lora/test_qwen3moe_tp.py - + parallelism: 4 - label: PyTorch Compilation Unit Tests # 15min @@ -1101,7 +1116,7 @@ steps: - tests/weight_loading commands: - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt - + - label: NixlConnector PD accuracy tests (Distributed) # 30min timeout_in_minutes: 30 working_dir: "/vllm-workspace/tests" diff --git a/examples/offline_inference/torchrun_dp_example.py b/examples/offline_inference/torchrun_dp_example.py index 295d1637528c..349e5ad167e6 100644 --- a/examples/offline_inference/torchrun_dp_example.py +++ b/examples/offline_inference/torchrun_dp_example.py @@ -11,8 +11,16 @@ ``` """ +import os + from vllm import LLM, SamplingParams +tp_size = int(os.getenv("TP_SIZE", "1")) +pp_size = int(os.getenv("PP_SIZE", "1")) +dp_size = int(os.getenv("DP_SIZE", "2")) +enable_ep = bool(int(os.getenv("ENABLE_EP", "0"))) + + # Create prompts, the same across all ranks prompts = [ "Hello, my name is", @@ -31,10 +39,10 @@ # deterministic across ranks. llm = LLM( model="microsoft/Phi-mini-MoE-instruct", - tensor_parallel_size=1, - data_parallel_size=2, - pipeline_parallel_size=1, - enable_expert_parallel=False, + tensor_parallel_size=tp_size, + data_parallel_size=dp_size, + pipeline_parallel_size=pp_size, + enable_expert_parallel=enable_ep, distributed_executor_backend="external_launcher", max_model_len=4096, gpu_memory_utilization=0.6, From 3a8f5e8b828528f98304592c58fec2a0c5f6c838 Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Sun, 26 Oct 2025 22:38:05 -0700 Subject: [PATCH 2/6] organize triggers Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 96f999cd3c17..5025e0797d2d 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -211,9 +211,12 @@ steps: working_dir: "/vllm-workspace/tests" num_gpus: 8 source_file_dependencies: + - examples/offline_inference/torchrun_dp_example.py + - vllm/config/parallel.py - vllm/distributed/ - - tests/examples/offline_inference/torchrun_dp_example.py - - tests/v1/distributed + - vllm/v1/engine/llm_engine.py + - vllm/v1/executor/uniproc_executor.py + - vllm/v1/worker/gpu_worker.py commands: # https://github.com/NVIDIA/nccl/issues/1838 - export NCCL_CUMEM_HOST_ENABLE=0 From 2e181e7de1bce04b566690823a05e6f403657765 Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Mon, 27 Oct 2025 16:19:29 -0700 Subject: [PATCH 3/6] gpu: h100 Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 5025e0797d2d..9b3206dcf5d6 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -207,9 +207,9 @@ steps: - label: Distributed Tests (8 GPUs) # ?min timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" + gpu: h100 num_gpus: 8 + working_dir: "/vllm-workspace/tests" source_file_dependencies: - examples/offline_inference/torchrun_dp_example.py - vllm/config/parallel.py From 894c0861a9e91b8a7b0080db8842f883678a80aa Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Tue, 28 Oct 2025 10:38:15 -0700 Subject: [PATCH 4/6] timeout for 8-h100 test: 10 minutes --- .buildkite/test-pipeline.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index a40a63c79187..fd1c66bea60a 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -205,8 +205,8 @@ steps: - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - popd -- label: Distributed Tests (8 GPUs) # ?min - timeout_in_minutes: 50 +- label: Distributed Tests (8 GPUs) # 4min + timeout_in_minutes: 10 gpu: h100 num_gpus: 8 working_dir: "/vllm-workspace/tests" From 4a0afe0072cd54f88439d2669972863880452600 Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Tue, 28 Oct 2025 16:48:42 -0700 Subject: [PATCH 5/6] argparse Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> --- .../offline_inference/torchrun_dp_example.py | 84 ++++++++++++++++--- 1 file changed, 71 insertions(+), 13 deletions(-) diff --git a/examples/offline_inference/torchrun_dp_example.py b/examples/offline_inference/torchrun_dp_example.py index 349e5ad167e6..eb7ed969ea4b 100644 --- a/examples/offline_inference/torchrun_dp_example.py +++ b/examples/offline_inference/torchrun_dp_example.py @@ -9,16 +9,74 @@ ```bash $ torchrun --nproc-per-node=2 examples/offline_inference/torchrun_dp_example.py ``` + +With custom parallelism settings: +```bash +$ torchrun --nproc-per-node=8 examples/offline_inference/torchrun_dp_example.py \ + --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep +``` """ -import os +import argparse from vllm import LLM, SamplingParams -tp_size = int(os.getenv("TP_SIZE", "1")) -pp_size = int(os.getenv("PP_SIZE", "1")) -dp_size = int(os.getenv("DP_SIZE", "2")) -enable_ep = bool(int(os.getenv("ENABLE_EP", "0"))) + +def parse_args(): + parser = argparse.ArgumentParser( + description="Data-parallel inference with torchrun" + ) + parser.add_argument( + "--tp-size", + type=int, + default=1, + help="Tensor parallel size (default: 1)", + ) + parser.add_argument( + "--pp-size", + type=int, + default=1, + help="Pipeline parallel size (default: 1)", + ) + parser.add_argument( + "--dp-size", + type=int, + default=2, + help="Data parallel size (default: 2)", + ) + parser.add_argument( + "--enable-ep", + action="store_true", + help="Enable expert parallel (default: False)", + ) + parser.add_argument( + "--model", + type=str, + default="microsoft/Phi-mini-MoE-instruct", + help="Model name or path (default: microsoft/Phi-mini-MoE-instruct)", + ) + parser.add_argument( + "--max-model-len", + type=int, + default=4096, + help="Maximum model length (default: 4096)", + ) + parser.add_argument( + "--gpu-memory-utilization", + type=float, + default=0.6, + help="GPU memory utilization (default: 0.6)", + ) + parser.add_argument( + "--seed", + type=int, + default=1, + help="Random seed (default: 1)", + ) + return parser.parse_args() + + +args = parse_args() # Create prompts, the same across all ranks @@ -38,15 +96,15 @@ # all ranks have the same random seed, so that sampling can be # deterministic across ranks. llm = LLM( - model="microsoft/Phi-mini-MoE-instruct", - tensor_parallel_size=tp_size, - data_parallel_size=dp_size, - pipeline_parallel_size=pp_size, - enable_expert_parallel=enable_ep, + model=args.model, + tensor_parallel_size=args.tp_size, + data_parallel_size=args.dp_size, + pipeline_parallel_size=args.pp_size, + enable_expert_parallel=args.enable_ep, distributed_executor_backend="external_launcher", - max_model_len=4096, - gpu_memory_utilization=0.6, - seed=1, + max_model_len=args.max_model_len, + gpu_memory_utilization=args.gpu_memory_utilization, + seed=args.seed, ) dp_rank = llm.llm_engine.vllm_config.parallel_config.data_parallel_rank From c1b2ff09456a46e258656ebc5b0b8e0162060681 Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Tue, 28 Oct 2025 16:57:18 -0700 Subject: [PATCH 6/6] arg Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index fd1c66bea60a..c6fd0be4d8cf 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -221,7 +221,7 @@ steps: # https://github.com/NVIDIA/nccl/issues/1838 - export NCCL_CUMEM_HOST_ENABLE=0 # test with torchrun tp=2 and dp=4 with ep - - TP_SIZE=2 DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py + - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep - label: EPLB Algorithm Test # 5min timeout_in_minutes: 15