@@ -164,11 +164,20 @@ steps:
164164 - tests/v1/test_internal_lb_dp.py
165165 - tests/v1/test_hybrid_lb_dp.py
166166 - tests/v1/engine/test_engine_core_client.py
167+ - tests/distributed/test_symm_mem_allreduce.py
167168 commands :
168- # test with tp=2 and external_dp=2
169+ # test with torchrun tp=2 and external_dp=2
169170 - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
170- # test with tp=2 and pp=2
171+ # test with torchrun tp=2 and pp=2
171172 - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
173+ # test with torchrun tp=4 and dp=1
174+ - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
175+ # test with torchrun tp=2, pp=2 and dp=1
176+ - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
177+ # test with torchrun tp=1 and dp=4 with ep
178+ - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
179+ # test with torchrun tp=2 and dp=2 with ep
180+ - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
172181 # test with internal dp
173182 - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
174183 - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
@@ -180,6 +189,7 @@ steps:
180189 - pytest -v -s compile/test_basic_correctness.py
181190 - pytest -v -s distributed/test_pynccl.py
182191 - pytest -v -s distributed/test_events.py
192+ - pytest -v -s distributed/test_symm_mem_allreduce.py
183193 # TODO: create a dedicated test section for multi-GPU example tests
184194 # when we have multiple distributed example tests
185195 - pushd ../examples/offline_inference
@@ -321,6 +331,8 @@ steps:
321331 - python3 offline_inference/basic/classify.py
322332 - python3 offline_inference/basic/embed.py
323333 - python3 offline_inference/basic/score.py
334+ - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
335+ - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
324336
325337- label : Platform Tests (CUDA) # 4min
326338 timeout_in_minutes : 15
@@ -875,6 +887,8 @@ steps:
875887 - tests/v1/test_external_lb_dp.py
876888 - tests/v1/entrypoints/openai/test_multi_api_servers.py
877889 - vllm/v1/engine/
890+ - vllm/v1/worker/
891+ - tests/v1/worker/test_worker_memory_snapshot.py
878892 commands :
879893 - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
880894 - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
@@ -896,6 +910,7 @@ steps:
896910 - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
897911 - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
898912 - pytest -v -s models/multimodal/generation/test_maverick.py
913+ - pytest -v -s v1/worker/test_worker_memory_snapshot.py
899914
900915- label : Plugin Tests (2 GPUs) # 40min
901916 timeout_in_minutes : 60
@@ -1029,3 +1044,16 @@ steps:
10291044 num_gpus : 2
10301045 commands :
10311046 - pytest -v -s tests/distributed/test_context_parallel.py
1047+ - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
1048+
1049+ # #### RL Integration Tests #####
1050+ - label : Prime-RL Integration Test # 15min
1051+ timeout_in_minutes : 30
1052+ optional : true
1053+ num_gpus : 2
1054+ working_dir : " /vllm-workspace"
1055+ source_file_dependencies :
1056+ - vllm/
1057+ - .buildkite/scripts/run-prime-rl-test.sh
1058+ commands :
1059+ - bash .buildkite/scripts/run-prime-rl-test.sh
0 commit comments