Skip to content

Commit f6b3bcb

Browse files
authored
Merge branch 'main' into v1_spec_decode_logprobs
2 parents fd2190a + 1c16084 commit f6b3bcb

File tree

217 files changed

+9508
-4257
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

217 files changed

+9508
-4257
lines changed

.buildkite/release-pipeline.yaml

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
steps:
2-
# aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
2+
# aarch64 + CUDA builds
33
- label: "Build arm64 wheel - CUDA 12.9"
44
depends_on: ~
55
id: build-wheel-arm64-cuda-12-9
@@ -15,20 +15,21 @@ steps:
1515
env:
1616
DOCKER_BUILDKIT: "1"
1717

18-
# aarch64 build.
18+
# aarch64 build
1919
- label: "Build arm64 CPU wheel"
2020
depends_on: ~
2121
id: build-wheel-arm64-cpu
2222
agents:
2323
queue: arm64_cpu_queue_postmerge
2424
commands:
25-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile.cpu ."
25+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile.cpu ."
2626
- "mkdir artifacts"
2727
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
2828
- "bash .buildkite/scripts/upload-wheels.sh"
2929
env:
3030
DOCKER_BUILDKIT: "1"
3131

32+
# x86 + CUDA builds
3233
- label: "Build wheel - CUDA 12.8"
3334
depends_on: ~
3435
id: build-wheel-cuda-12-8
@@ -42,47 +43,46 @@ steps:
4243
env:
4344
DOCKER_BUILDKIT: "1"
4445

45-
- label: "Build wheel - CUDA 12.6"
46+
- label: "Build wheel - CUDA 12.9"
4647
depends_on: ~
47-
id: build-wheel-cuda-12-6
48+
id: build-wheel-cuda-12-9
4849
agents:
4950
queue: cpu_queue_postmerge
5051
commands:
51-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
52+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
5253
- "mkdir artifacts"
5354
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
5455
- "bash .buildkite/scripts/upload-wheels.sh"
5556
env:
5657
DOCKER_BUILDKIT: "1"
5758

58-
# x86 + CUDA builds
59-
- label: "Build wheel - CUDA 12.9"
59+
- label: "Build wheel - CUDA 13.0"
6060
depends_on: ~
61-
id: build-wheel-cuda-12-9
61+
id: build-wheel-cuda-13-0
6262
agents:
6363
queue: cpu_queue_postmerge
6464
commands:
65-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
65+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
6666
- "mkdir artifacts"
6767
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
6868
- "bash .buildkite/scripts/upload-wheels.sh"
6969
env:
7070
DOCKER_BUILDKIT: "1"
7171

72+
# Build release images (12.9)
7273
- label: "Build release image (x86)"
7374
depends_on: ~
7475
id: build-release-image-x86
7576
agents:
7677
queue: cpu_queue_postmerge
7778
commands:
7879
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
79-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
80+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
8081
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
8182
# re-tag to default image tag and push, just in case arm64 build fails
8283
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
8384
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
8485

85-
# PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
8686
- label: "Build release image (arm64)"
8787
depends_on: ~
8888
id: build-release-image-arm64

.buildkite/scripts/upload-wheels.sh

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -58,33 +58,25 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
5858
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
5959
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
6060

61-
if [[ $normal_wheel == *"cu126"* ]]; then
62-
# if $normal_wheel matches cu126, do not upload the index.html
63-
echo "Skipping index files for cu126 wheels"
64-
elif [[ $normal_wheel == *"cu128"* ]]; then
65-
# if $normal_wheel matches cu128, do not upload the index.html
66-
echo "Skipping index files for cu128 wheels"
67-
else
61+
if [[ $normal_wheel == *"cu129"* ]]; then
6862
# only upload index.html for cu129 wheels (default wheels) as it
6963
# is available on both x86 and arm64
7064
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
7165
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
66+
else
67+
echo "Skipping index files for non-cu129 wheels"
7268
fi
7369

7470
# generate index for nightly
7571
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
7672
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
7773

78-
if [[ $normal_wheel == *"cu126"* ]]; then
79-
# if $normal_wheel matches cu126, do not upload the index.html
80-
echo "Skipping index files for cu126 wheels"
81-
elif [[ $normal_wheel == *"cu128"* ]]; then
82-
# if $normal_wheel matches cu128, do not upload the index.html
83-
echo "Skipping index files for cu128 wheels"
84-
else
74+
if [[ $normal_wheel == *"cu129"* ]]; then
8575
# only upload index.html for cu129 wheels (default wheels) as it
8676
# is available on both x86 and arm64
8777
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
78+
else
79+
echo "Skipping index files for non-cu129 wheels"
8880
fi
8981

9082
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"

.buildkite/test-amd.yaml

Lines changed: 66 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -454,8 +454,8 @@ steps:
454454
- pytest -v -s compile/test_fusion_attn.py
455455
- pytest -v -s compile/test_functionalization.py
456456
- pytest -v -s compile/test_silu_mul_quant_fusion.py
457-
- pytest -v -s compile/test_sequence_parallelism.py
458-
- pytest -v -s compile/test_async_tp.py
457+
# - pytest -v -s compile/test_sequence_parallelism.py
458+
# - pytest -v -s compile/test_async_tp.py
459459
- pytest -v -s compile/test_fusion_all_reduce.py
460460
- pytest -v -s compile/test_decorator.py
461461
- pytest -v -s compile/test_noop_elimination.py
@@ -474,8 +474,8 @@ steps:
474474
- pytest -v -s compile/test_basic_correctness.py
475475
- pytest -v -s compile/piecewise/
476476

477-
- label: PyTorch Fullgraph Test # 20min
478-
timeout_in_minutes: 30
477+
- label: PyTorch Fullgraph Test # 22min
478+
timeout_in_minutes: 35
479479
mirror_hardwares: [amdexperimental, amdproduction]
480480
agent_pool: mi325_1
481481
# grade: Blocking
@@ -485,6 +485,7 @@ steps:
485485
- tests/compile
486486
commands:
487487
- pytest -v -s compile/test_full_graph.py
488+
- pytest -v -s compile/test_fusions_e2e.py
488489

489490
- label: Kernels Core Operation Test # 48min
490491
timeout_in_minutes: 75
@@ -494,6 +495,7 @@ steps:
494495
source_file_dependencies:
495496
- csrc/
496497
- tests/kernels/core
498+
- tests/kernels/test_top_k_per_row.py
497499
commands:
498500
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
499501

@@ -606,7 +608,7 @@ steps:
606608
# we can only upgrade after this is resolved
607609
# TODO(jerryzh168): resolve the above comment
608610
- uv pip install --system torchao==0.13.0
609-
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
611+
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
610612

611613
- label: LM Eval Small Models # 53min
612614
timeout_in_minutes: 75
@@ -848,6 +850,18 @@ steps:
848850
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
849851
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
850852

853+
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
854+
mirror_hardwares: [amdexperimental]
855+
agent_pool: mi325_1
856+
timeout_in_minutes: 70
857+
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
858+
source_file_dependencies:
859+
- vllm/multimodal/
860+
- vllm/inputs/
861+
- vllm/v1/core/
862+
commands:
863+
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
864+
851865
- label: Multi-Modal Models Test (Extended) 1
852866
mirror_hardwares: [amdexperimental]
853867
agent_pool: mi325_1
@@ -923,8 +937,8 @@ steps:
923937
# Whisper needs spawn method to avoid deadlock
924938
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
925939

926-
- label: Blackwell Test # 38 min
927-
timeout_in_minutes: 60
940+
- label: Blackwell Test # 21 min
941+
timeout_in_minutes: 30
928942
working_dir: "/vllm-workspace/"
929943
gpu: b200
930944
# optional: true
@@ -937,8 +951,6 @@ steps:
937951
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
938952
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
939953
- vllm/v1/attention/backends/flashinfer.py
940-
- vllm/compilation/fusion.py
941-
- vllm/compilation/fusion_attn.py
942954
commands:
943955
- nvidia-smi
944956
- python3 examples/offline_inference/basic/chat.py
@@ -955,13 +967,32 @@ steps:
955967
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
956968
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
957969
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
970+
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
971+
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
958972
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
959973
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
960-
# Fusion
961-
- pytest -v -s tests/compile/test_fusion_all_reduce.py
962-
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
963974
- pytest -v -s tests/kernels/moe/test_flashinfer.py
975+
976+
- label: Blackwell Fusion Tests # 30 min
977+
timeout_in_minutes: 40
978+
working_dir: "/vllm-workspace/"
979+
gpu: b200
980+
source_file_dependencies:
981+
- csrc/quantization/fp4/
982+
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
983+
- vllm/v1/attention/backends/flashinfer.py
984+
- vllm/compilation/
985+
# can affect pattern matching
986+
- vllm/model_executor/layers/layernorm.py
987+
- vllm/model_executor/layers/activation.py
988+
- vllm/model_executor/layers/quantization/input_quant_fp8.py
989+
commands:
990+
- nvidia-smi
991+
- pytest -v -s tests/compile/test_fusion_attn.py
964992
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
993+
# this runner has 2 GPUs available even though num_gpus=2 is not set
994+
- pytest -v -s tests/compile/test_fusion_all_reduce.py
995+
- pytest -v -s tests/compile/test_fusions_e2e.py
965996

966997
- label: Blackwell GPT-OSS Eval
967998
timeout_in_minutes: 60
@@ -1081,6 +1112,7 @@ steps:
10811112
- pytest -v -s ./compile/test_basic_correctness.py
10821113
- pytest -v -s ./compile/test_wrapper.py
10831114
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
1115+
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
10841116
- pytest -v -s distributed/test_sequence_parallel.py
10851117
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
10861118
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
@@ -1128,6 +1160,11 @@ steps:
11281160
- pytest -v -s plugins_tests/test_io_processor_plugins.py
11291161
- pip uninstall prithvi_io_processor_plugin -y
11301162
# end io_processor plugins test
1163+
# begin stat_logger plugins test
1164+
- pip install -e ./plugins/vllm_add_dummy_stat_logger
1165+
- pytest -v -s plugins_tests/test_stats_logger_plugins.py
1166+
- pip uninstall dummy_stat_logger -y
1167+
# end stat_logger plugins test
11311168
# other tests continue here:
11321169
- pytest -v -s plugins_tests/test_scheduler_plugins.py
11331170
- pip install -e ./plugins/vllm_add_dummy_model
@@ -1172,7 +1209,6 @@ steps:
11721209
- pytest -v -s -x lora/test_llama_tp.py
11731210
- pytest -v -s -x lora/test_llm_with_multi_loras.py
11741211

1175-
11761212
- label: Weight Loading Multiple GPU Test # 33min
11771213
timeout_in_minutes: 45
11781214
mirror_hardwares: [amdexperimental]
@@ -1201,6 +1237,18 @@ steps:
12011237
commands:
12021238
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
12031239

1240+
- label: NixlConnector PD accuracy tests (Distributed) # 30min
1241+
mirror_hardwares: [amdexperimental]
1242+
agent_pool: mi325_4
1243+
timeout_in_minutes: 30
1244+
working_dir: "/vllm-workspace/tests"
1245+
num_gpus: 4
1246+
source_file_dependencies:
1247+
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
1248+
- tests/v1/kv_connector/nixl_integration/
1249+
commands:
1250+
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
1251+
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
12041252

12051253
##### multi gpus test #####
12061254
##### A100 test #####
@@ -1232,12 +1280,16 @@ steps:
12321280
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
12331281

12341282
##### H200 test #####
1235-
- label: Distrubted Tests (H200) # optional
1283+
- label: Distributed Tests (H200) # optional
12361284
gpu: h200
12371285
optional: true
12381286
working_dir: "/vllm-workspace/"
12391287
num_gpus: 2
12401288
commands:
1289+
- pytest -v -s tests/compile/test_async_tp.py
1290+
- pytest -v -s tests/compile/test_sequence_parallelism.py
1291+
- pytest -v -s tests/compile/test_fusion_all_reduce.py
1292+
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
12411293
- pytest -v -s tests/distributed/test_context_parallel.py
12421294
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
12431295

.buildkite/test-pipeline.yaml

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,8 @@ steps:
172172
- tests/v1/engine/test_engine_core_client.py
173173
- tests/distributed/test_symm_mem_allreduce.py
174174
commands:
175+
# https://github.com/NVIDIA/nccl/issues/1838
176+
- export NCCL_CUMEM_HOST_ENABLE=0
175177
# test with torchrun tp=2 and external_dp=2
176178
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
177179
# test with torchrun tp=2 and pp=2
@@ -349,7 +351,8 @@ steps:
349351
- python3 offline_inference/basic/embed.py
350352
- python3 offline_inference/basic/score.py
351353
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
352-
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
354+
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
355+
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
353356

354357
- label: Platform Tests (CUDA) # 4min
355358
timeout_in_minutes: 15
@@ -384,7 +387,12 @@ steps:
384387
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
385388
--ignore=lora/test_chatglm3_tp.py \
386389
--ignore=lora/test_llama_tp.py \
387-
--ignore=lora/test_llm_with_multi_loras.py
390+
--ignore=lora/test_llm_with_multi_loras.py \
391+
--ignore=lora/test_olmoe_tp.py \
392+
--ignore=lora/test_deepseekv2_tp.py \
393+
--ignore=lora/test_gptoss.py \
394+
--ignore=lora/test_qwen3moe_tp.py
395+
388396
parallelism: 4
389397

390398
- label: PyTorch Compilation Unit Tests # 15min
@@ -529,7 +537,7 @@ steps:
529537
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
530538
# we can only upgrade after this is resolved
531539
# TODO(jerryzh168): resolve the above comment
532-
- uv pip install --system torchao==0.13.0
540+
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
533541
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
534542

535543
- label: LM Eval Small Models # 53min
@@ -970,13 +978,16 @@ steps:
970978
- tests/v1/shutdown
971979
- tests/v1/worker/test_worker_memory_snapshot.py
972980
commands:
981+
# https://github.com/NVIDIA/nccl/issues/1838
982+
- export NCCL_CUMEM_HOST_ENABLE=0
973983
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
974984
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
975985
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
976986
- pytest -v -s entrypoints/llm/test_collective_rpc.py
977987
- pytest -v -s ./compile/test_basic_correctness.py
978988
- pytest -v -s ./compile/test_wrapper.py
979989
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
990+
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
980991
- pytest -v -s distributed/test_sequence_parallel.py
981992
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
982993
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
@@ -1064,6 +1075,7 @@ steps:
10641075
- pytest -v -s -x lora/test_chatglm3_tp.py
10651076
- pytest -v -s -x lora/test_llama_tp.py
10661077
- pytest -v -s -x lora/test_llm_with_multi_loras.py
1078+
- pytest -v -s -x lora/test_olmoe_tp.py
10671079

10681080

10691081
- label: Weight Loading Multiple GPU Test # 33min

0 commit comments

Comments
 (0)