Skip to content

Commit f44d253

Browse files
author
andy-neuma
committed
Merge remote-tracking branch 'upstream/main'
2 parents 2602d9d + bb103b2 commit f44d253

File tree

947 files changed

+64427
-20466
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

947 files changed

+64427
-20466
lines changed

.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ tasks:
44
- name: "gsm8k"
55
metrics:
66
- name: "exact_match,strict-match"
7-
value: 0.233
7+
value: 0.231
88
- name: "exact_match,flexible-extract"
9-
value: 0.236
9+
value: 0.22
1010
limit: 1000
1111
num_fewshot: 5

.buildkite/lm-eval-harness/test_lm_eval_correctness.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
import lm_eval
1515
import numpy
16+
import pytest
1617
import yaml
1718

1819
RTOL = 0.05
@@ -46,6 +47,10 @@ def test_lm_eval_correctness():
4647
eval_config = yaml.safe_load(
4748
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
4849

50+
if eval_config[
51+
"model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform": #noqa: E501
52+
pytest.skip("FBGEMM is currently failing on main.")
53+
4954
# Launch eval requests.
5055
results = launch_lm_eval(eval_config)
5156

.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -426,7 +426,7 @@ main() {
426426

427427
pip install -U transformers
428428

429-
pip install -r requirements-dev.txt
429+
pip install -r requirements/dev.txt
430430
which genai-perf
431431

432432
# check storage

.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,7 @@ main() {
361361
# get the current IP address, required by benchmark_serving.py
362362
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
363363
# turn of the reporting of the status of each request, to clean up the terminal output
364-
export VLLM_LOG_LEVEL="WARNING"
364+
export VLLM_LOGGING_LEVEL="WARNING"
365365

366366
# prepare for benchmarking
367367
cd benchmarks || exit 1

.buildkite/release-pipeline.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ steps:
8282
queue: cpu_queue_postmerge
8383
commands:
8484
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
85-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
85+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f Dockerfile.cpu ."
8686
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
8787
env:
8888
DOCKER_BUILDKIT: "1"

.buildkite/run-amd-test.sh

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -101,16 +101,30 @@ if [[ $commands == *" kernels "* ]]; then
101101
--ignore=kernels/test_permute_cols.py"
102102
fi
103103

104-
#ignore certain Entrypoints tests
104+
#ignore certain Entrypoints/openai tests
105105
if [[ $commands == *" entrypoints/openai "* ]]; then
106106
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
107-
--ignore=entrypoints/openai/test_accuracy.py \
108107
--ignore=entrypoints/openai/test_audio.py \
109-
--ignore=entrypoints/openai/test_encoder_decoder.py \
110-
--ignore=entrypoints/openai/test_embedding.py \
111-
--ignore=entrypoints/openai/test_oot_registration.py "}
108+
--ignore=entrypoints/openai/test_chat.py \
109+
--ignore=entrypoints/openai/test_shutdown.py \
110+
--ignore=entrypoints/openai/test_completion.py \
111+
--ignore=entrypoints/openai/test_sleep.py \
112+
--ignore=entrypoints/openai/test_models.py \
113+
--ignore=entrypoints/openai/test_prompt_validation.py "}
112114
fi
113115

116+
#ignore certain Entrypoints/llm tests
117+
if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
118+
commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
119+
fi
120+
121+
# --ignore=entrypoints/openai/test_encoder_decoder.py \
122+
# --ignore=entrypoints/openai/test_embedding.py \
123+
# --ignore=entrypoints/openai/test_oot_registration.py
124+
# --ignore=entrypoints/openai/test_accuracy.py \
125+
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
126+
127+
114128
PARALLEL_JOB_COUNT=8
115129
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
116130
if [[ $commands == *"--shard-id="* ]]; then
@@ -120,9 +134,10 @@ if [[ $commands == *"--shard-id="* ]]; then
120134
# assign shard-id for each shard
121135
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
122136
echo "Shard ${GPU} commands:$commands_gpu"
137+
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
123138
docker run \
124-
--device /dev/kfd --device /dev/dri \
125-
--network host \
139+
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
140+
--network=host \
126141
--shm-size=16gb \
127142
--rm \
128143
-e HIP_VISIBLE_DEVICES="${GPU}" \
@@ -149,9 +164,10 @@ if [[ $commands == *"--shard-id="* ]]; then
149164
fi
150165
done
151166
else
167+
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
152168
docker run \
153-
--device /dev/kfd --device /dev/dri \
154-
--network host \
169+
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
170+
--network=host \
155171
--shm-size=16gb \
156172
--rm \
157173
-e HIP_VISIBLE_DEVICES=0 \

.buildkite/run-cpu-test.sh

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,24 +8,29 @@ set -ex
88
CORE_RANGE=${CORE_RANGE:-48-95}
99
NUMA_NODE=${NUMA_NODE:-1}
1010

11-
# Try building the docker image
12-
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
13-
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
14-
1511
# Setup cleanup
16-
remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
12+
remove_docker_container() {
13+
set -e;
14+
docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
15+
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
16+
}
1717
trap remove_docker_container EXIT
1818
remove_docker_container
1919

20+
# Try building the docker image
21+
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f Dockerfile.cpu .
22+
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f Dockerfile.cpu .
23+
2024
# Run the image, setting --shm-size=4g for tensor parallel.
2125
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
22-
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
26+
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
2327
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
24-
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
28+
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
2529

2630
function cpu_tests() {
2731
set -e
2832
export NUMA_NODE=$2
33+
export BUILDKITE_BUILD_NUMBER=$3
2934

3035
# offline inference
3136
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
@@ -35,7 +40,8 @@ function cpu_tests() {
3540
# Run basic model test
3641
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
3742
set -e
38-
pip install -r vllm/requirements-test.txt
43+
pytest -v -s tests/kernels/test_cache.py -m cpu_model
44+
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
3945
pytest -v -s tests/models/decoder_only/language -m cpu_model
4046
pytest -v -s tests/models/embedding/language -m cpu_model
4147
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
@@ -85,4 +91,4 @@ function cpu_tests() {
8591

8692
# All of CPU tests are expected to be finished less than 40 mins.
8793
export -f cpu_tests
88-
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
94+
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"

.buildkite/run-gh200-test.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ DOCKER_BUILDKIT=1 docker build . \
1414
-t gh200-test \
1515
--build-arg max_jobs=66 \
1616
--build-arg nvcc_threads=2 \
17+
--build-arg RUN_WHEEL_CHECK=false \
1718
--build-arg torch_cuda_arch_list="9.0+PTX" \
1819
--build-arg vllm_fa_cmake_gpu_arches="90-real"
1920

@@ -23,6 +24,6 @@ trap remove_docker_container EXIT
2324
remove_docker_container
2425

2526
# Run the image and test offline inference
26-
docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
27+
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
2728
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
2829
'

.buildkite/run-neuron-test.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,11 @@ remove_docker_container() {
4444
trap remove_docker_container EXIT
4545

4646
# Run the image
47-
docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
47+
docker run --rm -it --device=/dev/neuron0 --network bridge \
4848
-v "${HF_CACHE}:${HF_MOUNT}" \
4949
-e "HF_HOME=${HF_MOUNT}" \
5050
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
5151
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
5252
--name "${container_name}" \
5353
${image_name} \
54-
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"
54+
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"

.buildkite/run-openvino-test.sh

Lines changed: 0 additions & 16 deletions
This file was deleted.

0 commit comments

Comments
 (0)