Skip to content

Commit 8db565c

Browse files
committed
Merge remote-tracking branch 'upstream/main' into add-option-exclude-tools-when-tool-choice-is-none
Signed-off-by: okada shintarou <okada@preferred.jp>
2 parents 5de852a + 32142b3 commit 8db565c

File tree

841 files changed

+49379
-38379
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

841 files changed

+49379
-38379
lines changed

.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
4646
done
4747

4848
lm_eval --model vllm \
49-
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
49+
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
5050
--tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
5151
--batch_size "$BATCH_SIZE"

.buildkite/lm-eval-harness/test_lm_eval_correctness.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,14 @@
1818

1919
def launch_lm_eval(eval_config, tp_size):
2020
trust_remote_code = eval_config.get("trust_remote_code", False)
21+
max_model_len = eval_config.get("max_model_len", 4096)
2122
model_args = (
2223
f"pretrained={eval_config['model_name']},"
2324
f"tensor_parallel_size={tp_size},"
2425
f"enforce_eager=true,"
2526
f"add_bos_token=true,"
26-
f"trust_remote_code={trust_remote_code}"
27+
f"trust_remote_code={trust_remote_code},"
28+
f"max_model_len={max_model_len}"
2729
)
2830
results = lm_eval.simple_evaluate(
2931
model="vllm",

.buildkite/scripts/hardware_ci/run-amd-test.sh

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -107,10 +107,8 @@ fi
107107

108108
if [[ $commands == *" kernels/attention"* ]]; then
109109
commands="${commands} \
110-
--ignore=kernels/attention/stest_attention_selector.py \
111-
--ignore=kernels/attention/test_blocksparse_attention.py \
112-
--ignore=kernels/attention/test_encoder_decoder_attn.py \
113110
--ignore=kernels/attention/test_attention_selector.py \
111+
--ignore=kernels/attention/test_encoder_decoder_attn.py \
114112
--ignore=kernels/attention/test_flash_attn.py \
115113
--ignore=kernels/attention/test_flashinfer.py \
116114
--ignore=kernels/attention/test_prefix_prefill.py \

.buildkite/scripts/hardware_ci/run-cpu-test.sh

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ set -ex
66

77
# allow to bind to different cores
88
CORE_RANGE=${CORE_RANGE:-48-95}
9+
# used for TP/PP E2E test
910
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
1011
NUMA_NODE=${NUMA_NODE:-1}
1112

@@ -24,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
2425
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
2526

2627
# Run the image, setting --shm-size=4g for tensor parallel.
27-
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
28-
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
28+
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
29+
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
2930

3031
function cpu_tests() {
3132
set -e
@@ -48,10 +49,16 @@ function cpu_tests() {
4849
# Run basic model test
4950
docker exec cpu-test-"$NUMA_NODE" bash -c "
5051
set -e
51-
pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
52-
pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
53-
pytest -v -s tests/models/language/generation -m cpu_model
54-
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model
52+
# Note: disable until supports V1
53+
# pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
54+
# pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
55+
56+
# Note: disable Bart until supports V1
57+
pytest -v -s tests/models/language/generation -m cpu_model \
58+
--ignore=tests/models/language/generation/test_bart.py
59+
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
60+
--ignore=tests/models/language/generation/test_bart.py
61+
5562
pytest -v -s tests/models/language/pooling -m cpu_model
5663
pytest -v -s tests/models/multimodal/generation \
5764
--ignore=tests/models/multimodal/generation/test_mllama.py \
@@ -62,33 +69,26 @@ function cpu_tests() {
6269
docker exec cpu-test-"$NUMA_NODE" bash -c "
6370
set -e
6471
pytest -s -v \
65-
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
66-
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
72+
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
6773

74+
# Note: disable it until supports V1
6875
# Run AWQ test
69-
docker exec cpu-test-"$NUMA_NODE" bash -c "
70-
set -e
71-
VLLM_USE_V1=0 pytest -s -v \
72-
tests/quantization/test_ipex_quant.py"
73-
74-
# Run chunked-prefill and prefix-cache test
75-
docker exec cpu-test-"$NUMA_NODE" bash -c "
76-
set -e
77-
pytest -s -v -k cpu_model \
78-
tests/basic_correctness/test_chunked_prefill.py"
76+
# docker exec cpu-test-"$NUMA_NODE" bash -c "
77+
# set -e
78+
# VLLM_USE_V1=0 pytest -s -v \
79+
# tests/quantization/test_ipex_quant.py"
7980

8081
# online serving
81-
docker exec cpu-test-"$NUMA_NODE" bash -c "
82+
docker exec cpu-test-"$NUMA_NODE" bash -c '
8283
set -e
83-
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
84-
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
85-
VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \
84+
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
85+
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
86+
python3 benchmarks/benchmark_serving.py \
8687
--backend vllm \
8788
--dataset-name random \
88-
--model facebook/opt-125m \
89+
--model meta-llama/Llama-3.2-3B-Instruct \
8990
--num-prompts 20 \
90-
--endpoint /v1/completions \
91-
--tokenizer facebook/opt-125m"
91+
--endpoint /v1/completions'
9292

9393
# Run multi-lora tests
9494
docker exec cpu-test-"$NUMA_NODE" bash -c "

.buildkite/scripts/hardware_ci/run-hpu-test.sh

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,17 @@ set -exuo pipefail
66

77
# Try building the docker image
88
cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
9-
FROM 1.22-413-pt2.7.1:latest
9+
FROM gaudi-base-image:latest
1010
1111
COPY ./ /workspace/vllm
1212
1313
WORKDIR /workspace/vllm
1414
15-
RUN pip install -v -r requirements/hpu.txt
16-
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
17-
1815
ENV no_proxy=localhost,127.0.0.1
1916
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
2017
21-
RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
18+
RUN VLLM_TARGET_DEVICE=empty pip install .
19+
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
2220
2321
# install development dependencies (for testing)
2422
RUN python3 -m pip install -e tests/vllm_test_utils

.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ export VLLM_XLA_CACHE_PATH=
7070
echo "Using VLLM V1"
7171
7272
echo "--- Hardware Information ---"
73-
tpu-info
73+
# tpu-info
7474
echo "--- Starting Tests ---"
7575
set +e
7676
overall_script_exit_code=0

.buildkite/scripts/hardware_ci/run-xpu-test.sh

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head
1111
docker build -t ${image_name} -f docker/Dockerfile.xpu .
1212

1313
# Setup cleanup
14-
remove_docker_container() {
15-
docker rm -f "${container_name}" || true;
14+
remove_docker_container() {
15+
docker rm -f "${container_name}" || true;
1616
docker image rm -f "${image_name}" || true;
1717
docker system prune -f || true;
1818
}
@@ -26,7 +26,9 @@ docker run \
2626
--name "${container_name}" \
2727
"${image_name}" \
2828
sh -c '
29-
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
30-
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
3129
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
30+
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
31+
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
32+
cd tests
33+
pytest -v -s v1/core
3234
'

.buildkite/scripts/tpu/docker_run_bm.sh

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,6 @@ trap remove_docker_container EXIT
2222
# Remove the container that might not be cleaned up in the previous run.
2323
remove_docker_container
2424

25-
# Build docker image.
26-
# TODO: build the image outside the script and share the image with other
27-
# tpu test if building time is too long.
28-
DOCKER_BUILDKIT=1 docker build \
29-
--build-arg max_jobs=16 \
30-
--build-arg USE_SCCACHE=1 \
31-
--build-arg GIT_REPO_CHECK=0 \
32-
--tag vllm/vllm-tpu-bm \
33-
--progress plain -f docker/Dockerfile.tpu .
34-
3525
LOG_ROOT=$(mktemp -d)
3626
# If mktemp fails, set -e will cause the script to exit.
3727
echo "Results will be stored in: $LOG_ROOT"

.buildkite/test-pipeline.yaml

Lines changed: 29 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -117,16 +117,14 @@ steps:
117117
commands:
118118
- pytest -v -s core
119119

120-
- label: Entrypoints Test # 40min
120+
- label: Entrypoints Test (LLM) # 40min
121121
mirror_hardwares: [amdexperimental]
122122
working_dir: "/vllm-workspace/tests"
123123
fast_check: true
124124
torch_nightly: true
125125
source_file_dependencies:
126126
- vllm/
127127
- tests/entrypoints/llm
128-
- tests/entrypoints/openai
129-
- tests/entrypoints/test_chat_utils
130128
- tests/entrypoints/offline_mode
131129
commands:
132130
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -135,9 +133,21 @@ steps:
135133
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
136134
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
137135
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
136+
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
137+
138+
- label: Entrypoints Test (API Server) # 40min
139+
mirror_hardwares: [amdexperimental]
140+
working_dir: "/vllm-workspace/tests"
141+
fast_check: true
142+
torch_nightly: true
143+
source_file_dependencies:
144+
- vllm/
145+
- tests/entrypoints/openai
146+
- tests/entrypoints/test_chat_utils
147+
commands:
148+
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
138149
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
139150
- pytest -v -s entrypoints/test_chat_utils.py
140-
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
141151

142152
- label: Distributed Tests (4 GPUs) # 10min
143153
mirror_hardwares: [amdexperimental]
@@ -149,7 +159,6 @@ steps:
149159
- tests/distributed/test_utils
150160
- tests/distributed/test_pynccl
151161
- tests/distributed/test_events
152-
- tests/spec_decode/e2e/test_integration_dist_tp4
153162
- tests/compile/test_basic_correctness
154163
- examples/offline_inference/rlhf.py
155164
- examples/offline_inference/rlhf_colocate.py
@@ -172,7 +181,6 @@ steps:
172181
- pytest -v -s compile/test_basic_correctness.py
173182
- pytest -v -s distributed/test_pynccl.py
174183
- pytest -v -s distributed/test_events.py
175-
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
176184
# TODO: create a dedicated test section for multi-GPU example tests
177185
# when we have multiple distributed example tests
178186
- pushd ../examples/offline_inference
@@ -256,6 +264,7 @@ steps:
256264
- pytest -v -s v1/structured_output
257265
- pytest -v -s v1/spec_decode
258266
- pytest -v -s v1/kv_connector/unit
267+
- pytest -v -s v1/metrics
259268
- pytest -v -s v1/test_serial_utils.py
260269
- pytest -v -s v1/test_utils.py
261270
- pytest -v -s v1/test_oracle.py
@@ -264,7 +273,7 @@ steps:
264273
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
265274
- pytest -v -s v1/e2e
266275
# Integration test for streaming correctness (requires special branch).
267-
- pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
276+
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
268277
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
269278

270279
- label: Examples Test # 25min
@@ -282,7 +291,7 @@ steps:
282291
- python3 offline_inference/llm_engine_example.py
283292
- python3 offline_inference/audio_language.py --seed 0
284293
- python3 offline_inference/vision_language.py --seed 0
285-
- python3 offline_inference/vision_language_embedding.py --seed 0
294+
- python3 offline_inference/vision_language_pooling.py --seed 0
286295
- python3 offline_inference/vision_language_multi_image.py --seed 0
287296
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
288297
- python3 offline_inference/encoder_decoder.py
@@ -320,17 +329,6 @@ steps:
320329
- pytest -v -s samplers
321330
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
322331

323-
- label: Speculative decoding tests # 40min
324-
mirror_hardwares: [amdexperimental]
325-
source_file_dependencies:
326-
- vllm/spec_decode
327-
- tests/spec_decode
328-
- vllm/model_executor/models/eagle.py
329-
commands:
330-
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
331-
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py
332-
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
333-
334332
- label: LoRA Test %N # 15min each
335333
mirror_hardwares: [amdexperimental, amdproduction]
336334
source_file_dependencies:
@@ -630,6 +628,18 @@ steps:
630628
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
631629
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
632630

631+
- label: Transformers Nightly Models Test
632+
working_dir: "/vllm-workspace/"
633+
optional: true
634+
commands:
635+
- pip install --upgrade git+https://github.com/huggingface/transformers
636+
- pytest -v -s tests/models/test_initialization.py
637+
- pytest -v -s tests/models/multimodal/processing/
638+
- pytest -v -s tests/models/multimodal/test_mapping.py
639+
- python3 examples/offline_inference/basic/chat.py
640+
- python3 examples/offline_inference/audio_language.py --model-type whisper
641+
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
642+
633643
##### 1 GPU test #####
634644
##### multi gpus test #####
635645

@@ -704,7 +714,6 @@ steps:
704714
- pytest -v -s distributed/test_sequence_parallel.py
705715
# this test fails consistently.
706716
# TODO: investigate and fix
707-
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
708717
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
709718
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
710719
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown

.gemini/config.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github
2+
have_fun: false # Just review the code
3+
code_review:
4+
comment_severity_threshold: HIGH # Reduce quantity of comments
5+
pull_request_opened:
6+
summary: false # Don't summarize the PR in a separate comment

0 commit comments

Comments
 (0)