Skip to content

Commit 5c45a37

Browse files
authored
Merge branch 'main' into dcp-gqa-fa
Signed-off-by: Jaya Yuan <yuanyongjie.yyj@antgroup.com>
2 parents a1f626d + 467a4f9 commit 5c45a37

File tree

1,770 files changed

+149112
-107579
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,770 files changed

+149112
-107579
lines changed

.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,7 @@ def parse_client_command(cmd: str) -> dict[str, Any]:
368368
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
369369
# we want to turn it into "8xGPUTYPE"
370370
df["GPU"] = df["GPU"].apply(
371-
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
371+
lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
372372
)
373373

374374
# get markdown tables

.buildkite/nightly-benchmarks/scripts/launch-server.sh

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -181,18 +181,14 @@ launch_vllm_server() {
181181
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
182182
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
183183
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
184-
server_command="python3 \
185-
-m vllm.entrypoints.openai.api_server \
184+
server_command="vllm serve $model \
186185
-tp $tp \
187-
--model $model \
188186
--port $port \
189187
$server_args"
190188
else
191189
echo "Key 'fp8' does not exist in common params."
192-
server_command="python3 \
193-
-m vllm.entrypoints.openai.api_server \
190+
server_command="vllm serve $model \
194191
-tp $tp \
195-
--model $model \
196192
--port $port \
197193
$server_args"
198194
fi

.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -365,8 +365,7 @@ run_serving_tests() {
365365
continue
366366
fi
367367

368-
server_command="$server_envs python3 \
369-
-m vllm.entrypoints.openai.api_server \
368+
server_command="$server_envs vllm serve \
370369
$server_args"
371370

372371
# run the server
@@ -455,11 +454,6 @@ main() {
455454
fi
456455
check_hf_token
457456

458-
# Set to v1 to run v1 benchmark
459-
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
460-
export VLLM_USE_V1=1
461-
fi
462-
463457
# dependencies
464458
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
465459
(which jq) || (apt-get update && apt-get -y install jq)

.buildkite/pyproject.toml

Lines changed: 0 additions & 46 deletions
This file was deleted.

.buildkite/release-pipeline.yaml

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ steps:
7676
queue: arm64_cpu_queue_postmerge
7777
commands:
7878
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
79-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
79+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
8080
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
8181

8282
# Add job to create multi-arch manifest
@@ -150,11 +150,16 @@ steps:
150150
queue: cpu_queue_postmerge
151151
commands:
152152
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
153-
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
154-
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
155-
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
156-
- "docker push vllm/vllm-openai:nightly"
157-
- "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
153+
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
154+
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
155+
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
156+
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
157+
- "docker push vllm/vllm-openai:nightly-x86_64"
158+
- "docker push vllm/vllm-openai:nightly-aarch64"
159+
- "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
160+
- "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
161+
- "docker manifest push vllm/vllm-openai:nightly"
162+
- "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
158163
# Clean up old nightly builds (keep only last 14)
159164
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
160165
plugins:
@@ -163,3 +168,4 @@ steps:
163168
password-env: DOCKERHUB_TOKEN
164169
env:
165170
DOCKER_BUILDKIT: "1"
171+
DOCKERHUB_USERNAME: "vllmbot"

.buildkite/scripts/cleanup-nightly-builds.sh

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,20 +8,41 @@ set -ex
88
# DockerHub API endpoint for vllm/vllm-openai repository
99
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
1010

11-
# Get DockerHub token from environment
11+
# Get DockerHub credentials from environment
1212
if [ -z "$DOCKERHUB_TOKEN" ]; then
1313
echo "Error: DOCKERHUB_TOKEN environment variable is not set"
1414
exit 1
1515
fi
1616

17+
if [ -z "$DOCKERHUB_USERNAME" ]; then
18+
echo "Error: DOCKERHUB_USERNAME environment variable is not set"
19+
exit 1
20+
fi
21+
22+
# Get DockerHub bearer token
23+
echo "Getting DockerHub bearer token..."
24+
set +x
25+
BEARER_TOKEN=$(curl -s -X POST \
26+
-H "Content-Type: application/json" \
27+
-d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \
28+
"https://hub.docker.com/v2/users/login" | jq -r '.token')
29+
set -x
30+
31+
if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then
32+
echo "Error: Failed to get DockerHub bearer token"
33+
exit 1
34+
fi
35+
1736
# Function to get all tags from DockerHub
1837
get_all_tags() {
1938
local page=1
2039
local all_tags=""
2140

2241
while true; do
23-
local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
42+
set +x
43+
local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \
2444
"$REPO_API_URL?page=$page&page_size=100")
45+
set -x
2546

2647
# Get both last_updated timestamp and tag name, separated by |
2748
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
@@ -43,7 +64,9 @@ delete_tag() {
4364
echo "Deleting tag: $tag_name"
4465

4566
local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
46-
local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
67+
set +x
68+
local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
69+
set -x
4770

4871
if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
4972
echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"

.buildkite/scripts/hardware_ci/run-cpu-test.sh

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,8 @@ function cpu_tests() {
5858
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
5959
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
6060
61-
# Note: disable Bart until supports V1
62-
pytest -x -v -s tests/models/language/generation -m cpu_model \
63-
--ignore=tests/models/language/generation/test_bart.py
64-
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
65-
--ignore=tests/models/language/generation/test_bart.py
61+
pytest -x -v -s tests/models/language/generation -m cpu_model
62+
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
6663
6764
pytest -x -v -s tests/models/language/pooling -m cpu_model
6865
pytest -x -v -s tests/models/multimodal/generation \
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
#!/bin/bash
2+
3+
# This script build the Ascend NPU docker image and run the offline inference inside the container.
4+
# It serves a sanity check for compilation and basic model usage.
5+
set -ex
6+
7+
# Base ubuntu image with basic ascend development libraries and python installed
8+
VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
9+
CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
10+
TEST_RUN_CONFIG_FILE="vllm_test.cfg"
11+
VLLM_ASCEND_TMP_DIR=
12+
# Get the test run configuration file from the vllm-ascend repository
13+
fetch_vllm_test_cfg() {
14+
VLLM_ASCEND_TMP_DIR=$(mktemp -d)
15+
# Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
16+
cleanup() {
17+
rm -rf "${VLLM_ASCEND_TMP_DIR}"
18+
}
19+
trap cleanup EXIT
20+
21+
GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
22+
if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
23+
echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
24+
exit 1
25+
fi
26+
27+
# If the file already exists locally, just overwrite it
28+
cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
29+
echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
30+
31+
# Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
32+
# when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
33+
rm -rf "${VLLM_ASCEND_TMP_DIR}"
34+
trap - EXIT
35+
}
36+
37+
# Downloads test run configuration file from a remote URL.
38+
# Loads the configuration into the current script environment.
39+
get_config() {
40+
if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
41+
echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
42+
exit 1
43+
fi
44+
source "${TEST_RUN_CONFIG_FILE}"
45+
echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
46+
return 0
47+
}
48+
49+
# get test running configuration.
50+
fetch_vllm_test_cfg
51+
get_config
52+
# Check if the function call was successful. If not, exit the script.
53+
if [ $? -ne 0 ]; then
54+
exit 1
55+
fi
56+
57+
image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
58+
container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
59+
60+
# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
61+
agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
62+
echo "agent_idx: ${agent_idx}"
63+
builder_name="cachebuilder${agent_idx}"
64+
builder_cache_dir="/mnt/docker-cache${agent_idx}"
65+
mkdir -p ${builder_cache_dir}
66+
67+
# Try building the docker image
68+
cat <<EOF | DOCKER_BUILDKIT=1 docker build \
69+
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
70+
--builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
71+
--cache-to type=local,dest=${builder_cache_dir},mode=max \
72+
--progress=plain --load -t ${image_name} -f - .
73+
FROM ${BASE_IMAGE_NAME}
74+
75+
# Define environments
76+
ENV DEBIAN_FRONTEND=noninteractive
77+
78+
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
79+
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
80+
apt-get update -y && \
81+
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
82+
rm -rf /var/cache/apt/* && \
83+
rm -rf /var/lib/apt/lists/*
84+
85+
# Install for pytest to make the docker build cache layer always valid
86+
RUN --mount=type=cache,target=/root/.cache/pip \
87+
pip install pytest>=6.0 modelscope
88+
89+
WORKDIR /workspace/vllm
90+
91+
# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
92+
COPY requirements/common.txt /workspace/vllm/requirements/common.txt
93+
RUN --mount=type=cache,target=/root/.cache/pip \
94+
pip install -r requirements/common.txt
95+
96+
COPY . .
97+
98+
# Install vLLM
99+
RUN --mount=type=cache,target=/root/.cache/pip \
100+
VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
101+
python3 -m pip uninstall -y triton
102+
103+
# Install vllm-ascend
104+
WORKDIR /workspace
105+
ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
106+
ARG VLLM_ASCEND_TAG=main
107+
RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
108+
git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
109+
110+
# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
111+
RUN --mount=type=cache,target=/root/.cache/pip \
112+
pip install -r /workspace/vllm-ascend/requirements.txt
113+
114+
RUN --mount=type=cache,target=/root/.cache/pip \
115+
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
116+
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
117+
source /usr/local/Ascend/nnal/atb/set_env.sh && \
118+
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
119+
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
120+
121+
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
122+
ENV VLLM_USE_MODELSCOPE=True
123+
124+
WORKDIR /workspace/vllm-ascend
125+
126+
CMD ["/bin/bash"]
127+
128+
EOF
129+
130+
# Setup cleanup
131+
remove_docker_container() {
132+
docker rm -f "${container_name}" || true;
133+
docker image rm -f "${image_name}" || true;
134+
docker system prune -f || true;
135+
}
136+
trap remove_docker_container EXIT
137+
138+
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
139+
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
140+
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
141+
# returns --device /dev/davinci0 --device /dev/davinci1
142+
parse_and_gen_devices() {
143+
local input="$1"
144+
local index cards_num
145+
if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
146+
index="${BASH_REMATCH[1]}"
147+
cards_num="${BASH_REMATCH[2]}"
148+
else
149+
echo "parse error" >&2
150+
return 1
151+
fi
152+
153+
local devices=""
154+
local i=0
155+
while (( i < cards_num )); do
156+
local dev_idx=$(((index - 1)*cards_num + i ))
157+
devices="$devices --device /dev/davinci${dev_idx}"
158+
((i++))
159+
done
160+
161+
# trim leading space
162+
devices="${devices#"${devices%%[![:space:]]*}"}"
163+
# Output devices: assigned to the caller variable
164+
printf '%s' "$devices"
165+
}
166+
167+
devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
168+
169+
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
170+
# This test checks whether the OOT platform interface is functioning properly in conjunction with
171+
# the hardware plugin vllm-ascend.
172+
model_cache_dir=/mnt/modelscope${agent_idx}
173+
mkdir -p ${model_cache_dir}
174+
docker run \
175+
${devices} \
176+
--device /dev/davinci_manager \
177+
--device /dev/devmm_svm \
178+
--device /dev/hisi_hdc \
179+
-v /usr/local/dcmi:/usr/local/dcmi \
180+
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
181+
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
182+
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
183+
-v /etc/ascend_install.info:/etc/ascend_install.info \
184+
-v ${model_cache_dir}:/root/.cache/modelscope \
185+
--entrypoint="" \
186+
--name "${container_name}" \
187+
"${image_name}" \
188+
bash -c '
189+
set -e
190+
pytest -v -s tests/e2e/vllm_interface/
191+
'

0 commit comments

Comments
 (0)