Skip to content

Commit ae3d612

Browse files
Merge pull request vllm-project#26 from HabanaAI/habana_main_rebase_cc466a3
Rebase habana_main up to cc466a3
2 parents a115250 + 61b7763 commit ae3d612

File tree

434 files changed

+47902
-8750
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

434 files changed

+47902
-8750
lines changed

.buildkite/check-wheel-size.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import os
2+
import zipfile
3+
4+
MAX_SIZE_MB = 100
5+
6+
7+
def print_top_10_largest_files(zip_file):
8+
with zipfile.ZipFile(zip_file, 'r') as z:
9+
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
10+
file_sizes.sort(key=lambda x: x[1], reverse=True)
11+
for f, size in file_sizes[:10]:
12+
print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
13+
14+
15+
def check_wheel_size(directory):
16+
for root, _, files in os.walk(directory):
17+
for f in files:
18+
if f.endswith(".whl"):
19+
wheel_path = os.path.join(root, f)
20+
wheel_size = os.path.getsize(wheel_path)
21+
wheel_size_mb = wheel_size / (1024 * 1024)
22+
if wheel_size_mb > MAX_SIZE_MB:
23+
print(
24+
f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
25+
f"compare to the allowed size ({MAX_SIZE_MB} MB).")
26+
print_top_10_largest_files(wheel_path)
27+
return 1
28+
else:
29+
print(f"Wheel {wheel_path} is within the allowed size "
30+
f"({wheel_size_mb} MB).")
31+
return 0
32+
33+
34+
if __name__ == "__main__":
35+
import sys
36+
sys.exit(check_wheel_size(sys.argv[1]))

.buildkite/download-images.sh

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#!/bin/bash
2+
3+
set -ex
4+
set -o pipefail
5+
6+
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
7+
8+
# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
9+
mkdir -p images
10+
cd images
11+
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
12+
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
13+
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
14+
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
15+
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
16+
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
17+
18+
cd -

.buildkite/run-amd-test.sh

Lines changed: 35 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,44 @@
1-
# This script build the ROCm docker image and run the API server inside the container.
2-
# It serves a sanity check for compilation and basic model usage.
1+
# This script build the ROCm docker image and runs test inside it.
32
set -ex
43

54
# Print ROCm version
5+
echo "--- ROCm info"
66
rocminfo
77

8-
# Try building the docker image
9-
docker build -t rocm -f Dockerfile.rocm .
8+
echo "--- Resetting GPUs"
109

11-
# Setup cleanup
12-
remove_docker_container() { docker rm -f rocm || true; }
13-
trap remove_docker_container EXIT
14-
remove_docker_container
15-
16-
# Run the image
17-
docker run --device /dev/kfd --device /dev/dri --network host --name rocm rocm python3 -m vllm.entrypoints.api_server &
18-
19-
# Wait for the server to start
20-
wait_for_server_to_start() {
21-
timeout=300
22-
counter=0
23-
24-
while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
25-
sleep 1
26-
counter=$((counter + 1))
27-
if [ $counter -ge $timeout ]; then
28-
echo "Timeout after $timeout seconds"
29-
break
10+
echo "reset" > /opt/amdgpu/etc/gpu_state
11+
12+
while true; do
13+
sleep 3
14+
if grep -q clean /opt/amdgpu/etc/gpu_state; then
15+
echo "GPUs state is \"clean\""
16+
break
3017
fi
31-
done
18+
done
19+
20+
echo "--- Building container"
21+
sha=$(git rev-parse --short HEAD)
22+
container_name=rocm_${sha}
23+
docker build \
24+
-t ${container_name} \
25+
-f Dockerfile.rocm \
26+
--progress plain \
27+
.
28+
29+
remove_docker_container() {
30+
docker rm -f ${container_name} || docker image rm -f ${container_name} || true
3231
}
33-
wait_for_server_to_start
32+
trap remove_docker_container EXIT
33+
34+
echo "--- Running container"
35+
36+
docker run \
37+
--device /dev/kfd --device /dev/dri \
38+
--network host \
39+
--rm \
40+
-e HF_TOKEN \
41+
--name ${container_name} \
42+
${container_name} \
43+
/bin/bash -c "${@}"
3444

35-
# Test a simple prompt
36-
curl -X POST -H "Content-Type: application/json" \
37-
localhost:8000/generate \
38-
-d '{"prompt": "San Francisco is a"}'

.buildkite/run-benchmarks.sh

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,9 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
2323
# wait for server to start, timeout after 600 seconds
2424
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
2525
python3 benchmarks/benchmark_serving.py \
26-
--backend openai \
27-
--dataset ./ShareGPT_V3_unfiltered_cleaned_split.json \
26+
--backend vllm \
27+
--dataset-name sharegpt \
28+
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
2829
--model meta-llama/Llama-2-7b-chat-hf \
2930
--num-prompts 20 \
3031
--endpoint /v1/completions \
@@ -48,7 +49,14 @@ sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line
4849
echo "### Serving Benchmarks" >> benchmark_results.md
4950
sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
5051
echo "" >> benchmark_results.md
51-
tail -n 13 benchmark_serving.txt >> benchmark_results.md # last 13 lines
52+
echo '```' >> benchmark_results.md
53+
tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
54+
echo '```' >> benchmark_results.md
55+
56+
# if the agent binary is not found, skip uploading the results, exit 0
57+
if [ ! -f /workspace/buildkite-agent ]; then
58+
exit 0
59+
fi
5260

5361
# upload the results to buildkite
5462
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md

.buildkite/run-cpu-test.sh

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# This script build the CPU docker image and run the offline inference inside the container.
2+
# It serves a sanity check for compilation and basic model usage.
3+
set -ex
4+
5+
# Try building the docker image
6+
docker build -t cpu-test -f Dockerfile.cpu .
7+
8+
# Setup cleanup
9+
remove_docker_container() { docker rm -f cpu-test || true; }
10+
trap remove_docker_container EXIT
11+
remove_docker_container
12+
13+
# Run the image and launch offline inference
14+
docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 examples/offline_inference.py

.buildkite/run-neuron-test.sh

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# This script build the Neuron docker image and run the API server inside the container.
2+
# It serves a sanity check for compilation and basic model usage.
3+
set -e
4+
5+
# Try building the docker image
6+
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
7+
8+
# prune old image and containers to save disk space, and only once a day
9+
# by using a timestamp file in tmp.
10+
if [ -f /tmp/neuron-docker-build-timestamp ]; then
11+
last_build=$(cat /tmp/neuron-docker-build-timestamp)
12+
current_time=$(date +%s)
13+
if [ $((current_time - last_build)) -gt 86400 ]; then
14+
docker system prune -f
15+
echo $current_time > /tmp/neuron-docker-build-timestamp
16+
fi
17+
else
18+
echo $(date +%s) > /tmp/neuron-docker-build-timestamp
19+
fi
20+
21+
docker build -t neuron -f Dockerfile.neuron .
22+
23+
# Setup cleanup
24+
remove_docker_container() { docker rm -f neuron || true; }
25+
trap remove_docker_container EXIT
26+
remove_docker_container
27+
28+
# Run the image
29+
docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
30+
--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
31+
32+
# Wait for the server to start
33+
wait_for_server_to_start() {
34+
timeout=300
35+
counter=0
36+
37+
while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
38+
sleep 1
39+
counter=$((counter + 1))
40+
if [ $counter -ge $timeout ]; then
41+
echo "Timeout after $timeout seconds"
42+
break
43+
fi
44+
done
45+
}
46+
wait_for_server_to_start
47+
48+
# Test a simple prompt
49+
curl -X POST -H "Content-Type: application/json" \
50+
localhost:8000/generate \
51+
-d '{"prompt": "San Francisco is a"}'

.buildkite/test-pipeline.yaml

Lines changed: 63 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,67 +12,120 @@ steps:
1212
command: pytest -v -s async_engine
1313

1414
- label: Basic Correctness Test
15-
command: pytest -v -s --forked basic_correctness
15+
commands:
16+
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
17+
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
18+
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
19+
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
20+
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
1621

1722
- label: Core Test
23+
mirror_hardwares: [amd]
1824
command: pytest -v -s core
1925

2026
- label: Distributed Comm Ops Test
21-
command: pytest -v -s --forked test_comm_ops.py
27+
command: pytest -v -s test_comm_ops.py
2228
working_dir: "/vllm-workspace/tests/distributed"
23-
num_gpus: 2 # only support 1 or 2 for now.
29+
num_gpus: 2
2430

25-
- label: Distributed Correctness Test
26-
command: pytest -v -s --forked test_basic_distributed_correctness.py
31+
- label: Distributed Tests
2732
working_dir: "/vllm-workspace/tests/distributed"
33+
2834
num_gpus: 2 # only support 1 or 2 for now.
35+
mirror_hardwares: [amd]
36+
37+
commands:
38+
- pytest -v -s test_pynccl_library.py
39+
- TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
40+
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
41+
- TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py
42+
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_chunked_prefill_distributed.py
43+
44+
- label: Distributed Tests (Multiple Groups)
45+
working_dir: "/vllm-workspace/tests/distributed"
46+
num_gpus: 4
47+
commands:
48+
- pytest -v -s test_pynccl.py
2949

3050
- label: Engine Test
31-
command: pytest -v -s engine tokenization test_sequence.py test_config.py
51+
#mirror_hardwares: [amd]
52+
command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
3253

3354
- label: Entrypoints Test
34-
command: pytest -v -s entrypoints
55+
commands:
56+
# these tests have to be separated, because each one will allocate all posible GPU memory
57+
- pytest -v -s entrypoints --ignore=entrypoints/test_server_oot_registration.py
58+
- pytest -v -s entrypoints/test_server_oot_registration.py
59+
60+
- label: Examples Test
61+
working_dir: "/vllm-workspace/examples"
62+
mirror_hardwares: [amd]
63+
commands:
64+
# install aws cli for llava_example.py
65+
- pip install awscli
66+
- python3 offline_inference.py
67+
- python3 offline_inference_with_prefix.py
68+
- python3 llm_engine_example.py
69+
- python3 llava_example.py
3570

3671
- label: Kernels Test %N
3772
command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
3873
parallelism: 4
3974

4075
- label: Models Test
76+
#mirror_hardwares: [amd]
4177
commands:
42-
- pytest -v -s models --forked
43-
soft_fail: true
78+
- bash ../.buildkite/download-images.sh
79+
- pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
80+
81+
- label: Llava Test
82+
#mirror_hardwares: [amd]
83+
commands:
84+
- bash ../.buildkite/download-images.sh
85+
- pytest -v -s models/test_llava.py
4486

4587
- label: Prefix Caching Test
88+
mirror_hardwares: [amd]
4689
commands:
4790
- pytest -v -s prefix_caching
4891

4992
- label: Samplers Test
5093
command: pytest -v -s samplers
5194

5295
- label: LogitsProcessor Test
96+
mirror_hardwares: [amd]
5397
command: pytest -v -s test_logits_processor.py
5498

5599
- label: Worker Test
100+
mirror_hardwares: [amd]
56101
command: pytest -v -s worker
57102

58103
- label: Speculative decoding tests
104+
#mirror_hardwares: [amd]
59105
command: pytest -v -s spec_decode
60106

61107
- label: LoRA Test %N
62108
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
63109
parallelism: 4
64110

111+
- label: Tensorizer Test
112+
command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
113+
65114
- label: Metrics Test
66115
command: pytest -v -s metrics
67116

117+
- label: Quantization Test
118+
command: pytest -v -s quantization
119+
68120
- label: Benchmarks
69121
working_dir: "/vllm-workspace/.buildkite"
122+
mirror_hardwares: [amd]
70123
commands:
71124
- pip install aiohttp
72125
- bash run-benchmarks.sh
73126

74127
- label: Documentation Build
75-
working_dir: "/vllm-workspace/docs"
128+
working_dir: "/vllm-workspace/test_docs/docs"
76129
no_gpu: True
77130
commands:
78131
- pip install -r requirements-docs.txt

0 commit comments

Comments
 (0)