diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh index 9259391aaed49..aa29c434e7cfb 100644 --- a/.buildkite/run-neuron-test.sh +++ b/.buildkite/run-neuron-test.sh @@ -3,6 +3,18 @@ # This script build the Neuron docker image and run the API server inside the container. # It serves a sanity check for compilation and basic model usage. set -e +set -v + +image_name="neuron/vllm-ci" +container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" + +HF_CACHE="$(realpath ~)/huggingface" +mkdir -p "${HF_CACHE}" +HF_MOUNT="/root/.cache/huggingface" + +NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache" +mkdir -p "${NEURON_COMPILE_CACHE_URL}" +NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache" # Try building the docker image aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com @@ -13,41 +25,30 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then last_build=$(cat /tmp/neuron-docker-build-timestamp) current_time=$(date +%s) if [ $((current_time - last_build)) -gt 86400 ]; then + docker image prune -f docker system prune -f + rm -rf "${HF_MOUNT:?}/*" + rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*" echo "$current_time" > /tmp/neuron-docker-build-timestamp fi else date "+%s" > /tmp/neuron-docker-build-timestamp fi -docker build -t neuron -f Dockerfile.neuron . +docker build -t "${image_name}" -f Dockerfile.neuron . # Setup cleanup -remove_docker_container() { docker rm -f neuron || true; } +remove_docker_container() { + docker image rm -f "${image_name}" || true; +} trap remove_docker_container EXIT -remove_docker_container # Run the image -docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \ - --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 & - -# Wait for the server to start -wait_for_server_to_start() { - timeout=300 - counter=0 - - while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do - sleep 1 - counter=$((counter + 1)) - if [ $counter -ge $timeout ]; then - echo "Timeout after $timeout seconds" - break - fi - done -} -wait_for_server_to_start - -# Test a simple prompt -curl -X POST -H "Content-Type: application/json" \ - localhost:8000/generate \ - -d '{"prompt": "San Francisco is a"}' +docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \ + -v "${HF_CACHE}:${HF_MOUNT}" \ + -e "HF_HOME=${HF_MOUNT}" \ + -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \ + -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ + --name "${container_name}" \ + ${image_name} \ + /bin/bash -c "python3 /workspace/vllm/examples/offline_inference_neuron.py" diff --git a/Dockerfile.neuron b/Dockerfile.neuron index 269139fe90f0b..e9cb82889decd 100644 --- a/Dockerfile.neuron +++ b/Dockerfile.neuron @@ -15,8 +15,8 @@ RUN apt-get update && \ ffmpeg libsm6 libxext6 libgl1 ### Mount Point ### -# When launching the container, mount the code directory to /app -ARG APP_MOUNT=/app +# When launching the container, mount the code directory to /workspace +ARG APP_MOUNT=/workspace VOLUME [ ${APP_MOUNT} ] WORKDIR ${APP_MOUNT}/vllm @@ -25,6 +25,7 @@ RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas RUN python3 -m pip install sentencepiece transformers==4.45.2 -U RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U +RUN python3 -m pip install pytest COPY . . ARG GIT_REPO_CHECK=0 @@ -42,4 +43,7 @@ RUN --mount=type=bind,source=.git,target=.git \ # install development dependencies (for testing) RUN python3 -m pip install -e tests/vllm_test_utils +# overwrite entrypoint to run bash script +RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py + CMD ["/bin/bash"] diff --git a/examples/offline_inference_neuron.py b/examples/offline_inference_neuron.py index 2856be7c864ea..f098c8e5fed1e 100644 --- a/examples/offline_inference_neuron.py +++ b/examples/offline_inference_neuron.py @@ -1,12 +1,5 @@ -import os - from vllm import LLM, SamplingParams -# creates XLA hlo graphs for all the context length buckets. -os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048" -# creates XLA hlo graphs for all the token gen buckets. -os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048" - # Sample prompts. prompts = [ "Hello, my name is", @@ -26,8 +19,8 @@ # Currently, this is a known limitation in continuous batching support # in transformers-neuronx. # TODO(liangfu): Support paged-attention in transformers-neuronx. - max_model_len=2048, - block_size=2048, + max_model_len=1024, + block_size=1024, # The device can be automatically detected when AWS Neuron SDK is installed. # The device argument can be either unspecified for automated detection, # or explicitly assigned.