-
Notifications
You must be signed in to change notification settings - Fork 531
/
serve.yaml
34 lines (26 loc) · 945 Bytes
/
serve.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
envs:
MODEL_NAME: decapoda-research/llama-65b-hf
resources:
accelerators: A100-80GB:8
setup: |
conda activate vllm
if [ $? -ne 0 ]; then
conda create -n vllm python=3.10 -y
conda activate vllm
fi
# Install fschat and accelerate for chat completion
git clone https://github.com/vllm-project/vllm.git || true
pip install transformers==4.38.0
pip install vllm==0.3.2
pip install gradio
run: |
conda activate vllm
echo 'Starting vllm api server...'
python -u -m vllm.entrypoints.api_server \
--model $MODEL_NAME \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
--tokenizer hf-internal-testing/llama-tokenizer 2>&1 | tee api_server.log &
echo 'Waiting for vllm api server to start...'
while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
echo 'Starting gradio server...'
python vllm/examples/gradio_webserver.py