llm/llama-2/chatbot-hf.yaml

resources:
  accelerators: A100:1
  disk_size: 1024
  disk_tier: best
  memory: 32+

envs:
  MODEL_SIZE: 7
  HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.

setup: |
  conda activate chatbot
  if [ $? -ne 0 ]; then
    conda create -n chatbot python=3.9 -y
    conda activate chatbot
  fi

  # Install dependencies
  pip install "fschat[model_worker,webui]"

  python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')"

run: |
  conda activate chatbot
  
  echo 'Starting controller...'
  python -u -m fastchat.serve.controller --host 127.0.0.1 > ~/controller.log 2>&1 &
  sleep 10
  echo 'Starting model worker...'
  python -u -m fastchat.serve.model_worker \
            --model-path meta-llama/Llama-2-${MODEL_SIZE}b-chat-hf \
            --num-gpus $SKYPILOT_NUM_GPUS_PER_NODE 2>&1 \
            --host 127.0.0.1 \
            | tee model_worker.log &

  echo 'Waiting for model worker to start...'
  while ! `cat model_worker.log | grep -q 'Uvicorn running on'`; do sleep 1; done

  echo 'Starting gradio server...'
  python -u -m fastchat.serve.gradio_web_server --share | tee ~/gradio.log