docker-compose.yml

version: '3.6'

services:
  llama-gpt-api:
    # Pin the image to llama-cpp-python 0.1.78 to avoid ggml => gguf breaking changes
    image: ghcr.io/abetlen/llama-cpp-python:latest@sha256:b6d21ff8c4d9baad65e1fa741a0f8c898d68735fff3f3cd777e3f0c6a1839dd4
    restart: on-failure
    volumes:
      - './models:/models'
      - './api:/api'
    ports:
      - 3001:8000
    environment:
      MODEL: '/models/${MODEL_NAME:-llama-2-7b-chat.bin}'
      MODEL_DOWNLOAD_URL: '${MODEL_DOWNLOAD_URL:-https://huggingface.co/TheBloke/Nous-Hermes-Llama-2-7B-GGML/resolve/main/nous-hermes-llama-2-7b.ggmlv3.q4_0.bin}'
      N_GQA: '${N_GQA:-1}'
      USE_MLOCK: 1
    cap_add:
      - IPC_LOCK
    command: '/bin/sh /api/run.sh'

  llama-gpt-ui:
    # TODO: Use this image instead of building from source after the next release
    # image: 'ghcr.io/getumbrel/llama-gpt-ui:latest'
    build:
      context: ./ui
      dockerfile: Dockerfile
    ports:
      - 3000:3000
    restart: on-failure
    environment:
      - 'OPENAI_API_KEY=sk-XXXXXXXXXXXXXXXXXXXX'
      - 'OPENAI_API_HOST=http://llama-gpt-api:8000'
      - 'DEFAULT_MODEL=/models/${MODEL_NAME:-llama-2-7b-chat.bin}'
      - 'NEXT_PUBLIC_DEFAULT_SYSTEM_PROMPT=${DEFAULT_SYSTEM_PROMPT:-"You are a helpful and friendly AI assistant. Respond very concisely."}'
      - 'WAIT_HOSTS=llama-gpt-api:8000'
      - 'WAIT_TIMEOUT=${WAIT_TIMEOUT:-3600}'