diff --git a/README.md b/README.md index c79e7f60..010fc0d6 100644 --- a/README.md +++ b/README.md @@ -138,33 +138,23 @@ docker compose -f production-compose.yml up -d For a complete production setup with custom images: ```shell -# 1. Generate the production compose file +# 1a. Generate the Production 1 image python3 ./scripts/docker-composer.py --prod \ - -f docker/compose/docker-compose.llama-3b-gpu.yml \ - -f docker/compose/docker-compose.llama-8b-gpu.yml \ - -f docker/compose/docker-compose.deepseek-14b-gpu.yml \ - --image 'nillion/nilai-api:latest=public.ecr.aws/k5d9x2g2/nilai-api:v0.2.0-alpha0' \ - --image 'nillion/nilai-vllm:latest=public.ecr.aws/k5d9x2g2/nilai-vllm:v0.2.0-alpha0' \ - --image 'nillion/nilai-attestation:latest=public.ecr.aws/k5d9x2g2/nilai-attestation:v0.2.0-alpha0' \ + -f docker/compose/docker-compose.nilai-prod-1.yml \ + --image 'nillion/nilai-api:latest=public.ecr.aws/k5d9x2g2/nilai-api:v0.2.0-alpha2' \ + --image 'nillion/nilai-vllm:latest=public.ecr.aws/k5d9x2g2/nilai-vllm:v0.2.0-alpha2' \ + --image 'nillion/nilai-attestation:latest=public.ecr.aws/k5d9x2g2/nilai-attestation:v0.2.0-alpha2' \ --testnet \ -o production-compose.yml -# 1. Generate the production compose file +# 1b. Generate the Production 2 image python3 ./scripts/docker-composer.py --prod \ - -f docker/compose/docker-compose.llama-3b-gpu.yml \ - --image 'nillion/nilai-api:latest=public.ecr.aws/k5d9x2g2/nilai-api:v0.2.0-alpha0' \ - --image 'nillion/nilai-vllm:latest=public.ecr.aws/k5d9x2g2/nilai-vllm:v0.2.0-alpha0' \ - --image 'nillion/nilai-attestation:latest=public.ecr.aws/k5d9x2g2/nilai-attestation:v0.2.0-alpha0' \ - --testnet \ + -f docker/compose/docker-compose.nilai-prod-2.yml \ + --image 'nillion/nilai-api:latest=public.ecr.aws/k5d9x2g2/nilai-api:v0.2.0-alpha2' \ + --image 'nillion/nilai-vllm:latest=public.ecr.aws/k5d9x2g2/nilai-vllm:v0.2.0-alpha2' \ + --image 'nillion/nilai-attestation:latest=public.ecr.aws/k5d9x2g2/nilai-attestation:v0.2.0-alpha2' \ -o production-compose.yml -# Or: -python3 ./scripts/docker-composer.py --prod \ - -f docker/compose/docker-compose.llama-70b-gpu.yml \ - --image 'nillion/nilai-api:latest=public.ecr.aws/k5d9x2g2/nilai-api:v0.2.0-alpha0' \ - --image 'nillion/nilai-vllm:latest=public.ecr.aws/k5d9x2g2/nilai-vllm:v0.2.0-alpha0' \ - --image 'nillion/nilai-attestation:latest=public.ecr.aws/k5d9x2g2/nilai-attestation:v0.2.0-alpha0' \ - -o production-compose.yml # 2. Deploy using the generated file docker compose -f production-compose.yml up -d diff --git a/caddy/Caddyfile.testnet b/caddy/Caddyfile.testnet new file mode 100644 index 00000000..1bd50c09 --- /dev/null +++ b/caddy/Caddyfile.testnet @@ -0,0 +1,42 @@ +(ssl_config) { + tls { + protocols tls1.2 tls1.3 + } +} + +{$NILAI_SERVER_DOMAIN} { + import ssl_config + + handle_path /grafana/* { + uri strip_prefix /grafana + reverse_proxy grafana:3000 + } + + handle_path /nuc/* { + uri strip_prefix /nuc + reverse_proxy nilai-nuc-api:8080 + } + + handle_path /testnet/* { + uri strip_prefix /testnet + reverse_proxy testnet-nilai-nuc-api:8080 + } + + handle { + reverse_proxy nilai-api:8080 + } +} + + +{$TESTNET_NILAI_SERVER_DOMAIN} { + import ssl_config + + handle_path /grafana/* { + uri strip_prefix /grafana + reverse_proxy grafana:3000 + } + + handle { + reverse_proxy testnet-nilai-nuc-api:8080 + } +} diff --git a/docker/compose/docker-compose.gemma-27b-gpu.yml b/docker/compose/docker-compose.gemma-27b-gpu.yml index 754b44c3..db970525 100644 --- a/docker/compose/docker-compose.gemma-27b-gpu.yml +++ b/docker/compose/docker-compose.gemma-27b-gpu.yml @@ -33,6 +33,8 @@ services: - ETCD_PORT=2379 - TOOL_SUPPORT=false - MULTIMODAL_SUPPORT=true + - MODEL_NUM_RETRIES=60 + - MODEL_RETRY_TIMEOUT=20 volumes: - hugging_face_models:/root/.cache/huggingface healthcheck: diff --git a/docker/compose/docker-compose.nilai-prod-1.yml b/docker/compose/docker-compose.nilai-prod-1.yml new file mode 100644 index 00000000..cb7d2abe --- /dev/null +++ b/docker/compose/docker-compose.nilai-prod-1.yml @@ -0,0 +1,47 @@ +services: + gemma_27b_gpu: + image: nillion/nilai-vllm:latest + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + ipc: host + ulimits: + memlock: -1 + stack: 67108864 + env_file: + - .env + restart: unless-stopped + depends_on: + etcd: + condition: service_healthy + command: > + --model google/gemma-3-27b-it + --gpu-memory-utilization 0.95 + --max-model-len 100000 + --max-num-batched-tokens 8192 + --dtype bfloat16 + --kv-cache-dtype fp8 + --uvicorn-log-level warning + environment: + - SVC_HOST=gemma_27b_gpu + - SVC_PORT=8000 + - ETCD_HOST=etcd + - ETCD_PORT=2379 + - TOOL_SUPPORT=false + - MULTIMODAL_SUPPORT=true + - MODEL_NUM_RETRIES=60 + - MODEL_RETRY_TIMEOUT=20 + volumes: + - hugging_face_models:/root/.cache/huggingface + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + retries: 3 + start_period: 60s + timeout: 10s +volumes: + hugging_face_models: diff --git a/docker/compose/docker-compose.nilai-prod-2.yml b/docker/compose/docker-compose.nilai-prod-2.yml new file mode 100644 index 00000000..c78af7e6 --- /dev/null +++ b/docker/compose/docker-compose.nilai-prod-2.yml @@ -0,0 +1,90 @@ +services: + llama_8b_gpu: + image: nillion/nilai-vllm:latest + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + + ulimits: + memlock: -1 + stack: 67108864 + env_file: + - .env + restart: unless-stopped + depends_on: + etcd: + condition: service_healthy + gpt_20b_gpu: + # Llama takes less time to initialize + condition: service_healthy + command: > + --model meta-llama/Llama-3.1-8B-Instruct + --gpu-memory-utilization 0.23 + --max-model-len 10000 + --max-num-batched-tokens 10000 + --tensor-parallel-size 1 + --dtype bfloat16 + --kv-cache-dtype fp8 + --enable-auto-tool-choice + --tool-call-parser llama3_json + --uvicorn-log-level warning + environment: + - SVC_HOST=llama_8b_gpu + - SVC_PORT=8000 + - ETCD_HOST=etcd + - ETCD_PORT=2379 + - TOOL_SUPPORT=true + volumes: + - hugging_face_models:/root/.cache/huggingface # cache models + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + retries: 3 + start_period: 60s + timeout: 10s + + gpt_20b_gpu: + image: nillion/nilai-vllm:latest + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + ulimits: + memlock: -1 + stack: 67108864 + env_file: + - .env + restart: unless-stopped + depends_on: + etcd: + condition: service_healthy + command: > + --model openai/gpt-oss-20b + --gpu-memory-utilization 0.75 + --max-model-len 100000 + --max-num-batched-tokens 100000 + --tensor-parallel-size 1 + --uvicorn-log-level warning + environment: + - SVC_HOST=gpt_20b_gpu + - SVC_PORT=8000 + - ETCD_HOST=etcd + - ETCD_PORT=2379 + - TOOL_SUPPORT=true + volumes: + - hugging_face_models:/root/.cache/huggingface # cache models + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + retries: 3 + start_period: 180s + timeout: 10s +volumes: + hugging_face_models: diff --git a/nilai-models/src/nilai_models/daemon.py b/nilai-models/src/nilai_models/daemon.py index 6cbe23a9..395fa6e8 100644 --- a/nilai-models/src/nilai_models/daemon.py +++ b/nilai-models/src/nilai_models/daemon.py @@ -39,6 +39,7 @@ async def get_metadata(): source=f"https://huggingface.co/{model_name}", # Model source supported_features=["chat_completion"], # Capabilities tool_support=SETTINGS.tool_support, # Tool support + multimodal_support=SETTINGS.multimodal_support, # Multimodal support ) except Exception as e: diff --git a/packages/nilai-common/src/nilai_common/config.py b/packages/nilai-common/src/nilai_common/config.py index 4f8a2e8f..294fc7b5 100644 --- a/packages/nilai-common/src/nilai_common/config.py +++ b/packages/nilai-common/src/nilai_common/config.py @@ -8,6 +8,7 @@ class HostSettings(BaseModel): etcd_host: str = "localhost" etcd_port: int = 2379 tool_support: bool = False + multimodal_support: bool = False gunicorn_workers: int = 10 attestation_host: str = "localhost" attestation_port: int = 8081 @@ -18,12 +19,18 @@ class ModelSettings(BaseModel): timeout: int = Field(default=10, ge=1) +def to_bool(value: str) -> bool: + """Convert a string to a boolean.""" + return value.lower() in ("true", "1", "t", "y", "yes") + + SETTINGS: HostSettings = HostSettings( host=str(os.getenv("SVC_HOST", "localhost")), port=int(os.getenv("SVC_PORT", 8000)), etcd_host=str(os.getenv("ETCD_HOST", "localhost")), etcd_port=int(os.getenv("ETCD_PORT", 2379)), - tool_support=bool(os.getenv("TOOL_SUPPORT", False)), + tool_support=to_bool(os.getenv("TOOL_SUPPORT", "False")), + multimodal_support=to_bool(os.getenv("MULTIMODAL_SUPPORT", "False")), gunicorn_workers=int(os.getenv("NILAI_GUNICORN_WORKERS", 10)), attestation_host=str(os.getenv("ATTESTATION_HOST", "localhost")), attestation_port=int(os.getenv("ATTESTATION_PORT", 8081)),