Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 10 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -138,33 +138,23 @@ docker compose -f production-compose.yml up -d
For a complete production setup with custom images:

```shell
# 1. Generate the production compose file
# 1a. Generate the Production 1 image
python3 ./scripts/docker-composer.py --prod \
-f docker/compose/docker-compose.llama-3b-gpu.yml \
-f docker/compose/docker-compose.llama-8b-gpu.yml \
-f docker/compose/docker-compose.deepseek-14b-gpu.yml \
--image 'nillion/nilai-api:latest=public.ecr.aws/k5d9x2g2/nilai-api:v0.2.0-alpha0' \
--image 'nillion/nilai-vllm:latest=public.ecr.aws/k5d9x2g2/nilai-vllm:v0.2.0-alpha0' \
--image 'nillion/nilai-attestation:latest=public.ecr.aws/k5d9x2g2/nilai-attestation:v0.2.0-alpha0' \
-f docker/compose/docker-compose.nilai-prod-1.yml \
--image 'nillion/nilai-api:latest=public.ecr.aws/k5d9x2g2/nilai-api:v0.2.0-alpha2' \
--image 'nillion/nilai-vllm:latest=public.ecr.aws/k5d9x2g2/nilai-vllm:v0.2.0-alpha2' \
--image 'nillion/nilai-attestation:latest=public.ecr.aws/k5d9x2g2/nilai-attestation:v0.2.0-alpha2' \
--testnet \
-o production-compose.yml

# 1. Generate the production compose file
# 1b. Generate the Production 2 image
python3 ./scripts/docker-composer.py --prod \
-f docker/compose/docker-compose.llama-3b-gpu.yml \
--image 'nillion/nilai-api:latest=public.ecr.aws/k5d9x2g2/nilai-api:v0.2.0-alpha0' \
--image 'nillion/nilai-vllm:latest=public.ecr.aws/k5d9x2g2/nilai-vllm:v0.2.0-alpha0' \
--image 'nillion/nilai-attestation:latest=public.ecr.aws/k5d9x2g2/nilai-attestation:v0.2.0-alpha0' \
--testnet \
-f docker/compose/docker-compose.nilai-prod-2.yml \
--image 'nillion/nilai-api:latest=public.ecr.aws/k5d9x2g2/nilai-api:v0.2.0-alpha2' \
--image 'nillion/nilai-vllm:latest=public.ecr.aws/k5d9x2g2/nilai-vllm:v0.2.0-alpha2' \
--image 'nillion/nilai-attestation:latest=public.ecr.aws/k5d9x2g2/nilai-attestation:v0.2.0-alpha2' \
-o production-compose.yml

# Or:
python3 ./scripts/docker-composer.py --prod \
-f docker/compose/docker-compose.llama-70b-gpu.yml \
--image 'nillion/nilai-api:latest=public.ecr.aws/k5d9x2g2/nilai-api:v0.2.0-alpha0' \
--image 'nillion/nilai-vllm:latest=public.ecr.aws/k5d9x2g2/nilai-vllm:v0.2.0-alpha0' \
--image 'nillion/nilai-attestation:latest=public.ecr.aws/k5d9x2g2/nilai-attestation:v0.2.0-alpha0' \
-o production-compose.yml

# 2. Deploy using the generated file
docker compose -f production-compose.yml up -d
Expand Down
42 changes: 42 additions & 0 deletions caddy/Caddyfile.testnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
(ssl_config) {
tls {
protocols tls1.2 tls1.3
}
}

{$NILAI_SERVER_DOMAIN} {
import ssl_config

handle_path /grafana/* {
uri strip_prefix /grafana
reverse_proxy grafana:3000
}

handle_path /nuc/* {
uri strip_prefix /nuc
reverse_proxy nilai-nuc-api:8080
}

handle_path /testnet/* {
uri strip_prefix /testnet
reverse_proxy testnet-nilai-nuc-api:8080
}

handle {
reverse_proxy nilai-api:8080
}
}


{$TESTNET_NILAI_SERVER_DOMAIN} {
import ssl_config

handle_path /grafana/* {
uri strip_prefix /grafana
reverse_proxy grafana:3000
}

handle {
reverse_proxy testnet-nilai-nuc-api:8080
}
}
2 changes: 2 additions & 0 deletions docker/compose/docker-compose.gemma-27b-gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ services:
- ETCD_PORT=2379
- TOOL_SUPPORT=false
- MULTIMODAL_SUPPORT=true
- MODEL_NUM_RETRIES=60
- MODEL_RETRY_TIMEOUT=20
volumes:
- hugging_face_models:/root/.cache/huggingface
healthcheck:
Expand Down
47 changes: 47 additions & 0 deletions docker/compose/docker-compose.nilai-prod-1.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
services:
gemma_27b_gpu:
image: nillion/nilai-vllm:latest
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
ipc: host
ulimits:
memlock: -1
stack: 67108864
env_file:
- .env
restart: unless-stopped
depends_on:
etcd:
condition: service_healthy
command: >
--model google/gemma-3-27b-it
--gpu-memory-utilization 0.95
--max-model-len 100000
--max-num-batched-tokens 8192
--dtype bfloat16
--kv-cache-dtype fp8
--uvicorn-log-level warning
environment:
- SVC_HOST=gemma_27b_gpu
- SVC_PORT=8000
- ETCD_HOST=etcd
- ETCD_PORT=2379
- TOOL_SUPPORT=false
- MULTIMODAL_SUPPORT=true
- MODEL_NUM_RETRIES=60
- MODEL_RETRY_TIMEOUT=20
volumes:
- hugging_face_models:/root/.cache/huggingface
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
retries: 3
start_period: 60s
timeout: 10s
volumes:
hugging_face_models:
90 changes: 90 additions & 0 deletions docker/compose/docker-compose.nilai-prod-2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
services:
llama_8b_gpu:
image: nillion/nilai-vllm:latest
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]

ulimits:
memlock: -1
stack: 67108864
env_file:
- .env
restart: unless-stopped
depends_on:
etcd:
condition: service_healthy
gpt_20b_gpu:
# Llama takes less time to initialize
condition: service_healthy
command: >
--model meta-llama/Llama-3.1-8B-Instruct
--gpu-memory-utilization 0.23
--max-model-len 10000
--max-num-batched-tokens 10000
--tensor-parallel-size 1
--dtype bfloat16
--kv-cache-dtype fp8
--enable-auto-tool-choice
--tool-call-parser llama3_json
--uvicorn-log-level warning
environment:
- SVC_HOST=llama_8b_gpu
- SVC_PORT=8000
- ETCD_HOST=etcd
- ETCD_PORT=2379
- TOOL_SUPPORT=true
volumes:
- hugging_face_models:/root/.cache/huggingface # cache models
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
retries: 3
start_period: 60s
timeout: 10s

gpt_20b_gpu:
image: nillion/nilai-vllm:latest
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
ulimits:
memlock: -1
stack: 67108864
env_file:
- .env
restart: unless-stopped
depends_on:
etcd:
condition: service_healthy
command: >
--model openai/gpt-oss-20b
--gpu-memory-utilization 0.75
--max-model-len 100000
--max-num-batched-tokens 100000
--tensor-parallel-size 1
--uvicorn-log-level warning
environment:
- SVC_HOST=gpt_20b_gpu
- SVC_PORT=8000
- ETCD_HOST=etcd
- ETCD_PORT=2379
- TOOL_SUPPORT=true
volumes:
- hugging_face_models:/root/.cache/huggingface # cache models
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
retries: 3
start_period: 180s
timeout: 10s
volumes:
hugging_face_models:
1 change: 1 addition & 0 deletions nilai-models/src/nilai_models/daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ async def get_metadata():
source=f"https://huggingface.co/{model_name}", # Model source
supported_features=["chat_completion"], # Capabilities
tool_support=SETTINGS.tool_support, # Tool support
multimodal_support=SETTINGS.multimodal_support, # Multimodal support
)

except Exception as e:
Expand Down
9 changes: 8 additions & 1 deletion packages/nilai-common/src/nilai_common/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ class HostSettings(BaseModel):
etcd_host: str = "localhost"
etcd_port: int = 2379
tool_support: bool = False
multimodal_support: bool = False
gunicorn_workers: int = 10
attestation_host: str = "localhost"
attestation_port: int = 8081
Expand All @@ -18,12 +19,18 @@ class ModelSettings(BaseModel):
timeout: int = Field(default=10, ge=1)


def to_bool(value: str) -> bool:
"""Convert a string to a boolean."""
return value.lower() in ("true", "1", "t", "y", "yes")


SETTINGS: HostSettings = HostSettings(
host=str(os.getenv("SVC_HOST", "localhost")),
port=int(os.getenv("SVC_PORT", 8000)),
etcd_host=str(os.getenv("ETCD_HOST", "localhost")),
etcd_port=int(os.getenv("ETCD_PORT", 2379)),
tool_support=bool(os.getenv("TOOL_SUPPORT", False)),
tool_support=to_bool(os.getenv("TOOL_SUPPORT", "False")),
multimodal_support=to_bool(os.getenv("MULTIMODAL_SUPPORT", "False")),
gunicorn_workers=int(os.getenv("NILAI_GUNICORN_WORKERS", 10)),
attestation_host=str(os.getenv("ATTESTATION_HOST", "localhost")),
attestation_port=int(os.getenv("ATTESTATION_PORT", 8081)),
Expand Down
Loading