From 2401aba0ffd84c008f9dad86b4e1b9da54223956 Mon Sep 17 00:00:00 2001 From: blefo Date: Fri, 28 Nov 2025 11:18:59 +0100 Subject: [PATCH 1/3] feat: add new Docker Compose configurations for multiple GPU services including Qwen3 and GPT models --- .../compose/docker-compose.nilai-router-1.yml | 48 ++++++++++ .../compose/docker-compose.nilai-router-2.yml | 93 ++++++++++++++++++ .../compose/docker-compose.nilai-router-3.yml | 96 +++++++++++++++++++ 3 files changed, 237 insertions(+) create mode 100644 docker/compose/docker-compose.nilai-router-1.yml create mode 100644 docker/compose/docker-compose.nilai-router-2.yml create mode 100644 docker/compose/docker-compose.nilai-router-3.yml diff --git a/docker/compose/docker-compose.nilai-router-1.yml b/docker/compose/docker-compose.nilai-router-1.yml new file mode 100644 index 00000000..229c437d --- /dev/null +++ b/docker/compose/docker-compose.nilai-router-1.yml @@ -0,0 +1,48 @@ +services: + qwen3_coder_30b_gpu: + image: nillion/nilai-vllm:latest + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + ipc: host + ulimits: + memlock: -1 + stack: 67108864 + env_file: + - .env + restart: unless-stopped + depends_on: + etcd: + condition: service_healthy + command: > + --model Qwen/Qwen3-Coder-30B-A3B-Instruct + --gpu-memory-utilization 0.95 + --max-model-len 100000 + --max-num-batched-tokens 8192 + --tensor-parallel-size 1 + --dtype bfloat16 + --kv-cache-dtype fp8 + --uvicorn-log-level warning + environment: + - SVC_HOST=qwen3_coder_30b_gpu + - SVC_PORT=8000 + - ETCD_HOST=etcd + - ETCD_PORT=2379 + - TOOL_SUPPORT=true + - MODEL_NUM_RETRIES=60 + - MODEL_RETRY_TIMEOUT=20 + volumes: + - hugging_face_models:/root/.cache/huggingface + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + retries: 3 + start_period: 180s + timeout: 10s +volumes: + hugging_face_models: + diff --git a/docker/compose/docker-compose.nilai-router-2.yml b/docker/compose/docker-compose.nilai-router-2.yml new file mode 100644 index 00000000..f48e5ab7 --- /dev/null +++ b/docker/compose/docker-compose.nilai-router-2.yml @@ -0,0 +1,93 @@ +services: + gpt_oss_20b_gpu: + image: nillion/nilai-vllm:latest + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + ipc: host + ulimits: + memlock: -1 + stack: 67108864 + env_file: + - .env + restart: unless-stopped + depends_on: + etcd: + condition: service_healthy + command: > + --model openai/gpt-oss-20b + --gpu-memory-utilization 0.75 + --max-model-len 100000 + --max-num-batched-tokens 100000 + --tensor-parallel-size 1 + --uvicorn-log-level warning + environment: + - SVC_HOST=gpt_oss_20b_gpu + - SVC_PORT=8000 + - ETCD_HOST=etcd + - ETCD_PORT=2379 + - TOOL_SUPPORT=true + - MODEL_NUM_RETRIES=60 + - MODEL_RETRY_TIMEOUT=20 + volumes: + - hugging_face_models:/root/.cache/huggingface + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + retries: 3 + start_period: 180s + timeout: 10s + + qwen3_thinking_4b_gpu: + image: nillion/nilai-vllm:latest + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + ipc: host + ulimits: + memlock: -1 + stack: 67108864 + env_file: + - .env + restart: unless-stopped + depends_on: + etcd: + condition: service_healthy + gpt_oss_20b_gpu: + condition: service_healthy + command: > + --model Qwen/Qwen3-4B-Thinking-2507 + --gpu-memory-utilization 0.23 + --max-model-len 10000 + --max-num-batched-tokens 10000 + --tensor-parallel-size 1 + --dtype bfloat16 + --kv-cache-dtype fp8 + --uvicorn-log-level warning + environment: + - SVC_HOST=qwen3_thinking_4b_gpu + - SVC_PORT=8000 + - ETCD_HOST=etcd + - ETCD_PORT=2379 + - TOOL_SUPPORT=true + - MODEL_NUM_RETRIES=60 + - MODEL_RETRY_TIMEOUT=20 + volumes: + - hugging_face_models:/root/.cache/huggingface + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + retries: 3 + start_period: 60s + timeout: 10s +volumes: + hugging_face_models: + diff --git a/docker/compose/docker-compose.nilai-router-3.yml b/docker/compose/docker-compose.nilai-router-3.yml new file mode 100644 index 00000000..08f5c061 --- /dev/null +++ b/docker/compose/docker-compose.nilai-router-3.yml @@ -0,0 +1,96 @@ +services: + arch_router_1_5b_gpu: + image: nillion/nilai-vllm:latest + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + ipc: host + ulimits: + memlock: -1 + stack: 67108864 + env_file: + - .env + restart: unless-stopped + depends_on: + etcd: + condition: service_healthy + command: > + --model katanemo/Arch-Router-1.5B + --gpu-memory-utilization 0.15 + --max-model-len 8000 + --max-num-batched-tokens 8000 + --tensor-parallel-size 1 + --dtype bfloat16 + --kv-cache-dtype fp8 + --uvicorn-log-level warning + environment: + - SVC_HOST=arch_router_1_5b_gpu + - SVC_PORT=8000 + - ETCD_HOST=etcd + - ETCD_PORT=2379 + - TOOL_SUPPORT=false + - MODEL_NUM_RETRIES=60 + - MODEL_RETRY_TIMEOUT=20 + volumes: + - hugging_face_models:/root/.cache/huggingface + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + retries: 3 + start_period: 60s + timeout: 10s + + qwen3_vl_4b_gpu: + image: nillion/nilai-vllm:latest + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + ipc: host + ulimits: + memlock: -1 + stack: 67108864 + env_file: + - .env + restart: unless-stopped + depends_on: + etcd: + condition: service_healthy + arch_router_1_5b_gpu: + condition: service_healthy + command: > + --model Qwen/Qwen3-VL-4B-Instruct + --gpu-memory-utilization 0.23 + --max-model-len 10000 + --max-num-batched-tokens 10000 + --tensor-parallel-size 1 + --dtype bfloat16 + --kv-cache-dtype fp8 + --uvicorn-log-level warning + environment: + - SVC_HOST=qwen3_vl_4b_gpu + - SVC_PORT=8000 + - ETCD_HOST=etcd + - ETCD_PORT=2379 + - TOOL_SUPPORT=true + - MULTIMODAL_SUPPORT=true + - MODEL_NUM_RETRIES=60 + - MODEL_RETRY_TIMEOUT=20 + volumes: + - hugging_face_models:/root/.cache/huggingface + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + retries: 3 + start_period: 60s + timeout: 10s +volumes: + hugging_face_models: + From d3bcc7779483ee21bd622eb224f6dee43e792527 Mon Sep 17 00:00:00 2001 From: blefo Date: Fri, 28 Nov 2025 11:46:56 +0100 Subject: [PATCH 2/3] fix: update GPU memory utilization in Docker Compose configuration for Qwen3 model --- docker/compose/docker-compose.nilai-router-3.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docker/compose/docker-compose.nilai-router-3.yml b/docker/compose/docker-compose.nilai-router-3.yml index 08f5c061..68b0fc52 100644 --- a/docker/compose/docker-compose.nilai-router-3.yml +++ b/docker/compose/docker-compose.nilai-router-3.yml @@ -67,7 +67,7 @@ services: condition: service_healthy command: > --model Qwen/Qwen3-VL-4B-Instruct - --gpu-memory-utilization 0.23 + --gpu-memory-utilization 0.8 --max-model-len 10000 --max-num-batched-tokens 10000 --tensor-parallel-size 1 @@ -92,5 +92,4 @@ services: start_period: 60s timeout: 10s volumes: - hugging_face_models: - + hugging_face_models: \ No newline at end of file From 639e69f240626124dbbd0065f0605bc3f832c17d Mon Sep 17 00:00:00 2001 From: blefo Date: Tue, 2 Dec 2025 09:12:29 +0100 Subject: [PATCH 3/3] fix: adjust GPU memory utilization to 0.20 in Docker Compose for Qwen3 model --- docker/compose/docker-compose.nilai-router-2.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/compose/docker-compose.nilai-router-2.yml b/docker/compose/docker-compose.nilai-router-2.yml index f48e5ab7..6ac36b71 100644 --- a/docker/compose/docker-compose.nilai-router-2.yml +++ b/docker/compose/docker-compose.nilai-router-2.yml @@ -65,7 +65,7 @@ services: condition: service_healthy command: > --model Qwen/Qwen3-4B-Thinking-2507 - --gpu-memory-utilization 0.23 + --gpu-memory-utilization 0.20 --max-model-len 10000 --max-num-batched-tokens 10000 --tensor-parallel-size 1