Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .env.ci
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,6 @@ ETCD_PORT = 2379
# Grafana Docker Compose Config
GF_SECURITY_ADMIN_USER = "admin"
GF_SECURITY_ADMIN_PASSWORD = "password"

# WebSearch Settings
BRAVE_SEARCH_API = "Your API here"
3 changes: 3 additions & 0 deletions .env.sample
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,6 @@ ETCD_PORT = 2379
# Grafana Docker Compose Config
GF_SECURITY_ADMIN_USER = "admin"
GF_SECURITY_ADMIN_PASSWORD = "password"

# WebSearch Settings
BRAVE_SEARCH_API = "Your API here"
43 changes: 43 additions & 0 deletions docker/compose/docker-compose.gpt-120b-gpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
services:
gpt_120b_gpu:
image: nillion/nilai-vllm:latest
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]

ulimits:
memlock: -1
stack: 67108864
env_file:
- .env
restart: unless-stopped
depends_on:
etcd:
condition: service_healthy
command: >
--model openai/gpt-oss-120b
--gpu-memory-utilization 0.95
--max-model-len 100000
--max-num-batched-tokens 100000
--tensor-parallel-size 1
--uvicorn-log-level warning
environment:
- SVC_HOST=gpt_120b_gpu
- SVC_PORT=8000
- ETCD_HOST=etcd
- ETCD_PORT=2379
- TOOL_SUPPORT=true
volumes:
- hugging_face_models:/root/.cache/huggingface # cache models
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
retries: 3
start_period: 60s
timeout: 10s
volumes:
hugging_face_models:
43 changes: 43 additions & 0 deletions docker/compose/docker-compose.gpt-20b-gpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
services:
gpt_20b_gpu:
image: nillion/nilai-vllm:latest
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]

ulimits:
memlock: -1
stack: 67108864
env_file:
- .env
restart: unless-stopped
depends_on:
etcd:
condition: service_healthy
command: >
--model openai/gpt-oss-20b
--gpu-memory-utilization 0.85
--max-model-len 100000
--max-num-batched-tokens 100000
--tensor-parallel-size 1
--uvicorn-log-level warning
environment:
- SVC_HOST=gpt_20b_gpu
- SVC_PORT=8000
- ETCD_HOST=etcd
- ETCD_PORT=2379
- TOOL_SUPPORT=true
volumes:
- hugging_face_models:/root/.cache/huggingface # cache models
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
retries: 3
start_period: 60s
timeout: 10s
volumes:
hugging_face_models:
2 changes: 2 additions & 0 deletions nilai-api/src/nilai_api/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ model_concurrent_rate_limit:
cognitivecomputations/Dolphin3.0-Llama3.1-8B: 30
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B: 5
hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4: 5
openai/gpt-oss-20b: 50
default: 50

user_rate_limit_minute: null
user_rate_limit_hour: null
Expand Down
7 changes: 3 additions & 4 deletions nilai-api/src/nilai_api/routers/private.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,10 +109,9 @@ async def chat_completion_concurrent_rate_limit(request: Request) -> Tuple[int,
except ValueError:
raise HTTPException(status_code=400, detail="Invalid request body")
key = f"chat:{chat_request.model}"
try:
limit = MODEL_CONCURRENT_RATE_LIMIT[chat_request.model]
except KeyError:
raise HTTPException(status_code=400, detail="Invalid model name")
limit = MODEL_CONCURRENT_RATE_LIMIT.get(
chat_request.model, MODEL_CONCURRENT_RATE_LIMIT.get("default", 50)
)
Comment on lines +112 to +114
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change is the most relevant. If the MODEL_CONCURRENT_RATE_LIMIT doesn't exist for such given model, it switches to "default" which should work for any model and otherwise 50. This prevents a failure state in most cases.

return limit, key


Expand Down
2 changes: 2 additions & 0 deletions nilai-api/src/nilai_api/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ async def models(self) -> Dict[str, ModelEndpoint]:
return await self.discovery_service.discover_models()

async def get_model(self, model_id: str) -> Optional[ModelEndpoint]:
if model_id is None or len(model_id) == 0:
return None
return await self.discovery_service.get_model(model_id)


Expand Down
2 changes: 1 addition & 1 deletion scripts/wait_for_ci_services.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
API_HEALTH_STATUS=$(docker inspect --format='{{.State.Health.Status}}' nilai-api 2>/dev/null)
MODEL_HEALTH_STATUS=$(docker inspect --format='{{.State.Health.Status}}' nilai-llama_1b_gpu 2>/dev/null)
NUC_API_HEALTH_STATUS=$(docker inspect --format='{{.State.Health.Status}}' nilai-nuc-api 2>/dev/null)
MAX_ATTEMPTS=20
MAX_ATTEMPTS=30
ATTEMPT=1

while [ $ATTEMPT -le $MAX_ATTEMPTS ]; do
Expand Down
74 changes: 1 addition & 73 deletions uv.lock

Large diffs are not rendered by default.

Loading