Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion caddy/Caddyfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
}
}

https://gpu.nilai.sandbox.nilogy.xyz {
https://nilai.sandbox.nilogy.xyz {
import ssl_config
reverse_proxy api:8443
}
29 changes: 16 additions & 13 deletions docker-compose.gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,7 @@ services:
llama_8b_gpu:
build:
context: .
dockerfile: docker/model.Dockerfile
target: nilai
args:
MODEL_NAME: "llama_8b_gpu"
dockerfile: docker/vllm.Dockerfile
runtime: nvidia
deploy:
resources:
Expand All @@ -44,28 +41,29 @@ services:
- driver: nvidia
count: all
capabilities: [gpu]
env_file:
- .env
depends_on:
etcd:
condition: service_healthy
command: >
--model meta-llama/Llama-3.1-8B-Instruct
--gpu-memory-utilization 0.5
--max-model-len 10000
--tensor-parallel-size 1
environment:
- SVC_HOST=llama_8b_gpu
- SVC_PORT=8000
- ETCD_HOST=etcd
- ETCD_PORT=2379
env_file:
- .env
volumes:
- hugging_face_models:/root/.cache/huggingface # cache models
networks:
- backend_net

llama_1b_gpu:
build:
context: .
dockerfile: docker/model.Dockerfile
target: nilai
args:
MODEL_NAME: "llama_1b_gpu"
dockerfile: docker/vllm.Dockerfile
runtime: nvidia
deploy:
resources:
Expand All @@ -74,16 +72,21 @@ services:
- driver: nvidia
count: all
capabilities: [gpu]
env_file:
- .env
depends_on:
etcd:
condition: service_healthy
command: >
--model meta-llama/Llama-3.2-1B-Instruct
--gpu-memory-utilization 0.2
--max-model-len 10000
--tensor-parallel-size 1
environment:
- SVC_HOST=llama_1b_gpu
- SVC_PORT=8000
- ETCD_HOST=etcd
- ETCD_PORT=2379
env_file:
- .env
volumes:
- hugging_face_models:/root/.cache/huggingface # cache models
networks:
Expand Down
29 changes: 29 additions & 0 deletions docker/vllm.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
FROM vllm/vllm-openai:latest

# # Specify model name and path during build
# ARG MODEL_NAME=llama_1b_cpu
# ARG MODEL_PATH=meta-llama/Llama-3.1-8B-Instruct

# # Set environment variables
# ENV MODEL_NAME=${MODEL_NAME}
# ENV MODEL_PATH=${MODEL_PATH}
# ENV EXEC_PATH=nilai_models.models.${MODEL_NAME}:app

COPY --link . /daemon/

WORKDIR /daemon/nilai-models/

RUN apt-get update && \
apt-get install build-essential -y && \
pip install uv && \
uv sync && \
apt-get clean && \
apt-get autoremove && \
rm -rf /var/lib/apt/lists/*

# Expose port 8000 for incoming requests
EXPOSE 8000

ENTRYPOINT ["bash", "run.sh"]

CMD [""]
1 change: 1 addition & 0 deletions nilai-api/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ dependencies = [
"httpx>=0.27.2",
"nilrag>=0.1.2",
"nilql>=0.0.0a3",
"openai>=1.59.9",
]


Expand Down
114 changes: 59 additions & 55 deletions nilai-api/src/nilai_api/routers/private.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,25 @@
# Fast API and serving
import logging
import os
import asyncio
from base64 import b64encode
from typing import AsyncGenerator, Union
from typing import AsyncGenerator, Union, List
import numpy as np

import nilql
import nilrag
import httpx
from fastapi import APIRouter, Body, Depends, HTTPException
from fastapi.responses import StreamingResponse
from nilai_api.auth import get_user
from nilai_api.crypto import sign_message
from nilai_api.db import UserManager
from nilai_api.state import state
from openai import OpenAI

# Internal libraries
from nilai_common import (
AttestationResponse,
ChatRequest,
ChatResponse,
SignedChatCompletion,
Message,
ModelMetadata,
Usage,
Expand Down Expand Up @@ -79,7 +78,7 @@ async def get_attestation(user: dict = Depends(get_user)) -> AttestationResponse


@router.get("/v1/models", tags=["Model"])
async def get_models(user: dict = Depends(get_user)) -> list[ModelMetadata]:
async def get_models(user: dict = Depends(get_user)) -> List[ModelMetadata]:
"""
List all available models in the system.

Expand All @@ -94,21 +93,30 @@ async def get_models(user: dict = Depends(get_user)) -> list[ModelMetadata]:
"""
logger.info(f"Retrieving models for user {user['userid']} from pid {os.getpid()}")
return [endpoint.metadata for endpoint in (await state.models).values()]
# result = [Model(
# id = endpoint.metadata.id,
# created = 0,
# object = "model",
# owned_by = endpoint.metadata.author,
# data = endpoint.metadata.dict(),
# ) for endpoint in (await state.models).values()]

# return result[0]


@router.post("/v1/chat/completions", tags=["Chat"], response_model=None)
async def chat_completion(
req: ChatRequest = Body(
ChatRequest(
model="Llama-3.2-1B-Instruct",
model="meta-llama/Llama-3.2-1B-Instruct",
messages=[
Message(role="system", content="You are a helpful assistant."),
Message(role="user", content="What is your name?"),
],
)
),
user: dict = Depends(get_user),
) -> Union[ChatResponse, StreamingResponse]:
) -> Union[SignedChatCompletion, StreamingResponse]:
"""
Generate a chat completion response from the AI model.

Expand Down Expand Up @@ -144,7 +152,7 @@ async def chat_completion(
```python
# Generate a chat completion
request = ChatRequest(
model="Llama-3.2-1B-Instruct",
model="meta-llama/Llama-3.2-1B-Instruct",
messages=[
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": "Hello, who are you?"}
Expand All @@ -157,10 +165,17 @@ async def chat_completion(
endpoint = await state.get_model(model_name)
if endpoint is None:
raise HTTPException(
status_code=400, detail="Invalid model name, check /v1/models for options"
status_code=400,
detail=f"Invalid model name {model_name}, check /v1/models for options",
)

model_url = endpoint.url
model_url = endpoint.url + "/v1/"

logger.info(
f"Chat completion request for model {model_name} from user {user['userid']} on url: {model_url}"
)

client = OpenAI(base_url=model_url, api_key="<not-needed>")

if req.nilrag:
"""
Expand Down Expand Up @@ -282,60 +297,49 @@ async def chat_completion(

if req.stream:
# Forwarding Streamed Responses
async def stream_response() -> AsyncGenerator[str, None]:
async def chat_completion_stream_generator() -> AsyncGenerator[str, None]:
try:
async with httpx.AsyncClient() as client:
async with client.stream(
"POST",
f"{model_url}/v1/chat/completions",
json=req.model_dump(),
timeout=None,
) as response:
response.raise_for_status() # Raise an error for invalid status codes

# Process the streamed response chunks
async for chunk in response.aiter_lines():
if chunk: # Skip empty lines
yield f"{chunk}\n"
await asyncio.sleep(
0
) # Add an await to return inmediately
except httpx.HTTPStatusError as e:
raise HTTPException(
status_code=e.response.status_code,
detail=e.response.json().get("detail", str(e)),
)
except httpx.RequestError as e:
raise HTTPException(
status_code=503,
detail=f"Error connecting to model service: {str(e)}",
response = client.chat.completions.create(
model=req.model,
messages=req.messages,
stream=req.stream,
extra_body={
"stream_options": {
"include_usage": True,
# "continuous_usage_stats": True,
}
},
)

for chunk in response:
if chunk.usage is not None:
UserManager.update_token_usage(
user["userid"],
prompt_tokens=chunk.usage.prompt_tokens,
completion_tokens=chunk.usage.completion_tokens,
)
else:
data = chunk.model_dump_json(exclude_unset=True)
yield f"data: {data}\n\n"

except Exception as e:
logger.error(f"Error streaming response: {e}")
return

# Return the streaming response
return StreamingResponse(
stream_response(),
chat_completion_stream_generator(),
media_type="text/event-stream", # Ensure client interprets as Server-Sent Events
)

try:
async with httpx.AsyncClient() as client:
response = await client.post(
f"{model_url}/v1/chat/completions", json=req.model_dump(), timeout=None
)
response.raise_for_status()
model_response = ChatResponse.model_validate_json(response.content)
except httpx.HTTPStatusError as e:
# Forward the original error from the model
raise HTTPException(
status_code=e.response.status_code,
detail=e.response.json().get("detail", str(e)),
)
except httpx.RequestError as e:
# Handle connection/timeout errors
raise HTTPException(
status_code=503, detail=f"Error connecting to model service: {str(e)}"
)
response = client.chat.completions.create(
model=req.model, messages=req.messages, stream=req.stream
)

model_response = SignedChatCompletion(
**response.dict(),
signature="",
)
# Update token usage
UserManager.update_token_usage(
user["userid"],
Expand Down
19 changes: 19 additions & 0 deletions nilai-models/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Running vLLM without docker


```shell
# For Llama 8B
uv run bash run.sh \
--model meta-llama/Llama-3.1-8B-Instruct \
--gpu-memory-utilization 0.5 \
--max-model-len 10000 \
--tensor-parallel-size 1
```

```shell
# For Llama 1B
bash run.sh --model meta-llama/Llama-3.2-1B-Instruct \
--gpu-memory-utilization 0.2 \
--max-model-len 10000 \
--tensor-parallel-size 1
```
16 changes: 0 additions & 16 deletions nilai-models/gunicorn.conf.py

This file was deleted.

9 changes: 1 addition & 8 deletions nilai-models/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,8 @@ authors = [
]
requires-python = ">=3.12"
dependencies = [
"accelerate>=1.1.1",
"cryptography>=44.0.0",
"fastapi[standard]>=0.115.5",
"gunicorn>=23.0.0",
"httpx>=0.27.2",
"nilai-common",
"torch>=2.5.1",
"transformers>=4.46.3",
"uvicorn>=0.32.1",
"vllm>=0.6.6.post1",
]

[build-system]
Expand Down
16 changes: 16 additions & 0 deletions nilai-models/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

echo "Starting the primary process"
# Start the primary process and put it in the background
echo "Args: $*"
python3 -m vllm.entrypoints.openai.api_server $* & #--model $1 --gpu-memory-utilization 0.5 --max-model-len 10000 --tensor-parallel-size 1 &

echo "Starting the secondary process"
# Start the helper process
uv run python3 -m nilai_models.daemon

# Wait for any process to exit
wait -n

# Exit with status of process that exited first
exit $?
Loading
Loading