Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions caddy/Caddyfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
}
}

https://nilai.sandbox.nilogy.xyz {
https://gpu.nilai.sandbox.nilogy.xyz {
import ssl_config
reverse_proxy api:8443
}
}
95 changes: 95 additions & 0 deletions docker-compose.gpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
services:
etcd:
image: 'bitnami/etcd:latest'
environment:
- ALLOW_NONE_AUTHENTICATION=yes
- ETCD_ADVERTISE_CLIENT_URLS=http://etcd:2379
healthcheck:
test: ["CMD", "etcdctl", "endpoint", "health"]
interval: 10s
timeout: 5s
retries: 3
start_period: 5s
networks:
- backend_net

api:
build:
context: .
dockerfile: docker/api.Dockerfile
target: nilai
depends_on:
etcd:
condition: service_healthy
volumes:
- ${PWD}/db/:/app/db/ # sqlite database for users
environment:
- ETCD_HOST=etcd
- ETCD_PORT=2379
networks:
- backend_net

llama_8b_gpu:
build:
context: .
dockerfile: docker/model.Dockerfile
target: nilai
args:
MODEL_NAME: "llama_8b_gpu"
runtime: nvidia
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
env_file:
- .env
depends_on:
etcd:
condition: service_healthy
environment:
- SVC_HOST=llama_8b_gpu
- SVC_PORT=8000
- ETCD_HOST=etcd
- ETCD_PORT=2379
volumes:
- hugging_face_models:/root/.cache/huggingface # cache models
networks:
- backend_net

llama_1b_gpu:
build:
context: .
dockerfile: docker/model.Dockerfile
target: nilai
args:
MODEL_NAME: "llama_1b_gpu"
runtime: nvidia
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
env_file:
- .env
depends_on:
etcd:
condition: service_healthy
environment:
- SVC_HOST=llama_1b_gpu
- SVC_PORT=8000
- ETCD_HOST=etcd
- ETCD_PORT=2379
volumes:
- hugging_face_models:/root/.cache/huggingface # cache models
networks:
- backend_net
volumes:
hugging_face_models:

networks:
backend_net:
4 changes: 2 additions & 2 deletions nilai-models/gunicorn.conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
bind = "0.0.0.0:8000"

# Set the number of workers (2)
workers = 2
workers = 1

# Set the number of threads per worker (16)
threads = 16
threads = 4

# Set the timeout (120 seconds)
timeout = 120
Expand Down
2 changes: 1 addition & 1 deletion nilai-models/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ dependencies = [
"cryptography>=44.0.0",
"fastapi[standard]>=0.115.5",
"gunicorn>=23.0.0",
"llama-cpp-python>=0.3.2",
"nilai-common",
"torch>=2.5.1",
"transformers>=4.46.3",
"uvicorn>=0.32.1",
"vllm>=0.6.6.post1",
]

[build-system]
Expand Down
13 changes: 12 additions & 1 deletion nilai-models/src/nilai_models/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def __init__(self, metadata: ModelMetadata, prefix="/models"):
"""
# Store the model's metadata for later retrieval
self.metadata = metadata
self.url = f"http://{SETTINGS["host"]}:{SETTINGS["port"]}"
self.url = f"http://{SETTINGS['host']}:{SETTINGS['port']}"
self.endpoint = ModelEndpoint(url=self.url, metadata=self.metadata)
# Record the start time for uptime tracking
self._uptime = time.time()
Expand All @@ -54,6 +54,8 @@ async def lifespan(app: FastAPI):
keep_alive_task = None

try:
# Load models
self.load_models()
# Initialize discovery service
discovery_service = ModelServiceDiscovery(
host=SETTINGS["etcd_host"], port=SETTINGS["etcd_port"]
Expand Down Expand Up @@ -102,6 +104,15 @@ async def lifespan(app: FastAPI):
self._setup_routes()
return self.app

def load_models(self):
"""
Load the model(s) required for the service.

This method should be overridden by child classes to load
the specific model(s) required for the service.
"""
pass

def get_app(self) -> FastAPI:
"""
Retrieve the FastAPI application instance for the model.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,15 @@ def __init__(self):
),
)

def load_models(self):
"""
Load the model(s) required for the service.

This method is called during model initialization to load the
specific model(s) required for the service at service startup.
"""
pass


# FastAPI app instance
app = Llama1BCpu().get_app()
3 changes: 3 additions & 0 deletions nilai-models/src/nilai_models/models/llama_1b_gpu/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from nilai_models.models.llama_1b_gpu.llama_1b_gpu import app

__all__ = ["app"]
189 changes: 189 additions & 0 deletions nilai-models/src/nilai_models/models/llama_1b_gpu/llama_1b_gpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
import uuid
import time
import torch
import logging
import json
import asyncio
from typing import AsyncGenerator
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.arg_utils import AsyncEngineArgs
from fastapi import HTTPException
from transformers import AutoTokenizer
from vllm import SamplingParams, RequestOutput
from fastapi.responses import StreamingResponse
from nilai_common import (
ChatRequest,
ChatResponse,
Message,
ModelMetadata,
Usage,
Choice,
ChatCompletionChunk,
ChoiceChunk,
ChoiceChunkContent,
)
from nilai_models.model import Model


class Llama1BGpu(Model):
"""
A specific implementation of the Model base class for the Llama 8B GPU model.
"""

def __init__(self, load=True) -> None:
if not torch.cuda.is_available():
raise ValueError("Attempted to initialize GPU model on non-GPU machine")
super().__init__(
ModelMetadata(
id="Llama-3.2-1B-Instruct", # Unique identifier
name="Llama-3.2-1B-Instruct", # Human-readable name
version="1.0", # Model version
description="Llama is a large language model trained on supervised and unsupervised data.",
author="Meta-Llama", # Model creators
license="Apache 2.0", # Usage license
source="https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct", # Model source
supported_features=["chat_completion"], # Capabilities
),
)

def load_models(self):
"""
Load the model(s) required for the service.

This method is called during model initialization to load the
specific model(s) required for the service at service startup.
"""
engine_args = AsyncEngineArgs(
model="meta-llama/Llama-3.2-1B-Instruct",
gpu_memory_utilization=0.3,
max_model_len=60624,
tensor_parallel_size=torch.cuda.device_count(),
)
self.llm_engine = AsyncLLMEngine.from_engine_args(engine_args)
self.tokenizer = AutoTokenizer.from_pretrained(
"meta-llama/Llama-3.2-1B-Instruct"
)

async def chat_completion(
self,
req: ChatRequest = ChatRequest(
# Default request with sample messages for documentation
model="meta-llama/Llama-3.2-1B-Instruct",
messages=[
Message(role="system", content="You are a helpful assistant."),
Message(role="user", content="What is your name?"),
],
),
) -> StreamingResponse | ChatResponse:
"""
Generate a chat completion using the Llama model, with optional streaming.

Args:
req (ChatRequest): The chat request containing conversation messages.
stream (bool): Whether to return a streamed response.

Returns:
ChatResponse or StreamingResponse: Either a full response or a streaming response.
"""
if not req.messages or len(req.messages) == 0:
raise HTTPException(
status_code=400, detail="The 'messages' field is required."
)
if not req.model:
raise HTTPException(
status_code=400, detail="The 'model' field is required."
)

# Transform incoming messages to llama_cpp-compatible format
conversation = [
{
"role": msg.role, # Preserve message role (system/user/assistant)
"content": msg.content, # Preserve message content
}
for msg in req.messages
]

prompt = self.tokenizer.apply_chat_template(
conversation, tokenize=False, add_generation_prompt=True
)

sampling_params = SamplingParams(
temperature=req.temperature if req.temperature else 0.7,
top_p=req.top_p if req.top_p else 0.95,
max_tokens=req.max_tokens if req.max_tokens else 1024,
)

if req.stream:

async def generate() -> AsyncGenerator[str, None]:
try:
previous_generated_len = 0
async for chunk in self.llm_engine.generate(
prompt,
sampling_params=sampling_params,
request_id=str(uuid.uuid4()),
): # Generate chunks
current_text = chunk.outputs[0].text

# Get only new tokens by slicing from previous length
new_text = current_text[previous_generated_len:]
# print(new_tokens, end='', flush=True)
previous_generated_len = len(current_text)
chunk = ChoiceChunk(
index=0,
delta=ChoiceChunkContent(content=new_text),
) # Create a ChoiceChunk
completion_chunk = ChatCompletionChunk(choices=[chunk])
yield f"data: {completion_chunk.model_dump_json()}\n\n" # Stream the chunk
await asyncio.sleep(0) # Add an await to return inmediately

yield "data: [DONE]\n\n"
except Exception as e:
logging.error("An error occurred: %s", str(e))
yield f"data: {json.dumps({'error': 'Internal error occurred!'})}\n\n"

# Return the streamed response with headers
return StreamingResponse(generate(), media_type="text/event-stream")
# Non-streaming (regular) chat completion
try:
request_output: RequestOutput = None # type: ignore
async for chunk in self.llm_engine.generate(
prompt, sampling_params=sampling_params, request_id=str(uuid.uuid4())
):
request_output = chunk
generation = request_output.outputs[0].text
except ValueError:
raise HTTPException(
status_code=400,
detail="The prompt size exceeds the maximum limit of 2048 tokens.",
)
if not generation or len(generation) == 0:
raise ValueError("The model returned no output.")

response = ChatResponse(
signature="",
id="chatcmpl-" + str(uuid.uuid4()),
object="chat.completion",
created=int(time.time()),
model=req.model,
choices=[
Choice(
index=0,
message=Message(role="assistant", content=generation),
finish_reason="complete",
logprobs=None,
)
],
usage=Usage(
prompt_tokens=len(prompt.split()),
completion_tokens=len(generation.split()),
total_tokens=len(prompt.split()) + len(generation.split()),
),
)
return response


# Create and expose the FastAPI app for this Llama model
# - Calls get_app() from the base Model class
# - Allows easy integration with ASGI servers like uvicorn
app = Llama1BGpu().get_app()
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,15 @@ def __init__(self):
),
)

def load_models(self):
"""
Load the model(s) required for the service.

This method is called during model initialization to load the
specific model(s) required for the service at service startup.
"""
pass


# Create and expose the FastAPI app for this Llama model
# - Calls get_app() from the base Model class
Expand Down
3 changes: 3 additions & 0 deletions nilai-models/src/nilai_models/models/llama_8b_gpu/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from nilai_models.models.llama_8b_gpu.llama_8b_gpu import app

__all__ = ["app"]
Loading
Loading