NillionNetwork · jcabrero · Jan 15, 2025 · Jan 5, 2025 · Jan 7, 2025 · Jan 7, 2025
diff --git a/caddy/Caddyfile b/caddy/Caddyfile
@@ -4,7 +4,7 @@
 	}
  }
 
- https://nilai.sandbox.nilogy.xyz {
+ https://gpu.nilai.sandbox.nilogy.xyz {
 	import ssl_config
 	reverse_proxy api:8443
- }
+ }
diff --git a/docker-compose.gpu.yml b/docker-compose.gpu.yml
@@ -0,0 +1,95 @@
+services:
+  etcd:
+    image: 'bitnami/etcd:latest'
+    environment:
+      - ALLOW_NONE_AUTHENTICATION=yes
+      - ETCD_ADVERTISE_CLIENT_URLS=http://etcd:2379
+    healthcheck:
+      test: ["CMD", "etcdctl", "endpoint", "health"]
+      interval: 10s
+      timeout: 5s
+      retries: 3
+      start_period: 5s
+    networks:
+      - backend_net
+
+  api:
+    build: 
+      context: .
+      dockerfile: docker/api.Dockerfile
+      target: nilai
+    depends_on:
+      etcd:
+        condition: service_healthy
+    volumes:
+      - ${PWD}/db/:/app/db/   # sqlite database for users
+    environment:
+      - ETCD_HOST=etcd
+      - ETCD_PORT=2379
+    networks:
+      - backend_net
+
+  llama_8b_gpu:
+    build:
+      context: .
+      dockerfile: docker/model.Dockerfile
+      target: nilai
+      args:
+        MODEL_NAME: "llama_8b_gpu"
+    runtime: nvidia
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    env_file:
+      - .env
+    depends_on:
+      etcd:
+        condition: service_healthy
+    environment:
+      - SVC_HOST=llama_8b_gpu
+      - SVC_PORT=8000
+      - ETCD_HOST=etcd
+      - ETCD_PORT=2379
+    volumes:
+      - hugging_face_models:/root/.cache/huggingface  # cache models
+    networks:
+      - backend_net
+
+  llama_1b_gpu:
+    build:
+      context: .
+      dockerfile: docker/model.Dockerfile
+      target: nilai
+      args:
+        MODEL_NAME: "llama_1b_gpu"
+    runtime: nvidia
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    env_file:
+      - .env
+    depends_on:
+      etcd:
+        condition: service_healthy
+    environment:
+      - SVC_HOST=llama_1b_gpu
+      - SVC_PORT=8000
+      - ETCD_HOST=etcd
+      - ETCD_PORT=2379
+    volumes:
+      - hugging_face_models:/root/.cache/huggingface  # cache models
+    networks:
+      - backend_net
+volumes:
+  hugging_face_models:
+
+networks:
+  backend_net:
diff --git a/nilai-models/gunicorn.conf.py b/nilai-models/gunicorn.conf.py
@@ -4,10 +4,10 @@
 bind = "0.0.0.0:8000"
 
 # Set the number of workers (2)
-workers = 2
+workers = 1
 
 # Set the number of threads per worker (16)
-threads = 16
+threads = 4
 
 # Set the timeout (120 seconds)
 timeout = 120

diff --git a/nilai-models/pyproject.toml b/nilai-models/pyproject.toml
@@ -12,11 +12,11 @@ dependencies = [
     "cryptography>=44.0.0",
     "fastapi[standard]>=0.115.5",
     "gunicorn>=23.0.0",
-    "llama-cpp-python>=0.3.2",
     "nilai-common",
     "torch>=2.5.1",
     "transformers>=4.46.3",
     "uvicorn>=0.32.1",
+    "vllm>=0.6.6.post1",
 ]
 
 [build-system]

diff --git a/nilai-models/src/nilai_models/model.py b/nilai-models/src/nilai_models/model.py
@@ -40,7 +40,7 @@ def __init__(self, metadata: ModelMetadata, prefix="/models"):
         """
         # Store the model's metadata for later retrieval
         self.metadata = metadata
-        self.url = f"http://{SETTINGS["host"]}:{SETTINGS["port"]}"
+        self.url = f"http://{SETTINGS['host']}:{SETTINGS['port']}"
         self.endpoint = ModelEndpoint(url=self.url, metadata=self.metadata)
         # Record the start time for uptime tracking
         self._uptime = time.time()
@@ -54,6 +54,8 @@ async def lifespan(app: FastAPI):
             keep_alive_task = None
 
             try:
+                # Load models
+                self.load_models()
                 # Initialize discovery service
                 discovery_service = ModelServiceDiscovery(
                     host=SETTINGS["etcd_host"], port=SETTINGS["etcd_port"]
@@ -102,6 +104,15 @@ async def lifespan(app: FastAPI):
         self._setup_routes()
         return self.app
 
+    def load_models(self):
+        """
+        Load the model(s) required for the service.
+
+        This method should be overridden by child classes to load
+        the specific model(s) required for the service.
+        """
+        pass
+
     def get_app(self) -> FastAPI:
         """
         Retrieve the FastAPI application instance for the model.

diff --git a/nilai-models/src/nilai_models/models/llama_1b_cpu/llama_1b_cpu.py b/nilai-models/src/nilai_models/models/llama_1b_cpu/llama_1b_cpu.py
@@ -41,6 +41,15 @@ def __init__(self):
             ),
         )
 
+    def load_models(self):
+        """
+        Load the model(s) required for the service.
+
+        This method is called during model initialization to load the
+        specific model(s) required for the service at service startup.
+        """
+        pass
+
 
 # FastAPI app instance
 app = Llama1BCpu().get_app()
diff --git a/nilai-models/src/nilai_models/models/llama_1b_gpu/__init__.py b/nilai-models/src/nilai_models/models/llama_1b_gpu/__init__.py
@@ -0,0 +1,3 @@
+from nilai_models.models.llama_1b_gpu.llama_1b_gpu import app
+
+__all__ = ["app"]
diff --git a/nilai-models/src/nilai_models/models/llama_1b_gpu/llama_1b_gpu.py b/nilai-models/src/nilai_models/models/llama_1b_gpu/llama_1b_gpu.py
@@ -0,0 +1,189 @@
+import uuid
+import time
+import torch
+import logging
+import json
+import asyncio
+from typing import AsyncGenerator
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.arg_utils import AsyncEngineArgs
+from fastapi import HTTPException
+from transformers import AutoTokenizer
+from vllm import SamplingParams, RequestOutput
+from fastapi.responses import StreamingResponse
+from nilai_common import (
+    ChatRequest,
+    ChatResponse,
+    Message,
+    ModelMetadata,
+    Usage,
+    Choice,
+    ChatCompletionChunk,
+    ChoiceChunk,
+    ChoiceChunkContent,
+)
+from nilai_models.model import Model
+
+
+class Llama1BGpu(Model):
+    """
+    A specific implementation of the Model base class for the Llama 8B GPU model.
+    """
+
+    def __init__(self, load=True) -> None:
+        if not torch.cuda.is_available():
+            raise ValueError("Attempted to initialize GPU model on non-GPU machine")
+        super().__init__(
+            ModelMetadata(
+                id="Llama-3.2-1B-Instruct",  # Unique identifier
+                name="Llama-3.2-1B-Instruct",  # Human-readable name
+                version="1.0",  # Model version
+                description="Llama is a large language model trained on supervised and unsupervised data.",
+                author="Meta-Llama",  # Model creators
+                license="Apache 2.0",  # Usage license
+                source="https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct",  # Model source
+                supported_features=["chat_completion"],  # Capabilities
+            ),
+        )
+
+    def load_models(self):
+        """
+        Load the model(s) required for the service.
+
+        This method is called during model initialization to load the
+        specific model(s) required for the service at service startup.
+        """
+        engine_args = AsyncEngineArgs(
+            model="meta-llama/Llama-3.2-1B-Instruct",
+            gpu_memory_utilization=0.3,
+            max_model_len=60624,
+            tensor_parallel_size=torch.cuda.device_count(),
+        )
+        self.llm_engine = AsyncLLMEngine.from_engine_args(engine_args)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            "meta-llama/Llama-3.2-1B-Instruct"
+        )
+
+    async def chat_completion(
+        self,
+        req: ChatRequest = ChatRequest(
+            # Default request with sample messages for documentation
+            model="meta-llama/Llama-3.2-1B-Instruct",
+            messages=[
+                Message(role="system", content="You are a helpful assistant."),
+                Message(role="user", content="What is your name?"),
+            ],
+        ),
+    ) -> StreamingResponse | ChatResponse:
+        """
+        Generate a chat completion using the Llama model, with optional streaming.
+
+        Args:
+            req (ChatRequest): The chat request containing conversation messages.
+            stream (bool): Whether to return a streamed response.
+
+        Returns:
+            ChatResponse or StreamingResponse: Either a full response or a streaming response.
+        """
+        if not req.messages or len(req.messages) == 0:
+            raise HTTPException(
+                status_code=400, detail="The 'messages' field is required."
+            )
+        if not req.model:
+            raise HTTPException(
+                status_code=400, detail="The 'model' field is required."
+            )
+
+        # Transform incoming messages to llama_cpp-compatible format
+        conversation = [
+            {
+                "role": msg.role,  # Preserve message role (system/user/assistant)
+                "content": msg.content,  # Preserve message content
+            }
+            for msg in req.messages
+        ]
+
+        prompt = self.tokenizer.apply_chat_template(
+            conversation, tokenize=False, add_generation_prompt=True
+        )
+
+        sampling_params = SamplingParams(
+            temperature=req.temperature if req.temperature else 0.7,
+            top_p=req.top_p if req.top_p else 0.95,
+            max_tokens=req.max_tokens if req.max_tokens else 1024,
+        )
+
+        if req.stream:
+
+            async def generate() -> AsyncGenerator[str, None]:
+                try:
+                    previous_generated_len = 0
+                    async for chunk in self.llm_engine.generate(
+                        prompt,
+                        sampling_params=sampling_params,
+                        request_id=str(uuid.uuid4()),
+                    ):  # Generate chunks
+                        current_text = chunk.outputs[0].text
+
+                        # Get only new tokens by slicing from previous length
+                        new_text = current_text[previous_generated_len:]
+                        # print(new_tokens, end='', flush=True)
+                        previous_generated_len = len(current_text)
+                        chunk = ChoiceChunk(
+                            index=0,
+                            delta=ChoiceChunkContent(content=new_text),
+                        )  # Create a ChoiceChunk
+                        completion_chunk = ChatCompletionChunk(choices=[chunk])
+                        yield f"data: {completion_chunk.model_dump_json()}\n\n"  # Stream the chunk
+                        await asyncio.sleep(0)  # Add an await to return inmediately
+
+                    yield "data: [DONE]\n\n"
+                except Exception as e:
+                    logging.error("An error occurred: %s", str(e))
+                    yield f"data: {json.dumps({'error': 'Internal error occurred!'})}\n\n"
+
+            # Return the streamed response with headers
+            return StreamingResponse(generate(), media_type="text/event-stream")
+        # Non-streaming (regular) chat completion
+        try:
+            request_output: RequestOutput = None  # type: ignore
+            async for chunk in self.llm_engine.generate(
+                prompt, sampling_params=sampling_params, request_id=str(uuid.uuid4())
+            ):
+                request_output = chunk
+            generation = request_output.outputs[0].text
+        except ValueError:
+            raise HTTPException(
+                status_code=400,
+                detail="The prompt size exceeds the maximum limit of 2048 tokens.",
+            )
+        if not generation or len(generation) == 0:
+            raise ValueError("The model returned no output.")
+
+        response = ChatResponse(
+            signature="",
+            id="chatcmpl-" + str(uuid.uuid4()),
+            object="chat.completion",
+            created=int(time.time()),
+            model=req.model,
+            choices=[
+                Choice(
+                    index=0,
+                    message=Message(role="assistant", content=generation),
+                    finish_reason="complete",
+                    logprobs=None,
+                )
+            ],
+            usage=Usage(
+                prompt_tokens=len(prompt.split()),
+                completion_tokens=len(generation.split()),
+                total_tokens=len(prompt.split()) + len(generation.split()),
+            ),
+        )
+        return response
+
+
+# Create and expose the FastAPI app for this Llama model
+# - Calls get_app() from the base Model class
+# - Allows easy integration with ASGI servers like uvicorn
+app = Llama1BGpu().get_app()
diff --git a/nilai-models/src/nilai_models/models/llama_8b_cpu/llama_8b_cpu.py b/nilai-models/src/nilai_models/models/llama_8b_cpu/llama_8b_cpu.py
@@ -52,6 +52,15 @@ def __init__(self):
             ),
         )
 
+    def load_models(self):
+        """
+        Load the model(s) required for the service.
+
+        This method is called during model initialization to load the
+        specific model(s) required for the service at service startup.
+        """
+        pass
+
 
 # Create and expose the FastAPI app for this Llama model
 # - Calls get_app() from the base Model class

diff --git a/nilai-models/src/nilai_models/models/llama_8b_gpu/__init__.py b/nilai-models/src/nilai_models/models/llama_8b_gpu/__init__.py
@@ -0,0 +1,3 @@
+from nilai_models.models.llama_8b_gpu.llama_8b_gpu import app
+
+__all__ = ["app"]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from nilai_models.models.llama_1b_gpu.llama_1b_gpu import app

		__all__ = ["app"]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from nilai_models.models.llama_8b_gpu.llama_8b_gpu import app

		__all__ = ["app"]