NillionNetwork · jcabrero · Jan 24, 2025 · Jan 22, 2025 · Jan 22, 2025 · Jan 22, 2025
diff --git a/caddy/Caddyfile b/caddy/Caddyfile
@@ -4,7 +4,7 @@
 	}
  }
 
- https://gpu.nilai.sandbox.nilogy.xyz {
+ https://nilai.sandbox.nilogy.xyz {
 	import ssl_config
 	reverse_proxy api:8443
  }
diff --git a/docker-compose.gpu.yml b/docker-compose.gpu.yml
@@ -32,10 +32,7 @@ services:
   llama_8b_gpu:
     build:
       context: .
-      dockerfile: docker/model.Dockerfile
-      target: nilai
-      args:
-        MODEL_NAME: "llama_8b_gpu"
+      dockerfile: docker/vllm.Dockerfile
     runtime: nvidia
     deploy:
       resources:
@@ -44,28 +41,29 @@ services:
             - driver: nvidia
               count: all
               capabilities: [gpu]
-    env_file:
-      - .env
     depends_on:
       etcd:
         condition: service_healthy
+    command: >
+      --model meta-llama/Llama-3.1-8B-Instruct
+      --gpu-memory-utilization 0.5 
+      --max-model-len 10000 
+      --tensor-parallel-size 1
     environment:
       - SVC_HOST=llama_8b_gpu
       - SVC_PORT=8000
       - ETCD_HOST=etcd
       - ETCD_PORT=2379
+    env_file:
+      - .env
     volumes:
       - hugging_face_models:/root/.cache/huggingface  # cache models
     networks:
       - backend_net
-
   llama_1b_gpu:
     build:
       context: .
-      dockerfile: docker/model.Dockerfile
-      target: nilai
-      args:
-        MODEL_NAME: "llama_1b_gpu"
+      dockerfile: docker/vllm.Dockerfile
     runtime: nvidia
     deploy:
       resources:
@@ -74,16 +72,21 @@ services:
             - driver: nvidia
               count: all
               capabilities: [gpu]
-    env_file:
-      - .env
     depends_on:
       etcd:
         condition: service_healthy
+    command: >
+      --model meta-llama/Llama-3.2-1B-Instruct
+      --gpu-memory-utilization 0.2 
+      --max-model-len 10000 
+      --tensor-parallel-size 1
     environment:
       - SVC_HOST=llama_1b_gpu
       - SVC_PORT=8000
       - ETCD_HOST=etcd
       - ETCD_PORT=2379
+    env_file:
+      - .env
     volumes:
       - hugging_face_models:/root/.cache/huggingface  # cache models
     networks:

diff --git a/docker/vllm.Dockerfile b/docker/vllm.Dockerfile
@@ -0,0 +1,29 @@
+FROM vllm/vllm-openai:latest
+
+# # Specify model name and path during build
+# ARG MODEL_NAME=llama_1b_cpu
+# ARG MODEL_PATH=meta-llama/Llama-3.1-8B-Instruct
+
+# # Set environment variables
+# ENV MODEL_NAME=${MODEL_NAME}
+# ENV MODEL_PATH=${MODEL_PATH}
+# ENV EXEC_PATH=nilai_models.models.${MODEL_NAME}:app
+
+COPY --link . /daemon/
+
+WORKDIR /daemon/nilai-models/
+
+RUN apt-get update && \
+    apt-get install build-essential -y && \
+    pip install uv && \
+    uv sync && \
+    apt-get clean && \
+    apt-get autoremove && \
+    rm -rf /var/lib/apt/lists/* 
+
+# Expose port 8000 for incoming requests
+EXPOSE 8000
+
+ENTRYPOINT ["bash", "run.sh"]
+
+CMD [""]
diff --git a/nilai-api/pyproject.toml b/nilai-api/pyproject.toml
@@ -22,6 +22,7 @@ dependencies = [
     "httpx>=0.27.2",
     "nilrag>=0.1.2",
     "nilql>=0.0.0a3",
+    "openai>=1.59.9",
 ]
 
 

diff --git a/nilai-api/src/nilai_api/routers/private.py b/nilai-api/src/nilai_api/routers/private.py
@@ -1,26 +1,25 @@
 # Fast API and serving
 import logging
 import os
-import asyncio
 from base64 import b64encode
-from typing import AsyncGenerator, Union
+from typing import AsyncGenerator, Union, List
 import numpy as np
 
 import nilql
 import nilrag
-import httpx
 from fastapi import APIRouter, Body, Depends, HTTPException
 from fastapi.responses import StreamingResponse
 from nilai_api.auth import get_user
 from nilai_api.crypto import sign_message
 from nilai_api.db import UserManager
 from nilai_api.state import state
+from openai import OpenAI
 
 # Internal libraries
 from nilai_common import (
     AttestationResponse,
     ChatRequest,
-    ChatResponse,
+    SignedChatCompletion,
     Message,
     ModelMetadata,
     Usage,
@@ -79,7 +78,7 @@ async def get_attestation(user: dict = Depends(get_user)) -> AttestationResponse
 
 
 @router.get("/v1/models", tags=["Model"])
-async def get_models(user: dict = Depends(get_user)) -> list[ModelMetadata]:
+async def get_models(user: dict = Depends(get_user)) -> List[ModelMetadata]:
     """
     List all available models in the system.
 
@@ -94,21 +93,30 @@ async def get_models(user: dict = Depends(get_user)) -> list[ModelMetadata]:
     """
     logger.info(f"Retrieving models for user {user['userid']} from pid {os.getpid()}")
     return [endpoint.metadata for endpoint in (await state.models).values()]
+    # result = [Model(
+    #     id = endpoint.metadata.id,
+    #     created = 0,
+    #     object = "model",
+    #     owned_by = endpoint.metadata.author,
+    #     data = endpoint.metadata.dict(),
+    # ) for endpoint in (await state.models).values()]
+
+    # return result[0]
 
 
 @router.post("/v1/chat/completions", tags=["Chat"], response_model=None)
 async def chat_completion(
     req: ChatRequest = Body(
         ChatRequest(
-            model="Llama-3.2-1B-Instruct",
+            model="meta-llama/Llama-3.2-1B-Instruct",
             messages=[
                 Message(role="system", content="You are a helpful assistant."),
                 Message(role="user", content="What is your name?"),
             ],
         )
     ),
     user: dict = Depends(get_user),
-) -> Union[ChatResponse, StreamingResponse]:
+) -> Union[SignedChatCompletion, StreamingResponse]:
     """
     Generate a chat completion response from the AI model.
 
@@ -144,7 +152,7 @@ async def chat_completion(
     ```python
     # Generate a chat completion
     request = ChatRequest(
-        model="Llama-3.2-1B-Instruct",
+        model="meta-llama/Llama-3.2-1B-Instruct",
         messages=[
             {"role": "system", "content": "You are a helpful assistant"},
             {"role": "user", "content": "Hello, who are you?"}
@@ -157,10 +165,17 @@ async def chat_completion(
     endpoint = await state.get_model(model_name)
     if endpoint is None:
         raise HTTPException(
-            status_code=400, detail="Invalid model name, check /v1/models for options"
+            status_code=400,
+            detail=f"Invalid model name {model_name}, check /v1/models for options",
         )
 
-    model_url = endpoint.url
+    model_url = endpoint.url + "/v1/"
+
+    logger.info(
+        f"Chat completion request for model {model_name} from user {user['userid']} on url: {model_url}"
+    )
+
+    client = OpenAI(base_url=model_url, api_key="<not-needed>")
 
     if req.nilrag:
         """
@@ -282,60 +297,49 @@ async def chat_completion(
 
     if req.stream:
         # Forwarding Streamed Responses
-        async def stream_response() -> AsyncGenerator[str, None]:
+        async def chat_completion_stream_generator() -> AsyncGenerator[str, None]:
             try:
-                async with httpx.AsyncClient() as client:
-                    async with client.stream(
-                        "POST",
-                        f"{model_url}/v1/chat/completions",
-                        json=req.model_dump(),
-                        timeout=None,
-                    ) as response:
-                        response.raise_for_status()  # Raise an error for invalid status codes
-
-                        # Process the streamed response chunks
-                        async for chunk in response.aiter_lines():
-                            if chunk:  # Skip empty lines
-                                yield f"{chunk}\n"
-                                await asyncio.sleep(
-                                    0
-                                )  # Add an await to return inmediately
-            except httpx.HTTPStatusError as e:
-                raise HTTPException(
-                    status_code=e.response.status_code,
-                    detail=e.response.json().get("detail", str(e)),
-                )
-            except httpx.RequestError as e:
-                raise HTTPException(
-                    status_code=503,
-                    detail=f"Error connecting to model service: {str(e)}",
+                response = client.chat.completions.create(
+                    model=req.model,
+                    messages=req.messages,
+                    stream=req.stream,
+                    extra_body={
+                        "stream_options": {
+                            "include_usage": True,
+                            # "continuous_usage_stats": True,
+                        }
+                    },
                 )
 
+                for chunk in response:
+                    if chunk.usage is not None:
+                        UserManager.update_token_usage(
+                            user["userid"],
+                            prompt_tokens=chunk.usage.prompt_tokens,
+                            completion_tokens=chunk.usage.completion_tokens,
+                        )
+                    else:
+                        data = chunk.model_dump_json(exclude_unset=True)
+                        yield f"data: {data}\n\n"
+
+            except Exception as e:
+                logger.error(f"Error streaming response: {e}")
+                return
+
         # Return the streaming response
         return StreamingResponse(
-            stream_response(),
+            chat_completion_stream_generator(),
             media_type="text/event-stream",  # Ensure client interprets as Server-Sent Events
         )
 
-    try:
-        async with httpx.AsyncClient() as client:
-            response = await client.post(
-                f"{model_url}/v1/chat/completions", json=req.model_dump(), timeout=None
-            )
-            response.raise_for_status()
-            model_response = ChatResponse.model_validate_json(response.content)
-    except httpx.HTTPStatusError as e:
-        # Forward the original error from the model
-        raise HTTPException(
-            status_code=e.response.status_code,
-            detail=e.response.json().get("detail", str(e)),
-        )
-    except httpx.RequestError as e:
-        # Handle connection/timeout errors
-        raise HTTPException(
-            status_code=503, detail=f"Error connecting to model service: {str(e)}"
-        )
+    response = client.chat.completions.create(
+        model=req.model, messages=req.messages, stream=req.stream
+    )
 
+    model_response = SignedChatCompletion(
+        **response.dict(),
+        signature="",
+    )
     # Update token usage
     UserManager.update_token_usage(
         user["userid"],

diff --git a/nilai-models/README.md b/nilai-models/README.md
@@ -0,0 +1,19 @@
+# Running vLLM without docker
+
+
+```shell
+# For Llama 8B
+uv run bash run.sh \
+--model meta-llama/Llama-3.1-8B-Instruct \
+--gpu-memory-utilization 0.5 \
+--max-model-len 10000 \
+--tensor-parallel-size 1
+```
+
+```shell
+# For Llama 1B
+bash run.sh --model meta-llama/Llama-3.2-1B-Instruct \
+--gpu-memory-utilization 0.2 \
+--max-model-len 10000 \
+--tensor-parallel-size 1
+```
diff --git a/nilai-models/gunicorn.conf.py b/nilai-models/gunicorn.conf.py
diff --git a/nilai-models/pyproject.toml b/nilai-models/pyproject.toml
@@ -8,15 +8,8 @@ authors = [
 ]
 requires-python = ">=3.12"
 dependencies = [
-    "accelerate>=1.1.1",
-    "cryptography>=44.0.0",
-    "fastapi[standard]>=0.115.5",
-    "gunicorn>=23.0.0",
+    "httpx>=0.27.2",
     "nilai-common",
-    "torch>=2.5.1",
-    "transformers>=4.46.3",
-    "uvicorn>=0.32.1",
-    "vllm>=0.6.6.post1",
 ]
 
 [build-system]

diff --git a/nilai-models/run.sh b/nilai-models/run.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+echo "Starting the primary process"
+# Start the primary process and put it in the background
+echo "Args: $*"
+python3 -m vllm.entrypoints.openai.api_server $* & #--model $1 --gpu-memory-utilization 0.5 --max-model-len 10000 --tensor-parallel-size 1 &
+
+echo "Starting the secondary process"
+# Start the helper process
+uv run python3 -m nilai_models.daemon
+
+# Wait for any process to exit
+wait -n
+
+# Exit with status of process that exited first
+exit $?