diff --git a/.env.sample b/.env.sample
index d4eb325a..e0f6f9db 100644
--- a/.env.sample
+++ b/.env.sample
@@ -1 +1,5 @@
-HUGGINGFACE_API_TOKEN="<your_huggingface_token>"
\ No newline at end of file
+HUGGINGFACE_API_TOKEN="<your_huggingface_token>"
+OMP_NUM_THREADS=8
+MKL_NUM_THREADS=8
+NUMEXPR_NUM_THREADS=8
+OPENBLAS_NUM_THREADS=8
\ No newline at end of file
diff --git a/README.md b/README.md
index d8817bd1..1ee3fa90 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,16 @@
 
 Copy the `.env.sample` to `.env` to and replace the value of the `HUGGINGFACE_API_TOKEN` with the appropriate value. It is required to download Llama3.2 1B.
 
+For development environments:
 ```shell
-docker compose up --build web
+docker compose -f docker-compose.yml -f docker-compose.dev.yml up --build nilai
+```
+
+For production environments:
+```shell
+docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d 
+```
+
+```
+uv run gunicorn -c gunicorn.conf.py nilai.__main__:app
 ```
\ No newline at end of file
diff --git a/caddy/.gitignore b/caddy/.gitignore
new file mode 100644
index 00000000..fbc7f1cd
--- /dev/null
+++ b/caddy/.gitignore
@@ -0,0 +1,2 @@
+caddy_config/
+caddy_data/
\ No newline at end of file
diff --git a/caddy/Caddyfile b/caddy/Caddyfile
new file mode 100644
index 00000000..7bdb0c8a
--- /dev/null
+++ b/caddy/Caddyfile
@@ -0,0 +1,10 @@
+(ssl_config) {
+	tls {
+		protocols tls1.2 tls1.3
+	}
+ }
+ 
+ https://nilai.sandbox.nilogy.xyz {
+	import ssl_config
+	reverse_proxy nilai:8443
+ }
\ No newline at end of file
diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml
new file mode 100644
index 00000000..e20e75ac
--- /dev/null
+++ b/docker-compose.dev.yml
@@ -0,0 +1,4 @@
+services:
+  nilai:
+    ports:
+      - "8080:8080"
\ No newline at end of file
diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
new file mode 100644
index 00000000..774676a6
--- /dev/null
+++ b/docker-compose.prod.yml
@@ -0,0 +1,21 @@
+services:
+  nilai:
+    networks:
+      - proxy_net
+  caddy:
+    image: caddy:latest
+    container_name: caddy
+    restart: unless-stopped
+    networks:
+      - proxy_net
+    ports:
+      - "80:80"
+      - "443:443"
+      - "443:443/udp"
+    volumes:
+      - ./caddy/Caddyfile:/etc/caddy/Caddyfile
+      - ./caddy/caddy_data:/data
+      - ./caddy/caddy_config:/config
+
+networks:
+  proxy_net:
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index 24f88723..8cc45612 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -3,11 +3,8 @@ services:
     build: 
       context: .
       dockerfile: docker/Dockerfile
-    ports:
-      - "12345:12345"
     volumes:
       - ${PWD}/db/:/app/db/   # sqlite database for users
       - hugging_face_models:/root/.cache/huggingface  # cache models
-  
 volumes:
   hugging_face_models:
\ No newline at end of file
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 24c2076a..0b59665f 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,16 +1,18 @@
 FROM python:3.12-slim
 
 COPY --link nilai /app/nilai
-COPY pyproject.toml uv.lock .env /app/
+COPY pyproject.toml uv.lock .env gunicorn.conf.py /app/
 
 WORKDIR /app
 
-RUN pip install uv
-RUN uv sync
+RUN apt-get update && \
+apt-get install build-essential certbot -y && \
+apt-get clean && \
+apt-get autoremove && \
+rm -rf /var/lib/apt/lists/* && \
+pip install uv && \
+uv sync
 
-EXPOSE 12345
+EXPOSE 8080 8443
 
-# ENTRYPOINT ["uv", "run", "fastapi", "run", "nilai/main.py"]
-# CMD ["--host", "0.0.0.0", "--port", "12345"]
-
-CMD ["uv", "run", "fastapi", "run", "nilai/main.py", "--host", "0.0.0.0", "--port", "12345"]
\ No newline at end of file
+CMD ["uv", "run", "gunicorn", "-c", "gunicorn.conf.py", "nilai.__main__:app"]
\ No newline at end of file
diff --git a/docker/README.md b/docker/README.md
index e82afc29..b2d13475 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -4,7 +4,8 @@ docker build -t nillion/nilai:latest -f docker/Dockerfile .
 
 
 docker run \
-  -p 12345:12345 \
+  -p 8080:8080 \
+  -p 8443:8443 \
   -v hugging_face_models:/root/.cache/huggingface \
   -v $(pwd)/users.sqlite:/app/users.sqlite \
   nillion/nilai:latest
diff --git a/gunicorn.conf.py b/gunicorn.conf.py
new file mode 100644
index 00000000..bcd2cb93
--- /dev/null
+++ b/gunicorn.conf.py
@@ -0,0 +1,16 @@
+# gunicorn.config.py
+
+# Bind to address and port
+bind = ["0.0.0.0:8080", "0.0.0.0:8443"]
+
+# Set the number of workers (2)
+workers = 2
+
+# Set the number of threads per worker (16)
+threads = 16
+
+# Set the timeout (120 seconds)
+timeout = 120
+
+# Set the worker class to UvicornWorker for async handling
+worker_class = "uvicorn.workers.UvicornWorker"
diff --git a/nilai/__main__.py b/nilai/__main__.py
new file mode 100644
index 00000000..17023698
--- /dev/null
+++ b/nilai/__main__.py
@@ -0,0 +1,21 @@
+import uvicorn
+
+from nilai.app import app
+
+
+def run_uvicorn():
+    """
+    Function to run the app with Uvicorn for debugging.
+    """
+    uvicorn.run(
+        app,
+        host="0.0.0.0",  # Listen on all interfaces
+        port=8080,  # Use the desired port
+        reload=True,  # Enable auto-reload for development
+        # ssl_certfile=SSL_CERTFILE,
+        # ssl_keyfile=SSL_KEYFILE,
+    )
+
+
+if __name__ == "__main__":
+    run_uvicorn()
diff --git a/nilai/main.py b/nilai/app.py
similarity index 69%
rename from nilai/main.py
rename to nilai/app.py
index 08cd5d9d..5e0af62e 100644
--- a/nilai/main.py
+++ b/nilai/app.py
@@ -35,24 +35,13 @@
             "name": "Model",
             "description": "Model information",
         },
+        {
+            "name": "Usage",
+            "description": "User token usage",
+        },
     ],
 )
 
 
 app.include_router(public.router)
 app.include_router(private.router, dependencies=[Depends(get_user)])
-
-if __name__ == "__main__":
-    import uvicorn
-
-    # Path to your SSL certificate and key files
-    # SSL_CERTFILE = "/path/to/certificate.pem"  # Replace with your certificate file path
-    # SSL_KEYFILE = "/path/to/private-key.pem"  # Replace with your private key file path
-
-    uvicorn.run(
-        app,
-        host="0.0.0.0",  # Listen on all interfaces
-        port=12345,  # Use port 8443 for HTTPS
-        # ssl_certfile=SSL_CERTFILE,
-        # ssl_keyfile=SSL_KEYFILE,
-    )
diff --git a/nilai/db.py b/nilai/db.py
index 49f54441..f39d6e12 100644
--- a/nilai/db.py
+++ b/nilai/db.py
@@ -53,8 +53,8 @@ class User(Base):
     userid = Column(String(36), primary_key=True, index=True)
     name = Column(String(100), nullable=False)
     apikey = Column(String(36), unique=True, nullable=False, index=True)
-    input_tokens = Column(Integer, default=0, nullable=False)
-    generated_tokens = Column(Integer, default=0, nullable=False)
+    prompt_tokens = Column(Integer, default=0, nullable=False)
+    completion_tokens = Column(Integer, default=0, nullable=False)
 
     def __repr__(self):
         return f"<User(userid={self.userid}, name={self.name})>"
@@ -146,7 +146,7 @@ def insert_user(name: str) -> Dict[str, str]:
             raise
 
     @staticmethod
-    def check_api_key(api_key: str) -> Optional[str]:
+    def check_api_key(api_key: str) -> Optional[dict]:
         """
         Validate an API key.
 
@@ -159,33 +159,59 @@ def check_api_key(api_key: str) -> Optional[str]:
         try:
             with get_db_session() as session:
                 user = session.query(User).filter(User.apikey == api_key).first()
-                return user.name if user else None  # type: ignore
+                return {"name": user.name, "userid": user.userid} if user else None  # type: ignore
         except SQLAlchemyError as e:
             logger.error(f"Error checking API key: {e}")
             return None
 
     @staticmethod
-    def update_token_usage(userid: str, input_tokens: int, generated_tokens: int):
+    def update_token_usage(userid: str, prompt_tokens: int, completion_tokens: int):
         """
         Update token usage for a specific user.
 
         Args:
             userid (str): User's unique ID
-            input_tokens (int): Number of input tokens
-            generated_tokens (int): Number of generated tokens
+            prompt_tokens (int): Number of input tokens
+            completion_tokens (int): Number of generated tokens
         """
         try:
             with get_db_session() as session:
                 user = session.query(User).filter(User.userid == userid).first()
                 if user:
-                    user.input_tokens += input_tokens  # type: ignore
-                    user.generated_tokens += generated_tokens  # type: ignore
+                    user.prompt_tokens += prompt_tokens  # type: ignore
+                    user.completion_tokens += completion_tokens  # type: ignore
                     logger.info(f"Updated token usage for user {userid}")
                 else:
                     logger.warning(f"User {userid} not found")
         except SQLAlchemyError as e:
             logger.error(f"Error updating token usage: {e}")
 
+    @staticmethod
+    def get_token_usage(
+        userid: str,
+    ) -> (
+        Dict[str, Any] | None
+    ):  # -> dict[str, Any] | None:# -> dict[str, Any] | None:# -> dict[str, Any] | None:# -> dict[str, Any] | None:# -> dict[str, Any] | None:
+        """
+        Get token usage for a specific user.
+
+        Args:
+            userid (str): User's unique ID
+        """
+        try:
+            with get_db_session() as session:
+                user = session.query(User).filter(User.userid == userid).first()
+                if user:
+                    return {
+                        "prompt_tokens": user.prompt_tokens,
+                        "completion_tokens": user.completion_tokens,
+                        "total_tokens": user.prompt_tokens + user.completion_tokens,
+                    }
+                else:
+                    logger.warning(f"User {userid} not found")
+        except SQLAlchemyError as e:
+            logger.error(f"Error updating token usage: {e}")
+
     @staticmethod
     def get_all_users() -> Optional[List[UserData]]:
         """
@@ -202,8 +228,8 @@ def get_all_users() -> Optional[List[UserData]]:
                         userid=user.userid,  # type: ignore
                         name=user.name,  # type: ignore
                         apikey=user.apikey,  # type: ignore
-                        input_tokens=user.input_tokens,  # type: ignore
-                        generated_tokens=user.generated_tokens,  # type: ignore
+                        input_tokens=user.prompt_tokens,  # type: ignore
+                        generated_tokens=user.completion_tokens,  # type: ignore
                     )
                     for user in users
                 ]
@@ -227,8 +253,8 @@ def get_user_token_usage(userid: str) -> Optional[Dict[str, int]]:
                 user = session.query(User).filter(User.userid == userid).first()
                 if user:
                     return {
-                        "input_tokens": user.input_tokens,
-                        "generated_tokens": user.generated_tokens,
+                        "prompt_tokens": user.prompt_tokens,
+                        "completion_tokens": user.completion_tokens,
                     }  # type: ignore
                 return None
         except SQLAlchemyError as e:
@@ -255,6 +281,8 @@ def get_user_token_usage(userid: str) -> Optional[Dict[str, int]]:
     print(f"API key validation: {user_name}")
 
     # Update and retrieve token usage
-    UserManager.update_token_usage(bob["userid"], input_tokens=50, generated_tokens=20)
+    UserManager.update_token_usage(
+        bob["userid"], prompt_tokens=50, completion_tokens=20
+    )
     usage = UserManager.get_user_token_usage(bob["userid"])
     print(f"Bob's token usage: {usage}")
diff --git a/nilai/routers/private.py b/nilai/routers/private.py
index 10edc751..853daa40 100644
--- a/nilai/routers/private.py
+++ b/nilai/routers/private.py
@@ -1,20 +1,17 @@
 # Fast API and serving
-import time
 from base64 import b64encode
-from typing import Any, List
-from uuid import uuid4
 
 from fastapi import APIRouter, Body, Depends, HTTPException
 
 from nilai.auth import get_user
 from nilai.crypto import sign_message
+from nilai.db import UserManager
 
 # Internal libraries
 from nilai.model import (
     AttestationResponse,
     ChatRequest,
     ChatResponse,
-    Choice,
     Message,
     Model,
     Usage,
@@ -24,6 +21,11 @@
 router = APIRouter()
 
 
+@router.get("/v1/usage", tags=["Usage"])
+async def get_usage(user: dict = Depends(get_user)) -> Usage:
+    return Usage(**UserManager.get_token_usage(user["userid"]))
+
+
 # Model Information Endpoint
 @router.get("/v1/model-info", tags=["Model"])
 async def get_model_info(user: str = Depends(get_user)) -> dict:
@@ -37,7 +39,7 @@ async def get_model_info(user: str = Depends(get_user)) -> dict:
 
 # Attestation Report Endpoint
 @router.get("/v1/attestation/report", tags=["Attestation"])
-async def get_attestation(user: str = Depends(get_user)) -> AttestationResponse:
+async def get_attestation(user: dict = Depends(get_user)) -> AttestationResponse:
     return AttestationResponse(
         verifying_key=state.verifying_key,
         cpu_attestation="...",
@@ -47,13 +49,13 @@ async def get_attestation(user: str = Depends(get_user)) -> AttestationResponse:
 
 # Available Models Endpoint
 @router.get("/v1/models", tags=["Model"])
-async def get_models(user: str = Depends(get_user)) -> dict[str, list[Model]]:
+async def get_models(user: dict = Depends(get_user)) -> dict[str, list[Model]]:
     return {"models": state.models}
 
 
 # Chat Completion Endpoint
 @router.post("/v1/chat/completions", tags=["Chat"])
-def chat_completion(
+async def chat_completion(
     req: ChatRequest = Body(
         ChatRequest(
             model=state.models[0].name,
@@ -63,17 +65,14 @@ def chat_completion(
             ],
         )
     ),
-    user: str = Depends(get_user),
+    user: dict = Depends(get_user),
 ) -> ChatResponse:
     if not req.messages or len(req.messages) == 0:
         raise HTTPException(status_code=400, detail="The 'messages' field is required.")
-
     if not req.model:
         raise HTTPException(status_code=400, detail="The 'model' field is required.")
 
     # Combine messages into a single prompt
-    print(req)
-    prompt = "\n".join([f"{msg.role}: {msg.content}" for msg in req.messages])
     prompt = [
         {
             "role": msg.role,
@@ -83,38 +82,22 @@ def chat_completion(
     ]
 
     # Generate response
-    generated: List[Any] = state.chat_pipeline(
-        prompt, max_length=1024, num_return_sequences=1, truncation=True
-    )  # type: ignore
-    print(type(generated))
+    generated: dict = state.chat_pipeline.create_chat_completion(prompt)
     if not generated or len(generated) == 0:
         raise HTTPException(status_code=500, detail="The model returned no output.")
 
-    response = generated[0]["generated_text"][-1]
-    print(f"Prompt: {prompt}, Response: {response}")
-    usage = Usage(
-        prompt_tokens=sum(len(msg.content.split()) for msg in req.messages),
-        completion_tokens=len(response["content"].split()),
-        total_tokens=0,
-    )
-    usage.total_tokens = usage.prompt_tokens + usage.completion_tokens
     response = ChatResponse(
-        id=f"chat-{uuid4()}",
-        object="chat.completion",
-        created=int(time.time()),
-        model=req.model,
-        choices=[
-            Choice(
-                index=0,
-                message=Message(**response),
-                finish_reason="stop",
-                logprobs=None,
-            )
-        ],
-        usage=usage,
-        signature="",  # Will be filled later
+        signature="",
+        **generated,
     )
 
+    response.model = req.model
+
+    UserManager.update_token_usage(
+        user["userid"],
+        prompt_tokens=response.usage.prompt_tokens,
+        completion_tokens=response.usage.completion_tokens,
+    )
     # Sign the response
     response_json = response.model_dump_json()
     signature = sign_message(state.private_key, response_json)
diff --git a/nilai/state.py b/nilai/state.py
index ce0ed436..4fc802ad 100644
--- a/nilai/state.py
+++ b/nilai/state.py
@@ -1,31 +1,23 @@
-import os
 import time
+from asyncio import Semaphore
 
-import torch
 from dotenv import load_dotenv
-from transformers import pipeline
+from llama_cpp import Llama
 
 from nilai.crypto import generate_key_pair
 from nilai.model import Model
 
-# Load the .env file
-load_dotenv()
-
-# # Application State Initialization
-# torch.set_num_threads(1)
-# torch.set_num_interop_threads(1)
-
 
 class AppState:
     def __init__(self):
         self.private_key, self.public_key, self.verifying_key = generate_key_pair()
-        self.chat_pipeline = pipeline(
-            "text-generation",
-            model="meta-llama/Llama-3.2-1B-Instruct",
-            model_kwargs={"torch_dtype": torch.bfloat16},
-            device_map="auto",
-            token=os.getenv("HUGGINGFACE_API_TOKEN"),
+        self.chat_pipeline = Llama.from_pretrained(
+            repo_id="bartowski/Llama-3.2-1B-Instruct-GGUF",
+            filename="Llama-3.2-1B-Instruct-Q5_K_S.gguf",
+            n_threads=16,
+            verbose=False,
         )
+        self.sem = Semaphore(2)
         self.models = [
             Model(
                 id="meta-llama/Llama-3.2-1B-Instruct",
@@ -60,4 +52,5 @@ def uptime(self):
         return ", ".join(parts)
 
 
+load_dotenv()
 state = AppState()
diff --git a/pyproject.toml b/pyproject.toml
index 57cc25b1..f589b136 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,6 +12,7 @@ dependencies = [
     "cryptography>=43.0.3",
     "fastapi[standard]>=0.115.5",
     "gunicorn>=23.0.0",
+    "llama-cpp-python>=0.3.2",
     "python-dotenv>=1.0.1",
     "sqlalchemy>=2.0.36",
     "torch>=2.5.1",
diff --git a/tests/model_execution_0.py b/tests/model_execution_0.py
new file mode 100644
index 00000000..f7bbe1ae
--- /dev/null
+++ b/tests/model_execution_0.py
@@ -0,0 +1,38 @@
+import os
+import time
+
+import torch
+from dotenv import load_dotenv
+from transformers import pipeline
+
+# Load the .env file
+load_dotenv()
+
+# # Application State Initialization
+torch.set_num_threads(32)
+torch.set_num_interop_threads(32)
+
+
+chat_pipeline = pipeline(
+    "text-generation",
+    model="meta-llama/Llama-3.2-1B-Instruct",
+    model_kwargs={"torch_dtype": torch.bfloat16},
+    device_map="cpu",
+    token=os.getenv("HUGGINGFACE_API_TOKEN"),
+)
+
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "What is your name?"},
+]
+
+start = time.time()
+# Generate response
+generated = chat_pipeline(
+    messages, max_length=1024, num_return_sequences=1, truncation=True
+)  # type: ignore
+
+end = time.time()
+
+print(generated)
+print(end - start)
diff --git a/tests/model_execution_1.py b/tests/model_execution_1.py
new file mode 100644
index 00000000..2ef17ab4
--- /dev/null
+++ b/tests/model_execution_1.py
@@ -0,0 +1,49 @@
+import time
+
+from onnxruntime import InferenceSession
+from optimum.onnxruntime import ORTModelForCausalLM
+from transformers import AutoTokenizer
+
+# Define the model directory and ONNX export location
+model_name = "meta-llama/Llama-3.2-1B-Instruct"
+onnx_export_dir = "./onnx_model"
+
+# Export the model
+model = ORTModelForCausalLM.from_pretrained(model_name, from_transformers=True)
+model.save_pretrained(onnx_export_dir)
+
+# Save the tokenizer for later use
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+tokenizer.save_pretrained(onnx_export_dir)
+
+
+# Load the ONNX model and tokenizer
+onnx_model_path = "./onnx_model/model.onnx"
+tokenizer = AutoTokenizer.from_pretrained("./onnx_model")
+
+# Create an ONNX Runtime session
+session = InferenceSession(onnx_model_path)
+
+# Input messages
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "What is your name?"},
+]
+
+# Prepare input text
+input_text = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
+
+# Tokenize input text
+inputs = tokenizer(input_text, return_tensors="pt")
+print("START:")
+# Run inference
+start = time.time()
+onnx_inputs = {session.get_inputs()[0].name: inputs["input_ids"].numpy()}
+onnx_output = session.run(None, onnx_inputs)
+end = time.time()
+
+# Decode the output
+output_text = tokenizer.decode(onnx_output[0][0], skip_special_tokens=True)
+
+print(output_text)
+print(f"Time taken: {end - start} seconds")
diff --git a/tests/model_execution_2.py b/tests/model_execution_2.py
new file mode 100644
index 00000000..821279ae
--- /dev/null
+++ b/tests/model_execution_2.py
@@ -0,0 +1,34 @@
+import os
+import time
+
+from dotenv import load_dotenv
+from optimum.pipelines import pipeline
+
+# Load the .env file
+load_dotenv()
+
+
+chat_pipeline = pipeline(
+    "text-generation",
+    model="meta-llama/Llama-3.2-1B-Instruct",
+    accelerator="ort",
+    token=os.getenv("HUGGINGFACE_API_TOKEN"),
+)
+
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "What is your name?"},
+]
+
+print("start")
+for i in range(10):
+    start = time.time()
+    # Generate response
+    generated = chat_pipeline(
+        messages, max_length=1024, num_return_sequences=1, truncation=True
+    )  # type: ignore
+
+    end = time.time()
+
+    print(generated)
+    print(end - start)
diff --git a/tests/model_execution_3.py b/tests/model_execution_3.py
new file mode 100644
index 00000000..33c4ace7
--- /dev/null
+++ b/tests/model_execution_3.py
@@ -0,0 +1,25 @@
+import time
+
+from llama_cpp import Llama
+
+llm = Llama.from_pretrained(
+    repo_id="bartowski/Llama-3.2-1B-Instruct-GGUF",
+    filename="Llama-3.2-1B-Instruct-Q5_K_S.gguf",
+)
+
+
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "What is your name?"},
+]
+
+print("start")
+for i in range(10):
+    start = time.time()
+    # Generate response
+    generated = llm.create_chat_completion(messages)
+
+    end = time.time()
+
+    print(generated)
+    print(end - start)
diff --git a/tests/test_cryptography.py b/tests/test_cryptography.py
index 51fb4cbd..702ca540 100644
--- a/tests/test_cryptography.py
+++ b/tests/test_cryptography.py
@@ -2,7 +2,6 @@
 
 import pytest
 from cryptography.exceptions import InvalidSignature
-from cryptography.hazmat.primitives import serialization
 from cryptography.hazmat.primitives.asymmetric import ec
 
 from nilai.crypto import generate_key_pair, sign_message, verify_signature
diff --git a/tests/test_db.py b/tests/test_db.py
index 574bd26d..21d43451 100644
--- a/tests/test_db.py
+++ b/tests/test_db.py
@@ -6,8 +6,9 @@
 from sqlalchemy.pool import StaticPool
 
 import nilai.db as db
+
 # Import the classes and functions to test
-from nilai.db import Base, DatabaseConfig, User, UserManager, get_db_session
+from nilai.db import Base, UserManager
 
 
 @pytest.fixture(scope="function")
@@ -123,8 +124,8 @@ def test_update_token_usage(self, user_manager):
     def test_get_all_users(self, user_manager):
         """Test retrieving all users."""
         # Insert multiple users
-        user1 = user_manager.insert_user("User 1")
-        user2 = user_manager.insert_user("User 2")
+        _ = user_manager.insert_user("User 1")
+        _ = user_manager.insert_user("User 2")
 
         # Retrieve all users
         all_users = user_manager.get_all_users()
diff --git a/uv.lock b/uv.lock
index 0e30bbf4..2cc024e1 100644
--- a/uv.lock
+++ b/uv.lock
@@ -196,6 +196,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/87/5c/3dab83cc4aba1f4b0e733e3f0c3e7d4386440d660ba5b1e3ff995feb734d/cryptography-43.0.3-cp39-abi3-win_amd64.whl", hash = "sha256:0c580952eef9bf68c4747774cde7ec1d85a6e61de97281f2dba83c7d2c806362", size = 3068026 },
 ]
 
+[[package]]
+name = "diskcache"
+version = "5.6.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/3f/21/1c1ffc1a039ddcc459db43cc108658f32c57d271d7289a2794e401d0fdb6/diskcache-5.6.3.tar.gz", hash = "sha256:2c3a3fa2743d8535d832ec61c2054a1641f41775aa7c556758a109941e33e4fc", size = 67916 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3f/27/4570e78fc0bf5ea0ca45eb1de3818a23787af9b390c0b0a0033a1b8236f9/diskcache-5.6.3-py3-none-any.whl", hash = "sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19", size = 45550 },
+]
+
 [[package]]
 name = "dnspython"
 version = "2.7.0"
@@ -440,6 +449,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/31/80/3a54838c3fb461f6fec263ebf3a3a41771bd05190238de3486aae8540c36/jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d", size = 133271 },
 ]
 
+[[package]]
+name = "llama-cpp-python"
+version = "0.3.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "diskcache" },
+    { name = "jinja2" },
+    { name = "numpy" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5f/0e/ff129005a33b955088fc7e4ecb57e5500b604fb97eca55ce8688dbe59680/llama_cpp_python-0.3.2.tar.gz", hash = "sha256:8fbf246a55a999f45015ed0d48f91b4ae04ae959827fac1cd6ac6ec65aed2e2f", size = 64964148 }
+
 [[package]]
 name = "markdown-it-py"
 version = "3.0.0"
@@ -535,6 +556,7 @@ dependencies = [
     { name = "cryptography" },
     { name = "fastapi", extra = ["standard"] },
     { name = "gunicorn" },
+    { name = "llama-cpp-python" },
     { name = "python-dotenv" },
     { name = "sqlalchemy" },
     { name = "torch" },
@@ -557,6 +579,7 @@ requires-dist = [
     { name = "cryptography", specifier = ">=43.0.3" },
     { name = "fastapi", extras = ["standard"], specifier = ">=0.115.5" },
     { name = "gunicorn", specifier = ">=23.0.0" },
+    { name = "llama-cpp-python", specifier = ">=0.3.2" },
     { name = "python-dotenv", specifier = ">=1.0.1" },
     { name = "sqlalchemy", specifier = ">=2.0.36" },
     { name = "torch", specifier = ">=2.5.1" },