NillionNetwork · jcabrero · Mar 31, 2025 · Mar 31, 2025 · Mar 31, 2025
diff --git a/docker/compose/docker-compose.llama-70b-gpu.yml b/docker/compose/docker-compose.llama-70b-gpu.yml
@@ -0,0 +1,49 @@
+services:
+  llama_70b_gpu:
+    image: nillion/nilai-vllm:latest
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    ipc: host
+    ulimits:
+      memlock: -1
+      stack: 67108864
+    env_file:
+      - .env
+    restart: unless-stopped
+    depends_on:
+      etcd:
+        condition: service_healthy
+    command: >
+      --model hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4
+      --gpu-memory-utilization 0.95
+      --max-model-len 60000
+      --tensor-parallel-size 1
+      --enable-auto-tool-choice
+      --tool-call-parser llama3_json
+      --uvicorn-log-level warning
+    environment:
+      - SVC_HOST=llama_70b_gpu
+      - SVC_PORT=8000
+      - ETCD_HOST=etcd
+      - ETCD_PORT=2379
+      - TOOL_SUPPORT=true
+    volumes:
+      - hugging_face_models:/root/.cache/huggingface  # cache models
+    networks:
+      - backend_net
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 30s
+      retries: 3
+      start_period: 60s
+      timeout: 10s
+volumes:
+  hugging_face_models:
+
+networks:
+  backend_net:
diff --git a/nilai-api/src/nilai_api/config/mainnet.py b/nilai-api/src/nilai_api/config/mainnet.py
@@ -9,6 +9,7 @@
     "meta-llama/Llama-3.1-8B-Instruct": 30,
     "cognitivecomputations/Dolphin3.0-Llama3.1-8B": 30,
     "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B": 5,
+    "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4": 5,
 }
 
 # It defines the number of requests allowed for each user for a given time frame.

diff --git a/nilai-api/src/nilai_api/config/testnet.py b/nilai-api/src/nilai_api/config/testnet.py
@@ -9,6 +9,7 @@
     "meta-llama/Llama-3.1-8B-Instruct": 5,
     "cognitivecomputations/Dolphin3.0-Llama3.1-8B": 5,
     "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B": 5,
+    "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4": 5,
 }
 
 # It defines the number of requests allowed for each user for a given time frame.

diff --git a/tests/e2e/test_http.py b/tests/e2e/test_http.py
@@ -412,7 +412,7 @@ def test_large_payload_handling(client):
         "max_tokens": 50,
     }
 
-    response = client.post("/chat/completions", json=payload)
+    response = client.post("/chat/completions", json=payload, timeout=30)
     print(response)
 
     # Check for appropriate handling of large payload

diff --git a/tests/e2e/test_openai.py b/tests/e2e/test_openai.py
@@ -245,7 +245,9 @@ def test_function_calling(client, model):
             assert follow_up_content, "No content in follow-up response"
             print(f"\nFollow-up response: {follow_up_content}")
             assert (
-                "22°C" in follow_up_content or "sunny" in follow_up_content.lower()
+                "22°C" in follow_up_content
+                or "sunny" in follow_up_content.lower()
+                or "weather" in follow_up_content.lower()
             ), "Follow-up should mention the weather details"
 
         else: