diff --git a/docker/compose/docker-compose.llama-70b-gpu.yml b/docker/compose/docker-compose.llama-70b-gpu.yml new file mode 100644 index 00000000..0235afbc --- /dev/null +++ b/docker/compose/docker-compose.llama-70b-gpu.yml @@ -0,0 +1,49 @@ +services: + llama_70b_gpu: + image: nillion/nilai-vllm:latest + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + ipc: host + ulimits: + memlock: -1 + stack: 67108864 + env_file: + - .env + restart: unless-stopped + depends_on: + etcd: + condition: service_healthy + command: > + --model hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4 + --gpu-memory-utilization 0.95 + --max-model-len 60000 + --tensor-parallel-size 1 + --enable-auto-tool-choice + --tool-call-parser llama3_json + --uvicorn-log-level warning + environment: + - SVC_HOST=llama_70b_gpu + - SVC_PORT=8000 + - ETCD_HOST=etcd + - ETCD_PORT=2379 + - TOOL_SUPPORT=true + volumes: + - hugging_face_models:/root/.cache/huggingface # cache models + networks: + - backend_net + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + retries: 3 + start_period: 60s + timeout: 10s +volumes: + hugging_face_models: + +networks: + backend_net: diff --git a/nilai-api/src/nilai_api/config/mainnet.py b/nilai-api/src/nilai_api/config/mainnet.py index 62d85a57..2a27af1a 100644 --- a/nilai-api/src/nilai_api/config/mainnet.py +++ b/nilai-api/src/nilai_api/config/mainnet.py @@ -9,6 +9,7 @@ "meta-llama/Llama-3.1-8B-Instruct": 30, "cognitivecomputations/Dolphin3.0-Llama3.1-8B": 30, "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B": 5, + "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4": 5, } # It defines the number of requests allowed for each user for a given time frame. diff --git a/nilai-api/src/nilai_api/config/testnet.py b/nilai-api/src/nilai_api/config/testnet.py index 0425e666..8efcb7df 100644 --- a/nilai-api/src/nilai_api/config/testnet.py +++ b/nilai-api/src/nilai_api/config/testnet.py @@ -9,6 +9,7 @@ "meta-llama/Llama-3.1-8B-Instruct": 5, "cognitivecomputations/Dolphin3.0-Llama3.1-8B": 5, "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B": 5, + "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4": 5, } # It defines the number of requests allowed for each user for a given time frame. diff --git a/tests/e2e/test_http.py b/tests/e2e/test_http.py index f02aa49e..d311cc63 100644 --- a/tests/e2e/test_http.py +++ b/tests/e2e/test_http.py @@ -412,7 +412,7 @@ def test_large_payload_handling(client): "max_tokens": 50, } - response = client.post("/chat/completions", json=payload) + response = client.post("/chat/completions", json=payload, timeout=30) print(response) # Check for appropriate handling of large payload diff --git a/tests/e2e/test_openai.py b/tests/e2e/test_openai.py index 61588ff0..b38e6a01 100644 --- a/tests/e2e/test_openai.py +++ b/tests/e2e/test_openai.py @@ -245,7 +245,9 @@ def test_function_calling(client, model): assert follow_up_content, "No content in follow-up response" print(f"\nFollow-up response: {follow_up_content}") assert ( - "22°C" in follow_up_content or "sunny" in follow_up_content.lower() + "22°C" in follow_up_content + or "sunny" in follow_up_content.lower() + or "weather" in follow_up_content.lower() ), "Follow-up should mention the weather details" else: