diff --git a/docker/api.Dockerfile b/docker/api.Dockerfile index 69df609c..df79ca18 100644 --- a/docker/api.Dockerfile +++ b/docker/api.Dockerfile @@ -10,6 +10,8 @@ apt-get clean && \ apt-get autoremove && \ rm -rf /var/lib/apt/lists/* && \ pip install uv && \ +ln -s $(which aclocal) /usr/local/bin/aclocal-1.16 && \ +ln -s $(which automake) /usr/local/bin/automake-1.16 && \ uv sync EXPOSE 8080 diff --git a/docker/compose/docker-compose.deepseek-14b-gpu.yml b/docker/compose/docker-compose.deepseek-14b-gpu.yml index 41eb4c00..dacb3124 100644 --- a/docker/compose/docker-compose.deepseek-14b-gpu.yml +++ b/docker/compose/docker-compose.deepseek-14b-gpu.yml @@ -24,6 +24,7 @@ services: --model deepseek-ai/DeepSeek-R1-Distill-Qwen-14B --gpu-memory-utilization 0.39 --max-model-len 10000 + --max-num-batched-tokens 10000 --tensor-parallel-size 1 --uvicorn-log-level warning environment: diff --git a/docker/compose/docker-compose.dolphin-8b-gpu.yml b/docker/compose/docker-compose.dolphin-8b-gpu.yml index b8b6a7a4..0f757792 100644 --- a/docker/compose/docker-compose.dolphin-8b-gpu.yml +++ b/docker/compose/docker-compose.dolphin-8b-gpu.yml @@ -24,6 +24,7 @@ services: --model cognitivecomputations/Dolphin3.0-Llama3.1-8B --gpu-memory-utilization 0.21 --max-model-len 10000 + --max-num-batched-tokens 10000 --tensor-parallel-size 1 --enable-auto-tool-choice --tool-call-parser llama3_json diff --git a/docker/compose/docker-compose.llama-1b-cpu.yml b/docker/compose/docker-compose.llama-1b-cpu.yml index b5c9a33d..50063464 100644 --- a/docker/compose/docker-compose.llama-1b-cpu.yml +++ b/docker/compose/docker-compose.llama-1b-cpu.yml @@ -10,8 +10,8 @@ services: condition: service_healthy command: > --model meta-llama/Llama-3.2-1B-Instruct - --gpu-memory-utilization 0.5 --max-model-len 30000 + --max-num-batched-tokens 30000 --tensor-parallel-size 1 --enable-auto-tool-choice --tool-call-parser llama3_json @@ -23,7 +23,7 @@ services: - ETCD_PORT=2379 - TOOL_SUPPORT=true volumes: - - hugging_face_models:/root/.cache/huggingface # cache models + - hugging_face_models:/root/.cache/huggingface healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 30s diff --git a/docker/compose/docker-compose.llama-1b-gpu.ci.yml b/docker/compose/docker-compose.llama-1b-gpu.ci.yml index 1023e1c5..82ac593b 100644 --- a/docker/compose/docker-compose.llama-1b-gpu.ci.yml +++ b/docker/compose/docker-compose.llama-1b-gpu.ci.yml @@ -21,6 +21,9 @@ services: condition: service_healthy command: > --model meta-llama/Llama-3.2-1B-Instruct + --gpu-memory-utilization 0.5 + --max-model-len 30000 + --max-num-batched-tokens 30000 --tensor-parallel-size 1 --enable-auto-tool-choice --tool-call-parser llama3_json diff --git a/docker/compose/docker-compose.llama-1b-gpu.yml b/docker/compose/docker-compose.llama-1b-gpu.yml index 648c7d65..0810aea4 100644 --- a/docker/compose/docker-compose.llama-1b-gpu.yml +++ b/docker/compose/docker-compose.llama-1b-gpu.yml @@ -23,6 +23,7 @@ services: --model meta-llama/Llama-3.2-1B-Instruct --gpu-memory-utilization 0.5 --max-model-len 30000 + --max-num-batched-tokens 30000 --tensor-parallel-size 1 --enable-auto-tool-choice --tool-call-parser llama3_json diff --git a/docker/compose/docker-compose.llama-3b-gpu.yml b/docker/compose/docker-compose.llama-3b-gpu.yml index 6c56fb10..dde581b5 100644 --- a/docker/compose/docker-compose.llama-3b-gpu.yml +++ b/docker/compose/docker-compose.llama-3b-gpu.yml @@ -22,6 +22,7 @@ services: --model meta-llama/Llama-3.2-3B-Instruct --gpu-memory-utilization 0.5 --max-model-len 30000 + --max-num-batched-tokens 30000 --tensor-parallel-size 1 --enable-auto-tool-choice --tool-call-parser llama3_json diff --git a/docker/compose/docker-compose.llama-70b-gpu.yml b/docker/compose/docker-compose.llama-70b-gpu.yml index 19a5c4b9..00a48afc 100644 --- a/docker/compose/docker-compose.llama-70b-gpu.yml +++ b/docker/compose/docker-compose.llama-70b-gpu.yml @@ -22,6 +22,7 @@ services: --model hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4 --gpu-memory-utilization 0.95 --max-model-len 60000 + --max-num-batched-tokens 60000 --tensor-parallel-size 1 --enable-auto-tool-choice --tool-call-parser llama3_json diff --git a/docker/compose/docker-compose.llama-8b-gpu.yml b/docker/compose/docker-compose.llama-8b-gpu.yml index 55dbd5f5..029bc34f 100644 --- a/docker/compose/docker-compose.llama-8b-gpu.yml +++ b/docker/compose/docker-compose.llama-8b-gpu.yml @@ -22,6 +22,7 @@ services: --model meta-llama/Llama-3.1-8B-Instruct --gpu-memory-utilization 0.21 --max-model-len 10000 + --max-num-batched-tokens 10000 --tensor-parallel-size 1 --enable-auto-tool-choice --tool-call-parser llama3_json diff --git a/docker/vllm.Dockerfile b/docker/vllm.Dockerfile index 01ba019f..687a0bd5 100644 --- a/docker/vllm.Dockerfile +++ b/docker/vllm.Dockerfile @@ -1,4 +1,4 @@ -FROM vllm/vllm-openai:v0.7.3 +FROM vllm/vllm-openai:v0.10.1 # # Specify model name and path during build # ARG MODEL_NAME=llama_1b_cpu diff --git a/tests/e2e/test_openai.py b/tests/e2e/test_openai.py index 03ad7dba..390eb9df 100644 --- a/tests/e2e/test_openai.py +++ b/tests/e2e/test_openai.py @@ -328,14 +328,16 @@ def test_function_calling(client, model): assert len(tool_calls) > 0, f"Tool calls array is empty for {model}" - # Validate the first tool call first_call = tool_calls[0] - assert first_call.function.name == "get_weather", ( - "Function name should be get_weather" - ) + first_call_dict = first_call.model_dump() + + function_name = first_call_dict["function"]["name"] + function_args = first_call_dict["function"]["arguments"] + + assert function_name == "get_weather", "Function name should be get_weather" # Parse arguments and check for location - args = json.loads(first_call.function.arguments) + args = json.loads(function_args) assert "location" in args, "Arguments should contain location" assert "paris" in args["location"].lower(), "Location should be Paris" @@ -363,7 +365,7 @@ def test_function_calling(client, model): "type": "function", "function": { "name": "get_weather", - "arguments": first_call.function.arguments, + "arguments": function_args, }, } ],