diff --git a/examples/online_serving/minimax_m2.py b/examples/online_serving/minimax_m2.py new file mode 100644 index 000000000000..7b731e2e0111 --- /dev/null +++ b/examples/online_serving/minimax_m2.py @@ -0,0 +1,104 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import json + +from openai import OpenAI + +""" +https://huggingface.co/MiniMaxAI/MiniMax-M2/blob/main/docs/tool_calling_guide.md + +vllm serve MiniMaxAI/MiniMax-M2 \ + --tensor-parallel-size 4 \ + --tool-call-parser minimax_m2 \ + --reasoning-parser minimax_m2_append_think \ + --enable-auto-tool-choice \ + --port 8000 +""" + +client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy") + + +def get_weather(location: str, unit: str): + return f"The weather for {location} in {unit} is 20" + + +tool_functions = {"get_weather": get_weather} + +tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "City and state, e.g., 'San Francisco, CA'", + }, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + }, + "required": ["location", "unit"], + }, + }, + } +] + +messages = [ + { + "role": "user", + "content": "What's the weather like in San Francisco? use celsius.", + } +] + +response = client.chat.completions.create( + model=client.models.list().data[0].id, + messages=messages, + tools=tools, + tool_choice="auto", +) + +# print(response) + +# tool_call = response.choices[0].message.tool_calls[0].function +# print(f"Function called: {tool_call.name}") +# print(f"Arguments: {tool_call.arguments}") +# print(f"Result: {get_weather(**json.loads(tool_call.arguments))}") + +# feed back into +# import fbvscode; fbvscode.set_trace() + +print("=== First response ===") +print(response) + +# Step 3: Extract and call the function +tool_call = response.choices[0].message.tool_calls[0].function +name = tool_call.name +args = json.loads(tool_call.arguments) +result = tool_functions[name](**args) + +print(f"\nFunction called: {name}") +print(f"Arguments: {args}") +print(f"Result: {result}") + +# Step 4: Send the result back to the model +messages.append( + {"role": "assistant", "tool_calls": response.choices[0].message.tool_calls} +) +messages.append( + { + "role": "tool", + "tool_call_id": response.choices[0].message.tool_calls[0].id, + "content": result, + } +) + +# Step 5: Second call — model sees tool output +second_response = client.chat.completions.create( + model=client.models.list().data[0].id, + messages=messages, +) + +print("\n=== Second response ===") +print(second_response.choices[0].message) diff --git a/examples/online_serving/openai_responses_client_with_tools.py b/examples/online_serving/openai_responses_client_with_tools.py index 276010197b5a..6a9a9a9cea8a 100644 --- a/examples/online_serving/openai_responses_client_with_tools.py +++ b/examples/online_serving/openai_responses_client_with_tools.py @@ -3,9 +3,20 @@ """ Set up this example by starting a vLLM OpenAI-compatible server with tool call options enabled. -Reasoning models can be used through the Responses API as seen here +Reasoning models can be used through the Responses API as seen here https://platform.openai.com/docs/api-reference/responses For example: + +--tool-call-parser minimax_m2 -> this outputs which isn't what the model has?? + +vllm serve MiniMaxAI/MiniMax-M2 \ + --tensor-parallel-size 4 \ + --tool-call-parser minimax \ + --reasoning-parser minimax_m2_append_think \ + --enable-auto-tool-choice \ + --chat-template examples/tool_chat_template_minimax_m1.jinja \ + --port 8000 + vllm serve Qwen/Qwen3-1.7B --reasoning-parser qwen3 \ --structured-outputs-config.backend xgrammar \ --enable-auto-tool-choice --tool-call-parser hermes @@ -17,51 +28,69 @@ from utils import get_first_model -def get_weather(latitude: float, longitude: float) -> str: - """ - Mock function to simulate getting weather data. - In a real application, this would call an external weather API. - """ - return f"Current temperature at ({latitude}, {longitude}) is 20°C." +# def get_horoscope(sign): +# return f"{sign}: Next Tuesday you will befriend a baby otter." + +# tools = [ +# { +# "type": "custom", +# "name": "get_horoscope", +# "description": "Get today’s horoscope for an astrological sign.", +# "parameters": { +# "type": "object", +# "properties": { +# "sign": { +# "type": "string", +# "description": "Astrological sign, e.g. Aries, Taurus, Gemini, etc." +# } +# }, +# "required": ["sign"] +# } +# } +# ] -tools = [ - { +def get_weather(location: str, unit: str): + return f"THe weather for {location} in {unit} is 20" + +tools = [{ "type": "function", "name": "get_weather", - "description": "Get current temperature for provided coordinates in celsius.", + "description": "Get the current weather in a given location", "parameters": { "type": "object", "properties": { - "latitude": {"type": "number"}, - "longitude": {"type": "number"}, + "location": {"type": "string"}, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, }, - "required": ["latitude", "longitude"], - "additionalProperties": False, - }, - "strict": True, + "required": ["location", "unit"], + } } ] input_messages = [ - {"role": "user", "content": "What's the weather like in Paris today?"} + {"role": "user", "content": "What is the weather in Paris in Celsius today?"} ] def main(): - base_url = "http://0.0.0.0:8000/v1" + base_url = "http://localhost:8000/v1" client = OpenAI(base_url=base_url, api_key="empty") model = get_first_model(client) response = client.responses.create( - model=model, input=input_messages, tools=tools, tool_choice="required" + model=model, input=input_messages, tools=tools, + # tool_choice="required" #this breaks it for custom tools ) + print(response) + # import fbvscode; fbvscode.set_trace() + for out in response.output: if out.type == "function_call": print("Function call:", out.name, out.arguments) tool_call = out args = json.loads(tool_call.arguments) - result = get_weather(args["latitude"], args["longitude"]) + result = get_weather(args["location"], args["unit"]) input_messages.append(tool_call) # append model's function call message input_messages.append( diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 62d86d790d28..1678df3ae06a 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -119,6 +119,24 @@ logger = init_logger(__name__) +def convert_tool_schema(tool: dict) -> dict: + """ + Convert a flat tool schema: + {"type": "function", "name": "...", "description": "...", "parameters": {...}} + into: + {"type": "function", "function": {...}} + """ + if tool.get("type") != "function": + raise ValueError("Expected tool['type'] == 'function'") + + # Extract everything except 'type' and wrap inside 'function' + function_body = {k: v for k, v in tool.items() if k != "type"} + + return { + "type": "function", + "function": function_body, + } + def extract_tool_types(tools: list[Tool]) -> set[str]: """ @@ -520,12 +538,13 @@ async def _make_request( prev_response: ResponsesResponse | None, tokenizer: AnyTokenizer, ): + import fbvscode; fbvscode.set_trace() if request.tools is None or ( request.tool_choice == "none" and self.exclude_tools_when_tool_choice_none ): tool_dicts = None else: - tool_dicts = [tool.model_dump() for tool in request.tools] + tool_dicts = [convert_tool_schema(tool.model_dump()) for tool in request.tools] # Construct the input messages. messages = self._construct_input_messages(request, prev_response) _, request_prompts, engine_prompts = await self._preprocess_chat( @@ -853,7 +872,7 @@ def _make_response_output_items( content=[output_text], role="assistant", status="completed", - type="message", + type="message", #this could be a function call output ) outputs = [] diff --git a/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm/entrypoints/openai/tool_parsers/utils.py index 570eb447a467..2e9ee27e70df 100644 --- a/vllm/entrypoints/openai/tool_parsers/utils.py +++ b/vllm/entrypoints/openai/tool_parsers/utils.py @@ -224,6 +224,6 @@ def get_json_schema_from_tools( return tool_map[tool_name].function.parameters # tool_choice: "required" if tool_choice == "required": - return _get_json_schema_from_tools(tools) + return _get_json_schema_from_tools(tools) #this break # tool_choice: "auto" return None diff --git a/vllm/reasoning/minimax_m2_reasoning_parser.py b/vllm/reasoning/minimax_m2_reasoning_parser.py index 0d4f6cc270a1..9e291ab0dd1d 100644 --- a/vllm/reasoning/minimax_m2_reasoning_parser.py +++ b/vllm/reasoning/minimax_m2_reasoning_parser.py @@ -15,6 +15,8 @@ logger = init_logger(__name__) +import re +from typing import Tuple @ReasoningParserManager.register_module("minimax_m2") class MiniMaxM2ReasoningParser(BaseThinkingReasoningParser): @@ -66,4 +68,13 @@ def extract_reasoning_content_streaming( def extract_reasoning_content( self, model_output: str, request: ChatCompletionRequest | ResponsesRequest ) -> tuple[str | None, str | None]: - return None, "" + model_output + + match = re.search(r"\s*", model_output, re.DOTALL) + if not match: + return model_output, "" + + end_idx = match.end() + before = model_output[:end_idx] + after = model_output[end_idx:] + return before.strip(), after.strip() + # return None, "" + model_output