gemini realtime: support NON_BLOCKING tool behavior (livekit#3482)

longcw · akshaym1shra · commit cc96fccfffb6 · 2025-11-03T18:49:09.000+05:30
diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/realtime_api.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/realtime_api.py
@@ -10,8 +10,7 @@
 from dataclasses import dataclass, field
 from typing import Literal
 
-from google import genai
-from google.genai import types
+from google.genai import Client as GenAIClient, types
 from google.genai.live import AsyncSession
 from livekit import rtc
 from livekit.agents import APIConnectionError, llm, utils
@@ -76,6 +75,8 @@ class _RealtimeOptions:
     context_window_compression: NotGivenOr[types.ContextWindowCompressionConfig] = NOT_GIVEN
     api_version: NotGivenOr[str] = NOT_GIVEN
     gemini_tools: NotGivenOr[list[_LLMTool]] = NOT_GIVEN
+    tool_behavior: NotGivenOr[types.Behavior] = NOT_GIVEN
+    tool_response_scheduling: NotGivenOr[types.FunctionResponseScheduling] = NOT_GIVEN
 
 
 @dataclass
@@ -136,6 +137,8 @@ def __init__(
         proactivity: NotGivenOr[bool] = NOT_GIVEN,
         realtime_input_config: NotGivenOr[types.RealtimeInputConfig] = NOT_GIVEN,
         context_window_compression: NotGivenOr[types.ContextWindowCompressionConfig] = NOT_GIVEN,
+        tool_behavior: NotGivenOr[types.Behavior] = NOT_GIVEN,
+        tool_response_scheduling: NotGivenOr[types.FunctionResponseScheduling] = NOT_GIVEN,
         api_version: NotGivenOr[str] = NOT_GIVEN,
         conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
         http_options: NotGivenOr[types.HttpOptions] = NOT_GIVEN,
@@ -174,6 +177,8 @@ def __init__(
             proactivity (bool, optional): Whether to enable proactive audio. Defaults to False.
             realtime_input_config (RealtimeInputConfig, optional): The configuration for realtime input. Defaults to None.
             context_window_compression (ContextWindowCompressionConfig, optional): The configuration for context window compression. Defaults to None.
+            tool_behavior (Behavior, optional): The behavior for tool call. Default behavior is BLOCK in Gemini Realtime API.
+            tool_response_scheduling (FunctionResponseScheduling, optional): The scheduling for tool response. Default scheduling is WHEN_IDLE.
             conn_options (APIConnectOptions, optional): The configuration for the API connection. Defaults to DEFAULT_API_CONNECT_OPTIONS.
             _gemini_tools (list[LLMTool], optional): Gemini-specific tools to use for the session. This parameter is experimental and may change.
 
@@ -265,6 +270,7 @@ def __init__(
             context_window_compression=context_window_compression,
             api_version=api_version,
             gemini_tools=_gemini_tools,
+            tool_behavior=tool_behavior,
             conn_options=conn_options,
             http_options=http_options,
         )
@@ -281,6 +287,8 @@ def update_options(
         *,
         voice: NotGivenOr[str] = NOT_GIVEN,
         temperature: NotGivenOr[float] = NOT_GIVEN,
+        tool_behavior: NotGivenOr[types.Behavior] = NOT_GIVEN,
+        tool_response_scheduling: NotGivenOr[types.FunctionResponseScheduling] = NOT_GIVEN,
     ) -> None:
         """
         Update the options for the RealtimeModel.
@@ -296,10 +304,18 @@ def update_options(
         if is_given(temperature):
             self._opts.temperature = temperature
 
+        if is_given(tool_behavior):
+            self._opts.tool_behavior = tool_behavior
+
+        if is_given(tool_response_scheduling):
+            self._opts.tool_response_scheduling = tool_response_scheduling
+
         for sess in self._sessions:
             sess.update_options(
                 voice=self._opts.voice,
                 temperature=self._opts.temperature,
+                tool_behavior=self._opts.tool_behavior,
+                tool_response_scheduling=self._opts.tool_response_scheduling,
             )
 
     async def aclose(self) -> None:
@@ -337,7 +353,7 @@ def __init__(self, realtime_model: RealtimeModel) -> None:
         if api_version:
             http_options.api_version = api_version
 
-        self._client = genai.Client(
+        self._client = GenAIClient(
             api_key=self._opts.api_key,
             vertexai=self._opts.vertexai,
             project=self._opts.project,
@@ -381,6 +397,8 @@ def update_options(
         voice: NotGivenOr[str] = NOT_GIVEN,
         temperature: NotGivenOr[float] = NOT_GIVEN,
         tool_choice: NotGivenOr[llm.ToolChoice | None] = NOT_GIVEN,
+        tool_behavior: NotGivenOr[types.Behavior] = NOT_GIVEN,
+        tool_response_scheduling: NotGivenOr[types.FunctionResponseScheduling] = NOT_GIVEN,
     ) -> None:
         should_restart = False
         if is_given(voice) and self._opts.voice != voice:
@@ -391,6 +409,20 @@ def update_options(
             self._opts.temperature = temperature if is_given(temperature) else NOT_GIVEN
             should_restart = True
 
+        if is_given(tool_behavior) and self._opts.tool_behavior != tool_behavior:
+            self._opts.tool_behavior = tool_behavior
+            should_restart = True
+
+        if (
+            is_given(tool_response_scheduling)
+            and self._opts.tool_response_scheduling != tool_response_scheduling
+        ):
+            self._opts.tool_response_scheduling = tool_response_scheduling
+            # no need to restart
+
+        if is_given(tool_choice):
+            logger.warning("tool_choice is not supported by the Google Realtime API.")
+
         if should_restart:
             self._mark_restart_needed()
 
@@ -422,7 +454,11 @@ async def update_chat_ctx(self, chat_ctx: llm.ChatContext) -> None:
             ).to_provider_format(format="google", inject_dummy_user_message=False)
             # we are not generating, and do not need to inject
             turns = [types.Content.model_validate(turn) for turn in turns_dict]
-            tool_results = get_tool_results_for_realtime(append_ctx, vertexai=self._opts.vertexai)
+            tool_results = get_tool_results_for_realtime(
+                append_ctx,
+                vertexai=self._opts.vertexai,
+                tool_response_scheduling=self._opts.tool_response_scheduling,
+            )
             if turns:
                 self._send_client_event(types.LiveClientContent(turns=turns, turn_complete=False))
             if tool_results:
@@ -434,7 +470,7 @@ async def update_chat_ctx(self, chat_ctx: llm.ChatContext) -> None:
 
     async def update_tools(self, tools: list[llm.FunctionTool | llm.RawFunctionTool]) -> None:
         new_declarations: list[types.FunctionDeclaration] = to_fnc_ctx(
-            tools, use_parameters_json_schema=False
+            tools, use_parameters_json_schema=False, tool_behavior=self._opts.tool_behavior
         )
         current_tool_names = {f.name for f in self._gemini_declarations}
         new_tool_names = {f.name for f in new_declarations}
diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/utils.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/utils.py
@@ -16,6 +16,8 @@
     is_function_tool,
     is_raw_function_tool,
 )
+from livekit.agents.types import NOT_GIVEN, NotGivenOr
+from livekit.agents.utils import is_given
 
 from .log import logger
 from .tools import _LLMTool
@@ -24,7 +26,10 @@
 
 
 def to_fnc_ctx(
-    fncs: list[FunctionTool | RawFunctionTool], *, use_parameters_json_schema: bool = True
+    fncs: list[FunctionTool | RawFunctionTool],
+    *,
+    use_parameters_json_schema: bool = True,
+    tool_behavior: NotGivenOr[types.Behavior] = NOT_GIVEN,
 ) -> list[types.FunctionDeclaration]:
     tools: list[types.FunctionDeclaration] = []
     for fnc in fncs:
@@ -43,10 +48,14 @@ def to_fnc_ctx(
                         info.raw_schema.get("parameters", {})
                     )
                 )
+
+            if is_given(tool_behavior):
+                fnc_kwargs["behavior"] = tool_behavior
+
             tools.append(types.FunctionDeclaration(**fnc_kwargs))
 
         elif is_function_tool(fnc):
-            tools.append(_build_gemini_fnc(fnc))
+            tools.append(_build_gemini_fnc(fnc, tool_behavior=tool_behavior))
 
     return tools
 
@@ -88,14 +97,20 @@ def create_tools_config(
 
 
 def get_tool_results_for_realtime(
-    chat_ctx: llm.ChatContext, *, vertexai: bool = False
+    chat_ctx: llm.ChatContext,
+    *,
+    vertexai: bool = False,
+    tool_response_scheduling: NotGivenOr[types.FunctionResponseScheduling] = NOT_GIVEN,
 ) -> types.LiveClientToolResponse | None:
     function_responses: list[types.FunctionResponse] = []
     for msg in chat_ctx.items:
         if msg.type == "function_call_output":
             res = types.FunctionResponse(
                 name=msg.name,
                 response={"output": msg.output},
+                scheduling=tool_response_scheduling
+                if is_given(tool_response_scheduling)
+                else types.FunctionResponseScheduling.WHEN_IDLE,
             )
             if not vertexai:
                 # vertexai does not support id in FunctionResponse
@@ -109,14 +124,21 @@ def get_tool_results_for_realtime(
     )
 
 
-def _build_gemini_fnc(function_tool: FunctionTool) -> types.FunctionDeclaration:
+def _build_gemini_fnc(
+    function_tool: FunctionTool, *, tool_behavior: NotGivenOr[types.Behavior] = NOT_GIVEN
+) -> types.FunctionDeclaration:
     fnc = llm.utils.build_legacy_openai_schema(function_tool, internally_tagged=True)
     json_schema = _GeminiJsonSchema(fnc["parameters"]).simplify()
-    return types.FunctionDeclaration(
-        name=fnc["name"],
-        description=fnc["description"],
-        parameters=types.Schema.model_validate(json_schema) if json_schema else None,
-    )
+
+    kwargs = {
+        "name": fnc["name"],
+        "description": fnc["description"],
+        "parameters": types.Schema.model_validate(json_schema) if json_schema else None,
+    }
+    if is_given(tool_behavior):
+        kwargs["behavior"] = tool_behavior
+
+    return types.FunctionDeclaration(**kwargs)
 
 
 def to_response_format(response_format: type | dict) -> types.SchemaUnion: