Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions livekit-agents/livekit/agents/llm/realtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,10 @@ def generate_reply(
@abstractmethod
def commit_audio(self) -> None: ...

# commit the user turn to the server
@abstractmethod
def commit_user_turn(self) -> None: ...

# clear the input audio buffer to the server
@abstractmethod
def clear_audio(self) -> None: ...
Expand Down
3 changes: 3 additions & 0 deletions livekit-agents/livekit/agents/voice/agent_activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -976,6 +976,9 @@ def clear_user_turn(self) -> None:
self._rt_session.clear_audio()

def commit_user_turn(self, *, transcript_timeout: float, stt_flush_duration: float) -> None:
if self._rt_session is not None:
self._rt_session.commit_user_turn()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe this can be simplified to

Suggested change
self._rt_session.commit_user_turn()
self._rt_session.commit_audio()
self._rt_session.generate_reply()

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think commit audio alone works and we don't really need any changes.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


Comment on lines 978 to +981
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Check for commit_user_turn implementation and exception types
rg -n "def commit_user_turn" --type py -A 5

Repository: livekit/agents

Length of output: 6596


🏁 Script executed:

#!/bin/bash
# Search for llm.RealtimeError or similar exception definitions
rg -n "class.*RealtimeError|RealtimeError" --type py -A 2

Repository: livekit/agents

Length of output: 9008


🏁 Script executed:

#!/bin/bash
# Check the context around lines 978-981 to see what follows commit_user_turn
sed -n '975,990p' livekit-agents/livekit/agents/voice/agent_activity.py

Repository: livekit/agents

Length of output: 782


🏁 Script executed:

#!/bin/bash
# Search for AudioRecognition.commit_user_turn to understand the full flow
rg -n "AudioRecognition.*commit_user_turn|\.commit_user_turn" livekit-agents/livekit/agents/voice/agent_activity.py -B 2 -A 2

Repository: livekit/agents

Length of output: 494


Wrap realtime commit to ensure STT commit always runs.

The _rt_session.commit_user_turn() call should be guarded with exception handling. While current provider implementations don't raise exceptions, this ensures the STT commit at line 983 always executes even if a future provider implementation raises RealtimeError. This pattern is already established elsewhere in the codebase for other realtime operations.

Suggested fix
        if self._rt_session is not None:
-            self._rt_session.commit_user_turn()
+            try:
+                self._rt_session.commit_user_turn()
+            except llm.RealtimeError:
+                logger.exception("failed to commit user turn on realtime session")
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def commit_user_turn(self, *, transcript_timeout: float, stt_flush_duration: float) -> None:
if self._rt_session is not None:
self._rt_session.commit_user_turn()
def commit_user_turn(self, *, transcript_timeout: float, stt_flush_duration: float) -> None:
if self._rt_session is not None:
try:
self._rt_session.commit_user_turn()
except llm.RealtimeError:
logger.exception("failed to commit user turn on realtime session")
🤖 Prompt for AI Agents
In `@livekit-agents/livekit/agents/voice/agent_activity.py` around lines 978 -
981, Wrap the realtime call in commit_user_turn so exceptions from
self._rt_session.commit_user_turn() (e.g., RealtimeError) are caught and do not
prevent the STT commit from running; specifically, in the commit_user_turn
method surround the call to self._rt_session.commit_user_turn() with a
try/except that catches RealtimeError (or a broad Exception if RealtimeError
isn't imported) and ignores or logs it, then ensure the subsequent STT commit
still executes unconditionally.

assert self._audio_recognition is not None
self._audio_recognition.commit_user_turn(
audio_detached=not self._session.input.audio_enabled,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2005,6 +2005,9 @@ def commit_audio(self) -> None:
def clear_audio(self) -> None:
logger.warning("clear_audio is not supported by Nova Sonic's Realtime API")

def commit_user_turn(self) -> None:
logger.warning("commit_user_turn is not supported by Nova Sonic's Realtime API")

def push_video(self, frame: rtc.VideoFrame) -> None:
logger.warning("video is not supported by Nova Sonic's Realtime API")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1225,10 +1225,13 @@ def _handle_go_away(self, go_away: types.LiveServerGoAway) -> None:
self._session_should_close.set()

def commit_audio(self) -> None:
pass
logger.warning("commit_audio is not supported by Gemini Realtime API.")

def clear_audio(self) -> None:
pass
logger.warning("clear_audio is not supported by Gemini Realtime API.")

def commit_user_turn(self) -> None:
logger.warning("commit_user_turn is not supported by Gemini Realtime API.")

def _resample_audio(self, frame: rtc.AudioFrame) -> Iterator[rtc.AudioFrame]:
if self._input_resampler:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1292,6 +1292,23 @@ def clear_audio(self) -> None:
self.send_event(InputAudioBufferClearEvent(type="input_audio_buffer.clear"))
self._pushed_duration_s = 0

def commit_user_turn(self) -> None:
if self._realtime_model._opts.turn_detection is not None and (
self._realtime_model._opts.turn_detection.interrupt_response
or self._realtime_model._opts.turn_detection.create_response
):
logger.warning(
"commit_user_turn is triggered when auto response is enabled. Model behavior may be unexpected."
)

self.commit_audio()
self.send_event(
ResponseCreateEvent(
type="response.create",
response=RealtimeResponseCreateParams(),
)
)

def generate_reply(
self, *, instructions: NotGivenOr[str] = NOT_GIVEN
) -> asyncio.Future[llm.GenerationCreatedEvent]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1120,6 +1120,23 @@ def clear_audio(self) -> None:
self.send_event(InputAudioBufferClearEvent(type="input_audio_buffer.clear"))
self._pushed_duration_s = 0

def commit_user_turn(self) -> None:
if self._realtime_model._opts.turn_detection is not None and (
self._realtime_model._opts.turn_detection.interrupt_response
or self._realtime_model._opts.turn_detection.create_response
):
logger.warning(
"commit_user_turn is triggered when auto response is enabled. Model behavior may be unexpected."
)

self.commit_audio()
self.send_event(
ResponseCreateEvent(
type="response.create",
response=Response(),
)
)

def generate_reply(
self, *, instructions: NotGivenOr[str] = NOT_GIVEN
) -> asyncio.Future[llm.GenerationCreatedEvent]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -466,7 +466,7 @@ def push_audio(self, frame: rtc.AudioFrame) -> None:

def push_video(self, frame: rtc.VideoFrame) -> None:
"""Push video frames (not supported by Ultravox)."""
pass
logger.warning("push_video is not supported by Ultravox.")

def _send_client_event(self, event: UltravoxEvent | dict[str, Any]) -> None:
"""Send an event to the Ultravox WebSocket."""
Expand Down Expand Up @@ -1138,6 +1138,9 @@ def commit_audio(self) -> None:
def clear_audio(self) -> None:
logger.warning("clear audio is not supported by Ultravox.")

def commit_user_turn(self) -> None:
logger.warning("commit_user_turn is not supported by Ultravox.")

def _resample_audio(self, frame: rtc.AudioFrame) -> Iterator[rtc.AudioFrame]:
"""Resample audio frame to the required sample rate."""
if self._input_resampler:
Expand Down