Skip to content

Commit 4a178e9

Browse files
authored
Merge pull request #70 from GetStream/fix/agent-example
fix: Agent Example and TURN detection
2 parents a940bd3 + 2eacdfb commit 4a178e9

File tree

6 files changed

+1876
-1852
lines changed

6 files changed

+1876
-1852
lines changed

agents-core/vision_agents/core/agents/agents.py

Lines changed: 80 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,9 @@ def __init__(
129129
self.conversation: Optional[Conversation] = None
130130
self._user_conversation_handle: Optional[StreamHandle] = None
131131
self._agent_conversation_handle: Optional[StreamHandle] = None
132+
133+
# Track pending transcripts for turn-based response triggering
134+
self._pending_user_transcripts: Dict[str, str] = {}
132135

133136
# Merge plugin events BEFORE subscribing to any events
134137
for plugin in [stt, tts, turn_detection, vad, llm]:
@@ -673,16 +676,56 @@ async def _process_track(self, track_id: str, track_type: str, participant):
673676

674677
async def _on_turn_event(self, event: TurnStartedEvent | TurnEndedEvent) -> None:
675678
"""Handle turn detection events."""
679+
# In realtime mode, the LLM handles turn detection, interruption, and responses itself
680+
if self.realtime_mode:
681+
return
682+
676683
if isinstance(event, TurnStartedEvent):
677-
# TODO: Implement TTS pause/resume functionality
678-
# For now, TTS will continue playing - this should be improved
679-
self.logger.info(
680-
f"👉 Turn started - participant speaking {event.speaker_id} : {event.confidence}"
681-
)
684+
# Interrupt TTS when user starts speaking (barge-in)
685+
if event.speaker_id and event.speaker_id != self.agent_user.id:
686+
if self.tts:
687+
self.logger.info(
688+
f"👉 Turn started - interrupting TTS for participant {event.speaker_id}"
689+
)
690+
try:
691+
await self.tts.stop_audio()
692+
except Exception as e:
693+
self.logger.error(f"Error stopping TTS: {e}")
694+
else:
695+
self.logger.info(
696+
f"👉 Turn started - participant speaking {event.speaker_id} : {event.confidence}"
697+
)
698+
else:
699+
# Agent itself started speaking - this is normal
700+
self.logger.debug(
701+
f"👉 Turn started - agent speaking {event.speaker_id}"
702+
)
682703
elif isinstance(event, TurnEndedEvent):
683704
self.logger.info(
684705
f"👉 Turn ended - participant {event.speaker_id} finished (duration: {event.duration}, confidence: {event.confidence})"
685706
)
707+
708+
# When turn detection is enabled, trigger LLM response when user's turn ends
709+
# This is the signal that the user has finished speaking and expects a response
710+
if event.speaker_id and event.speaker_id != self.agent_user.id:
711+
# Get the accumulated transcript for this speaker
712+
transcript = self._pending_user_transcripts.get(event.speaker_id, "")
713+
714+
if transcript and transcript.strip():
715+
self.logger.info(f"🤖 Triggering LLM response after turn ended for {event.speaker_id}")
716+
717+
# Create participant object if we have metadata
718+
participant = None
719+
if hasattr(event, 'custom') and event.custom:
720+
# Try to extract participant info from custom metadata
721+
participant = event.custom.get('participant')
722+
723+
# Trigger LLM response with the complete transcript
724+
if self.llm:
725+
await self.simple_response(transcript, participant)
726+
727+
# Clear the pending transcript for this speaker
728+
self._pending_user_transcripts[event.speaker_id] = ""
686729

687730
async def _on_partial_transcript(
688731
self, event: STTPartialTranscriptEvent | RealtimePartialTranscriptEvent
@@ -737,6 +780,38 @@ async def _on_transcript(self, event: STTTranscriptEvent | RealtimeTranscriptEve
737780
)
738781
self.conversation.complete_message(self._user_conversation_handle)
739782
self._user_conversation_handle = None
783+
784+
# In realtime mode, the LLM handles everything itself (STT, turn detection, responses)
785+
# Skip our manual LLM triggering logic
786+
if self.realtime_mode:
787+
return
788+
789+
# Determine how to handle LLM triggering based on turn detection
790+
if self.turn_detection is not None:
791+
# With turn detection: accumulate transcripts and wait for TurnEndedEvent
792+
# Store/append the transcript for this user
793+
if user_id not in self._pending_user_transcripts:
794+
self._pending_user_transcripts[user_id] = event.text
795+
else:
796+
# Append to existing transcript (user might be speaking in chunks)
797+
self._pending_user_transcripts[user_id] += " " + event.text
798+
799+
self.logger.debug(
800+
f"📝 Accumulated transcript for {user_id} (waiting for turn end): "
801+
f"{self._pending_user_transcripts[user_id][:100]}..."
802+
)
803+
else:
804+
# Without turn detection: trigger LLM immediately on transcript completion
805+
# This is the traditional STT -> LLM flow
806+
if self.llm:
807+
self.logger.info("🤖 Triggering LLM response immediately (no turn detection)")
808+
809+
# Get participant from event metadata
810+
participant = None
811+
if hasattr(event, "user_metadata"):
812+
participant = event.user_metadata
813+
814+
await self.simple_response(event.text, participant)
740815

741816
async def _on_stt_error(self, error):
742817
"""Handle STT service errors."""

agents-core/vision_agents/core/turn_detection/fal_turn_detection.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -311,10 +311,13 @@ async def _process_turn_prediction(
311311
f"Turn completed detected for user {user_id} (confidence: {probability:.3f})"
312312
)
313313

314-
# If this user was speaking, emit turn ended
315-
if self._current_speaker == user_id:
316-
self._emit_turn_event(TurnEvent.TURN_ENDED, event_data)
317-
self._current_speaker = None
314+
# User finished speaking - emit turn ended
315+
# Set them as current speaker if they weren't already (in case we missed the start)
316+
if self._current_speaker != user_id:
317+
self._current_speaker = user_id
318+
319+
self._emit_turn_event(TurnEvent.TURN_ENDED, event_data)
320+
self._current_speaker = None
318321

319322
else:
320323
# Turn is still in progress

examples/01_simple_agent_example/pyproject.toml

Lines changed: 11 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6,31 +6,22 @@ requires-python = ">=3.13"
66
# put only what this example needs
77
dependencies = [
88
"python-dotenv>=1.0",
9-
"stream-agents-plugins-deepgram",
10-
"stream-agents-plugins-elevenlabs",
11-
"stream-agents-plugins-anthropic",
12-
"stream-agents-plugins-getstream",
9+
"vision-agents-plugins-deepgram",
10+
"vision-agents-plugins-elevenlabs",
11+
"vision-agents-plugins-anthropic",
12+
"vision-agents-plugins-getstream",
1313
"getstream-plugins-common",
14-
"stream-agents",
14+
"vision-agents",
1515
"openai>=1.101.0",
16-
"krisp-audio>=1.4.0; sys_platform == 'darwin' and platform_machine == 'aarch64'",
17-
"krisp-audio>=1.4.0; sys_platform == 'win32'",
18-
"krisp-audio>=1.4.0; sys_platform == 'linux' and platform_machine == 'x86_64'",
19-
"krisp-audio>=1.4.0; sys_platform == 'linux' and platform_machine == 'aarch64'",
2016
"anthropic>=0.66.0",
2117
"google-genai>=1.33.0",
18+
"fal-client>=0.5.3",
2219
]
2320

2421
[tool.uv.sources]
25-
krisp-audio = [
26-
{ path = "../../agents-core/stream_agents/core/turn_detection/krisp/krisp_audio-1.4.0-cp313-cp313-macosx_12_0_arm64.whl", marker = "sys_platform == 'darwin' and platform_machine == 'aarch64'" },
27-
{ path = "../../agents-core/stream_agents/core/turn_detection/krisp/krisp_audio-1.4.0-cp313-cp313-linux_aarch64.whl", marker = "sys_platform == 'linux' and platform_machine == 'aarch64'" },
28-
{ path = "../../agents-core/stream_agents/core/turn_detection/krisp/krisp_audio-1.4.0-cp313-cp313-linux_x86_64.whl", marker = "sys_platform == 'linux' and platform_machine == 'x86_64'" },
29-
{ path = "../../agents-core/stream_agents/core/turn_detection/krisp/krisp_audio-1.4.0-cp313-cp313-win_amd64.whl", marker = "sys_platform == 'win32'" }
30-
]
31-
"stream-agents-plugins-deepgram" = {path = "../../plugins/deepgram", editable=true}
32-
"stream-agents-plugins-elevenlabs" = {path = "../../plugins/elevenlabs", editable=true}
33-
"stream-agents-plugins-anthropic" = {path = "../../plugins/anthropic", editable=true}
34-
"stream-agents-plugins-getstream" = {path = "../../plugins/getstream", editable=true}
22+
"vision-agents-plugins-deepgram" = {path = "../../plugins/deepgram", editable=true}
23+
"vision-agents-plugins-elevenlabs" = {path = "../../plugins/elevenlabs", editable=true}
24+
"vision-agents-plugins-anthropic" = {path = "../../plugins/anthropic", editable=true}
25+
"vision-agents-plugins-getstream" = {path = "../../plugins/getstream", editable=true}
3526

36-
"stream-agents" = {path = "../../agents-core", editable=true}
27+
"vision-agents" = {path = "../../agents-core", editable=true}

examples/01_simple_agent_example/simple_agent_example.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from vision_agents.plugins import elevenlabs, deepgram, openai, getstream
99
from vision_agents.core import agents, cli
1010
from vision_agents.core.events import CallSessionParticipantJoinedEvent
11+
from vision_agents.core.turn_detection import FalTurnDetection
1112

1213
logging.basicConfig(
1314
level=logging.INFO,
@@ -37,6 +38,7 @@ async def start_agent() -> None:
3738
llm=llm,
3839
tts=elevenlabs.TTS(),
3940
stt=deepgram.STT(),
41+
turn_detection=FalTurnDetection(buffer_duration=2.0, confidence_threshold=0.5), # Enable turn detection with FAL
4042
#vad=silero.VAD(),
4143
# realtime version (vad, tts and stt not needed)
4244
# llm=openai.Realtime()

0 commit comments

Comments
 (0)