55import time
66import uuid
77from dataclasses import asdict
8- from typing import TYPE_CHECKING , Any , Dict , List , Optional
8+ from typing import TYPE_CHECKING , Any , Dict , List , Optional , TypeGuard
99from uuid import uuid4
1010
1111import getstream .models
3030 RealtimeUserSpeechTranscriptionEvent ,
3131 RealtimeAgentSpeechTranscriptionEvent ,
3232)
33- from ..llm .llm import LLM
33+ from ..llm .llm import AudioLLM , LLM , VideoLLM
3434from ..llm .realtime import Realtime
3535from ..mcp import MCPBaseServer , MCPManager
3636from ..processors .base_processor import Processor , ProcessorType , filter_processors
@@ -109,6 +109,18 @@ def default_agent_options():
109109 return AgentOptions (model_dir = _DEFAULT_MODEL_DIR )
110110
111111
112+ def _is_audio_llm (llm : LLM | VideoLLM | AudioLLM ) -> TypeGuard [AudioLLM ]:
113+ return isinstance (llm , AudioLLM )
114+
115+
116+ def _is_video_llm (llm : LLM | VideoLLM | AudioLLM ) -> TypeGuard [VideoLLM ]:
117+ return isinstance (llm , VideoLLM )
118+
119+
120+ def _is_realtime_llm (llm : LLM | AudioLLM | VideoLLM | Realtime ) -> TypeGuard [Realtime ]:
121+ return isinstance (llm , Realtime )
122+
123+
112124class Agent :
113125 """
114126 Agent class makes it easy to build your own video AI.
@@ -139,7 +151,7 @@ def __init__(
139151 # edge network for video & audio
140152 edge : "StreamEdge" ,
141153 # llm, optionally with sts/realtime capabilities
142- llm : LLM | Realtime ,
154+ llm : LLM | AudioLLM | VideoLLM ,
143155 # the agent's user info
144156 agent_user : User ,
145157 # instructions
@@ -424,7 +436,7 @@ async def _on_tts_audio_write_to_output(event: TTSAudioEvent):
424436
425437 @self .events .subscribe
426438 async def on_stt_transcript_event_create_response (event : STTTranscriptEvent ):
427- if self .llm . handles_audio :
439+ if _is_audio_llm ( self .llm ) :
428440 # There is no need to send the response to the LLM if it handles audio itself.
429441 return
430442
@@ -493,7 +505,7 @@ async def join(self, call: Call) -> "AgentSessionContextManager":
493505
494506 # Ensure Realtime providers are ready before proceeding (they manage their own connection)
495507 self .logger .info (f"🤖 Agent joining call: { call .id } " )
496- if isinstance (self .llm , Realtime ):
508+ if _is_realtime_llm (self .llm ):
497509 await self .llm .connect ()
498510
499511 with self .span ("edge.join" ):
@@ -812,12 +824,12 @@ async def on_video_track_added(event: TrackAddedEvent):
812824 f"🎥 Track re-added: { track_type_name } ({ track_id } ), switching to it"
813825 )
814826
815- if self .llm . handles_video :
827+ if _is_video_llm ( self .llm ) :
816828 # Get the existing forwarder and switch to this track
817829 _ , _ , forwarder = self ._active_video_tracks [track_id ]
818830 track = self .edge .add_track_subscriber (track_id )
819831 if track and forwarder :
820- await self .llm ._watch_video_track (
832+ await self .llm .watch_video_track (
821833 track , shared_forwarder = forwarder
822834 )
823835 self ._current_video_track_id = track_id
@@ -846,7 +858,7 @@ async def on_video_track_removed(event: TrackRemovedEvent):
846858 self ._active_video_tracks .pop (track_id , None )
847859
848860 # If this was the active track, switch to any other available track
849- if self .llm . handles_video and track_id == self ._current_video_track_id :
861+ if _is_video_llm ( self .llm ) and track_id == self ._current_video_track_id :
850862 self .logger .info (
851863 "🎥 Active video track removed, switching to next available"
852864 )
@@ -872,7 +884,7 @@ async def _reply_to_audio(
872884 )
873885
874886 # when in Realtime mode call the Realtime directly (non-blocking)
875- if self .llm . handles_audio :
887+ if _is_audio_llm ( self .llm ) :
876888 # TODO: this behaviour should be easy to change in the agent class
877889 asyncio .create_task (
878890 self .llm .simple_audio_response (pcm_data , participant )
@@ -908,9 +920,9 @@ async def _switch_to_next_available_track(self) -> None:
908920
909921 # Get the track and forwarder
910922 track = self .edge .add_track_subscriber (track_id )
911- if track and forwarder and isinstance (self .llm , Realtime ):
923+ if track and forwarder and _is_video_llm (self .llm ):
912924 # Send to Realtime provider
913- await self .llm ._watch_video_track (track , shared_forwarder = forwarder )
925+ await self .llm .watch_video_track (track , shared_forwarder = forwarder )
914926 self ._current_video_track_id = track_id
915927 return
916928 else :
@@ -973,7 +985,7 @@ async def recv(self):
973985 # If Realtime provider supports video, switch to this new track
974986 track_type_name = TrackType .Name (track_type )
975987
976- if self .llm . handles_video :
988+ if _is_video_llm ( self .llm ) :
977989 if self ._video_track :
978990 # We have a video publisher (e.g., YOLO processor)
979991 # Create a separate forwarder for the PROCESSED video track
@@ -989,22 +1001,20 @@ async def recv(self):
9891001 await processed_forwarder .start ()
9901002 self ._video_forwarders .append (processed_forwarder )
9911003
992- if isinstance (self .llm , Realtime ):
993- # Send PROCESSED frames with the processed forwarder
994- await self .llm ._watch_video_track (
995- self ._video_track , shared_forwarder = processed_forwarder
996- )
997- self ._current_video_track_id = track_id
1004+ # Send PROCESSED frames with the processed forwarder
1005+ await self .llm .watch_video_track (
1006+ self ._video_track , shared_forwarder = processed_forwarder
1007+ )
1008+ self ._current_video_track_id = track_id
9981009 else :
9991010 # No video publisher, send raw frames - switch to this new track
10001011 self .logger .info (
10011012 f"🎥 Switching to { track_type_name } track: { track_id } "
10021013 )
1003- if isinstance (self .llm , Realtime ):
1004- await self .llm ._watch_video_track (
1005- track , shared_forwarder = raw_forwarder
1006- )
1007- self ._current_video_track_id = track_id
1014+ await self .llm .watch_video_track (
1015+ track , shared_forwarder = raw_forwarder
1016+ )
1017+ self ._current_video_track_id = track_id
10081018
10091019 has_image_processors = len (self .image_processors ) > 0
10101020
@@ -1096,7 +1106,7 @@ async def recv(self):
10961106 async def _on_turn_event (self , event : TurnStartedEvent | TurnEndedEvent ) -> None :
10971107 """Handle turn detection events."""
10981108 # Skip the turn event handling if the model doesn't require TTS or SST audio itself.
1099- if not (self .llm . needs_tts and self . llm . needs_stt ):
1109+ if _is_audio_llm (self .llm ):
11001110 return
11011111
11021112 if isinstance (event , TurnStartedEvent ):
@@ -1167,7 +1177,7 @@ def publish_audio(self) -> bool:
11671177 Returns:
11681178 True if TTS is configured, when in Realtime mode, or if there are audio publishers.
11691179 """
1170- if self .tts is not None or self .llm . handles_audio :
1180+ if self .tts is not None or _is_audio_llm ( self .llm ) :
11711181 return True
11721182 # Also publish audio if there are audio publishers (e.g., HeyGen avatar)
11731183 if self .audio_publishers :
@@ -1204,7 +1214,7 @@ def _needs_audio_or_video_input(self) -> bool:
12041214 # Video input needed for:
12051215 # - Video processors (for frame analysis)
12061216 # - Realtime mode with video (multimodal LLMs)
1207- needs_video = len (self .video_processors ) > 0 or self .llm . handles_video
1217+ needs_video = len (self .video_processors ) > 0 or _is_video_llm ( self .llm )
12081218
12091219 return needs_audio or needs_video
12101220
@@ -1255,7 +1265,7 @@ def image_processors(self) -> List[Any]:
12551265
12561266 def _validate_configuration (self ):
12571267 """Validate the agent configuration."""
1258- if self .llm . handles_audio :
1268+ if _is_audio_llm ( self .llm ) :
12591269 # Realtime mode - should not have separate STT/TTS
12601270 if self .stt or self .tts :
12611271 self .logger .warning (
@@ -1292,8 +1302,8 @@ def _prepare_rtc(self):
12921302
12931303 # Set up audio track if TTS is available
12941304 if self .publish_audio :
1295- if self .llm . handles_audio :
1296- self ._audio_track = self .llm .output_track
1305+ if _is_audio_llm ( self .llm ) :
1306+ self ._audio_track = self .llm .output_audio_track
12971307 self .logger .info ("🎵 Using Realtime provider output track for audio" )
12981308 elif self .audio_publishers :
12991309 # Get the first audio publisher to create the track
0 commit comments