GetStream
diff --git a/‎DEVELOPMENT.md‎
Lines changed: 1 addition & 1 deletion b/‎DEVELOPMENT.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎agents-core/vision_agents/core/agents/agents.py‎
Lines changed: 39 additions & 29 deletions b/‎agents-core/vision_agents/core/agents/agents.py‎
Lines changed: 39 additions & 29 deletions
diff --git a/‎agents-core/vision_agents/core/llm/__init__.py‎
Lines changed: 10 additions & 2 deletions b/‎agents-core/vision_agents/core/llm/__init__.py‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎agents-core/vision_agents/core/llm/llm.py‎
Lines changed: 42 additions & 7 deletions b/‎agents-core/vision_agents/core/llm/llm.py‎
Lines changed: 42 additions & 7 deletions
diff --git a/‎agents-core/vision_agents/core/llm/realtime.py‎
Lines changed: 2 additions & 20 deletions b/‎agents-core/vision_agents/core/llm/realtime.py‎
Lines changed: 2 additions & 20 deletions
@@ -105,7 +105,7 @@ To see how the agent work open up agents.py
 
 **Video**
 
-* The agent receives the video track, and calls agent.llm._watch_video_track
+* The agent receives the video track, and calls agent.llm.watch_video_track
 * The LLM uses the VideoForwarder to write the video to a websocket or webrtc connection
 * The STS writes the reply on agent.llm.audio_track and the RealtimeTranscriptEvent / RealtimePartialTranscriptEvent
 
 
@@ -5,7 +5,7 @@
 import time
 import uuid
 from dataclasses import asdict
-from typing import TYPE_CHECKING, Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeGuard
 from uuid import uuid4
 
 import getstream.models
@@ -30,7 +30,7 @@
     RealtimeUserSpeechTranscriptionEvent,
     RealtimeAgentSpeechTranscriptionEvent,
 )
-from ..llm.llm import LLM
+from ..llm.llm import AudioLLM, LLM, VideoLLM
 from ..llm.realtime import Realtime
 from ..mcp import MCPBaseServer, MCPManager
 from ..processors.base_processor import Processor, ProcessorType, filter_processors
@@ -109,6 +109,18 @@ def default_agent_options():
     return AgentOptions(model_dir=_DEFAULT_MODEL_DIR)
 
 
+def _is_audio_llm(llm: LLM | VideoLLM | AudioLLM) -> TypeGuard[AudioLLM]:
+    return isinstance(llm, AudioLLM)
+
+
+def _is_video_llm(llm: LLM | VideoLLM | AudioLLM) -> TypeGuard[VideoLLM]:
+    return isinstance(llm, VideoLLM)
+
+
+def _is_realtime_llm(llm: LLM | AudioLLM | VideoLLM | Realtime) -> TypeGuard[Realtime]:
+    return isinstance(llm, Realtime)
+
+
 class Agent:
     """
     Agent class makes it easy to build your own video AI.
@@ -139,7 +151,7 @@ def __init__(
         # edge network for video & audio
         edge: "StreamEdge",
         # llm, optionally with sts/realtime capabilities
-        llm: LLM | Realtime,
+        llm: LLM | AudioLLM | VideoLLM,
         # the agent's user info
         agent_user: User,
         # instructions
@@ -424,7 +436,7 @@ async def _on_tts_audio_write_to_output(event: TTSAudioEvent):
 
         @self.events.subscribe
         async def on_stt_transcript_event_create_response(event: STTTranscriptEvent):
-            if self.llm.handles_audio:
+            if _is_audio_llm(self.llm):
                 # There is no need to send the response to the LLM if it handles audio itself.
                 return
 
@@ -493,7 +505,7 @@ async def join(self, call: Call) -> "AgentSessionContextManager":
 
             # Ensure Realtime providers are ready before proceeding (they manage their own connection)
             self.logger.info(f"🤖 Agent joining call: {call.id}")
-            if isinstance(self.llm, Realtime):
+            if _is_realtime_llm(self.llm):
                 await self.llm.connect()
 
             with self.span("edge.join"):
@@ -812,12 +824,12 @@ async def on_video_track_added(event: TrackAddedEvent):
                     f"🎥 Track re-added: {track_type_name} ({track_id}), switching to it"
                 )
 
-                if self.llm.handles_video:
+                if _is_video_llm(self.llm):
                     # Get the existing forwarder and switch to this track
                     _, _, forwarder = self._active_video_tracks[track_id]
                     track = self.edge.add_track_subscriber(track_id)
                     if track and forwarder:
-                        await self.llm._watch_video_track(
+                        await self.llm.watch_video_track(
                             track, shared_forwarder=forwarder
                         )
                         self._current_video_track_id = track_id
@@ -846,7 +858,7 @@ async def on_video_track_removed(event: TrackRemovedEvent):
             self._active_video_tracks.pop(track_id, None)
 
             # If this was the active track, switch to any other available track
-            if self.llm.handles_video and track_id == self._current_video_track_id:
+            if _is_video_llm(self.llm) and track_id == self._current_video_track_id:
                 self.logger.info(
                     "🎥 Active video track removed, switching to next available"
                 )
@@ -872,7 +884,7 @@ async def _reply_to_audio(
                 )
 
             # when in Realtime mode call the Realtime directly (non-blocking)
-            if self.llm.handles_audio:
+            if _is_audio_llm(self.llm):
                 # TODO: this behaviour should be easy to change in the agent class
                 asyncio.create_task(
                     self.llm.simple_audio_response(pcm_data, participant)
@@ -908,9 +920,9 @@ async def _switch_to_next_available_track(self) -> None:
 
             # Get the track and forwarder
             track = self.edge.add_track_subscriber(track_id)
-            if track and forwarder and isinstance(self.llm, Realtime):
+            if track and forwarder and _is_video_llm(self.llm):
                 # Send to Realtime provider
-                await self.llm._watch_video_track(track, shared_forwarder=forwarder)
+                await self.llm.watch_video_track(track, shared_forwarder=forwarder)
                 self._current_video_track_id = track_id
                 return
             else:
@@ -973,7 +985,7 @@ async def recv(self):
             # If Realtime provider supports video, switch to this new track
             track_type_name = TrackType.Name(track_type)
 
-            if self.llm.handles_video:
+            if _is_video_llm(self.llm):
                 if self._video_track:
                     # We have a video publisher (e.g., YOLO processor)
                     # Create a separate forwarder for the PROCESSED video track
@@ -989,22 +1001,20 @@ async def recv(self):
                     await processed_forwarder.start()
                     self._video_forwarders.append(processed_forwarder)
 
-                    if isinstance(self.llm, Realtime):
-                        # Send PROCESSED frames with the processed forwarder
-                        await self.llm._watch_video_track(
-                            self._video_track, shared_forwarder=processed_forwarder
-                        )
-                        self._current_video_track_id = track_id
+                    # Send PROCESSED frames with the processed forwarder
+                    await self.llm.watch_video_track(
+                        self._video_track, shared_forwarder=processed_forwarder
+                    )
+                    self._current_video_track_id = track_id
                 else:
                     # No video publisher, send raw frames - switch to this new track
                     self.logger.info(
                         f"🎥 Switching to {track_type_name} track: {track_id}"
                     )
-                    if isinstance(self.llm, Realtime):
-                        await self.llm._watch_video_track(
-                            track, shared_forwarder=raw_forwarder
-                        )
-                        self._current_video_track_id = track_id
+                    await self.llm.watch_video_track(
+                        track, shared_forwarder=raw_forwarder
+                    )
+                    self._current_video_track_id = track_id
 
             has_image_processors = len(self.image_processors) > 0
 
@@ -1096,7 +1106,7 @@ async def recv(self):
     async def _on_turn_event(self, event: TurnStartedEvent | TurnEndedEvent) -> None:
         """Handle turn detection events."""
         # Skip the turn event handling if the model doesn't require TTS or SST audio itself.
-        if not (self.llm.needs_tts and self.llm.needs_stt):
+        if _is_audio_llm(self.llm):
             return
 
         if isinstance(event, TurnStartedEvent):
@@ -1167,7 +1177,7 @@ def publish_audio(self) -> bool:
         Returns:
             True if TTS is configured, when in Realtime mode, or if there are audio publishers.
         """
-        if self.tts is not None or self.llm.handles_audio:
+        if self.tts is not None or _is_audio_llm(self.llm):
             return True
         # Also publish audio if there are audio publishers (e.g., HeyGen avatar)
         if self.audio_publishers:
@@ -1204,7 +1214,7 @@ def _needs_audio_or_video_input(self) -> bool:
         # Video input needed for:
         # - Video processors (for frame analysis)
         # - Realtime mode with video (multimodal LLMs)
-        needs_video = len(self.video_processors) > 0 or self.llm.handles_video
+        needs_video = len(self.video_processors) > 0 or _is_video_llm(self.llm)
 
         return needs_audio or needs_video
 
@@ -1255,7 +1265,7 @@ def image_processors(self) -> List[Any]:
 
     def _validate_configuration(self):
         """Validate the agent configuration."""
-        if self.llm.handles_audio:
+        if _is_audio_llm(self.llm):
             # Realtime mode - should not have separate STT/TTS
             if self.stt or self.tts:
                 self.logger.warning(
@@ -1292,8 +1302,8 @@ def _prepare_rtc(self):
 
         # Set up audio track if TTS is available
         if self.publish_audio:
-            if self.llm.handles_audio:
-                self._audio_track = self.llm.output_track
+            if _is_audio_llm(self.llm):
+                self._audio_track = self.llm.output_audio_track
                 self.logger.info("🎵 Using Realtime provider output track for audio")
             elif self.audio_publishers:
                 # Get the first audio publisher to create the track
 
@@ -1,5 +1,13 @@
-from .llm import LLM
+from .llm import LLM, AudioLLM, VideoLLM, OmniLLM
 from .realtime import Realtime
 from .function_registry import FunctionRegistry, function_registry
 
-__all__ = ["LLM", "Realtime", "FunctionRegistry", "function_registry"]
+__all__ = [
+    "LLM",
+    "AudioLLM",
+    "VideoLLM",
+    "OmniLLM",
+    "Realtime",
+    "FunctionRegistry",
+    "function_registry",
+]
@@ -15,6 +15,7 @@
     Generic,
 )
 
+import aiortc
 from vision_agents.core.llm import events
 from vision_agents.core.llm.events import ToolStartEvent, ToolEndEvent
 
@@ -23,11 +24,13 @@
     from vision_agents.core.agents.conversation import Conversation
 
 from getstream.video.rtc.pb.stream.video.sfu.models.models_pb2 import Participant
+from getstream.video.rtc import AudioStreamTrack, PcmData
 from vision_agents.core.processors import Processor
 from vision_agents.core.utils.utils import parse_instructions
 from vision_agents.core.events.manager import EventManager
 from .function_registry import FunctionRegistry
 from .llm_types import ToolSchema, NormalizedToolCallItem
+from ..utils.video_forwarder import VideoForwarder
 
 T = TypeVar("T")
 
@@ -44,13 +47,6 @@ def __init__(self, original: T, text: str, exception: Optional[Exception] = None
 
 
 class LLM(abc.ABC):
-    # Instruct the Agent that this model requires STT and TTS services, and it doesn't handle audio and video
-    # on its own.
-    needs_stt: bool = True
-    needs_tts: bool = True
-    handles_audio: bool = False
-    handles_video: bool = False
-
     before_response_listener: BeforeCb
     after_response_listener: AfterCb
     agent: Optional["Agent"]
@@ -407,3 +403,42 @@ def _sanitize_tool_output(self, value: Any, max_chars: int = 60_000) -> str:
         """
         s = value if isinstance(value, str) else json.dumps(value)
         return (s[:max_chars] + "…") if len(s) > max_chars else s
+
+
+class AudioLLM(LLM, metaclass=abc.ABCMeta):
+    """
+    A base class for LLMs capable of processing speech-to-speech audio.
+    These models do not require TTS and STT services to run.
+    """
+
+    @abc.abstractmethod
+    async def simple_audio_response(
+        self, pcm: PcmData, participant: Optional[Participant] = None
+    ): ...
+
+    @property
+    @abc.abstractmethod
+    def output_audio_track(self) -> AudioStreamTrack: ...
+
+
+class VideoLLM(LLM, metaclass=abc.ABCMeta):
+    """
+    A base class for LLMs capable of processing video.
+
+    These models will receive the video track from the `Agent` to analyze it.
+    """
+
+    @abc.abstractmethod
+    async def watch_video_track(
+        self,
+        track: aiortc.mediastreams.MediaStreamTrack,
+        shared_forwarder: Optional[VideoForwarder] = None,
+    ) -> None: ...
+
+
+class OmniLLM(AudioLLM, VideoLLM, metaclass=abc.ABCMeta):
+    """
+    A base class for LLMs capable of both video and speech-to-speech audio processing.
+    """
+
+    ...
@@ -1,11 +1,9 @@
 from __future__ import annotations
 
 from typing import (
-    Any,
     Optional,
 )
 
-from getstream.video.rtc.audio_track import AudioStreamTrack
 from getstream.video.rtc.track_util import PcmData
 from vision_agents.core.edge.types import Participant
 
@@ -14,14 +12,13 @@
 import logging
 import uuid
 
-
-from . import events, LLM
+from . import events, OmniLLM
 
 
 logger = logging.getLogger(__name__)
 
 
-class Realtime(LLM, abc.ABC):
+class Realtime(OmniLLM):
     """
     Realtime is an abstract base class for LLMs that can receive audio and video
 
@@ -42,13 +39,6 @@ class Realtime(LLM, abc.ABC):
     fps: int = 1
     session_id: str  # UUID to identify this session
 
-    # Instruct the Agent that this model can handle audio and video
-    # without additional STT and TTS services.
-    handles_audio: bool = True
-    handles_video: bool = True
-    needs_stt = False
-    needs_tts = False
-
     def __init__(
         self,
         fps: int = 1,  # the number of video frames per second to send (for implementations that support setting fps)
@@ -59,10 +49,6 @@ def __init__(
         self.provider_name = "realtime_base"
         self.session_id = str(uuid.uuid4())
         self.fps = fps
-        # The most common style output track (webrtc)
-        self.output_track: AudioStreamTrack = AudioStreamTrack(
-            sample_rate=48000, channels=2, format="s16"
-        )
         # Store current participant for user speech transcription events
         self._current_participant: Optional[Participant] = None
 
@@ -74,10 +60,6 @@ async def simple_audio_response(
         self, pcm: PcmData, participant: Optional[Participant] = None
     ): ...
 
-    async def _watch_video_track(self, track: Any, **kwargs) -> None:
-        """Optionally overridden by providers that support video input."""
-        return None
-
     async def _stop_watching_video_track(self) -> None:
         """Optionally overridden by providers that support video input."""
         return None