GetStream
diff --git a/‎agents-core/vision_agents/core/agents/agents.py‎
Lines changed: 88 additions & 64 deletions b/‎agents-core/vision_agents/core/agents/agents.py‎
Lines changed: 88 additions & 64 deletions
@@ -50,6 +50,7 @@ def _log_task_exception(task: asyncio.Task):
     except Exception:
         logger.exception("Error in background task")
 
+
 class Agent:
     """
     Agent class makes it easy to build your own video AI.
@@ -102,6 +103,7 @@ def __init__(
         self.instructions = instructions
         self.edge = edge
         self.agent_user = agent_user
+        self._agent_user_initialized = False
 
         # only needed in case we spin threads
         self._root_span = trace.get_current_span()
@@ -124,15 +126,19 @@ def __init__(
         self._call_context_token: CallContextToken | None = None
 
         # Initialize MCP manager if servers are provided
-        self.mcp_manager = MCPManager(self.mcp_servers, self.llm, self.logger) if self.mcp_servers else None
+        self.mcp_manager = (
+            MCPManager(self.mcp_servers, self.llm, self.logger)
+            if self.mcp_servers
+            else None
+        )
 
         # we sync the user talking and the agent responses to the conversation
         # because we want to support streaming responses and can have delta updates for both
         # user and agent we keep an handle for both
         self.conversation: Optional[Conversation] = None
         self._user_conversation_handle: Optional[StreamHandle] = None
         self._agent_conversation_handle: Optional[StreamHandle] = None
-        
+
         # Track pending transcripts for turn-based response triggering
         self._pending_user_transcripts: Dict[str, str] = {}
 
@@ -153,7 +159,7 @@ def __init__(
         self._current_frame = None
         self._interval_task = None
         self._callback_executed = False
-        self._track_tasks : Dict[str, asyncio.Task] = {}
+        self._track_tasks: Dict[str, asyncio.Task] = {}
         self._connection: Optional[Connection] = None
         self._audio_track: Optional[aiortc.AudioStreamTrack] = None
         self._video_track: Optional[VideoStreamTrack] = None
@@ -194,8 +200,9 @@ def subscribe(self, function):
         """
         return self.events.subscribe(function)
 
-
     async def join(self, call: Call) -> "AgentSessionContextManager":
+        await self.create_user()
+
         # TODO: validation. join can only be called once
         with self.tracer.start_as_current_span("join"):
             if self._is_running:
@@ -311,9 +318,9 @@ async def close(self):
 
         for processor in self.processors:
             processor.close()
-        
+
         # Stop all video forwarders
-        if hasattr(self, '_video_forwarders'):
+        if hasattr(self, "_video_forwarders"):
             for forwarder in self._video_forwarders:
                 try:
                     await forwarder.stop()
@@ -382,16 +389,18 @@ def clear_call_logging_context(self) -> None:
             clear_call_context(self._call_context_token)
             self._call_context_token = None
 
-    async def create_user(self):
-        """Create the agent user in the edge provider, if required.
+    async def create_user(self) -> None:
+        """Create the agent user in the edge provider, if required."""
+
+        if self._agent_user_initialized:
+            return None
 
-        Returns:
-            Provider-specific user creation response.
-        """
         with self.tracer.start_as_current_span("edge.create_user"):
-            if self.agent_user.id == "":
-                self.agent_user.id = str(uuid4())
-            return await self.edge.create_user(self.agent_user)
+            if not self.agent_user.id:
+                self.agent_user.id = f"agent-{uuid4()}"
+            await self.edge.create_user(self.agent_user)
+
+        return None
 
     async def _handle_output_text_delta(self, event: LLMResponseChunkEvent):
         """Handle partial LLM response text deltas."""
@@ -499,23 +508,30 @@ async def _on_agent_say(self, event: events.AgentSayEvent):
             )
             self.logger.error(f"Error in agent say: {e}")
 
-    async def say(self, text: str, user_id: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None):
+    async def say(
+        self,
+        text: str,
+        user_id: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ):
         """
         Make the agent say something using TTS.
-        
+
         This is a convenience method that sends an AgentSayEvent to trigger TTS synthesis.
-        
+
         Args:
             text: The text for the agent to say
             user_id: Optional user ID for the speech
             metadata: Optional metadata to include with the speech
         """
-        self.events.send(events.AgentSayEvent(
-            plugin_name="agent",
-            text=text,
-            user_id=user_id or self.agent_user.id,
-            metadata=metadata
-        ))
+        self.events.send(
+            events.AgentSayEvent(
+                plugin_name="agent",
+                text=text,
+                user_id=user_id or self.agent_user.id,
+                metadata=metadata,
+            )
+        )
 
     def _setup_turn_detection(self):
         if self.turn_detection:
@@ -571,12 +587,11 @@ async def _reply_to_audio(
                     continue
                 await processor.process_audio(audio_bytes, participant.user_id)
 
-
             # when in Realtime mode call the Realtime directly (non-blocking)
             if self.realtime_mode and isinstance(self.llm, Realtime):
                 # TODO: this behaviour should be easy to change in the agent class
                 asyncio.create_task(self.llm.simple_audio_response(pcm_data))
-                #task.add_done_callback(lambda t: print(f"Task (send_audio_pcm) error: {t.exception()}"))
+                # task.add_done_callback(lambda t: print(f"Task (send_audio_pcm) error: {t.exception()}"))
             # Process audio through STT
             elif self.stt:
                 self.logger.debug(f"🎵 Processing audio from {participant}")
@@ -591,14 +606,12 @@ async def _process_track(self, track_id: str, track_type: int, participant):
         # subscribe to the video track
         track = self.edge.add_track_subscriber(track_id)
         if not track:
-            self.logger.error(
-                f"Failed to subscribe to {track_id}"
-            )
+            self.logger.error(f"Failed to subscribe to {track_id}")
             return
 
         # Import VideoForwarder
         from ..utils.video_forwarder import VideoForwarder
-        
+
         # Create a SHARED VideoForwarder for the RAW incoming track
         # This prevents multiple recv() calls competing on the same track
         raw_forwarder = VideoForwarder(
@@ -609,9 +622,9 @@ async def _process_track(self, track_id: str, track_type: int, participant):
         )
         await raw_forwarder.start()
         self.logger.info("🎥 Created raw VideoForwarder for track %s", track_id)
-        
+
         # Track forwarders for cleanup
-        if not hasattr(self, '_video_forwarders'):
+        if not hasattr(self, "_video_forwarders"):
             self._video_forwarders = []
         self._video_forwarders.append(raw_forwarder)
 
@@ -620,7 +633,9 @@ async def _process_track(self, track_id: str, track_type: int, participant):
             if self._video_track:
                 # We have a video publisher (e.g., YOLO processor)
                 # Create a separate forwarder for the PROCESSED video track
-                self.logger.info("🎥 Forwarding PROCESSED video frames to Realtime provider")
+                self.logger.info(
+                    "🎥 Forwarding PROCESSED video frames to Realtime provider"
+                )
                 processed_forwarder = VideoForwarder(
                     self._video_track,  # type: ignore[arg-type]
                     max_buffer=30,
@@ -629,23 +644,28 @@ async def _process_track(self, track_id: str, track_type: int, participant):
                 )
                 await processed_forwarder.start()
                 self._video_forwarders.append(processed_forwarder)
-                
+
                 if isinstance(self.llm, Realtime):
                     # Send PROCESSED frames with the processed forwarder
-                    await self.llm._watch_video_track(self._video_track, shared_forwarder=processed_forwarder)
+                    await self.llm._watch_video_track(
+                        self._video_track, shared_forwarder=processed_forwarder
+                    )
             else:
                 # No video publisher, send raw frames
                 self.logger.info("🎥 Forwarding RAW video frames to Realtime provider")
                 if isinstance(self.llm, Realtime):
-                    await self.llm._watch_video_track(track, shared_forwarder=raw_forwarder)
-
+                    await self.llm._watch_video_track(
+                        track, shared_forwarder=raw_forwarder
+                    )
 
         hasImageProcessers = len(self.image_processors) > 0
 
         # video processors - pass the raw forwarder (they process incoming frames)
         for processor in self.video_processors:
             try:
-                await processor.process_video(track, participant.user_id, shared_forwarder=raw_forwarder)
+                await processor.process_video(
+                    track, participant.user_id, shared_forwarder=raw_forwarder
+                )
             except Exception as e:
                 self.logger.error(
                     f"Error in video processor {type(processor).__name__}: {e}"
@@ -654,13 +674,15 @@ async def _process_track(self, track_id: str, track_type: int, participant):
         # Use raw forwarder for image processors - only if there are image processors
         if not hasImageProcessers:
             # No image processors, just keep the connection alive
-            self.logger.info("No image processors, video processing handled by video processors only")
+            self.logger.info(
+                "No image processors, video processing handled by video processors only"
+            )
             return
-        
+
         # Initialize error tracking counters
         timeout_errors = 0
         consecutive_errors = 0
-        
+
         while True:
             try:
                 # Use the raw forwarder instead of competing for track.recv()
@@ -672,7 +694,6 @@ async def _process_track(self, track_id: str, track_type: int, participant):
                     consecutive_errors = 0
 
                     if hasImageProcessers:
-
                         img = video_frame.to_image()
 
                         for processor in self.image_processors:
@@ -683,7 +704,6 @@ async def _process_track(self, track_id: str, track_type: int, participant):
                                     f"Error in image processor {type(processor).__name__}: {e}"
                                 )
 
-
                 else:
                     self.logger.warning("🎥VDP: Received empty frame")
                     consecutive_errors += 1
@@ -698,14 +718,16 @@ async def _process_track(self, track_id: str, track_type: int, participant):
                 await asyncio.sleep(backoff_delay)
 
         # Cleanup and logging
-        self.logger.info(f"🎥VDP: Video processing loop ended for track {track_id} - timeouts: {timeout_errors}, consecutive_errors: {consecutive_errors}")
+        self.logger.info(
+            f"🎥VDP: Video processing loop ended for track {track_id} - timeouts: {timeout_errors}, consecutive_errors: {consecutive_errors}"
+        )
 
     async def _on_turn_event(self, event: TurnStartedEvent | TurnEndedEvent) -> None:
         """Handle turn detection events."""
         # In realtime mode, the LLM handles turn detection, interruption, and responses itself
         if self.realtime_mode:
             return
-        
+
         if isinstance(event, TurnStartedEvent):
             # Interrupt TTS when user starts speaking (barge-in)
             if event.speaker_id and event.speaker_id != self.agent_user.id:
@@ -730,26 +752,28 @@ async def _on_turn_event(self, event: TurnStartedEvent | TurnEndedEvent) -> None
             self.logger.info(
                 f"👉 Turn ended - participant {event.speaker_id} finished (duration: {event.duration}, confidence: {event.confidence})"
             )
-            
+
             # When turn detection is enabled, trigger LLM response when user's turn ends
             # This is the signal that the user has finished speaking and expects a response
             if event.speaker_id and event.speaker_id != self.agent_user.id:
                 # Get the accumulated transcript for this speaker
                 transcript = self._pending_user_transcripts.get(event.speaker_id, "")
-                
+
                 if transcript and transcript.strip():
-                    self.logger.info(f"🤖 Triggering LLM response after turn ended for {event.speaker_id}")
-                    
+                    self.logger.info(
+                        f"🤖 Triggering LLM response after turn ended for {event.speaker_id}"
+                    )
+
                     # Create participant object if we have metadata
                     participant = None
-                    if hasattr(event, 'custom') and event.custom:
+                    if hasattr(event, "custom") and event.custom:
                         # Try to extract participant info from custom metadata
-                        participant = event.custom.get('participant')
-                    
+                        participant = event.custom.get("participant")
+
                     # Trigger LLM response with the complete transcript
                     if self.llm:
                         await self.simple_response(transcript, participant)
-                    
+
                     # Clear the pending transcript for this speaker
                     self._pending_user_transcripts[event.speaker_id] = ""
 
@@ -806,12 +830,12 @@ async def _on_transcript(self, event: STTTranscriptEvent | RealtimeTranscriptEve
             )
             self.conversation.complete_message(self._user_conversation_handle)
             self._user_conversation_handle = None
-        
+
         # In realtime mode, the LLM handles everything itself (STT, turn detection, responses)
         # Skip our manual LLM triggering logic
         if self.realtime_mode:
             return
-        
+
         # Determine how to handle LLM triggering based on turn detection
         if self.turn_detection is not None:
             # With turn detection: accumulate transcripts and wait for TurnEndedEvent
@@ -821,7 +845,7 @@ async def _on_transcript(self, event: STTTranscriptEvent | RealtimeTranscriptEve
             else:
                 # Append to existing transcript (user might be speaking in chunks)
                 self._pending_user_transcripts[user_id] += " " + event.text
-            
+
             self.logger.debug(
                 f"📝 Accumulated transcript for {user_id} (waiting for turn end): "
                 f"{self._pending_user_transcripts[user_id][:100]}..."
@@ -830,21 +854,21 @@ async def _on_transcript(self, event: STTTranscriptEvent | RealtimeTranscriptEve
             # Without turn detection: trigger LLM immediately on transcript completion
             # This is the traditional STT -> LLM flow
             if self.llm:
-                self.logger.info("🤖 Triggering LLM response immediately (no turn detection)")
-                
+                self.logger.info(
+                    "🤖 Triggering LLM response immediately (no turn detection)"
+                )
+
                 # Get participant from event metadata
                 participant = None
                 if hasattr(event, "user_metadata"):
                     participant = event.user_metadata
-                
+
                 await self.simple_response(event.text, participant)
 
     async def _on_stt_error(self, error):
         """Handle STT service errors."""
         self.logger.error(f"❌ STT Error: {error}")
 
-
-
     @property
     def realtime_mode(self) -> bool:
         """Check if the agent is in Realtime mode.
@@ -869,8 +893,7 @@ def publish_audio(self) -> bool:
 
     @property
     def publish_video(self) -> bool:
-        """Whether the agent should publish an outbound video track.
-        """
+        """Whether the agent should publish an outbound video track."""
         return len(self.video_publishers) > 0
 
     def _needs_audio_or_video_input(self) -> bool:
@@ -1000,7 +1023,9 @@ def _prepare_rtc(self):
                 else:
                     framerate = 48000
                     stereo = True  # Default to stereo for WebRTC
-                self._audio_track = self.edge.create_audio_track(framerate=framerate, stereo=stereo)
+                self._audio_track = self.edge.create_audio_track(
+                    framerate=framerate, stereo=stereo
+                )
                 if self.tts:
                     self.tts.set_output_track(self._audio_track)
 
@@ -1012,7 +1037,6 @@ def _prepare_rtc(self):
             self._video_track = video_publisher.publish_video_track()
             self.logger.info("🎥 Video track initialized from video publisher")
 
-
     def _truncate_for_logging(self, obj, max_length=200):
         """Truncate object string representation for logging to prevent spam."""
         obj_str = str(obj)