Merge pull request #4 from speechmatics/v0.0.4

Add new v1 protocol messages
speechmatics · Nov 12, 2024 · f777fd0 · f777fd0
2 parents 84076ef + c931be5
commit f777fd0
Show file tree

Hide file tree

Showing 5 changed files with 100 additions and 25 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,26 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
+## [0.0.4] - 2024-11-12
+
+### Added
+
+- `ResponseStarted`: Indicates the start of TTS audio streaming from the server.
+  The message contains the textual content of the utterance to be spoken.
+- `ResponseInterrupted`: Indicates an interruption in the TTS audio stream from the server.
+  The message contains the textual content up to the point where the utterance was stopped.
+- `ResponseCompleted`: Indicates the completion of TTS audio transmission from the server.
+- `ConversationEnding`: Indicates the session will continue in one-sided mode during TTS playback of the final words.
+  The message includes the textual content of the utterance just spoken.
+- `AddAudio`: Implicit name for all inbound binary messages.
+  The client confirms receipt by sending an `ServerMessageType.AudioReceived` message.
+- `AudioReceived`: Response to `ServerMessageType.AddAudio`, indicating that audio has been added successfully.
+- Deprecation warning for `audio` (replaced by AddAudio) and `prompt` (replaced by Response*) messages
+
+### Removed
+
+- Unused `EndOfTranscript` server message
+
 ## [0.0.3] - 2024-10-23
 
 ### Changed

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.0.3
+0.0.4
diff --git a/speechmatics_flow/cli.py b/speechmatics_flow/cli.py
@@ -174,21 +174,17 @@ def prompt_handler(message):
         if print_json:
             print(json.dumps(message))
             return
-        new_response = message["prompt"]["response"]
+        new_response = message["content"]
         new_plaintext_response = new_response.replace("<sb> ", "").replace("</sb> ", "")
         if new_plaintext_response:
             sys.stdout.write(f"{escape_seq}{new_plaintext_response}\n")
         transcripts.user_transcript += new_plaintext_response
 
-    def end_of_transcript_handler(_):
-        print("\n", file=sys.stderr)
-
-    api.add_event_handler(ServerMessageType.prompt, prompt_handler)
+    api.add_event_handler(ServerMessageType.ResponseStarted, prompt_handler)
     api.add_event_handler(ServerMessageType.AddTranscript, transcript_handler)
     api.add_event_handler(
         ServerMessageType.AddPartialTranscript, partial_transcript_handler
     )
-    api.add_event_handler(ServerMessageType.EndOfTranscript, end_of_transcript_handler)
 
 
 # pylint: disable=too-many-branches

diff --git a/speechmatics_flow/client.py b/speechmatics_flow/client.py
@@ -64,7 +64,8 @@ def __init__(
         self.event_handlers = {x: [] for x in ServerMessageType}
         self.middlewares = {x: [] for x in ClientMessageType}
 
-        self.seq_no = 0
+        self.client_seq_no = 0
+        self.server_seq_no = 0
         self.session_running = False
         self.conversation_ended_wait_timeout = 5
         self._session_needs_closing = False
@@ -135,11 +136,27 @@ def _end_of_audio(self):
         :py:attr:`models.ClientMessageType.AudioEnded`
         message.
         """
-        msg = {"message": ClientMessageType.AudioEnded, "last_seq_no": self.seq_no}
+        msg = {
+            "message": ClientMessageType.AudioEnded,
+            "last_seq_no": self.client_seq_no,
+        }
         self._call_middleware(ClientMessageType.AudioEnded, msg, False)
         LOGGER.debug(msg)
         return msg
 
+    @json_utf8
+    def _audio_received(self):
+        """Constructs an :py:attr:`models.ClientMessageType.AudioReceived` message."""
+        self.server_seq_no += 1
+        msg = {
+            "message": ClientMessageType.AudioReceived,
+            "seq_no": self.server_seq_no,
+            "buffering": 0.01,  # 10ms
+        }
+        self._call_middleware(ClientMessageType.AudioReceived, msg, False)
+        LOGGER.debug(msg)
+        return msg
+
     async def _wait_for_conversation_ended(self):
         """
         Waits for :py:attr:`models.ClientMessageType.ConversationEnded`
@@ -164,12 +181,15 @@ async def _consumer(self, message, from_cli: False):
             handler.
         """
         if isinstance(message, (bytes, bytearray)):
+            # Send ack as soon as we receive audio
+            await self.websocket.send(self._audio_received())
             # add an audio message to local buffer only when running from cli
             if from_cli:
                 await self._audio_buffer.put(message)
-            # Flow service does not send message_type with binary data,
-            # so we need to set it here for event_handler to work
-            message_type = ServerMessageType.audio
+            # Implicit name for all inbound binary messages.
+            # We must manually set it for event handler subscribed
+            # to `ServerMessageType.AddAudio` messages to work.
+            message_type = ServerMessageType.AddAudio
         else:
             LOGGER.debug(message)
             message = json.loads(message)
@@ -225,7 +245,7 @@ async def _read_from_microphone(self):
                 # audio_chunk size is 128 * 2 = 256 bytes which is about 8ms
                 audio_chunk = stream.read(num_frames=128, exception_on_overflow=False)
 
-                self.seq_no += 1
+                self.client_seq_no += 1
                 self._call_middleware(ClientMessageType.AddAudio, audio_chunk, True)
                 await self.websocket.send(audio_chunk)
                 # send audio at a constant rate
@@ -265,7 +285,7 @@ async def _stream_producer(self, stream, audio_chunk_size):
                 timeout=self.connection_settings.semaphore_timeout_seconds,
             )
 
-            self.seq_no += 1
+            self.client_seq_no += 1
             self._call_middleware(ClientMessageType.AddAudio, audio_chunk, True)
             yield audio_chunk
 
@@ -342,13 +362,13 @@ def add_event_handler(self, event_name, event_handler):
         the handler will be added for every event.
 
         For example, a simple handler that just LOGGER.debugs out the
-        :py:attr:`models.ServerMessageType.audio`
+        :py:attr:`models.ServerMessageType.ConversationStarted`
         messages received:
 
         >>> client = WebsocketClient(
                 ConnectionSettings(url="wss://localhost:9000"))
         >>> handler = lambda msg: LOGGER.debug(msg)
-        >>> client.add_event_handler(ServerMessageType.audio, handler)
+        >>> client.add_event_handler(ServerMessageType.ConversationStarted, handler)
 
         :param event_name: The name of the message for which a handler is
                 being added. Refer to
@@ -362,15 +382,30 @@ def add_event_handler(self, event_name, event_handler):
 
         :raises ValueError: If the given event name is not valid.
         """
+        # TODO: Remove when no longer supported
+        if event_name in [ServerMessageType.audio, ServerMessageType.prompt]:
+            LOGGER.warning(
+                f"DeprecationWarning: '{event_name}' is deprecated and will be removed in future versions."
+            )
+
         if event_name == "all":
+            # Iterate through event handlers, excluding deprecated ServerMessageType.audio.
             for name in self.event_handlers.keys():
+                if (
+                    name == ServerMessageType.audio
+                ):  # TODO: Remove when no longer supported
+                    continue
                 self.event_handlers[name].append(event_handler)
         elif event_name not in self.event_handlers:
             raise ValueError(
-                f"Unknown event name: {event_name!r}, expected to be "
-                f"'all' or one of {list(self.event_handlers.keys())}."
+                f"Unknown event name: '{event_name}'. Expected 'all' or one of {list(self.event_handlers.keys())}."
             )
         else:
+            # Map deprecated ServerMessageType.audio to ServerMessageType.AddAudio for compatibility.
+            if (
+                event_name == ServerMessageType.audio
+            ):  # TODO: Remove when no longer supported
+                event_name = ServerMessageType.AddAudio
             self.event_handlers[event_name].append(event_handler)
 
     def add_middleware(self, event_name, middleware):
@@ -472,7 +507,8 @@ async def run(
         :raises Exception: Can raise any exception returned by the
             consumer/producer tasks.
         """
-        self.seq_no = 0
+        self.client_seq_no = 0
+        self.server_seq_no = 0
         self.conversation_config = conversation_config
         self.audio_settings = audio_settings
 

diff --git a/speechmatics_flow/models.py b/speechmatics_flow/models.py
@@ -84,9 +84,13 @@ class ClientMessageType(str, Enum):
     """Initiates a conversation job based on configuration set previously."""
 
     AddAudio = "AddAudio"
-    """Adds more audio data to the recognition job. The server confirms
+    """Implicit name for all outbound binary messages. The server confirms
     receipt by sending an :py:attr:`ServerMessageType.AudioAdded` message."""
 
+    AudioReceived = "AudioReceived"
+    """Client response to :py:attr:`ServerMessageType.AddAudio`, indicating
+    that audio has been added successfully."""
+
     AudioEnded = "AudioEnded"
     """Indicates audio input has finished."""
 
@@ -111,20 +115,39 @@ class ServerMessageType(str, Enum):
     AddTranscript = "AddTranscript"
     """Indicates the final transcript of a part of the audio."""
 
+    ResponseStarted = "ResponseStarted"
+    """
+    Indicates the start of TTS audio streaming from the server.
+    The message contains the textual content of the utterance to be spoken.
+    """
+
+    ResponseCompleted = "ResponseCompleted"
+    """Indicates the completion of TTS audio transmission from the server.
+    The message includes the textual content of the utterance just spoken.
+    """
+
+    ResponseInterrupted = "ResponseInterrupted"
+    """Indicates an interruption in the TTS audio stream from the server.
+    The message contains the textual content up to the point where the utterance was stopped.
+    """
+
+    AddAudio = "AddAudio"
+    """Implicit name for all outbound binary messages. The client confirms
+    receipt by sending an :py:attr:`ServerMessageType.AudioReceived` message."""
+
     audio = "audio"
     """Message contains binary data"""
 
     prompt = "prompt"
     """Message contains text data"""
 
+    ConversationEnding = "ConversationEnding"
+    """Indicates the session will continue in one-sided mode
+    during TTS playback of the final words."""
+
     ConversationEnded = "ConversationEnded"
     """Message indicates the session ended."""
 
-    EndOfTranscript = "EndOfTranscript"
-    """Server response to :py:attr:`ClientMessageType.EndOfStream`,
-    after the server has finished sending all :py:attr:`AddTranscript`
-    messages."""
-
     Info = "Info"
     """Indicates a generic info message."""