Skip to content

Commit

Permalink
Merge pull request #4 from speechmatics/v0.0.4
Browse files Browse the repository at this point in the history
Add new v1 protocol messages
  • Loading branch information
dumitrugutu authored Nov 12, 2024
2 parents 84076ef + c931be5 commit f777fd0
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 25 deletions.
20 changes: 20 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,26 @@ All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [0.0.4] - 2024-11-12

### Added

- `ResponseStarted`: Indicates the start of TTS audio streaming from the server.
The message contains the textual content of the utterance to be spoken.
- `ResponseInterrupted`: Indicates an interruption in the TTS audio stream from the server.
The message contains the textual content up to the point where the utterance was stopped.
- `ResponseCompleted`: Indicates the completion of TTS audio transmission from the server.
- `ConversationEnding`: Indicates the session will continue in one-sided mode during TTS playback of the final words.
The message includes the textual content of the utterance just spoken.
- `AddAudio`: Implicit name for all inbound binary messages.
The client confirms receipt by sending an `ServerMessageType.AudioReceived` message.
- `AudioReceived`: Response to `ServerMessageType.AddAudio`, indicating that audio has been added successfully.
- Deprecation warning for `audio` (replaced by AddAudio) and `prompt` (replaced by Response*) messages

### Removed

- Unused `EndOfTranscript` server message

## [0.0.3] - 2024-10-23

### Changed
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.0.3
0.0.4
8 changes: 2 additions & 6 deletions speechmatics_flow/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,21 +174,17 @@ def prompt_handler(message):
if print_json:
print(json.dumps(message))
return
new_response = message["prompt"]["response"]
new_response = message["content"]
new_plaintext_response = new_response.replace("<sb> ", "").replace("</sb> ", "")
if new_plaintext_response:
sys.stdout.write(f"{escape_seq}{new_plaintext_response}\n")
transcripts.user_transcript += new_plaintext_response

def end_of_transcript_handler(_):
print("\n", file=sys.stderr)

api.add_event_handler(ServerMessageType.prompt, prompt_handler)
api.add_event_handler(ServerMessageType.ResponseStarted, prompt_handler)
api.add_event_handler(ServerMessageType.AddTranscript, transcript_handler)
api.add_event_handler(
ServerMessageType.AddPartialTranscript, partial_transcript_handler
)
api.add_event_handler(ServerMessageType.EndOfTranscript, end_of_transcript_handler)


# pylint: disable=too-many-branches
Expand Down
60 changes: 48 additions & 12 deletions speechmatics_flow/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ def __init__(
self.event_handlers = {x: [] for x in ServerMessageType}
self.middlewares = {x: [] for x in ClientMessageType}

self.seq_no = 0
self.client_seq_no = 0
self.server_seq_no = 0
self.session_running = False
self.conversation_ended_wait_timeout = 5
self._session_needs_closing = False
Expand Down Expand Up @@ -135,11 +136,27 @@ def _end_of_audio(self):
:py:attr:`models.ClientMessageType.AudioEnded`
message.
"""
msg = {"message": ClientMessageType.AudioEnded, "last_seq_no": self.seq_no}
msg = {
"message": ClientMessageType.AudioEnded,
"last_seq_no": self.client_seq_no,
}
self._call_middleware(ClientMessageType.AudioEnded, msg, False)
LOGGER.debug(msg)
return msg

@json_utf8
def _audio_received(self):
"""Constructs an :py:attr:`models.ClientMessageType.AudioReceived` message."""
self.server_seq_no += 1
msg = {
"message": ClientMessageType.AudioReceived,
"seq_no": self.server_seq_no,
"buffering": 0.01, # 10ms
}
self._call_middleware(ClientMessageType.AudioReceived, msg, False)
LOGGER.debug(msg)
return msg

async def _wait_for_conversation_ended(self):
"""
Waits for :py:attr:`models.ClientMessageType.ConversationEnded`
Expand All @@ -164,12 +181,15 @@ async def _consumer(self, message, from_cli: False):
handler.
"""
if isinstance(message, (bytes, bytearray)):
# Send ack as soon as we receive audio
await self.websocket.send(self._audio_received())
# add an audio message to local buffer only when running from cli
if from_cli:
await self._audio_buffer.put(message)
# Flow service does not send message_type with binary data,
# so we need to set it here for event_handler to work
message_type = ServerMessageType.audio
# Implicit name for all inbound binary messages.
# We must manually set it for event handler subscribed
# to `ServerMessageType.AddAudio` messages to work.
message_type = ServerMessageType.AddAudio
else:
LOGGER.debug(message)
message = json.loads(message)
Expand Down Expand Up @@ -225,7 +245,7 @@ async def _read_from_microphone(self):
# audio_chunk size is 128 * 2 = 256 bytes which is about 8ms
audio_chunk = stream.read(num_frames=128, exception_on_overflow=False)

self.seq_no += 1
self.client_seq_no += 1
self._call_middleware(ClientMessageType.AddAudio, audio_chunk, True)
await self.websocket.send(audio_chunk)
# send audio at a constant rate
Expand Down Expand Up @@ -265,7 +285,7 @@ async def _stream_producer(self, stream, audio_chunk_size):
timeout=self.connection_settings.semaphore_timeout_seconds,
)

self.seq_no += 1
self.client_seq_no += 1
self._call_middleware(ClientMessageType.AddAudio, audio_chunk, True)
yield audio_chunk

Expand Down Expand Up @@ -342,13 +362,13 @@ def add_event_handler(self, event_name, event_handler):
the handler will be added for every event.
For example, a simple handler that just LOGGER.debugs out the
:py:attr:`models.ServerMessageType.audio`
:py:attr:`models.ServerMessageType.ConversationStarted`
messages received:
>>> client = WebsocketClient(
ConnectionSettings(url="wss://localhost:9000"))
>>> handler = lambda msg: LOGGER.debug(msg)
>>> client.add_event_handler(ServerMessageType.audio, handler)
>>> client.add_event_handler(ServerMessageType.ConversationStarted, handler)
:param event_name: The name of the message for which a handler is
being added. Refer to
Expand All @@ -362,15 +382,30 @@ def add_event_handler(self, event_name, event_handler):
:raises ValueError: If the given event name is not valid.
"""
# TODO: Remove when no longer supported
if event_name in [ServerMessageType.audio, ServerMessageType.prompt]:
LOGGER.warning(
f"DeprecationWarning: '{event_name}' is deprecated and will be removed in future versions."
)

if event_name == "all":
# Iterate through event handlers, excluding deprecated ServerMessageType.audio.
for name in self.event_handlers.keys():
if (
name == ServerMessageType.audio
): # TODO: Remove when no longer supported
continue
self.event_handlers[name].append(event_handler)
elif event_name not in self.event_handlers:
raise ValueError(
f"Unknown event name: {event_name!r}, expected to be "
f"'all' or one of {list(self.event_handlers.keys())}."
f"Unknown event name: '{event_name}'. Expected 'all' or one of {list(self.event_handlers.keys())}."
)
else:
# Map deprecated ServerMessageType.audio to ServerMessageType.AddAudio for compatibility.
if (
event_name == ServerMessageType.audio
): # TODO: Remove when no longer supported
event_name = ServerMessageType.AddAudio
self.event_handlers[event_name].append(event_handler)

def add_middleware(self, event_name, middleware):
Expand Down Expand Up @@ -472,7 +507,8 @@ async def run(
:raises Exception: Can raise any exception returned by the
consumer/producer tasks.
"""
self.seq_no = 0
self.client_seq_no = 0
self.server_seq_no = 0
self.conversation_config = conversation_config
self.audio_settings = audio_settings

Expand Down
35 changes: 29 additions & 6 deletions speechmatics_flow/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,13 @@ class ClientMessageType(str, Enum):
"""Initiates a conversation job based on configuration set previously."""

AddAudio = "AddAudio"
"""Adds more audio data to the recognition job. The server confirms
"""Implicit name for all outbound binary messages. The server confirms
receipt by sending an :py:attr:`ServerMessageType.AudioAdded` message."""

AudioReceived = "AudioReceived"
"""Client response to :py:attr:`ServerMessageType.AddAudio`, indicating
that audio has been added successfully."""

AudioEnded = "AudioEnded"
"""Indicates audio input has finished."""

Expand All @@ -111,20 +115,39 @@ class ServerMessageType(str, Enum):
AddTranscript = "AddTranscript"
"""Indicates the final transcript of a part of the audio."""

ResponseStarted = "ResponseStarted"
"""
Indicates the start of TTS audio streaming from the server.
The message contains the textual content of the utterance to be spoken.
"""

ResponseCompleted = "ResponseCompleted"
"""Indicates the completion of TTS audio transmission from the server.
The message includes the textual content of the utterance just spoken.
"""

ResponseInterrupted = "ResponseInterrupted"
"""Indicates an interruption in the TTS audio stream from the server.
The message contains the textual content up to the point where the utterance was stopped.
"""

AddAudio = "AddAudio"
"""Implicit name for all outbound binary messages. The client confirms
receipt by sending an :py:attr:`ServerMessageType.AudioReceived` message."""

audio = "audio"
"""Message contains binary data"""

prompt = "prompt"
"""Message contains text data"""

ConversationEnding = "ConversationEnding"
"""Indicates the session will continue in one-sided mode
during TTS playback of the final words."""

ConversationEnded = "ConversationEnded"
"""Message indicates the session ended."""

EndOfTranscript = "EndOfTranscript"
"""Server response to :py:attr:`ClientMessageType.EndOfStream`,
after the server has finished sending all :py:attr:`AddTranscript`
messages."""

Info = "Info"
"""Indicates a generic info message."""

Expand Down

0 comments on commit f777fd0

Please sign in to comment.