diff --git a/.gitignore b/.gitignore
index f4ac18d..99722df 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,4 @@ build
.venv
*.local
.env
+site
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..5c3753f
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,4 @@
+include README.md
+include LICENSE
+include src/uhlive/py.typed
+
diff --git a/README.md b/README.md
index 5460816..973089d 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
The Uh!live Python SDK provides convenient access to the Uh!live API from
applications written in the Python language.
-Read the [documentation for the Conversation API](https://docs.allo-media.net/live-api/) and [for the Recognition API (vocal bot toolkit)](https://docs.allo-media.net/stream-api-bots/).
+Read the [full documentation](https://python-uhlive-sdk.netlify.app/).
## Requirements
@@ -11,6 +11,12 @@ Read the [documentation for the Conversation API](https://docs.allo-media.net/li
Install with `pip install .[examples]` to install the the library and all the dependencies necessary to run the examples.
+### Installation from Pypi
+
+```
+pip install uhlive
+```
+
### Audio files
To play with the examples, you should have a raw audio file.
diff --git a/docs/auth.md b/docs/auth.md
new file mode 100644
index 0000000..df3df22
--- /dev/null
+++ b/docs/auth.md
@@ -0,0 +1,6 @@
+# uhlive.auth
+
+::: uhlive.auth
+ options:
+ show_source: false
+
diff --git a/docs/conversation_api.md b/docs/conversation_api.md
new file mode 100644
index 0000000..5cc9b85
--- /dev/null
+++ b/docs/conversation_api.md
@@ -0,0 +1,5 @@
+# uhlive.stream.conversation
+
+::: uhlive.stream.conversation
+ options:
+ show_source: false
diff --git a/docs/css/mkdocstrings.css b/docs/css/mkdocstrings.css
new file mode 100644
index 0000000..c2d342a
--- /dev/null
+++ b/docs/css/mkdocstrings.css
@@ -0,0 +1,41 @@
+/* Indentation. */
+div.doc-contents:not(.first) {
+ padding-left: 25px;
+ border-left: .15rem solid var(--secondary);
+}
+
+/*code {
+ font-size: inherit;
+}
+*/
+h2.doc-heading {
+ font-size: 1.75rem;
+/* font-weight: 600;*/
+}
+
+h3.doc-heading {
+/* font-weight: 600;*/
+ font-size: 1.2rem;
+}
+
+/* Mark external links as such. */
+a.external::after,
+a.autorefs-external::after {
+ /* https://primer.style/octicons/arrow-up-right-24 */
+ mask-image: url('data:image/svg+xml,');
+ -webkit-mask-image: url('data:image/svg+xml,');
+ content: ' ';
+
+ display: inline-block;
+ vertical-align: middle;
+ position: relative;
+
+ height: 1em;
+ width: 1em;
+ background-color: var(--md-typeset-a-color);
+}
+
+a.external:hover::after,
+a.autorefs-external:hover::after {
+ background-color: var(--md-accent-fg-color);
+}
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 0000000..0b8e5c5
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,105 @@
+# Welcome to the Uh!ive Python SDK
+
+The Uh!ive Python SDK is a library to access our live Automated Speech Recognition online APIs.
+It provides [I/O Free](https://sans-io.readthedocs.io/index.html) Python abstractions over the underlying protocols and workflows to hide the complexity.
+
+By providing an I/O Free implementation, we let developers choose whatever websocket transport library and paradigm — synchronous or asynchronous (asyncio) — they like most.
+
+## Access to the API
+
+In order to have access to our online APIs, your company needs to register for an account. Depending on the plan, you may get two kinds of credentials:
+
+* Either a `client_id` and `client_secret`;
+* or a `client_id`, `user_id` and `user_password`.
+
+In all cases, those credentials are used to retrieve a one time access token from our SSO.
+
+You are free to use whatever HTTP client library you like.
+
+Here is a synchronous example using `requests`:
+
+```python
+from uhlive.auth import build_authentication_request
+import requests
+
+uhlive_client = "…"
+uhlive_secret = "…"
+# user_id = "…"
+# user_password = "…"
+
+auth_url, auth_params = build_authentication_request(uhlive_client, uhlive_secret)
+# or auth_url, auth_params = build_authentication_request(uhlive_client, user_id=user_id, user_pwd=user_password)
+login = requests.post(auth_url, data=auth_params)
+login.raise_for_status()
+uhlive_token = login.json()["access_token"]
+```
+
+Here is an asynchronous example using `aiohttp`:
+
+```python
+import asyncio
+from uhlive.auth import build_authentication_request
+from aiohttp import ClientSession
+
+uhlive_client = "…"
+uhlive_secret = "…"
+# user_id = "…"
+# user_password = "…"
+
+
+async def main(uhlive_client, uhlive_secret):
+ async with ClientSession() as session:
+ auth_url, auth_params = build_authentication_request(
+ uhlive_client, uhlive_secret
+ )
+ async with session.post(auth_url, data=auth_params) as login:
+ login.raise_for_status()
+ body = await login.json()
+ uhlive_token = body["access_token"]
+ # continue with Stream API of your choice
+ # ...
+
+asyncio.run(main(uhlive_client, uhlive_secret))
+```
+
+Then this one time token allows you to connect to any subscribed API within 5 minutes.
+
+* [Auth API reference](auth.md)
+
+
+## Conversation API to analyze human to human interactions.
+
+Also known as the human to human (H2H) stream API.
+
+* [High level overview](https://docs.allo-media.net/stream-h2h/overview/#high-level-overview-and-concepts)
+* [Python SDK API documentation](conversation_api.md)
+
+## Recognition and interpretation API for voice bots.
+
+Also known as the human to bot (H2B) stream API.
+
+* [High level overview](https://docs.allo-media.net/stream-h2b/#real-time-stream-api-for-voicebots)
+* [Python SDK API documentation](recognition_api.md)
+
+
+## Changelog
+
+### v1.3.1
+
+Full API documentation.
+
+### v1.3.0
+
+* Support for `SegmentNormalized`
+* SSO
+* Concurrent test runner `test_runner_async.py` in `examples/recognition`
+
+### v1.2.0
+
+* Improved streamer
+* Improved test_runner.py
+* Forbid sharing connection between conversations
+
+### v1.1.0
+
+* Support for passing codec parameter
diff --git a/docs/recognition_api.md b/docs/recognition_api.md
new file mode 100644
index 0000000..c2093fd
--- /dev/null
+++ b/docs/recognition_api.md
@@ -0,0 +1,5 @@
+# uhlive.stream.recognition
+
+::: uhlive.stream.recognition
+ options:
+ show_source: false
diff --git a/examples/recognition/async_bot_lib.py b/examples/recognition/async_bot_lib.py
index d85f58a..7d4c88e 100644
--- a/examples/recognition/async_bot_lib.py
+++ b/examples/recognition/async_bot_lib.py
@@ -77,7 +77,6 @@ def callback(indata, frame_count, time_info, status):
class Bot:
-
TTF_CACHE: Dict[str, bytes] = {}
def __init__(self, google_ttf_key):
diff --git a/examples/recognition/basic_sync.py b/examples/recognition/basic_sync.py
index be81e35..6744f1d 100644
--- a/examples/recognition/basic_sync.py
+++ b/examples/recognition/basic_sync.py
@@ -84,7 +84,6 @@ def play(self, filename, codec="linear"):
def main(socket: ws.WebSocket, client: Recognizer, stream: AudioStreamer):
-
# Shortcuts
send = socket.send
diff --git a/examples/recognition/desktop-bot_async.py b/examples/recognition/desktop-bot_async.py
index a6a0c2b..fc52ae5 100644
--- a/examples/recognition/desktop-bot_async.py
+++ b/examples/recognition/desktop-bot_async.py
@@ -116,7 +116,6 @@ async def demo_multi(self):
await say("je vous passe le services des abonnés")
async def scenario(self):
-
# Shortcuts
say = self.say
diff --git a/examples/recognition/desktop-bot_sync.py b/examples/recognition/desktop-bot_sync.py
index 08ff518..71fa565 100644
--- a/examples/recognition/desktop-bot_sync.py
+++ b/examples/recognition/desktop-bot_sync.py
@@ -13,9 +13,9 @@ def set_defaults(self):
speech_language="fr",
no_input_timeout=5000,
recognition_timeout=20000,
- speech_complete_timeout=800,
- speech_incomplete_timeout=1200,
- speech_nomatch_timeout=3000,
+ speech_complete_timeout=1000,
+ speech_incomplete_timeout=2000,
+ speech_nomatch_timeout=4000,
)
# Define grammars up front
@@ -66,8 +66,17 @@ def demo_address(self):
recognition_mode="hotword",
)
addr["zipcode"] = nlu.value
- formatted = f"j'ai compris {addr['number'] or ''} {addr['street'] or ''} {addr['zipcode'] or ''} {addr['city'] or ''}"
- say(formatted)
+ say("J'ai compris")
+ if addr["number"]:
+ say(f"numéro : {addr['number']}")
+ if addr["street"]:
+ say(f"voie : {addr['street']}")
+ if addr["zipcode"]:
+ say(f"code postal : {addr['zipcode']}")
+ if addr["city"]:
+ say(f"ville : {addr['city']}")
+ if addr["complement"]:
+ say(f"complément d'adresse : {addr['complement']}")
confirm = self.confirm(
"Est-ce correct?",
)
@@ -160,7 +169,6 @@ def demo_date(self):
say("J'ai compris, mais ce n'est pas une date valide")
def scenario(self):
-
# Scenario
self.set_defaults()
self.wait_activation()
diff --git a/examples/recognition/fixtures/fr_address.test b/examples/recognition/fixtures/fr_address.test
index 674a07d..2bfc7d9 100644
--- a/examples/recognition/fixtures/fr_address.test
+++ b/examples/recognition/fixtures/fr_address.test
@@ -31,3 +31,4 @@ number = "37"
street = "rue du docteur leroy"
zipcode = "72000"
city = "le mans"
+complement = ""
diff --git a/examples/recognition/sync_bot_lib.py b/examples/recognition/sync_bot_lib.py
index a50ffa3..bd784ec 100644
--- a/examples/recognition/sync_bot_lib.py
+++ b/examples/recognition/sync_bot_lib.py
@@ -27,7 +27,6 @@
class Bot:
-
TTF_CACHE: Dict[str, bytes] = {}
def __init__(self, google_ttf_key):
@@ -108,7 +107,6 @@ def confirm(self, text: str) -> bool:
return res.value
def run(self, uhlive_client: str, uhlive_secret: str):
-
auth_url, auth_params = build_authentication_request(
uhlive_client, uhlive_secret
)
diff --git a/examples/recognition/transcribe.py b/examples/recognition/transcribe.py
index 0c922ce..17c0222 100644
--- a/examples/recognition/transcribe.py
+++ b/examples/recognition/transcribe.py
@@ -26,7 +26,6 @@ def main(
codec: str,
filepath: str,
):
-
# Shortcuts
send = socket.send
diff --git a/mkdocs.yml b/mkdocs.yml
new file mode 100644
index 0000000..f520721
--- /dev/null
+++ b/mkdocs.yml
@@ -0,0 +1,32 @@
+site_name: Uh!ive python SDK
+
+theme:
+ name: "mkdocs"
+
+extra_css:
+ - css/mkdocstrings.css
+
+plugins:
+ - search
+ - mkdocstrings:
+ default_handler: python
+ handlers:
+ python:
+ paths: [src]
+ options:
+ show_signature_annotations: true
+ group_by_category: true
+ show_category_heading: false
+ inherited_members: true
+ members_order: source
+ docstring_section_style: "list"
+ signature_crossrefs: true
+ separate_signature: true
+ line_length: 110
+ merge_init_into_class: true
+
+nav:
+ - Home: index.md
+ - Auth: auth.md
+ - H2H API: conversation_api.md
+ - H2B API: recognition_api.md
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..2a1db3f
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,54 @@
+aiohttp==3.8.4
+aiosignal==1.3.1
+async-timeout==4.0.2
+attrs==23.1.0
+Babel==2.14.0
+black==23.12.1
+cachetools==5.3.2
+certifi==2023.5.7
+cffi==1.15.1
+chardet==5.2.0
+charset-normalizer==3.1.0
+click==8.1.7
+colorama==0.4.6
+distlib==0.3.8
+filelock==3.13.1
+frozenlist==1.3.3
+ghp-import==2.1.0
+griffe==0.38.1
+idna==3.4
+Jinja2==3.1.2
+Markdown==3.5.1
+MarkupSafe==2.1.3
+mergedeep==1.3.4
+mkdocs==1.5.3
+mkdocs-autorefs==0.5.0
+mkdocs-material==9.5.3
+mkdocs-material-extensions==1.3.1
+mkdocstrings==0.24.0
+mkdocstrings-python==1.7.5
+multidict==6.0.4
+mypy-extensions==1.0.0
+packaging==23.2
+paginate==0.5.6
+pathspec==0.12.1
+platformdirs==4.1.0
+pluggy==1.3.0
+pycparser==2.21
+Pygments==2.17.2
+pymdown-extensions==10.7
+pyproject-api==1.6.1
+python-dateutil==2.8.2
+PyYAML==6.0.1
+pyyaml_env_tag==0.1
+regex==2023.12.25
+requests==2.31.0
+six==1.16.0
+sounddevice==0.4.6
+toml==0.10.2
+tox==4.11.4
+urllib3==2.0.3
+virtualenv==20.25.0
+watchdog==3.0.0
+websocket-client==1.6.1
+yarl==1.9.2
diff --git a/setup.py b/setup.py
index df855ff..593c250 100644
--- a/setup.py
+++ b/setup.py
@@ -5,11 +5,11 @@
setup(
name="uhlive",
- version="1.3.0",
+ version="1.3.1",
url="https://github.com/uhlive/python-sdk",
author="Allo-Media",
author_email="support@allo-media.fr",
- description="Python bindings for the Uh!ive API",
+ description="Python bindings for the Uh!ive APIs",
long_description=long_description,
long_description_content_type="text/markdown",
license="MIT",
diff --git a/src/uhlive/py.typed b/src/uhlive/py.typed
new file mode 100644
index 0000000..e69de29
diff --git a/src/uhlive/stream/conversation/__init__.py b/src/uhlive/stream/conversation/__init__.py
index 8980f3e..75b050e 100644
--- a/src/uhlive/stream/conversation/__init__.py
+++ b/src/uhlive/stream/conversation/__init__.py
@@ -1,21 +1,133 @@
+"""
+The Stream Conversation SDK API for human to human interactions.
+
+This API is used to consume a real-time audio stream and get enriched transcription events.
+
+The protocol is messages based and uses websockets as transport. You are free to use whatever websocket client library you like to communicate
+with the API, and use our SDK to encode/decode the messages.
+
+## Quickstart
+
+First retrieve a one time access token with the [Auth API](auth.md).
+
+Then use that token to build an authenticated URL, open a websocket connection to it with the websocket client library
+of your choice and instanciate a [`Conversation`][uhlive.stream.conversation.Conversation] to join a conversation, generate
+audio stream messages and decode transcription and enrichment events.
+
+As the API is asynchronous, streaming the audio and reading the returned events should be done in two different threads/tasks.
+
+```python
+from uhlive.stream.Conversation import *
+
+stream_h2h_url = build_conversation_url(token)
+
+# The subcripttion identifier was given to you with your other credentials
+# the conversation id can be any string you like. If a conversation by that name already exists in your subscription identifier domain
+# it will join it as a new speaker, otherwise it will create it and join the speaker in.
+# The speaker id helps you identify who is speaking.
+conversation = Conversation("subscription_identifier", "a_conversation_id", "a_speaker_id")
+```
+
+Now you can connect and interact with the API:
+
+Synchronous example:
+
+```python
+import websocket as ws
+
+socket = ws.create_connection(stream_h2h_url, timeout=10)
+socket.send(
+ conversation.join(
+ model="fr",
+ interim_results=False,
+ rescoring=True,
+ origin=int(time.time() * 1000),
+ country="fr",
+ )
+)
+# check we didn't get an error on join
+reply = conversation.receive(socket.recv())
+assert isinstance(reply, Ok)
+
+```
+
+Asynchronous example:
+
+```python
+from aiohttp import ClientSession
+
+async def main(uhlive_client, uhlive_secret):
+ async with ClientSession() as session:
+ async with session.ws_connect(stream_h2h_url) as socket:
+ await socket.send_str(
+ conversation.join(
+ model="fr",
+ interim_results=False,
+ rescoring=True,
+ origin=int(time.time() * 1000),
+ country="fr",
+ )
+ )
+ # check we didn't get an error on join
+ msg = await socket.receive()
+ reply = conversation.receive(msg.data)
+ assert isinstance(reply, Ok)
+```
+
+As you can see, the I/O is cleanly decoupled from the protocol handling: the `Conversation` object is only used
+to create the messages to send to the API and to decode the received messages as `Event` objects.
+
+See the [complete examples in the source distribution](https://github.com/uhlive/python-sdk/tree/main/examples/conversation).
+"""
+
+
import os
from urllib.parse import urljoin
-from .client import Conversation, ProtocolError # noqa
-from .events import ( # noqa
+from .client import Conversation, ProtocolError
+from .events import (
EntityFound,
+ EntityReference,
Event,
Ok,
RelationFound,
SegmentDecoded,
+ SpeakerJoined,
SpeakerLeft,
SpeechDecoded,
+ Tag,
+ TagsFound,
Unknown,
+ Word,
WordsDecoded,
)
SERVER = os.getenv("UHLIVE_API_URL", "wss://api.uh.live")
-def build_conversation_url(token):
+def build_conversation_url(token: str) -> str:
+ """
+ Make an authenticated URL to connect to the Conversation Service.
+ """
return urljoin(SERVER, "socket/websocket") + f"?jwt={token}&vsn=2.0.0"
+
+
+__all__ = [
+ "build_conversation_url",
+ "Conversation",
+ "ProtocolError",
+ "SpeakerJoined",
+ "Word",
+ "EntityFound",
+ "Event",
+ "Ok",
+ "EntityReference",
+ "RelationFound",
+ "SegmentDecoded",
+ "SpeakerLeft",
+ "SpeechDecoded",
+ "Unknown",
+ "WordsDecoded",
+ "Tag",
+ "TagsFound",
+]
diff --git a/src/uhlive/stream/conversation/client.py b/src/uhlive/stream/conversation/client.py
index 8bcf436..4c9effa 100644
--- a/src/uhlive/stream/conversation/client.py
+++ b/src/uhlive/stream/conversation/client.py
@@ -1,6 +1,12 @@
+"""
+Object oriented abstraction over the Conversation API protocol and workflow.
+"""
+
+
import json
from array import array
-from typing import Any, Callable, Dict, Type, TypeVar
+from enum import Enum
+from typing import Any, Dict, Union
from .events import Event, Ok, SpeakerLeft
@@ -34,67 +40,84 @@
class ProtocolError(RuntimeError):
+ """Exception raised when a [Conversation][uhlive.stream.conversation.Conversation] method is not available in the current state."""
+
pass
-class State:
- """Protocol state.
+class State(Enum):
+ """Protocol state."""
- Protocol states implement and document the available commands.
- User code should not use the State directly but use a `Conversation`
- object instead, and call the prococol methods on it.
- """
+ Idle = "Idle State"
+ Joined = "Joined State"
- def __init__(self, context: "Conversation") -> None:
- self.context = context
- def command(self, name: str, payload: Dict[str, Any] = {}) -> str:
- message = [
- S_JOIN_REF,
- self.context.request_id,
- self.context.topic,
- name,
- payload,
- ]
- return json.dumps(
- message, ensure_ascii=False, indent=None, separators=(",", ":")
- )
+class Conversation:
+ """To join a conversation on the API, you need a `Conversation` object.
+
+ You can only have one `Conversation` per connection (socket) otherwise you risk
+ unexpected behavior (and exceptions!).
+ """
+ def __init__(self, identifier: str, conversation_id: str, speaker: str) -> None:
+ """Create a `Conversation`.
+
+ Args:
+ identifier: is the identifier you got when you subscribed to the service;
+ conversation_id: is the conversation you wish to join,
+ speaker: is your alias in the conversation, to identify you and your events
+ """
+ self._state: State = State.Idle
+ self.identifier = identifier
+ self.topic = f"conversation:{self.identifier}@{conversation_id}"
+ self.topic_bin = self.topic.encode("utf-8")
+ self.topic_len = len(self.topic_bin)
+ self.speaker = speaker
+ self._request_id = int(S_JOIN_REF) - 1
-class IdleState(State):
def join(
self,
- model="fr",
- country="fr",
+ model: str = "fr",
+ country: str = "fr",
readonly: bool = False,
- interim_results=True,
- rescoring=True,
- origin=0,
- audio_codec="linear",
+ interim_results: bool = True,
+ rescoring: bool = True,
+ origin: int = 0,
+ audio_codec: str = "linear",
) -> str:
"""Join the conversation.
- * ``readonly``: if you are not going to stream audio, set it to ``True``.
- * ``speaker``: your alias in the conversation, to identify you and your events.
- * ``model``: (if ``readonly`` is ``False``) the ASR language model to be use to recognize
+ Args:
+ readonly: if you are not going to stream audio, set it to `True`.
+ model: (if `readonly` is `False`) the ASR language model to be use to recognize
the audio you will stream.
- * ``country``: the iso 2 letter country code of the place where the speaker is.
- * ``interim_results``: (``readonly`` = ``False`` only) should the ASR trigger interim result events?
- * ``rescoring``: (``readonly`` = ``False`` only) should the ASR refine the final segment
+ country: the iso 2 letter country code of the place where the speaker is.
+ interim_results: (`readonly` = `False` only) should the ASR trigger interim result events?
+ rescoring: (`readonly` = `False` only) should the ASR refine the final segment
with a bigger Language Model?
May give slightly degraded results for very short segments.
- * ``codec``: the speech audio codec of the audio data:
- - ``"linear"``: (default) linear 16 bit SLE raw PCM audio at 8khz;
- - ``"g711a"``: G711 a-law audio at 8khz;
- - ``"g711u"``: G711 μ-law audio at 8khz.
+ origin: The UNIX time, in milliseconds, to which the event timeline origin is set.
+ audio_codec: the speech audio codec of the audio data:
+
+ - `"linear"`: (default) linear 16 bit SLE raw PCM audio at 8khz;
+ - `"g711a"`: G711 a-law audio at 8khz;
+ - `"g711u"`: G711 μ-law audio at 8khz.
+
+ Returns:
+ The text websocket message to send to the server.
+
+ Raises:
+ ProtocolError: if still in a previously joined conversation.
"""
+ if self._state != State.Idle:
+ raise ProtocolError("Can't join twice!")
if not readonly and not model:
raise ProtocolError("If readonly is False, you must specify a model!")
return self.command(
"phx_join",
{
"readonly": readonly,
- "speaker": self.context.speaker,
+ "speaker": self.speaker,
"model": model,
"country": country,
"interim_results": interim_results,
@@ -104,78 +127,77 @@ def join(
},
)
-
-class JoinedState(State):
def leave(self) -> str:
"""Leave the current conversation.
It's a good idea to leave a conversation and continue to consume messages
- until you receive a SpeakerLeft event for your speaker, before you
+ until you receive a [`SpeakerLeft`][uhlive.stream.conversation.SpeakerLeft] event for your speaker, before you
close the connection. Otherwise, you may miss parts of the transcription.
+
+ Returns:
+ The text websocket message to send to the server.
+
+ Raises:
+ ProtocolError: if not currently in a converstation.
"""
+ if self._state != State.Joined:
+ raise ProtocolError("No conversation to leave!")
return self.command("phx_leave", {})
def send_audio_chunk(self, chunk: bytes) -> bytes:
- """Send an audio chunk (when streaming.)."""
- ref = self.context.request_id.encode("ascii")
- message = array("B", [0, 1, len(ref), self.context.topic_len, 11, B_JOIN_REF])
+ """Build an audio chunk for streaming.
+
+ Returns:
+ The binary websocket message to send to the server.
+ Raises:
+ ProtocolError: if not currently in a converstation.
+ """
+ if self._state != State.Joined:
+ raise ProtocolError("Not in a conversation!")
+ ref = self.request_id.encode("ascii")
+ message = array("B", [0, 1, len(ref), self.topic_len, 11, B_JOIN_REF])
message.extend(ref)
- message.extend(self.context.topic_bin)
+ message.extend(self.topic_bin)
message.extend(b"audio_chunk")
message.extend(chunk)
return message.tobytes()
-
-S = TypeVar("S", bound=State)
-
-
-class Conversation:
- """To join a conversation on the API, you need a `Conversation` object.
-
- You can only have one `Conversation` per connection (socket) otherwise you risk
- unexpected behavior (and exceptions!).
- """
-
- def __init__(self, identifier: str, conversation_id: str, speaker: str) -> None:
- """Create a `Conversation`.
- - `identifier` is the identifier you got when you subscribed to the service;
- - `conversation_id` is the conversation you wish to join,
- - `speaker` is your alias in the conversation, to identify you and your events
- """
- self._state: State = IdleState(self)
- self.identifier = identifier
- self.topic = f"conversation:{self.identifier}@{conversation_id}"
- self.topic_bin = self.topic.encode("utf-8")
- self.topic_len = len(self.topic_bin)
- self.speaker = speaker
- self._request_id = int(S_JOIN_REF) - 1
-
- def __getattr__(self, name: str) -> Callable:
- meth = getattr(self._state, name, None)
- if meth is None:
- raise ProtocolError(f"no method '{name}' in this state!")
- return meth
-
- def transition(self, state: Type[S]) -> None:
- self._state = state(self)
-
@property
def request_id(self) -> str:
self._request_id += 1
return str(self._request_id)
- def receive(self, data: str) -> Event:
- """Decode received text frame"""
+ def receive(self, data: Union[str, bytes]) -> Event:
+ """Decode received websocket message.
+
+ The server only sends text messages.
+
+ Returns:
+ The appropriate [Event][uhlive.stream.conversation.Event] subclass instance.
+ """
event = Event.from_message(json.loads(data))
assert (
event.topic == self.topic
), "Topic mismatch! Are you trying to mix several conversations on the same socket? This is not supported."
if isinstance(event, Ok) and event.ref == event.join_ref:
- self.transition(JoinedState)
+ self._state = State.Joined
elif isinstance(event, SpeakerLeft) and event.speaker == self.speaker:
- self.transition(IdleState)
+ self._state = State.Idle
return event
@property
def left(self):
- return isinstance(self._state, IdleState)
+ """Did the server confirm we left the conversation?"""
+ return self._state == State.Idle
+
+ def command(self, name: str, payload: Dict[str, Any] = {}) -> str:
+ message = [
+ S_JOIN_REF,
+ self.request_id,
+ self.topic,
+ name,
+ payload,
+ ]
+ return json.dumps(
+ message, ensure_ascii=False, indent=None, separators=(",", ":")
+ )
diff --git a/src/uhlive/stream/conversation/events.py b/src/uhlive/stream/conversation/events.py
index 81f9a35..62cfe2b 100644
--- a/src/uhlive/stream/conversation/events.py
+++ b/src/uhlive/stream/conversation/events.py
@@ -1,7 +1,7 @@
"""Event definitions."""
import re
-from typing import List
+from typing import Any, List
from .error import UhliveError
from .human_datetime import human_datetime
@@ -14,27 +14,28 @@ class Word(dict):
"""Timestamped word."""
@property
- def start(self):
+ def start(self) -> int:
"""Start time as Unix timestamp in millisecond, according to audio timeline."""
return self["start"]
@property
- def end(self):
+ def end(self) -> int:
"""End time as Unix timestamp in millisecond, according to audio timeline."""
return self["end"]
@property
- def length(self):
+ def length(self) -> int:
"""Word length in millisecond, according to audio timeline."""
return self["length"]
@property
- def word(self):
+ def word(self) -> str:
"""Transcript token string for this word."""
return self["word"]
@property
- def confidence(self):
+ def confidence(self) -> float:
+ """ASR confidence for this word."""
return self["confidence"]
@@ -47,23 +48,24 @@ def __init__(self, join_ref, ref, topic, event, payload) -> None:
self._topic = topic
self._payload = payload
- def __repr__(self):
+ def __repr__(self) -> str:
return f"{self.__class__.__name__}(payload={self._payload})"
@property
- def topic(self):
+ def topic(self) -> str:
+ """The conversation identifier"""
return self._topic
@property
- def join_ref(self):
+ def join_ref(self) -> str:
return self._join_ref
@property
- def ref(self):
+ def ref(self) -> str:
return self._ref
@property
- def speaker(self):
+ def speaker(self) -> str:
"""The speaker whose speech triggered this event.
All events are relative to a speaker."""
@@ -87,15 +89,19 @@ def from_message(message):
class Ok(Event):
+ """API asynchronous command aknowledgements."""
+
pass
class Unknown(Event):
+ """The server emitted an event unkown to this SDK. Time to upgrade!"""
+
def __init__(self, join_ref, ref, topic, event, payload):
self._name = event
super().__init__(join_ref, ref, topic, event, payload)
- def __repr__(self):
+ def __repr__(self) -> str:
return f"Unknown [{self._name}](payload={self._payload})"
@@ -103,17 +109,17 @@ class TimeScopedEvent(Event):
"""Base class for events that are anchored to the audio time line."""
@property
- def start(self):
+ def start(self) -> int:
"""Start time as Unix timestamp in millisecond, according to audio timeline."""
return self._payload["start"]
@property
- def end(self):
+ def end(self) -> int:
"""End time as Unix timestamp in millisecond, according to audio timeline."""
return self._payload["end"]
@property
- def length(self):
+ def length(self) -> int:
"""Event length in millisecond, according to audio timeline."""
return self._payload["length"]
@@ -122,12 +128,12 @@ class SpeechDecoded(TimeScopedEvent):
"""The base class of all transcription events."""
@property
- def transcript(self):
+ def transcript(self) -> str:
"""Get the transcript of the whole segment as a string"""
return self._payload["transcript"]
@property
- def lang(self):
+ def lang(self) -> str:
"""Natural Language of the speech.
As ISO 639-1 code.
@@ -135,7 +141,7 @@ def lang(self):
return self._payload["lang"]
@property
- def country(self):
+ def country(self) -> str:
"""Country location of speaker.
As ISO 3166-1 code.
@@ -143,20 +149,21 @@ def country(self):
return self._payload["country"]
@property
- def utterance_id(self):
+ def utterance_id(self) -> str:
"""The Utterance id identifies the speech utterance this event transcribes."""
return self._payload["utterance_id"]
@property
- def words(self):
- """Get the transcript of the whole segment as a list of timestamped words."""
+ def words(self) -> List[Word]:
+ """Get the transcript of the whole segment as a list of timestamped [words][uhlive.stream.conversation.Word]."""
return [Word(w) for w in self._payload["words"]]
- def __str__(self):
+ def __str__(self) -> str:
return f"[{self.speaker} — {human_datetime(self.start)}] {self.transcript}"
@property
- def confidence(self):
+ def confidence(self) -> float:
+ """The ASR confidence for this segment."""
return self._payload["confidence"]
@@ -175,25 +182,30 @@ class SegmentDecoded(SpeechDecoded):
class SegmentNormalized(SpeechDecoded):
"""Normalized final segment event."""
- def __str__(self):
+ def __str__(self) -> str:
return f"[{self.speaker} — Formatted] {self.transcript}"
class SpeakerJoined(Event):
+ """A new speaker joined the conversation (after us)."""
+
@property
- def timestamp(self):
+ def timestamp(self) -> int:
+ """The UNIX time when the speaker joined the conversation."""
return self._payload["timestamp"]
@property
- def interim_results(self):
+ def interim_results(self) -> bool:
+ """Are interim results activated for this speaker?"""
return self._payload["interim_results"]
@property
- def rescoring(self):
+ def rescoring(self) -> bool:
+ """Is rescoring enabled for this speaker?"""
return self._payload["rescoring"]
@property
- def lang(self):
+ def lang(self) -> str:
"""Natural Language of the speech.
As ISO 639-1 code.
@@ -201,7 +213,7 @@ def lang(self):
return self._payload["lang"]
@property
- def country(self):
+ def country(self) -> str:
"""Country location of speaker.
As ISO 3166-1 code.
@@ -213,7 +225,8 @@ class SpeakerLeft(Event):
"""Event emitted by the associated speaker when they left the conversation."""
@property
- def timestamp(self):
+ def timestamp(self) -> int:
+ """UNIX time when the speaker left the conversation."""
return self._payload["timestamp"]
@@ -227,11 +240,12 @@ def __init__(self, join_ref, ref, topic, event, payload):
super().__init__(join_ref, ref, topic, event, payload)
@property
- def entity_name(self):
+ def entity_name(self) -> str:
+ """The name of the named entity found."""
return self._name
@property
- def lang(self):
+ def lang(self) -> str:
"""Natural Language of the interpretation.
As ISO 639-1 code.
@@ -239,7 +253,7 @@ def lang(self):
return self._payload["lang"]
@property
- def country(self):
+ def country(self) -> str:
"""Country location of speaker.
As ISO 3166-1 code.
@@ -247,17 +261,17 @@ def country(self):
return self._payload["country"]
@property
- def canonical(self):
+ def canonical(self) -> str:
"""The well formatted form of the entity in the language (string)."""
return self._payload["annotation"].get("canonical")
@property
- def original(self):
+ def original(self) -> str:
"""The transcript excerpt that was interpreted, as string."""
return self._payload["annotation"]["original"]
@property
- def value(self):
+ def value(self) -> Any:
"""The interpreted value in machine understandable form.
The exact type depends on the entity.
@@ -265,10 +279,11 @@ def value(self):
return self._payload["annotation"].get("value")
@property
- def confidence(self):
+ def confidence(self) -> float:
+ """The confidence of the interpretation."""
return self._payload["confidence"]
- def __repr__(self):
+ def __repr__(self) -> str:
return " ".join(
(
" - ",
@@ -280,8 +295,12 @@ def __repr__(self):
class Tag:
+ """A tag represents a behavioral feature found in the conversation."""
+
uuid: str
+ """The unique id of the Tag."""
label: str
+ """The human readable name of the Tag."""
def __init__(self, uuid: str, label: str) -> None:
self.uuid = uuid
@@ -292,8 +311,10 @@ def __repr__(self) -> str:
class TagsFound(TimeScopedEvent):
+ """One or more tags were found on this time range."""
+
@property
- def lang(self):
+ def lang(self) -> str:
"""Natural Language of the interpretation.
As ISO 639-1 code.
@@ -301,7 +322,7 @@ def lang(self):
return self._payload["lang"]
@property
- def country(self):
+ def country(self) -> str:
"""Country location of speaker.
As ISO 3166-1 code.
@@ -309,11 +330,13 @@ def country(self):
return self._payload["country"]
@property
- def confidence(self):
+ def confidence(self) -> float:
+ """Tagger confidence."""
return self._payload["confidence"]
@property
def tags(self) -> List[Tag]:
+ """The [tags][uhlive.stream.conversation.Tag] that were found on this time range"""
return [Tag(t["uuid"], t["label"]) for t in self._payload["annotation"]["tags"]]
def __repr__(self):
@@ -321,11 +344,14 @@ def __repr__(self):
class EntityReference:
- """Reference to unique Entity in conversation."""
+ """Reference to a unique previously found Entity in the conversation."""
kind: str
+ """The name of the `Entity` referenced."""
speaker: str
+ """The speaker identifier."""
start: int
+ """The UNIX start time of the referenced `Entity`."""
def __init__(self, entity_name: str, speaker: str, start: int) -> None:
self.kind = entity_name
@@ -337,7 +363,10 @@ def __repr__(self) -> str:
class RelationFound(TimeScopedEvent):
- """The class for all Relation events."""
+ """The class for all Relation events.
+
+ Relations express a semantic relationship between two or more entities.
+ """
def __init__(self, join_ref, ref, topic, event, payload):
self._name = RELATION_NAME.match(event).group(
@@ -346,11 +375,12 @@ def __init__(self, join_ref, ref, topic, event, payload):
super().__init__(join_ref, ref, topic, event, payload)
@property
- def relation_name(self):
+ def relation_name(self) -> str:
+ """The type of the relation."""
return self._name
@property
- def lang(self):
+ def lang(self) -> str:
"""Natural Language of the interpretation.
As ISO 639-1 code.
@@ -358,21 +388,25 @@ def lang(self):
return self._payload["lang"]
@property
- def confidence(self):
+ def confidence(self) -> float:
+ """The confidence on the discovered relationship."""
return self._payload["confidence"]
@property
- def members(self):
+ def members(self) -> List[EntityReference]:
+ """[References to the Entities][uhlive.stream.conversation.EntityReference] involved in this relationship."""
m = []
speaker = self.speaker
print(self._payload)
for ref in self._payload["members"]:
- kind = ENTITY_NAME.match(ref["entity"]).group(1) if ref["entity"] else None
+ kind = (
+ ENTITY_NAME.match(ref["entity"]).group(1) if ref["entity"] else None # type: ignore
+ )
if kind is not None:
m.append(EntityReference(kind, speaker, ref["start"]))
return m
- def __repr__(self):
+ def __repr__(self) -> str:
return f"{self.__class__.__name__} <{self._name}> for {self.members} [confidence: {self.confidence:.2f}]"
diff --git a/src/uhlive/stream/conversation/human_datetime.py b/src/uhlive/stream/conversation/human_datetime.py
index c87fdc4..b0771f0 100644
--- a/src/uhlive/stream/conversation/human_datetime.py
+++ b/src/uhlive/stream/conversation/human_datetime.py
@@ -1,9 +1,14 @@
+"""
+Display helpers.
+"""
+
from datetime import datetime
local_tz = datetime.now().astimezone().tzinfo
def human_datetime(timestamp):
+ """Human readable representation of unix timestamp date."""
utc_dt = datetime.fromtimestamp(timestamp / 1000.0, local_tz)
return utc_dt.isoformat(sep=" ")
diff --git a/src/uhlive/stream/recognition/__init__.py b/src/uhlive/stream/recognition/__init__.py
index 90deffd..ad983ce 100644
--- a/src/uhlive/stream/recognition/__init__.py
+++ b/src/uhlive/stream/recognition/__init__.py
@@ -1,11 +1,136 @@
+"""
+The stream recognition API SDK for voice bots.
+
+Stream for voicebots, or Stream Human to Bots, or Stream H2B is a set of API enabling clients to build
+interaction between a human end-user and a bot, for example to create Interactive Voice Response (IVR)
+on the phone, or a voicebot within an app.
+
+For an overview of the concepts, protocols and workflow, see the
+[higher level documenation](https://docs.allo-media.net/stream-h2b/#real-time-stream-api-for-voicebots) and
+more specifically the [Websocket H2B protocol reference](https://docs.allo-media.net/stream-h2b/protocols/websocket/#websocket-for-voicebots).
+
+The protocol is messages based and uses websockets as transport. You are free to use whatever websocket client library you like to communicate
+with the API, and use our SDK to encode/decode the messages.
+
+## Quickstart
+
+First retrieve a one time access token with the [Auth API](auth.md).
+
+Then use that token to build an authenticated URL, open a websocket connection to it with the websocket client library
+of your choice and instanciate a [`Recognizer`][uhlive.stream.recognition.Recognizer] to make request, generate
+audio stream messages and decode responses.
+
+As the API is asynchronous, streaming the audio and reading the returned events should be done in two different threads/tasks.
+
+```python
+from uhlive.stream.recognition import *
+
+stream_h2b_url, stream_h2b_headers = build_connection_request(token)
+recognizer = Recognizer()
+```
+
+Now you can connect and interact with the API:
+
+Synchronous example:
+
+```python
+import websocket as ws
+
+socket = ws.create_connection(stream_h2b_url, header=stream_h2b_headers)
+socket.send(recognizer.open())
+# Check if successful
+reply = recognizer.receive(socket.recv())
+assert isinstance(reply, Opened), f"Expected Opened, got {reply}"
+# start streaming the user's voice in another thread
+streamer_thread_handle = stream_mic(socket, recognizer)
+```
+
+Asynchronous example:
+
+```python
+from aiohttp import ClientSession
+
+ async with ClientSession() as session:
+ async with session.ws_connect(stream_h2b_url, header=stream_h2b_headers) as socket:
+
+ # Open a session
+ # Commands are sent as text frames
+ await socket.send_str(recognizer.open())
+ # Check if successful
+ msg = await socket.receive()
+ reply = recognizer.receive(msg.data)
+ assert isinstance(reply, Opened), f"Expected Opened, got {reply}"
+ # start streaming the user's voice in another task
+ streamer_task_handle = asyncio.create_task(stream(socket, recognizer))
+```
+
+As you can see, the I/O is cleanly decoupled from the protocol handling: the `Recognizer` object is only used
+to create the messages to send to the API and to decode the received messages as `Event` objects.
+
+See the [complete examples in the source distribution](https://github.com/uhlive/python-sdk/tree/main/examples/recognition).
+
+"""
+
import os
+from typing import Tuple
from urllib.parse import urljoin
-from .client import ProtocolError, Recognizer # noqa
-from .events import * # noqa
+from .client import ProtocolError, Recognizer
+from .events import (
+ Closed,
+ CompletionCause,
+ DefaultParams,
+ Event,
+ GrammarDefined,
+ InputTimersStarted,
+ Interpretation,
+ InvalidParamValue,
+ MethodFailed,
+ MethodNotAllowed,
+ MethodNotValid,
+ MissingParam,
+ Opened,
+ ParamsSet,
+ RecognitionComplete,
+ RecognitionInProgress,
+ RecogResult,
+ StartOfInput,
+ Stopped,
+ Transcript,
+)
SERVER = os.getenv("UHLIVE_API_URL", "wss://api.uh.live")
-def build_connection_request(token):
+def build_connection_request(token) -> Tuple[str, dict]:
+ """
+ Make an authenticated URL and header to connect to the H2B Service.
+ """
return urljoin(SERVER, "/bots"), {"Authorization": f"bearer {token}"}
+
+
+__all__ = [
+ "ProtocolError",
+ "Recognizer",
+ "build_connection_request",
+ "Event",
+ "CompletionCause",
+ "Transcript",
+ "Interpretation",
+ "RecogResult",
+ "Opened",
+ "ParamsSet",
+ "DefaultParams",
+ "GrammarDefined",
+ "RecognitionInProgress",
+ "InputTimersStarted",
+ "Stopped",
+ "Closed",
+ "StartOfInput",
+ "RecognitionComplete",
+ "MethodNotValid",
+ "MethodFailed",
+ "InvalidParamValue",
+ "MissingParam",
+ "MethodNotAllowed",
+]
diff --git a/src/uhlive/stream/recognition/client.py b/src/uhlive/stream/recognition/client.py
index bae6e56..4ab1c80 100644
--- a/src/uhlive/stream/recognition/client.py
+++ b/src/uhlive/stream/recognition/client.py
@@ -1,8 +1,9 @@
"""
-I/O free Connection state machine.
+Object oriented abstraction over the H2B API protocol and workflow.
"""
import json
-from typing import Any, Callable, Dict, Type, TypeVar
+from enum import Enum
+from typing import Any, Dict, Union
from .events import (
Closed,
@@ -20,85 +21,127 @@ def serialize(cmd: Dict[str, Any]) -> str:
class ProtocolError(RuntimeError):
+ """Exception raised when a [Recognizer][uhlive.stream.recognition.Recognizer] method is not available in the current state."""
+
pass
-class State:
- """Protocol state.
+class State(Enum):
+ """Protocol state."""
- Protocol states implement and document the available commands.
- User code should not use the State directly but use a `Recognizer`
- object instead, and call the prococol methods on it.
- """
+ NoSession = "Out of Session State"
+ IdleSession = "Idle Session State"
+ Recognition = "On-going Recognition State"
- def __init__(self, context: "Recognizer") -> None:
- self.context = context
- def command(self, name: str, headers: Dict[str, Any] = {}, body: str = "") -> str:
- return serialize(
- {
- "command": name,
- "request_id": self.context.request_id,
- "channel_id": self.context.channel_id,
- "headers": headers,
- "body": body,
- }
- )
+class Recognizer:
+ """The connection state machine.
+
+ Use this class to decode received frames as `Event`s or to
+ make command frames by calling the appropriate methods.
+ If you call a method that is not appropriate in the current protocol
+ state, a `ProtocolError` is raised.
+ """
+ def __init__(self) -> None:
+ self._state: State = State.NoSession
+ self._request_id = 0
+ self._channel_id = ""
+
+ # Workflow methods
-class NoSessionState(State):
def open(
self, custom_id: str = "", channel_id: str = "", audio_codec: str = "linear"
) -> str:
- """Open a new Samosa session.
+ """Open a new H2B session.
- ``custom_id`` is any reference of yours that you want to appear in the logs
- and invoice reports.
+ Args:
+ custom_id: is any reference of yours that you want to appear in the logs
+ and invoice reports.
+ channel_id: when provided, it'll be used as a prefix for
+ the actual channel ID generated by the server.
+ audio_codec: the speech audio codec of the audio data:
- If a ``channel_id`` is provided, it'll be used as a prefix for
- the actual channel ID generated by the server.
+ - `"linear"`: (default) linear 16 bit SLE raw PCM audio at 8khz;
+ - `"g711a"`: G711 a-law audio at 8khz;
+ - `"g711u"`: G711 μ-law audio at 8khz.
- ``codec`` is the speech audio codec of the audio data:
- - ``"linear"``: (default) linear 16 bit SLE raw PCM audio at 8khz;
- - ``"g711a"``: G711 a-law audio at 8khz;
- - ``"g711u"``: G711 μ-law audio at 8khz.
+ Returns:
+ A websocket text message to send to the server.
+ Raises:
+ ProtocolError: if a session is already open.
"""
+ if self._state != State.NoSession:
+ raise ProtocolError("Session already opened!")
return serialize(
{
"command": "OPEN",
- "request_id": self.context.request_id,
+ "request_id": self.request_id,
"channel_id": channel_id,
"headers": {"custom_id": custom_id, "audio_codec": audio_codec},
"body": "",
}
)
-
-class IdleSessionState(State):
def send_audio_chunk(self, chunk: bytes) -> bytes:
- """Build an audio chunk frame for streaming."""
+ """Build an audio chunk frame for streaming.
+
+ Returns:
+ A websocket binary message to send to the server.
+
+ Raises:
+ ProtocolError: if no session opened.
+ """
+ if self._state == State.NoSession:
+ raise ProtocolError("You must open a session first!")
return chunk
def set_params(self, **params: Any) -> str:
"""Set default ASR parameters for the session.
- See https://docs.allo-media.net/live-api/robots-humans-protocol/#set-session-defaults
+ See [the parameter list](https://docs.allo-media.net/stream-h2b/protocols/websocket/#set-session-defaults)
+ and [the parameter visual explanations](https://docs.allo-media.net/stream-h2b/input/#resource-headers)
for an explanation of the different parameters available.
- """
+ Returns:
+ A websocket text message to send to the server.
+
+ Raises:
+ ProtocolError: if no session opened.
+ """
+ if self._state != State.IdleSession:
+ raise ProtocolError(f"Method not available in this state ({self._state})!")
return self.command("SET-PARAMS", params)
def get_params(self) -> str:
- """Retrieve the default values for the ASR parameters."""
+ """Retrieve the default values for the ASR parameters.
+
+ Returns:
+ A websocket text message to send to the server.
+
+ Raises:
+ ProtocolError: if no session opened.
+ """
+ if self._state != State.IdleSession:
+ raise ProtocolError(f"Method not available in this state ({self._state})!")
return self.command("GET-PARAMS")
def define_grammar(self, builtin: str, alias: str) -> str:
"""Define a grammar alias for a parameterized builtin.
- `builtin`: the builtin URI to alias, including the query string, but without the "builtin:" prefix
- `alias`: the alias, without the "session:" prefix.
+ Args:
+ builtin: the builtin URI to alias, including the query string, but without the "builtin:" prefix
+ alias: the alias, without the "session:" prefix.
+
+ Returns:
+ A websocket text message to send to the server.
+
+ Raises:
+ ProtocolError: if no session opened.
"""
+ if self._state != State.IdleSession:
+ raise ProtocolError(f"Method not available in this state ({self._state})!")
return self.command(
"DEFINE-GRAMMAR",
{"content_id": alias, "content_type": "text/uri-list"},
@@ -117,13 +160,19 @@ def recognize(
This method takes grammar URIs as positional arguments, including the `builtin:` or
`session:` prefixes to make the difference between builtin grammars and custom aliases.
- Keywords arguments are also accepted:
- - `start_timers`: default True
- - `recognition_mode`: default is "normal"
- - any other ASR parameter (no client side defaults).
+ Keyword Args:
+ start_timers: default True
+ recognition_mode: default is "normal"
+ **params: any other [ASR parameter](https://docs.allo-media.net/stream-h2b/protocols/websocket/#start-recognition) (no client side defaults).
- See https://docs.allo-media.net/live-api/robots-humans-protocol/#start-recognition
+ Returns:
+ A websocket text message to send to the server.
+
+ Raises:
+ ProtocolError: if no session opened.
"""
+ if self._state != State.IdleSession:
+ raise ProtocolError(f"Method not available in this state ({self._state})!")
return self.command(
"RECOGNIZE",
headers={
@@ -136,51 +185,46 @@ def recognize(
)
def close(self) -> str:
- """Close the current session."""
- return self.command("CLOSE")
+ """Close the current session.
+ Returns:
+ A websocket text message to send to the server.
-class RecognitionState(State):
- def send_audio_chunk(self, chunk: bytes) -> bytes:
- """Build an audio chunk frame for streaming."""
- return chunk
+ Raises:
+ ProtocolError: if no session opened.
+ """
+ if self._state != State.IdleSession:
+ raise ProtocolError(f"Method not available in this state ({self._state})!")
+ return self.command("CLOSE")
def start_input_timers(self) -> str:
"""If the input timers were not started by the RECOGNIZE command,
starts them now.
+
+ Returns:
+ A websocket text message to send to the server.
+
+ Raises:
+ ProtocolError: if no on-going recognition process
"""
+ if self._state != State.Recognition:
+ raise ProtocolError("Command is only valid during recognition!")
return self.command("START-INPUT-TIMERS")
def stop(self) -> str:
- """Stop ongoing recognition process"""
- return self.command("STOP")
-
-
-S = TypeVar("S", bound=State)
-
-
-class Recognizer:
- """The connection state machine.
-
- Use this class to decode received frames as `Event`s or to
- make command frames by calling the appropriate methods.
- If you call a method that is not appropriate in the current protocol
- state, a `ProtocolError` is raised.
- """
+ """Stop ongoing recognition process
- def __init__(self) -> None:
- self._state: State = NoSessionState(self)
- self._request_id = 0
- self._channel_id = ""
+ Returns:
+ A websocket text message to send to the server.
- def __getattr__(self, name: str) -> Callable:
- meth = getattr(self._state, name, None)
- if meth is None:
- raise ProtocolError(f"no method '{name}' in this state!")
- return meth
+ Raises:
+ ProtocolError: if no on-going recognition process
+ """
+ if self._state != State.Recognition:
+ raise ProtocolError("Command is only valid during recognition!")
+ return self.command("STOP")
- def transition(self, state: Type[S]) -> None:
- self._state = state(self)
+ ##
@property
def request_id(self) -> int:
@@ -189,18 +233,37 @@ def request_id(self) -> int:
@property
def channel_id(self) -> str:
+ """The current session ID."""
return self._channel_id
- def receive(self, data: str) -> Event:
- """Decode received text frame"""
+ def receive(self, data: Union[str, bytes]) -> Event:
+ """Decode received text frame.
+
+ The server always replies with text frames.
+
+ Returns:
+ The appropriate `Event` subclass.
+ """
+ assert type(data) is str # to please mypy
event = deserialize(data)
if isinstance(event, RecognitionInProgress):
- self.transition(RecognitionState)
+ self._state = State.Recognition
elif isinstance(event, (RecognitionComplete, Stopped)):
- self.transition(IdleSessionState)
+ self._state = State.IdleSession
elif isinstance(event, Opened):
self._channel_id = event.channel_id
- self.transition(IdleSessionState)
+ self._state = State.IdleSession
elif isinstance(event, Closed):
- self.transition(NoSessionState)
+ self._state = State.NoSession
return event
+
+ def command(self, name: str, headers: Dict[str, Any] = {}, body: str = "") -> str:
+ return serialize(
+ {
+ "command": name,
+ "request_id": self.request_id,
+ "channel_id": self.channel_id,
+ "headers": headers,
+ "body": body,
+ }
+ )
diff --git a/src/uhlive/stream/recognition/events.py b/src/uhlive/stream/recognition/events.py
index 327e5a4..2aeb3c5 100644
--- a/src/uhlive/stream/recognition/events.py
+++ b/src/uhlive/stream/recognition/events.py
@@ -1,6 +1,7 @@
-"""Samosa Events.
+"""H2B Events.
-Data model for events returned by the Samosa server.
+Data model for events returned by the H2B server.
+See also https://docs.allo-media.net/stream-h2b/protocols/websocket/#websocket-for-voicebots.
"""
@@ -11,7 +12,10 @@
class CompletionCause(Enum):
- """The set of possible completion cause"""
+ """The set of possible completion causes.
+
+ See [all possible values](https://docs.allo-media.net/stream-h2b/protocols/websocket/#asynchronous-recognition-events).
+ """
GramDefinitionFailure = "GramDefinitionFailure"
GramLoadFailure = "GramLoadFailure"
@@ -38,18 +42,22 @@ def __init__(self, data: Dict[str, Any]) -> None:
@property
def transcript(self) -> str:
+ """The raw ASR output."""
return self._transcript
@property
def confidence(self) -> float:
+ """The ASR transcription confidence."""
return self._confidence
@property
def start(self) -> datetime:
+ """Start of speech."""
return self._start
@property
def end(self) -> datetime:
+ """End of speech."""
return self._end
def __str__(self) -> str:
@@ -66,6 +74,7 @@ def __init__(self, data: Dict[str, Any]) -> None:
@property
def confidence(self) -> float:
+ """The confidence of the interpretation."""
return self._confidence
@property
@@ -76,7 +85,10 @@ def type(self) -> str:
@property
def value(self) -> Dict[str, Any]:
"""The structured interpreted value.
- The type/schema of the value is given by the `type` attribute
+
+ The type/schema of the value is given by the `self.type` property.
+
+ See the [Grammar reference documentaiton](https://docs.allo-media.net/stream-h2b/grammars).
"""
return self._value
@@ -98,12 +110,12 @@ def __init__(self, data: dict) -> None:
@property
def asr(self) -> Optional[Transcript]:
- """The ASR part of the result (transcription result)"""
+ """The ASR part of the result ([transcription][uhlive.stream.recognition.Transcript] result)"""
return self._asr
@property
def nlu(self) -> Optional[Interpretation]:
- """The NLU part of the result (interpretation)"""
+ """The NLU part of the result ([interpretation][uhlive.stream.recognition.Interpretation])"""
return self._nlu
@property
@@ -134,26 +146,35 @@ def __init__(self, data: Dict[str, Any]) -> None:
@property
def request_id(self) -> int:
+ """The request ID that event responds to."""
return self._request_id
@property
def channel_id(self) -> str:
+ """The channel ID."""
return self._channel_id
@property
def headers(self) -> Dict[str, Any]:
+ """The response headers.
+
+ See also the [header description](https://docs.allo-media.net/stream-h2b/output/#headers-%26-statuses).
+ """
return self._headers
@property
def completion_cause(self) -> Optional[CompletionCause]:
+ """The response [`CompletionCause`][uhlive.stream.recognition.CompletionCause]."""
return self._completion_cause
@property
def completion_reason(self) -> Optional[str]:
+ """The completion message."""
return self._completion_reason
@property
def body(self) -> Optional[RecogResult]:
+ """The content of the Event is a [`RecogResult`][uhlive.stream.recognition.RecogResult] if it is a `RecognitionComplete` event."""
return self._body
def __str__(self) -> str:
@@ -161,10 +182,14 @@ def __str__(self) -> str:
class Opened(Event):
+ """Session opened on the server"""
+
pass
class ParamsSet(Event):
+ """The default parameters were set."""
+
pass
@@ -175,50 +200,74 @@ class DefaultParams(Event):
class GrammarDefined(Event):
+ """The `DefineGrammar` command has been processed."""
+
pass
class RecognitionInProgress(Event):
+ """The ASR recognition is started."""
+
pass
class InputTimersStarted(Event):
+ """The Input Timers are started."""
+
pass
class Stopped(Event):
+ """The ASR recognition has been stopped on the client request."""
+
pass
class Closed(Event):
+ """The session is closed."""
+
pass
class StartOfInput(Event):
+ """In normal recognition mode, this event is emitted when speech is detected."""
+
pass
class RecognitionComplete(Event):
+ """The ASR recognition is complete."""
+
pass
class MethodNotValid(Event):
+ """The server received an invalid command."""
+
pass
class MethodFailed(Event):
+ """The server was unable to complete the command."""
+
pass
class InvalidParamValue(Event):
+ """The server received a request to set an invalid value for a parameter."""
+
pass
class MissingParam(Event):
+ """The command is missings some mandatory parameter."""
+
pass
class MethodNotAllowed(Event):
+ """The command is not allowed in this state."""
+
pass
@@ -247,27 +296,3 @@ def deserialize(data: str) -> Event:
if kind in EVENT_MAP:
return EVENT_MAP[kind](jd)
raise ValueError(f"Unknown event '{kind}'")
-
-
-__all__ = [
- "Event",
- "CompletionCause",
- "Transcript",
- "Interpretation",
- "RecogResult",
- "Opened",
- "ParamsSet",
- "DefaultParams",
- "GrammarDefined",
- "RecognitionInProgress",
- "InputTimersStarted",
- "Stopped",
- "Closed",
- "StartOfInput",
- "RecognitionComplete",
- "MethodNotValid",
- "MethodFailed",
- "InvalidParamValue",
- "MissingParam",
- "MethodNotAllowed",
-]
diff --git a/tox.ini b/tox.ini
index b9f89aa..4a1e691 100644
--- a/tox.ini
+++ b/tox.ini
@@ -9,6 +9,7 @@ envlist =
py38
py39
py310
+ py311
isort
black
flake8