diff --git a/.gitignore b/.gitignore index f4ac18d..99722df 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ build .venv *.local .env +site diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..5c3753f --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,4 @@ +include README.md +include LICENSE +include src/uhlive/py.typed + diff --git a/README.md b/README.md index 5460816..973089d 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ The Uh!live Python SDK provides convenient access to the Uh!live API from applications written in the Python language. -Read the [documentation for the Conversation API](https://docs.allo-media.net/live-api/) and [for the Recognition API (vocal bot toolkit)](https://docs.allo-media.net/stream-api-bots/). +Read the [full documentation](https://python-uhlive-sdk.netlify.app/). ## Requirements @@ -11,6 +11,12 @@ Read the [documentation for the Conversation API](https://docs.allo-media.net/li Install with `pip install .[examples]` to install the the library and all the dependencies necessary to run the examples. +### Installation from Pypi + +``` +pip install uhlive +``` + ### Audio files To play with the examples, you should have a raw audio file. diff --git a/docs/auth.md b/docs/auth.md new file mode 100644 index 0000000..df3df22 --- /dev/null +++ b/docs/auth.md @@ -0,0 +1,6 @@ +# uhlive.auth + +::: uhlive.auth + options: + show_source: false + diff --git a/docs/conversation_api.md b/docs/conversation_api.md new file mode 100644 index 0000000..5cc9b85 --- /dev/null +++ b/docs/conversation_api.md @@ -0,0 +1,5 @@ +# uhlive.stream.conversation + +::: uhlive.stream.conversation + options: + show_source: false diff --git a/docs/css/mkdocstrings.css b/docs/css/mkdocstrings.css new file mode 100644 index 0000000..c2d342a --- /dev/null +++ b/docs/css/mkdocstrings.css @@ -0,0 +1,41 @@ +/* Indentation. */ +div.doc-contents:not(.first) { + padding-left: 25px; + border-left: .15rem solid var(--secondary); +} + +/*code { + font-size: inherit; +} +*/ +h2.doc-heading { + font-size: 1.75rem; +/* font-weight: 600;*/ +} + +h3.doc-heading { +/* font-weight: 600;*/ + font-size: 1.2rem; +} + +/* Mark external links as such. */ +a.external::after, +a.autorefs-external::after { + /* https://primer.style/octicons/arrow-up-right-24 */ + mask-image: url('data:image/svg+xml,'); + -webkit-mask-image: url('data:image/svg+xml,'); + content: ' '; + + display: inline-block; + vertical-align: middle; + position: relative; + + height: 1em; + width: 1em; + background-color: var(--md-typeset-a-color); +} + +a.external:hover::after, +a.autorefs-external:hover::after { + background-color: var(--md-accent-fg-color); +} diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..0b8e5c5 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,105 @@ +# Welcome to the Uh!ive Python SDK + +The Uh!ive Python SDK is a library to access our live Automated Speech Recognition online APIs. +It provides [I/O Free](https://sans-io.readthedocs.io/index.html) Python abstractions over the underlying protocols and workflows to hide the complexity. + +By providing an I/O Free implementation, we let developers choose whatever websocket transport library and paradigm — synchronous or asynchronous (asyncio) — they like most. + +## Access to the API + +In order to have access to our online APIs, your company needs to register for an account. Depending on the plan, you may get two kinds of credentials: + +* Either a `client_id` and `client_secret`; +* or a `client_id`, `user_id` and `user_password`. + +In all cases, those credentials are used to retrieve a one time access token from our SSO. + +You are free to use whatever HTTP client library you like. + +Here is a synchronous example using `requests`: + +```python +from uhlive.auth import build_authentication_request +import requests + +uhlive_client = "…" +uhlive_secret = "…" +# user_id = "…" +# user_password = "…" + +auth_url, auth_params = build_authentication_request(uhlive_client, uhlive_secret) +# or auth_url, auth_params = build_authentication_request(uhlive_client, user_id=user_id, user_pwd=user_password) +login = requests.post(auth_url, data=auth_params) +login.raise_for_status() +uhlive_token = login.json()["access_token"] +``` + +Here is an asynchronous example using `aiohttp`: + +```python +import asyncio +from uhlive.auth import build_authentication_request +from aiohttp import ClientSession + +uhlive_client = "…" +uhlive_secret = "…" +# user_id = "…" +# user_password = "…" + + +async def main(uhlive_client, uhlive_secret): + async with ClientSession() as session: + auth_url, auth_params = build_authentication_request( + uhlive_client, uhlive_secret + ) + async with session.post(auth_url, data=auth_params) as login: + login.raise_for_status() + body = await login.json() + uhlive_token = body["access_token"] + # continue with Stream API of your choice + # ... + +asyncio.run(main(uhlive_client, uhlive_secret)) +``` + +Then this one time token allows you to connect to any subscribed API within 5 minutes. + +* [Auth API reference](auth.md) + + +## Conversation API to analyze human to human interactions. + +Also known as the human to human (H2H) stream API. + +* [High level overview](https://docs.allo-media.net/stream-h2h/overview/#high-level-overview-and-concepts) +* [Python SDK API documentation](conversation_api.md) + +## Recognition and interpretation API for voice bots. + +Also known as the human to bot (H2B) stream API. + +* [High level overview](https://docs.allo-media.net/stream-h2b/#real-time-stream-api-for-voicebots) +* [Python SDK API documentation](recognition_api.md) + + +## Changelog + +### v1.3.1 + +Full API documentation. + +### v1.3.0 + +* Support for `SegmentNormalized` +* SSO +* Concurrent test runner `test_runner_async.py` in `examples/recognition` + +### v1.2.0 + +* Improved streamer +* Improved test_runner.py +* Forbid sharing connection between conversations + +### v1.1.0 + +* Support for passing codec parameter diff --git a/docs/recognition_api.md b/docs/recognition_api.md new file mode 100644 index 0000000..c2093fd --- /dev/null +++ b/docs/recognition_api.md @@ -0,0 +1,5 @@ +# uhlive.stream.recognition + +::: uhlive.stream.recognition + options: + show_source: false diff --git a/examples/recognition/async_bot_lib.py b/examples/recognition/async_bot_lib.py index d85f58a..7d4c88e 100644 --- a/examples/recognition/async_bot_lib.py +++ b/examples/recognition/async_bot_lib.py @@ -77,7 +77,6 @@ def callback(indata, frame_count, time_info, status): class Bot: - TTF_CACHE: Dict[str, bytes] = {} def __init__(self, google_ttf_key): diff --git a/examples/recognition/basic_sync.py b/examples/recognition/basic_sync.py index be81e35..6744f1d 100644 --- a/examples/recognition/basic_sync.py +++ b/examples/recognition/basic_sync.py @@ -84,7 +84,6 @@ def play(self, filename, codec="linear"): def main(socket: ws.WebSocket, client: Recognizer, stream: AudioStreamer): - # Shortcuts send = socket.send diff --git a/examples/recognition/desktop-bot_async.py b/examples/recognition/desktop-bot_async.py index a6a0c2b..fc52ae5 100644 --- a/examples/recognition/desktop-bot_async.py +++ b/examples/recognition/desktop-bot_async.py @@ -116,7 +116,6 @@ async def demo_multi(self): await say("je vous passe le services des abonnés") async def scenario(self): - # Shortcuts say = self.say diff --git a/examples/recognition/desktop-bot_sync.py b/examples/recognition/desktop-bot_sync.py index 08ff518..71fa565 100644 --- a/examples/recognition/desktop-bot_sync.py +++ b/examples/recognition/desktop-bot_sync.py @@ -13,9 +13,9 @@ def set_defaults(self): speech_language="fr", no_input_timeout=5000, recognition_timeout=20000, - speech_complete_timeout=800, - speech_incomplete_timeout=1200, - speech_nomatch_timeout=3000, + speech_complete_timeout=1000, + speech_incomplete_timeout=2000, + speech_nomatch_timeout=4000, ) # Define grammars up front @@ -66,8 +66,17 @@ def demo_address(self): recognition_mode="hotword", ) addr["zipcode"] = nlu.value - formatted = f"j'ai compris {addr['number'] or ''} {addr['street'] or ''} {addr['zipcode'] or ''} {addr['city'] or ''}" - say(formatted) + say("J'ai compris") + if addr["number"]: + say(f"numéro : {addr['number']}") + if addr["street"]: + say(f"voie : {addr['street']}") + if addr["zipcode"]: + say(f"code postal : {addr['zipcode']}") + if addr["city"]: + say(f"ville : {addr['city']}") + if addr["complement"]: + say(f"complément d'adresse : {addr['complement']}") confirm = self.confirm( "Est-ce correct?", ) @@ -160,7 +169,6 @@ def demo_date(self): say("J'ai compris, mais ce n'est pas une date valide") def scenario(self): - # Scenario self.set_defaults() self.wait_activation() diff --git a/examples/recognition/fixtures/fr_address.test b/examples/recognition/fixtures/fr_address.test index 674a07d..2bfc7d9 100644 --- a/examples/recognition/fixtures/fr_address.test +++ b/examples/recognition/fixtures/fr_address.test @@ -31,3 +31,4 @@ number = "37" street = "rue du docteur leroy" zipcode = "72000" city = "le mans" +complement = "" diff --git a/examples/recognition/sync_bot_lib.py b/examples/recognition/sync_bot_lib.py index a50ffa3..bd784ec 100644 --- a/examples/recognition/sync_bot_lib.py +++ b/examples/recognition/sync_bot_lib.py @@ -27,7 +27,6 @@ class Bot: - TTF_CACHE: Dict[str, bytes] = {} def __init__(self, google_ttf_key): @@ -108,7 +107,6 @@ def confirm(self, text: str) -> bool: return res.value def run(self, uhlive_client: str, uhlive_secret: str): - auth_url, auth_params = build_authentication_request( uhlive_client, uhlive_secret ) diff --git a/examples/recognition/transcribe.py b/examples/recognition/transcribe.py index 0c922ce..17c0222 100644 --- a/examples/recognition/transcribe.py +++ b/examples/recognition/transcribe.py @@ -26,7 +26,6 @@ def main( codec: str, filepath: str, ): - # Shortcuts send = socket.send diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..f520721 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,32 @@ +site_name: Uh!ive python SDK + +theme: + name: "mkdocs" + +extra_css: + - css/mkdocstrings.css + +plugins: + - search + - mkdocstrings: + default_handler: python + handlers: + python: + paths: [src] + options: + show_signature_annotations: true + group_by_category: true + show_category_heading: false + inherited_members: true + members_order: source + docstring_section_style: "list" + signature_crossrefs: true + separate_signature: true + line_length: 110 + merge_init_into_class: true + +nav: + - Home: index.md + - Auth: auth.md + - H2H API: conversation_api.md + - H2B API: recognition_api.md diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2a1db3f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,54 @@ +aiohttp==3.8.4 +aiosignal==1.3.1 +async-timeout==4.0.2 +attrs==23.1.0 +Babel==2.14.0 +black==23.12.1 +cachetools==5.3.2 +certifi==2023.5.7 +cffi==1.15.1 +chardet==5.2.0 +charset-normalizer==3.1.0 +click==8.1.7 +colorama==0.4.6 +distlib==0.3.8 +filelock==3.13.1 +frozenlist==1.3.3 +ghp-import==2.1.0 +griffe==0.38.1 +idna==3.4 +Jinja2==3.1.2 +Markdown==3.5.1 +MarkupSafe==2.1.3 +mergedeep==1.3.4 +mkdocs==1.5.3 +mkdocs-autorefs==0.5.0 +mkdocs-material==9.5.3 +mkdocs-material-extensions==1.3.1 +mkdocstrings==0.24.0 +mkdocstrings-python==1.7.5 +multidict==6.0.4 +mypy-extensions==1.0.0 +packaging==23.2 +paginate==0.5.6 +pathspec==0.12.1 +platformdirs==4.1.0 +pluggy==1.3.0 +pycparser==2.21 +Pygments==2.17.2 +pymdown-extensions==10.7 +pyproject-api==1.6.1 +python-dateutil==2.8.2 +PyYAML==6.0.1 +pyyaml_env_tag==0.1 +regex==2023.12.25 +requests==2.31.0 +six==1.16.0 +sounddevice==0.4.6 +toml==0.10.2 +tox==4.11.4 +urllib3==2.0.3 +virtualenv==20.25.0 +watchdog==3.0.0 +websocket-client==1.6.1 +yarl==1.9.2 diff --git a/setup.py b/setup.py index df855ff..593c250 100644 --- a/setup.py +++ b/setup.py @@ -5,11 +5,11 @@ setup( name="uhlive", - version="1.3.0", + version="1.3.1", url="https://github.com/uhlive/python-sdk", author="Allo-Media", author_email="support@allo-media.fr", - description="Python bindings for the Uh!ive API", + description="Python bindings for the Uh!ive APIs", long_description=long_description, long_description_content_type="text/markdown", license="MIT", diff --git a/src/uhlive/py.typed b/src/uhlive/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/src/uhlive/stream/conversation/__init__.py b/src/uhlive/stream/conversation/__init__.py index 8980f3e..75b050e 100644 --- a/src/uhlive/stream/conversation/__init__.py +++ b/src/uhlive/stream/conversation/__init__.py @@ -1,21 +1,133 @@ +""" +The Stream Conversation SDK API for human to human interactions. + +This API is used to consume a real-time audio stream and get enriched transcription events. + +The protocol is messages based and uses websockets as transport. You are free to use whatever websocket client library you like to communicate +with the API, and use our SDK to encode/decode the messages. + +## Quickstart + +First retrieve a one time access token with the [Auth API](auth.md). + +Then use that token to build an authenticated URL, open a websocket connection to it with the websocket client library +of your choice and instanciate a [`Conversation`][uhlive.stream.conversation.Conversation] to join a conversation, generate +audio stream messages and decode transcription and enrichment events. + +As the API is asynchronous, streaming the audio and reading the returned events should be done in two different threads/tasks. + +```python +from uhlive.stream.Conversation import * + +stream_h2h_url = build_conversation_url(token) + +# The subcripttion identifier was given to you with your other credentials +# the conversation id can be any string you like. If a conversation by that name already exists in your subscription identifier domain +# it will join it as a new speaker, otherwise it will create it and join the speaker in. +# The speaker id helps you identify who is speaking. +conversation = Conversation("subscription_identifier", "a_conversation_id", "a_speaker_id") +``` + +Now you can connect and interact with the API: + +Synchronous example: + +```python +import websocket as ws + +socket = ws.create_connection(stream_h2h_url, timeout=10) +socket.send( + conversation.join( + model="fr", + interim_results=False, + rescoring=True, + origin=int(time.time() * 1000), + country="fr", + ) +) +# check we didn't get an error on join +reply = conversation.receive(socket.recv()) +assert isinstance(reply, Ok) + +``` + +Asynchronous example: + +```python +from aiohttp import ClientSession + +async def main(uhlive_client, uhlive_secret): + async with ClientSession() as session: + async with session.ws_connect(stream_h2h_url) as socket: + await socket.send_str( + conversation.join( + model="fr", + interim_results=False, + rescoring=True, + origin=int(time.time() * 1000), + country="fr", + ) + ) + # check we didn't get an error on join + msg = await socket.receive() + reply = conversation.receive(msg.data) + assert isinstance(reply, Ok) +``` + +As you can see, the I/O is cleanly decoupled from the protocol handling: the `Conversation` object is only used +to create the messages to send to the API and to decode the received messages as `Event` objects. + +See the [complete examples in the source distribution](https://github.com/uhlive/python-sdk/tree/main/examples/conversation). +""" + + import os from urllib.parse import urljoin -from .client import Conversation, ProtocolError # noqa -from .events import ( # noqa +from .client import Conversation, ProtocolError +from .events import ( EntityFound, + EntityReference, Event, Ok, RelationFound, SegmentDecoded, + SpeakerJoined, SpeakerLeft, SpeechDecoded, + Tag, + TagsFound, Unknown, + Word, WordsDecoded, ) SERVER = os.getenv("UHLIVE_API_URL", "wss://api.uh.live") -def build_conversation_url(token): +def build_conversation_url(token: str) -> str: + """ + Make an authenticated URL to connect to the Conversation Service. + """ return urljoin(SERVER, "socket/websocket") + f"?jwt={token}&vsn=2.0.0" + + +__all__ = [ + "build_conversation_url", + "Conversation", + "ProtocolError", + "SpeakerJoined", + "Word", + "EntityFound", + "Event", + "Ok", + "EntityReference", + "RelationFound", + "SegmentDecoded", + "SpeakerLeft", + "SpeechDecoded", + "Unknown", + "WordsDecoded", + "Tag", + "TagsFound", +] diff --git a/src/uhlive/stream/conversation/client.py b/src/uhlive/stream/conversation/client.py index 8bcf436..4c9effa 100644 --- a/src/uhlive/stream/conversation/client.py +++ b/src/uhlive/stream/conversation/client.py @@ -1,6 +1,12 @@ +""" +Object oriented abstraction over the Conversation API protocol and workflow. +""" + + import json from array import array -from typing import Any, Callable, Dict, Type, TypeVar +from enum import Enum +from typing import Any, Dict, Union from .events import Event, Ok, SpeakerLeft @@ -34,67 +40,84 @@ class ProtocolError(RuntimeError): + """Exception raised when a [Conversation][uhlive.stream.conversation.Conversation] method is not available in the current state.""" + pass -class State: - """Protocol state. +class State(Enum): + """Protocol state.""" - Protocol states implement and document the available commands. - User code should not use the State directly but use a `Conversation` - object instead, and call the prococol methods on it. - """ + Idle = "Idle State" + Joined = "Joined State" - def __init__(self, context: "Conversation") -> None: - self.context = context - def command(self, name: str, payload: Dict[str, Any] = {}) -> str: - message = [ - S_JOIN_REF, - self.context.request_id, - self.context.topic, - name, - payload, - ] - return json.dumps( - message, ensure_ascii=False, indent=None, separators=(",", ":") - ) +class Conversation: + """To join a conversation on the API, you need a `Conversation` object. + + You can only have one `Conversation` per connection (socket) otherwise you risk + unexpected behavior (and exceptions!). + """ + def __init__(self, identifier: str, conversation_id: str, speaker: str) -> None: + """Create a `Conversation`. + + Args: + identifier: is the identifier you got when you subscribed to the service; + conversation_id: is the conversation you wish to join, + speaker: is your alias in the conversation, to identify you and your events + """ + self._state: State = State.Idle + self.identifier = identifier + self.topic = f"conversation:{self.identifier}@{conversation_id}" + self.topic_bin = self.topic.encode("utf-8") + self.topic_len = len(self.topic_bin) + self.speaker = speaker + self._request_id = int(S_JOIN_REF) - 1 -class IdleState(State): def join( self, - model="fr", - country="fr", + model: str = "fr", + country: str = "fr", readonly: bool = False, - interim_results=True, - rescoring=True, - origin=0, - audio_codec="linear", + interim_results: bool = True, + rescoring: bool = True, + origin: int = 0, + audio_codec: str = "linear", ) -> str: """Join the conversation. - * ``readonly``: if you are not going to stream audio, set it to ``True``. - * ``speaker``: your alias in the conversation, to identify you and your events. - * ``model``: (if ``readonly`` is ``False``) the ASR language model to be use to recognize + Args: + readonly: if you are not going to stream audio, set it to `True`. + model: (if `readonly` is `False`) the ASR language model to be use to recognize the audio you will stream. - * ``country``: the iso 2 letter country code of the place where the speaker is. - * ``interim_results``: (``readonly`` = ``False`` only) should the ASR trigger interim result events? - * ``rescoring``: (``readonly`` = ``False`` only) should the ASR refine the final segment + country: the iso 2 letter country code of the place where the speaker is. + interim_results: (`readonly` = `False` only) should the ASR trigger interim result events? + rescoring: (`readonly` = `False` only) should the ASR refine the final segment with a bigger Language Model? May give slightly degraded results for very short segments. - * ``codec``: the speech audio codec of the audio data: - - ``"linear"``: (default) linear 16 bit SLE raw PCM audio at 8khz; - - ``"g711a"``: G711 a-law audio at 8khz; - - ``"g711u"``: G711 μ-law audio at 8khz. + origin: The UNIX time, in milliseconds, to which the event timeline origin is set. + audio_codec: the speech audio codec of the audio data: + + - `"linear"`: (default) linear 16 bit SLE raw PCM audio at 8khz; + - `"g711a"`: G711 a-law audio at 8khz; + - `"g711u"`: G711 μ-law audio at 8khz. + + Returns: + The text websocket message to send to the server. + + Raises: + ProtocolError: if still in a previously joined conversation. """ + if self._state != State.Idle: + raise ProtocolError("Can't join twice!") if not readonly and not model: raise ProtocolError("If readonly is False, you must specify a model!") return self.command( "phx_join", { "readonly": readonly, - "speaker": self.context.speaker, + "speaker": self.speaker, "model": model, "country": country, "interim_results": interim_results, @@ -104,78 +127,77 @@ def join( }, ) - -class JoinedState(State): def leave(self) -> str: """Leave the current conversation. It's a good idea to leave a conversation and continue to consume messages - until you receive a SpeakerLeft event for your speaker, before you + until you receive a [`SpeakerLeft`][uhlive.stream.conversation.SpeakerLeft] event for your speaker, before you close the connection. Otherwise, you may miss parts of the transcription. + + Returns: + The text websocket message to send to the server. + + Raises: + ProtocolError: if not currently in a converstation. """ + if self._state != State.Joined: + raise ProtocolError("No conversation to leave!") return self.command("phx_leave", {}) def send_audio_chunk(self, chunk: bytes) -> bytes: - """Send an audio chunk (when streaming.).""" - ref = self.context.request_id.encode("ascii") - message = array("B", [0, 1, len(ref), self.context.topic_len, 11, B_JOIN_REF]) + """Build an audio chunk for streaming. + + Returns: + The binary websocket message to send to the server. + Raises: + ProtocolError: if not currently in a converstation. + """ + if self._state != State.Joined: + raise ProtocolError("Not in a conversation!") + ref = self.request_id.encode("ascii") + message = array("B", [0, 1, len(ref), self.topic_len, 11, B_JOIN_REF]) message.extend(ref) - message.extend(self.context.topic_bin) + message.extend(self.topic_bin) message.extend(b"audio_chunk") message.extend(chunk) return message.tobytes() - -S = TypeVar("S", bound=State) - - -class Conversation: - """To join a conversation on the API, you need a `Conversation` object. - - You can only have one `Conversation` per connection (socket) otherwise you risk - unexpected behavior (and exceptions!). - """ - - def __init__(self, identifier: str, conversation_id: str, speaker: str) -> None: - """Create a `Conversation`. - - `identifier` is the identifier you got when you subscribed to the service; - - `conversation_id` is the conversation you wish to join, - - `speaker` is your alias in the conversation, to identify you and your events - """ - self._state: State = IdleState(self) - self.identifier = identifier - self.topic = f"conversation:{self.identifier}@{conversation_id}" - self.topic_bin = self.topic.encode("utf-8") - self.topic_len = len(self.topic_bin) - self.speaker = speaker - self._request_id = int(S_JOIN_REF) - 1 - - def __getattr__(self, name: str) -> Callable: - meth = getattr(self._state, name, None) - if meth is None: - raise ProtocolError(f"no method '{name}' in this state!") - return meth - - def transition(self, state: Type[S]) -> None: - self._state = state(self) - @property def request_id(self) -> str: self._request_id += 1 return str(self._request_id) - def receive(self, data: str) -> Event: - """Decode received text frame""" + def receive(self, data: Union[str, bytes]) -> Event: + """Decode received websocket message. + + The server only sends text messages. + + Returns: + The appropriate [Event][uhlive.stream.conversation.Event] subclass instance. + """ event = Event.from_message(json.loads(data)) assert ( event.topic == self.topic ), "Topic mismatch! Are you trying to mix several conversations on the same socket? This is not supported." if isinstance(event, Ok) and event.ref == event.join_ref: - self.transition(JoinedState) + self._state = State.Joined elif isinstance(event, SpeakerLeft) and event.speaker == self.speaker: - self.transition(IdleState) + self._state = State.Idle return event @property def left(self): - return isinstance(self._state, IdleState) + """Did the server confirm we left the conversation?""" + return self._state == State.Idle + + def command(self, name: str, payload: Dict[str, Any] = {}) -> str: + message = [ + S_JOIN_REF, + self.request_id, + self.topic, + name, + payload, + ] + return json.dumps( + message, ensure_ascii=False, indent=None, separators=(",", ":") + ) diff --git a/src/uhlive/stream/conversation/events.py b/src/uhlive/stream/conversation/events.py index 81f9a35..62cfe2b 100644 --- a/src/uhlive/stream/conversation/events.py +++ b/src/uhlive/stream/conversation/events.py @@ -1,7 +1,7 @@ """Event definitions.""" import re -from typing import List +from typing import Any, List from .error import UhliveError from .human_datetime import human_datetime @@ -14,27 +14,28 @@ class Word(dict): """Timestamped word.""" @property - def start(self): + def start(self) -> int: """Start time as Unix timestamp in millisecond, according to audio timeline.""" return self["start"] @property - def end(self): + def end(self) -> int: """End time as Unix timestamp in millisecond, according to audio timeline.""" return self["end"] @property - def length(self): + def length(self) -> int: """Word length in millisecond, according to audio timeline.""" return self["length"] @property - def word(self): + def word(self) -> str: """Transcript token string for this word.""" return self["word"] @property - def confidence(self): + def confidence(self) -> float: + """ASR confidence for this word.""" return self["confidence"] @@ -47,23 +48,24 @@ def __init__(self, join_ref, ref, topic, event, payload) -> None: self._topic = topic self._payload = payload - def __repr__(self): + def __repr__(self) -> str: return f"{self.__class__.__name__}(payload={self._payload})" @property - def topic(self): + def topic(self) -> str: + """The conversation identifier""" return self._topic @property - def join_ref(self): + def join_ref(self) -> str: return self._join_ref @property - def ref(self): + def ref(self) -> str: return self._ref @property - def speaker(self): + def speaker(self) -> str: """The speaker whose speech triggered this event. All events are relative to a speaker.""" @@ -87,15 +89,19 @@ def from_message(message): class Ok(Event): + """API asynchronous command aknowledgements.""" + pass class Unknown(Event): + """The server emitted an event unkown to this SDK. Time to upgrade!""" + def __init__(self, join_ref, ref, topic, event, payload): self._name = event super().__init__(join_ref, ref, topic, event, payload) - def __repr__(self): + def __repr__(self) -> str: return f"Unknown [{self._name}](payload={self._payload})" @@ -103,17 +109,17 @@ class TimeScopedEvent(Event): """Base class for events that are anchored to the audio time line.""" @property - def start(self): + def start(self) -> int: """Start time as Unix timestamp in millisecond, according to audio timeline.""" return self._payload["start"] @property - def end(self): + def end(self) -> int: """End time as Unix timestamp in millisecond, according to audio timeline.""" return self._payload["end"] @property - def length(self): + def length(self) -> int: """Event length in millisecond, according to audio timeline.""" return self._payload["length"] @@ -122,12 +128,12 @@ class SpeechDecoded(TimeScopedEvent): """The base class of all transcription events.""" @property - def transcript(self): + def transcript(self) -> str: """Get the transcript of the whole segment as a string""" return self._payload["transcript"] @property - def lang(self): + def lang(self) -> str: """Natural Language of the speech. As ISO 639-1 code. @@ -135,7 +141,7 @@ def lang(self): return self._payload["lang"] @property - def country(self): + def country(self) -> str: """Country location of speaker. As ISO 3166-1 code. @@ -143,20 +149,21 @@ def country(self): return self._payload["country"] @property - def utterance_id(self): + def utterance_id(self) -> str: """The Utterance id identifies the speech utterance this event transcribes.""" return self._payload["utterance_id"] @property - def words(self): - """Get the transcript of the whole segment as a list of timestamped words.""" + def words(self) -> List[Word]: + """Get the transcript of the whole segment as a list of timestamped [words][uhlive.stream.conversation.Word].""" return [Word(w) for w in self._payload["words"]] - def __str__(self): + def __str__(self) -> str: return f"[{self.speaker} — {human_datetime(self.start)}] {self.transcript}" @property - def confidence(self): + def confidence(self) -> float: + """The ASR confidence for this segment.""" return self._payload["confidence"] @@ -175,25 +182,30 @@ class SegmentDecoded(SpeechDecoded): class SegmentNormalized(SpeechDecoded): """Normalized final segment event.""" - def __str__(self): + def __str__(self) -> str: return f"[{self.speaker} — Formatted] {self.transcript}" class SpeakerJoined(Event): + """A new speaker joined the conversation (after us).""" + @property - def timestamp(self): + def timestamp(self) -> int: + """The UNIX time when the speaker joined the conversation.""" return self._payload["timestamp"] @property - def interim_results(self): + def interim_results(self) -> bool: + """Are interim results activated for this speaker?""" return self._payload["interim_results"] @property - def rescoring(self): + def rescoring(self) -> bool: + """Is rescoring enabled for this speaker?""" return self._payload["rescoring"] @property - def lang(self): + def lang(self) -> str: """Natural Language of the speech. As ISO 639-1 code. @@ -201,7 +213,7 @@ def lang(self): return self._payload["lang"] @property - def country(self): + def country(self) -> str: """Country location of speaker. As ISO 3166-1 code. @@ -213,7 +225,8 @@ class SpeakerLeft(Event): """Event emitted by the associated speaker when they left the conversation.""" @property - def timestamp(self): + def timestamp(self) -> int: + """UNIX time when the speaker left the conversation.""" return self._payload["timestamp"] @@ -227,11 +240,12 @@ def __init__(self, join_ref, ref, topic, event, payload): super().__init__(join_ref, ref, topic, event, payload) @property - def entity_name(self): + def entity_name(self) -> str: + """The name of the named entity found.""" return self._name @property - def lang(self): + def lang(self) -> str: """Natural Language of the interpretation. As ISO 639-1 code. @@ -239,7 +253,7 @@ def lang(self): return self._payload["lang"] @property - def country(self): + def country(self) -> str: """Country location of speaker. As ISO 3166-1 code. @@ -247,17 +261,17 @@ def country(self): return self._payload["country"] @property - def canonical(self): + def canonical(self) -> str: """The well formatted form of the entity in the language (string).""" return self._payload["annotation"].get("canonical") @property - def original(self): + def original(self) -> str: """The transcript excerpt that was interpreted, as string.""" return self._payload["annotation"]["original"] @property - def value(self): + def value(self) -> Any: """The interpreted value in machine understandable form. The exact type depends on the entity. @@ -265,10 +279,11 @@ def value(self): return self._payload["annotation"].get("value") @property - def confidence(self): + def confidence(self) -> float: + """The confidence of the interpretation.""" return self._payload["confidence"] - def __repr__(self): + def __repr__(self) -> str: return " ".join( ( " - ", @@ -280,8 +295,12 @@ def __repr__(self): class Tag: + """A tag represents a behavioral feature found in the conversation.""" + uuid: str + """The unique id of the Tag.""" label: str + """The human readable name of the Tag.""" def __init__(self, uuid: str, label: str) -> None: self.uuid = uuid @@ -292,8 +311,10 @@ def __repr__(self) -> str: class TagsFound(TimeScopedEvent): + """One or more tags were found on this time range.""" + @property - def lang(self): + def lang(self) -> str: """Natural Language of the interpretation. As ISO 639-1 code. @@ -301,7 +322,7 @@ def lang(self): return self._payload["lang"] @property - def country(self): + def country(self) -> str: """Country location of speaker. As ISO 3166-1 code. @@ -309,11 +330,13 @@ def country(self): return self._payload["country"] @property - def confidence(self): + def confidence(self) -> float: + """Tagger confidence.""" return self._payload["confidence"] @property def tags(self) -> List[Tag]: + """The [tags][uhlive.stream.conversation.Tag] that were found on this time range""" return [Tag(t["uuid"], t["label"]) for t in self._payload["annotation"]["tags"]] def __repr__(self): @@ -321,11 +344,14 @@ def __repr__(self): class EntityReference: - """Reference to unique Entity in conversation.""" + """Reference to a unique previously found Entity in the conversation.""" kind: str + """The name of the `Entity` referenced.""" speaker: str + """The speaker identifier.""" start: int + """The UNIX start time of the referenced `Entity`.""" def __init__(self, entity_name: str, speaker: str, start: int) -> None: self.kind = entity_name @@ -337,7 +363,10 @@ def __repr__(self) -> str: class RelationFound(TimeScopedEvent): - """The class for all Relation events.""" + """The class for all Relation events. + + Relations express a semantic relationship between two or more entities. + """ def __init__(self, join_ref, ref, topic, event, payload): self._name = RELATION_NAME.match(event).group( @@ -346,11 +375,12 @@ def __init__(self, join_ref, ref, topic, event, payload): super().__init__(join_ref, ref, topic, event, payload) @property - def relation_name(self): + def relation_name(self) -> str: + """The type of the relation.""" return self._name @property - def lang(self): + def lang(self) -> str: """Natural Language of the interpretation. As ISO 639-1 code. @@ -358,21 +388,25 @@ def lang(self): return self._payload["lang"] @property - def confidence(self): + def confidence(self) -> float: + """The confidence on the discovered relationship.""" return self._payload["confidence"] @property - def members(self): + def members(self) -> List[EntityReference]: + """[References to the Entities][uhlive.stream.conversation.EntityReference] involved in this relationship.""" m = [] speaker = self.speaker print(self._payload) for ref in self._payload["members"]: - kind = ENTITY_NAME.match(ref["entity"]).group(1) if ref["entity"] else None + kind = ( + ENTITY_NAME.match(ref["entity"]).group(1) if ref["entity"] else None # type: ignore + ) if kind is not None: m.append(EntityReference(kind, speaker, ref["start"])) return m - def __repr__(self): + def __repr__(self) -> str: return f"{self.__class__.__name__} <{self._name}> for {self.members} [confidence: {self.confidence:.2f}]" diff --git a/src/uhlive/stream/conversation/human_datetime.py b/src/uhlive/stream/conversation/human_datetime.py index c87fdc4..b0771f0 100644 --- a/src/uhlive/stream/conversation/human_datetime.py +++ b/src/uhlive/stream/conversation/human_datetime.py @@ -1,9 +1,14 @@ +""" +Display helpers. +""" + from datetime import datetime local_tz = datetime.now().astimezone().tzinfo def human_datetime(timestamp): + """Human readable representation of unix timestamp date.""" utc_dt = datetime.fromtimestamp(timestamp / 1000.0, local_tz) return utc_dt.isoformat(sep=" ") diff --git a/src/uhlive/stream/recognition/__init__.py b/src/uhlive/stream/recognition/__init__.py index 90deffd..ad983ce 100644 --- a/src/uhlive/stream/recognition/__init__.py +++ b/src/uhlive/stream/recognition/__init__.py @@ -1,11 +1,136 @@ +""" +The stream recognition API SDK for voice bots. + +Stream for voicebots, or Stream Human to Bots, or Stream H2B is a set of API enabling clients to build +interaction between a human end-user and a bot, for example to create Interactive Voice Response (IVR) +on the phone, or a voicebot within an app. + +For an overview of the concepts, protocols and workflow, see the +[higher level documenation](https://docs.allo-media.net/stream-h2b/#real-time-stream-api-for-voicebots) and +more specifically the [Websocket H2B protocol reference](https://docs.allo-media.net/stream-h2b/protocols/websocket/#websocket-for-voicebots). + +The protocol is messages based and uses websockets as transport. You are free to use whatever websocket client library you like to communicate +with the API, and use our SDK to encode/decode the messages. + +## Quickstart + +First retrieve a one time access token with the [Auth API](auth.md). + +Then use that token to build an authenticated URL, open a websocket connection to it with the websocket client library +of your choice and instanciate a [`Recognizer`][uhlive.stream.recognition.Recognizer] to make request, generate +audio stream messages and decode responses. + +As the API is asynchronous, streaming the audio and reading the returned events should be done in two different threads/tasks. + +```python +from uhlive.stream.recognition import * + +stream_h2b_url, stream_h2b_headers = build_connection_request(token) +recognizer = Recognizer() +``` + +Now you can connect and interact with the API: + +Synchronous example: + +```python +import websocket as ws + +socket = ws.create_connection(stream_h2b_url, header=stream_h2b_headers) +socket.send(recognizer.open()) +# Check if successful +reply = recognizer.receive(socket.recv()) +assert isinstance(reply, Opened), f"Expected Opened, got {reply}" +# start streaming the user's voice in another thread +streamer_thread_handle = stream_mic(socket, recognizer) +``` + +Asynchronous example: + +```python +from aiohttp import ClientSession + + async with ClientSession() as session: + async with session.ws_connect(stream_h2b_url, header=stream_h2b_headers) as socket: + + # Open a session + # Commands are sent as text frames + await socket.send_str(recognizer.open()) + # Check if successful + msg = await socket.receive() + reply = recognizer.receive(msg.data) + assert isinstance(reply, Opened), f"Expected Opened, got {reply}" + # start streaming the user's voice in another task + streamer_task_handle = asyncio.create_task(stream(socket, recognizer)) +``` + +As you can see, the I/O is cleanly decoupled from the protocol handling: the `Recognizer` object is only used +to create the messages to send to the API and to decode the received messages as `Event` objects. + +See the [complete examples in the source distribution](https://github.com/uhlive/python-sdk/tree/main/examples/recognition). + +""" + import os +from typing import Tuple from urllib.parse import urljoin -from .client import ProtocolError, Recognizer # noqa -from .events import * # noqa +from .client import ProtocolError, Recognizer +from .events import ( + Closed, + CompletionCause, + DefaultParams, + Event, + GrammarDefined, + InputTimersStarted, + Interpretation, + InvalidParamValue, + MethodFailed, + MethodNotAllowed, + MethodNotValid, + MissingParam, + Opened, + ParamsSet, + RecognitionComplete, + RecognitionInProgress, + RecogResult, + StartOfInput, + Stopped, + Transcript, +) SERVER = os.getenv("UHLIVE_API_URL", "wss://api.uh.live") -def build_connection_request(token): +def build_connection_request(token) -> Tuple[str, dict]: + """ + Make an authenticated URL and header to connect to the H2B Service. + """ return urljoin(SERVER, "/bots"), {"Authorization": f"bearer {token}"} + + +__all__ = [ + "ProtocolError", + "Recognizer", + "build_connection_request", + "Event", + "CompletionCause", + "Transcript", + "Interpretation", + "RecogResult", + "Opened", + "ParamsSet", + "DefaultParams", + "GrammarDefined", + "RecognitionInProgress", + "InputTimersStarted", + "Stopped", + "Closed", + "StartOfInput", + "RecognitionComplete", + "MethodNotValid", + "MethodFailed", + "InvalidParamValue", + "MissingParam", + "MethodNotAllowed", +] diff --git a/src/uhlive/stream/recognition/client.py b/src/uhlive/stream/recognition/client.py index bae6e56..4ab1c80 100644 --- a/src/uhlive/stream/recognition/client.py +++ b/src/uhlive/stream/recognition/client.py @@ -1,8 +1,9 @@ """ -I/O free Connection state machine. +Object oriented abstraction over the H2B API protocol and workflow. """ import json -from typing import Any, Callable, Dict, Type, TypeVar +from enum import Enum +from typing import Any, Dict, Union from .events import ( Closed, @@ -20,85 +21,127 @@ def serialize(cmd: Dict[str, Any]) -> str: class ProtocolError(RuntimeError): + """Exception raised when a [Recognizer][uhlive.stream.recognition.Recognizer] method is not available in the current state.""" + pass -class State: - """Protocol state. +class State(Enum): + """Protocol state.""" - Protocol states implement and document the available commands. - User code should not use the State directly but use a `Recognizer` - object instead, and call the prococol methods on it. - """ + NoSession = "Out of Session State" + IdleSession = "Idle Session State" + Recognition = "On-going Recognition State" - def __init__(self, context: "Recognizer") -> None: - self.context = context - def command(self, name: str, headers: Dict[str, Any] = {}, body: str = "") -> str: - return serialize( - { - "command": name, - "request_id": self.context.request_id, - "channel_id": self.context.channel_id, - "headers": headers, - "body": body, - } - ) +class Recognizer: + """The connection state machine. + + Use this class to decode received frames as `Event`s or to + make command frames by calling the appropriate methods. + If you call a method that is not appropriate in the current protocol + state, a `ProtocolError` is raised. + """ + def __init__(self) -> None: + self._state: State = State.NoSession + self._request_id = 0 + self._channel_id = "" + + # Workflow methods -class NoSessionState(State): def open( self, custom_id: str = "", channel_id: str = "", audio_codec: str = "linear" ) -> str: - """Open a new Samosa session. + """Open a new H2B session. - ``custom_id`` is any reference of yours that you want to appear in the logs - and invoice reports. + Args: + custom_id: is any reference of yours that you want to appear in the logs + and invoice reports. + channel_id: when provided, it'll be used as a prefix for + the actual channel ID generated by the server. + audio_codec: the speech audio codec of the audio data: - If a ``channel_id`` is provided, it'll be used as a prefix for - the actual channel ID generated by the server. + - `"linear"`: (default) linear 16 bit SLE raw PCM audio at 8khz; + - `"g711a"`: G711 a-law audio at 8khz; + - `"g711u"`: G711 μ-law audio at 8khz. - ``codec`` is the speech audio codec of the audio data: - - ``"linear"``: (default) linear 16 bit SLE raw PCM audio at 8khz; - - ``"g711a"``: G711 a-law audio at 8khz; - - ``"g711u"``: G711 μ-law audio at 8khz. + Returns: + A websocket text message to send to the server. + Raises: + ProtocolError: if a session is already open. """ + if self._state != State.NoSession: + raise ProtocolError("Session already opened!") return serialize( { "command": "OPEN", - "request_id": self.context.request_id, + "request_id": self.request_id, "channel_id": channel_id, "headers": {"custom_id": custom_id, "audio_codec": audio_codec}, "body": "", } ) - -class IdleSessionState(State): def send_audio_chunk(self, chunk: bytes) -> bytes: - """Build an audio chunk frame for streaming.""" + """Build an audio chunk frame for streaming. + + Returns: + A websocket binary message to send to the server. + + Raises: + ProtocolError: if no session opened. + """ + if self._state == State.NoSession: + raise ProtocolError("You must open a session first!") return chunk def set_params(self, **params: Any) -> str: """Set default ASR parameters for the session. - See https://docs.allo-media.net/live-api/robots-humans-protocol/#set-session-defaults + See [the parameter list](https://docs.allo-media.net/stream-h2b/protocols/websocket/#set-session-defaults) + and [the parameter visual explanations](https://docs.allo-media.net/stream-h2b/input/#resource-headers) for an explanation of the different parameters available. - """ + Returns: + A websocket text message to send to the server. + + Raises: + ProtocolError: if no session opened. + """ + if self._state != State.IdleSession: + raise ProtocolError(f"Method not available in this state ({self._state})!") return self.command("SET-PARAMS", params) def get_params(self) -> str: - """Retrieve the default values for the ASR parameters.""" + """Retrieve the default values for the ASR parameters. + + Returns: + A websocket text message to send to the server. + + Raises: + ProtocolError: if no session opened. + """ + if self._state != State.IdleSession: + raise ProtocolError(f"Method not available in this state ({self._state})!") return self.command("GET-PARAMS") def define_grammar(self, builtin: str, alias: str) -> str: """Define a grammar alias for a parameterized builtin. - `builtin`: the builtin URI to alias, including the query string, but without the "builtin:" prefix - `alias`: the alias, without the "session:" prefix. + Args: + builtin: the builtin URI to alias, including the query string, but without the "builtin:" prefix + alias: the alias, without the "session:" prefix. + + Returns: + A websocket text message to send to the server. + + Raises: + ProtocolError: if no session opened. """ + if self._state != State.IdleSession: + raise ProtocolError(f"Method not available in this state ({self._state})!") return self.command( "DEFINE-GRAMMAR", {"content_id": alias, "content_type": "text/uri-list"}, @@ -117,13 +160,19 @@ def recognize( This method takes grammar URIs as positional arguments, including the `builtin:` or `session:` prefixes to make the difference between builtin grammars and custom aliases. - Keywords arguments are also accepted: - - `start_timers`: default True - - `recognition_mode`: default is "normal" - - any other ASR parameter (no client side defaults). + Keyword Args: + start_timers: default True + recognition_mode: default is "normal" + **params: any other [ASR parameter](https://docs.allo-media.net/stream-h2b/protocols/websocket/#start-recognition) (no client side defaults). - See https://docs.allo-media.net/live-api/robots-humans-protocol/#start-recognition + Returns: + A websocket text message to send to the server. + + Raises: + ProtocolError: if no session opened. """ + if self._state != State.IdleSession: + raise ProtocolError(f"Method not available in this state ({self._state})!") return self.command( "RECOGNIZE", headers={ @@ -136,51 +185,46 @@ def recognize( ) def close(self) -> str: - """Close the current session.""" - return self.command("CLOSE") + """Close the current session. + Returns: + A websocket text message to send to the server. -class RecognitionState(State): - def send_audio_chunk(self, chunk: bytes) -> bytes: - """Build an audio chunk frame for streaming.""" - return chunk + Raises: + ProtocolError: if no session opened. + """ + if self._state != State.IdleSession: + raise ProtocolError(f"Method not available in this state ({self._state})!") + return self.command("CLOSE") def start_input_timers(self) -> str: """If the input timers were not started by the RECOGNIZE command, starts them now. + + Returns: + A websocket text message to send to the server. + + Raises: + ProtocolError: if no on-going recognition process """ + if self._state != State.Recognition: + raise ProtocolError("Command is only valid during recognition!") return self.command("START-INPUT-TIMERS") def stop(self) -> str: - """Stop ongoing recognition process""" - return self.command("STOP") - - -S = TypeVar("S", bound=State) - - -class Recognizer: - """The connection state machine. - - Use this class to decode received frames as `Event`s or to - make command frames by calling the appropriate methods. - If you call a method that is not appropriate in the current protocol - state, a `ProtocolError` is raised. - """ + """Stop ongoing recognition process - def __init__(self) -> None: - self._state: State = NoSessionState(self) - self._request_id = 0 - self._channel_id = "" + Returns: + A websocket text message to send to the server. - def __getattr__(self, name: str) -> Callable: - meth = getattr(self._state, name, None) - if meth is None: - raise ProtocolError(f"no method '{name}' in this state!") - return meth + Raises: + ProtocolError: if no on-going recognition process + """ + if self._state != State.Recognition: + raise ProtocolError("Command is only valid during recognition!") + return self.command("STOP") - def transition(self, state: Type[S]) -> None: - self._state = state(self) + ## @property def request_id(self) -> int: @@ -189,18 +233,37 @@ def request_id(self) -> int: @property def channel_id(self) -> str: + """The current session ID.""" return self._channel_id - def receive(self, data: str) -> Event: - """Decode received text frame""" + def receive(self, data: Union[str, bytes]) -> Event: + """Decode received text frame. + + The server always replies with text frames. + + Returns: + The appropriate `Event` subclass. + """ + assert type(data) is str # to please mypy event = deserialize(data) if isinstance(event, RecognitionInProgress): - self.transition(RecognitionState) + self._state = State.Recognition elif isinstance(event, (RecognitionComplete, Stopped)): - self.transition(IdleSessionState) + self._state = State.IdleSession elif isinstance(event, Opened): self._channel_id = event.channel_id - self.transition(IdleSessionState) + self._state = State.IdleSession elif isinstance(event, Closed): - self.transition(NoSessionState) + self._state = State.NoSession return event + + def command(self, name: str, headers: Dict[str, Any] = {}, body: str = "") -> str: + return serialize( + { + "command": name, + "request_id": self.request_id, + "channel_id": self.channel_id, + "headers": headers, + "body": body, + } + ) diff --git a/src/uhlive/stream/recognition/events.py b/src/uhlive/stream/recognition/events.py index 327e5a4..2aeb3c5 100644 --- a/src/uhlive/stream/recognition/events.py +++ b/src/uhlive/stream/recognition/events.py @@ -1,6 +1,7 @@ -"""Samosa Events. +"""H2B Events. -Data model for events returned by the Samosa server. +Data model for events returned by the H2B server. +See also https://docs.allo-media.net/stream-h2b/protocols/websocket/#websocket-for-voicebots. """ @@ -11,7 +12,10 @@ class CompletionCause(Enum): - """The set of possible completion cause""" + """The set of possible completion causes. + + See [all possible values](https://docs.allo-media.net/stream-h2b/protocols/websocket/#asynchronous-recognition-events). + """ GramDefinitionFailure = "GramDefinitionFailure" GramLoadFailure = "GramLoadFailure" @@ -38,18 +42,22 @@ def __init__(self, data: Dict[str, Any]) -> None: @property def transcript(self) -> str: + """The raw ASR output.""" return self._transcript @property def confidence(self) -> float: + """The ASR transcription confidence.""" return self._confidence @property def start(self) -> datetime: + """Start of speech.""" return self._start @property def end(self) -> datetime: + """End of speech.""" return self._end def __str__(self) -> str: @@ -66,6 +74,7 @@ def __init__(self, data: Dict[str, Any]) -> None: @property def confidence(self) -> float: + """The confidence of the interpretation.""" return self._confidence @property @@ -76,7 +85,10 @@ def type(self) -> str: @property def value(self) -> Dict[str, Any]: """The structured interpreted value. - The type/schema of the value is given by the `type` attribute + + The type/schema of the value is given by the `self.type` property. + + See the [Grammar reference documentaiton](https://docs.allo-media.net/stream-h2b/grammars). """ return self._value @@ -98,12 +110,12 @@ def __init__(self, data: dict) -> None: @property def asr(self) -> Optional[Transcript]: - """The ASR part of the result (transcription result)""" + """The ASR part of the result ([transcription][uhlive.stream.recognition.Transcript] result)""" return self._asr @property def nlu(self) -> Optional[Interpretation]: - """The NLU part of the result (interpretation)""" + """The NLU part of the result ([interpretation][uhlive.stream.recognition.Interpretation])""" return self._nlu @property @@ -134,26 +146,35 @@ def __init__(self, data: Dict[str, Any]) -> None: @property def request_id(self) -> int: + """The request ID that event responds to.""" return self._request_id @property def channel_id(self) -> str: + """The channel ID.""" return self._channel_id @property def headers(self) -> Dict[str, Any]: + """The response headers. + + See also the [header description](https://docs.allo-media.net/stream-h2b/output/#headers-%26-statuses). + """ return self._headers @property def completion_cause(self) -> Optional[CompletionCause]: + """The response [`CompletionCause`][uhlive.stream.recognition.CompletionCause].""" return self._completion_cause @property def completion_reason(self) -> Optional[str]: + """The completion message.""" return self._completion_reason @property def body(self) -> Optional[RecogResult]: + """The content of the Event is a [`RecogResult`][uhlive.stream.recognition.RecogResult] if it is a `RecognitionComplete` event.""" return self._body def __str__(self) -> str: @@ -161,10 +182,14 @@ def __str__(self) -> str: class Opened(Event): + """Session opened on the server""" + pass class ParamsSet(Event): + """The default parameters were set.""" + pass @@ -175,50 +200,74 @@ class DefaultParams(Event): class GrammarDefined(Event): + """The `DefineGrammar` command has been processed.""" + pass class RecognitionInProgress(Event): + """The ASR recognition is started.""" + pass class InputTimersStarted(Event): + """The Input Timers are started.""" + pass class Stopped(Event): + """The ASR recognition has been stopped on the client request.""" + pass class Closed(Event): + """The session is closed.""" + pass class StartOfInput(Event): + """In normal recognition mode, this event is emitted when speech is detected.""" + pass class RecognitionComplete(Event): + """The ASR recognition is complete.""" + pass class MethodNotValid(Event): + """The server received an invalid command.""" + pass class MethodFailed(Event): + """The server was unable to complete the command.""" + pass class InvalidParamValue(Event): + """The server received a request to set an invalid value for a parameter.""" + pass class MissingParam(Event): + """The command is missings some mandatory parameter.""" + pass class MethodNotAllowed(Event): + """The command is not allowed in this state.""" + pass @@ -247,27 +296,3 @@ def deserialize(data: str) -> Event: if kind in EVENT_MAP: return EVENT_MAP[kind](jd) raise ValueError(f"Unknown event '{kind}'") - - -__all__ = [ - "Event", - "CompletionCause", - "Transcript", - "Interpretation", - "RecogResult", - "Opened", - "ParamsSet", - "DefaultParams", - "GrammarDefined", - "RecognitionInProgress", - "InputTimersStarted", - "Stopped", - "Closed", - "StartOfInput", - "RecognitionComplete", - "MethodNotValid", - "MethodFailed", - "InvalidParamValue", - "MissingParam", - "MethodNotAllowed", -] diff --git a/tox.ini b/tox.ini index b9f89aa..4a1e691 100644 --- a/tox.ini +++ b/tox.ini @@ -9,6 +9,7 @@ envlist = py38 py39 py310 + py311 isort black flake8