feat(stt,tts): update baseclasses to match python (#103)

livekit · Oct 8, 2024 · ba41d6f · ba41d6f
1 parent 18a5da0
commit ba41d6f
Show file tree

Hide file tree

Showing 7 changed files with 217 additions and 296 deletions.
diff --git a/.changeset/small-hotels-happen.md b/.changeset/small-hotels-happen.md
@@ -0,0 +1,5 @@
+---
+"@livekit/agents": minor
+---
+
+update TTS and STT baseclasses to match python
diff --git a/agents/src/stt/index.ts b/agents/src/stt/index.ts
@@ -2,5 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-export { STT, SpeechEvent, SpeechEventType, SpeechStream, type SpeechData } from './stt.js';
-export { StreamAdapter, StreamAdapterWrapper } from './stream_adapter.js';
+export {
+  type SpeechEvent,
+  type SpeechData,
+  type STTCapabilities,
+  SpeechEventType,
+  STT,
+  SpeechStream,
+} from './stt.js';
diff --git a/agents/src/stt/stream_adapter.ts b/agents/src/stt/stream_adapter.ts
diff --git a/agents/src/stt/stt.ts b/agents/src/stt/stt.ts
@@ -2,13 +2,14 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 import type { AudioFrame } from '@livekit/rtc-node';
-import type { AudioBuffer } from '../utils.js';
+import { AsyncIterableQueue } from '../utils.js';
 
+/** Indicates start/middle/end of speech */
 export enum SpeechEventType {
   /**
    * Indicate the start of speech.
    * If the STT doesn't support this event, this will be emitted at the same time
-   * as the first INTERMIN_TRANSCRIPT.
+   * as the first INTERIM_TRANSCRIPT.
    */
   START_OF_SPEECH = 0,
   /**
@@ -27,6 +28,7 @@ export enum SpeechEventType {
   END_OF_SPEECH = 3,
 }
 
+/** SpeechData contains metadata about this {@link SpeechEvent}. */
 export interface SpeechData {
   language: string;
   text: string;
@@ -35,51 +37,116 @@ export interface SpeechData {
   confidence: number;
 }
 
-export class SpeechEvent {
+/** SpeechEvent is a packet of speech-to-text data. */
+export interface SpeechEvent {
   type: SpeechEventType;
   alternatives: SpeechData[];
+}
 
-  constructor(type: SpeechEventType, alternatives: SpeechData[] = []) {
-    this.type = type;
-    this.alternatives = alternatives;
-  }
+/**
+ * Describes the capabilities of the STT provider.
+ *
+ * @remarks
+ * At present, the framework only supports providers that have a streaming endpoint.
+ */
+export interface STTCapabilities {
+  streaming: boolean;
+  interimResults: boolean;
 }
 
-export abstract class SpeechStream implements IterableIterator<SpeechEvent> {
-  /**
-   * Push a frame to be recognised.
-   * It is recommended to push frames as soon as they are available.
-   */
-  abstract pushFrame(token: AudioFrame): void;
+/**
+ * An instance of a speech-to-text adapter.
+ *
+ * @remarks
+ * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that
+ * exports its own child STT class, which inherits this class's methods.
+ */
+export abstract class STT {
+  #capabilities: STTCapabilities;
+
+  constructor(capabilities: STTCapabilities) {
+    this.#capabilities = capabilities;
+  }
+
+  /** Returns this STT's capabilities */
+  get capabilities(): STTCapabilities {
+    return this.#capabilities;
+  }
 
   /**
-   * Close the stream.
-   *
-   * @param wait
-   *   Whether to wait for the STT to finish processing the remaining
-   *   frames before closing
+   * Returns a {@link SpeechStream} that can be used to push audio frames and receive
+   * transcriptions
    */
-  abstract close(wait: boolean): Promise<void>;
+  abstract stream(): SpeechStream;
+}
 
-  abstract next(): IteratorResult<SpeechEvent>;
+/**
+ * An instance of a speech-to-text stream, as an asynchronous iterable iterator.
+ *
+ * @example Looping through frames
+ * ```ts
+ * for await (const event of stream) {
+ *   if (event.type === SpeechEventType.FINAL_TRANSCRIPT) {
+ *     console.log(event.alternatives[0].text)
+ *   }
+ * }
+ * ```
+ *
+ * @remarks
+ * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that
+ * exports its own child SpeechStream class, which inherits this class's methods.
+ */
+export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent> {
+  protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');
+  protected input = new AsyncIterableQueue<AudioFrame | typeof SpeechStream.FLUSH_SENTINEL>();
+  protected queue = new AsyncIterableQueue<SpeechEvent>();
+  protected closed = false;
 
-  [Symbol.iterator](): SpeechStream {
-    return this;
+  /** Push an audio frame to the STT */
+  pushFrame(frame: AudioFrame) {
+    if (this.input.closed) {
+      throw new Error('Input is closed');
+    }
+    if (this.closed) {
+      throw new Error('Stream is closed');
+    }
+    this.input.put(frame);
   }
-}
 
-export abstract class STT {
-  #streamingSupported: boolean;
+  /** Flush the STT, causing it to process all pending text */
+  flush() {
+    if (this.input.closed) {
+      throw new Error('Input is closed');
+    }
+    if (this.closed) {
+      throw new Error('Stream is closed');
+    }
+    this.input.put(SpeechStream.FLUSH_SENTINEL);
+  }
 
-  constructor(streamingSupported: boolean) {
-    this.#streamingSupported = streamingSupported;
+  /** Mark the input as ended and forbid additional pushes */
+  endInput() {
+    if (this.input.closed) {
+      throw new Error('Input is closed');
+    }
+    if (this.closed) {
+      throw new Error('Stream is closed');
+    }
+    this.input.close();
   }
 
-  abstract recognize(buffer: AudioBuffer, language?: string): Promise<SpeechEvent>;
+  next(): Promise<IteratorResult<SpeechEvent>> {
+    return this.queue.next();
+  }
 
-  abstract stream(language: string | undefined): SpeechStream;
+  /** Close both the input and output of the STT stream */
+  close() {
+    this.input.close();
+    this.queue.close();
+    this.closed = true;
+  }
 
-  get streamingSupported(): boolean {
-    return this.#streamingSupported;
+  [Symbol.asyncIterator](): SpeechStream {
+    return this;
   }
 }
diff --git a/agents/src/tts/index.ts b/agents/src/tts/index.ts
@@ -1,23 +1,4 @@
 // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-import { StreamAdapter, StreamAdapterWrapper } from './stream_adapter.js';
-import {
-  ChunkedStream,
-  SynthesisEvent,
-  SynthesisEventType,
-  SynthesizeStream,
-  type SynthesizedAudio,
-  TTS,
-} from './tts.js';
-
-export {
-  TTS,
-  SynthesisEvent,
-  SynthesisEventType,
-  SynthesizedAudio,
-  SynthesizeStream,
-  StreamAdapter,
-  StreamAdapterWrapper,
-  ChunkedStream,
-};
+export { type SynthesizedAudio, type TTSCapabilities, TTS, SynthesizeStream } from './tts.js';