Skip to content

Commit

Permalink
feat(stt,tts): update baseclasses to match python (#103)
Browse files Browse the repository at this point in the history
  • Loading branch information
nbsp authored Oct 8, 2024
1 parent 18a5da0 commit ba41d6f
Show file tree
Hide file tree
Showing 7 changed files with 217 additions and 296 deletions.
5 changes: 5 additions & 0 deletions .changeset/small-hotels-happen.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@livekit/agents": minor
---

update TTS and STT baseclasses to match python
10 changes: 8 additions & 2 deletions agents/src/stt/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,11 @@
//
// SPDX-License-Identifier: Apache-2.0

export { STT, SpeechEvent, SpeechEventType, SpeechStream, type SpeechData } from './stt.js';
export { StreamAdapter, StreamAdapterWrapper } from './stream_adapter.js';
export {
type SpeechEvent,
type SpeechData,
type STTCapabilities,
SpeechEventType,
STT,
SpeechStream,
} from './stt.js';
93 changes: 0 additions & 93 deletions agents/src/stt/stream_adapter.ts

This file was deleted.

129 changes: 98 additions & 31 deletions agents/src/stt/stt.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@
//
// SPDX-License-Identifier: Apache-2.0
import type { AudioFrame } from '@livekit/rtc-node';
import type { AudioBuffer } from '../utils.js';
import { AsyncIterableQueue } from '../utils.js';

/** Indicates start/middle/end of speech */
export enum SpeechEventType {
/**
* Indicate the start of speech.
* If the STT doesn't support this event, this will be emitted at the same time
* as the first INTERMIN_TRANSCRIPT.
* as the first INTERIM_TRANSCRIPT.
*/
START_OF_SPEECH = 0,
/**
Expand All @@ -27,6 +28,7 @@ export enum SpeechEventType {
END_OF_SPEECH = 3,
}

/** SpeechData contains metadata about this {@link SpeechEvent}. */
export interface SpeechData {
language: string;
text: string;
Expand All @@ -35,51 +37,116 @@ export interface SpeechData {
confidence: number;
}

export class SpeechEvent {
/** SpeechEvent is a packet of speech-to-text data. */
export interface SpeechEvent {
type: SpeechEventType;
alternatives: SpeechData[];
}

constructor(type: SpeechEventType, alternatives: SpeechData[] = []) {
this.type = type;
this.alternatives = alternatives;
}
/**
* Describes the capabilities of the STT provider.
*
* @remarks
* At present, the framework only supports providers that have a streaming endpoint.
*/
export interface STTCapabilities {
streaming: boolean;
interimResults: boolean;
}

export abstract class SpeechStream implements IterableIterator<SpeechEvent> {
/**
* Push a frame to be recognised.
* It is recommended to push frames as soon as they are available.
*/
abstract pushFrame(token: AudioFrame): void;
/**
* An instance of a speech-to-text adapter.
*
* @remarks
* This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that
* exports its own child STT class, which inherits this class's methods.
*/
export abstract class STT {
#capabilities: STTCapabilities;

constructor(capabilities: STTCapabilities) {
this.#capabilities = capabilities;
}

/** Returns this STT's capabilities */
get capabilities(): STTCapabilities {
return this.#capabilities;
}

/**
* Close the stream.
*
* @param wait
* Whether to wait for the STT to finish processing the remaining
* frames before closing
* Returns a {@link SpeechStream} that can be used to push audio frames and receive
* transcriptions
*/
abstract close(wait: boolean): Promise<void>;
abstract stream(): SpeechStream;
}

abstract next(): IteratorResult<SpeechEvent>;
/**
* An instance of a speech-to-text stream, as an asynchronous iterable iterator.
*
* @example Looping through frames
* ```ts
* for await (const event of stream) {
* if (event.type === SpeechEventType.FINAL_TRANSCRIPT) {
* console.log(event.alternatives[0].text)
* }
* }
* ```
*
* @remarks
* This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that
* exports its own child SpeechStream class, which inherits this class's methods.
*/
export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent> {
protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');
protected input = new AsyncIterableQueue<AudioFrame | typeof SpeechStream.FLUSH_SENTINEL>();
protected queue = new AsyncIterableQueue<SpeechEvent>();
protected closed = false;

[Symbol.iterator](): SpeechStream {
return this;
/** Push an audio frame to the STT */
pushFrame(frame: AudioFrame) {
if (this.input.closed) {
throw new Error('Input is closed');
}
if (this.closed) {
throw new Error('Stream is closed');
}
this.input.put(frame);
}
}

export abstract class STT {
#streamingSupported: boolean;
/** Flush the STT, causing it to process all pending text */
flush() {
if (this.input.closed) {
throw new Error('Input is closed');
}
if (this.closed) {
throw new Error('Stream is closed');
}
this.input.put(SpeechStream.FLUSH_SENTINEL);
}

constructor(streamingSupported: boolean) {
this.#streamingSupported = streamingSupported;
/** Mark the input as ended and forbid additional pushes */
endInput() {
if (this.input.closed) {
throw new Error('Input is closed');
}
if (this.closed) {
throw new Error('Stream is closed');
}
this.input.close();
}

abstract recognize(buffer: AudioBuffer, language?: string): Promise<SpeechEvent>;
next(): Promise<IteratorResult<SpeechEvent>> {
return this.queue.next();
}

abstract stream(language: string | undefined): SpeechStream;
/** Close both the input and output of the STT stream */
close() {
this.input.close();
this.queue.close();
this.closed = true;
}

get streamingSupported(): boolean {
return this.#streamingSupported;
[Symbol.asyncIterator](): SpeechStream {
return this;
}
}
21 changes: 1 addition & 20 deletions agents/src/tts/index.ts
Original file line number Diff line number Diff line change
@@ -1,23 +1,4 @@
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
//
// SPDX-License-Identifier: Apache-2.0
import { StreamAdapter, StreamAdapterWrapper } from './stream_adapter.js';
import {
ChunkedStream,
SynthesisEvent,
SynthesisEventType,
SynthesizeStream,
type SynthesizedAudio,
TTS,
} from './tts.js';

export {
TTS,
SynthesisEvent,
SynthesisEventType,
SynthesizedAudio,
SynthesizeStream,
StreamAdapter,
StreamAdapterWrapper,
ChunkedStream,
};
export { type SynthesizedAudio, type TTSCapabilities, TTS, SynthesizeStream } from './tts.js';
Loading

0 comments on commit ba41d6f

Please sign in to comment.