Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(stt,tts): update baseclasses to match python #103

Merged
merged 4 commits into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/small-hotels-happen.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@livekit/agents": minor
---

update TTS and STT baseclasses to match python
10 changes: 8 additions & 2 deletions agents/src/stt/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,11 @@
//
// SPDX-License-Identifier: Apache-2.0

export { STT, SpeechEvent, SpeechEventType, SpeechStream, type SpeechData } from './stt.js';
export { StreamAdapter, StreamAdapterWrapper } from './stream_adapter.js';
export {
type SpeechEvent,
type SpeechData,
type STTCapabilities,
SpeechEventType,
STT,
SpeechStream,
} from './stt.js';
93 changes: 0 additions & 93 deletions agents/src/stt/stream_adapter.ts

This file was deleted.

129 changes: 98 additions & 31 deletions agents/src/stt/stt.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@
//
// SPDX-License-Identifier: Apache-2.0
import type { AudioFrame } from '@livekit/rtc-node';
import type { AudioBuffer } from '../utils.js';
import { AsyncIterableQueue } from '../utils.js';

/** Indicates start/middle/end of speech */
export enum SpeechEventType {
/**
* Indicate the start of speech.
* If the STT doesn't support this event, this will be emitted at the same time
* as the first INTERMIN_TRANSCRIPT.
* as the first INTERIM_TRANSCRIPT.
*/
START_OF_SPEECH = 0,
/**
Expand All @@ -27,6 +28,7 @@ export enum SpeechEventType {
END_OF_SPEECH = 3,
}

/** SpeechData contains metadata about this {@link SpeechEvent}. */
export interface SpeechData {
language: string;
text: string;
Expand All @@ -35,51 +37,116 @@ export interface SpeechData {
confidence: number;
}

export class SpeechEvent {
/** SpeechEvent is a packet of speech-to-text data. */
export interface SpeechEvent {
type: SpeechEventType;
alternatives: SpeechData[];
}

constructor(type: SpeechEventType, alternatives: SpeechData[] = []) {
this.type = type;
this.alternatives = alternatives;
}
/**
* Describes the capabilities of the STT provider.
*
* @remarks
* At present, the framework only supports providers that have a streaming endpoint.
*/
export interface STTCapabilities {
streaming: boolean;
interimResults: boolean;
}

export abstract class SpeechStream implements IterableIterator<SpeechEvent> {
/**
* Push a frame to be recognised.
* It is recommended to push frames as soon as they are available.
*/
abstract pushFrame(token: AudioFrame): void;
/**
* An instance of a speech-to-text adapter.
*
* @remarks
* This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that
* exports its own child STT class, which inherits this class's methods.
*/
export abstract class STT {
#capabilities: STTCapabilities;

constructor(capabilities: STTCapabilities) {
this.#capabilities = capabilities;
}

/** Returns this STT's capabilities */
get capabilities(): STTCapabilities {
return this.#capabilities;
}

/**
* Close the stream.
*
* @param wait
* Whether to wait for the STT to finish processing the remaining
* frames before closing
* Returns a {@link SpeechStream} that can be used to push audio frames and receive
* transcriptions
*/
abstract close(wait: boolean): Promise<void>;
abstract stream(): SpeechStream;
}

abstract next(): IteratorResult<SpeechEvent>;
/**
* An instance of a speech-to-text stream, as an asynchronous iterable iterator.
*
* @example Looping through frames
* ```ts
* for await (const event of stream) {
* if (event.type === SpeechEventType.FINAL_TRANSCRIPT) {
* console.log(event.alternatives[0].text)
* }
* }
* ```
*
* @remarks
* This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that
* exports its own child SpeechStream class, which inherits this class's methods.
*/
export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent> {
protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');
protected input = new AsyncIterableQueue<AudioFrame | typeof SpeechStream.FLUSH_SENTINEL>();
protected queue = new AsyncIterableQueue<SpeechEvent>();
protected closed = false;

[Symbol.iterator](): SpeechStream {
return this;
/** Push an audio frame to the STT */
pushFrame(frame: AudioFrame) {
if (this.input.closed) {
throw new Error('Input is closed');
}
if (this.closed) {
throw new Error('Stream is closed');
}
this.input.put(frame);
}
}

export abstract class STT {
#streamingSupported: boolean;
/** Flush the STT, causing it to process all pending text */
flush() {
if (this.input.closed) {
throw new Error('Input is closed');
}
if (this.closed) {
throw new Error('Stream is closed');
}
this.input.put(SpeechStream.FLUSH_SENTINEL);
}

constructor(streamingSupported: boolean) {
this.#streamingSupported = streamingSupported;
/** Mark the input as ended and forbid additional pushes */
endInput() {
if (this.input.closed) {
throw new Error('Input is closed');
}
if (this.closed) {
throw new Error('Stream is closed');
}
this.input.close();
}

abstract recognize(buffer: AudioBuffer, language?: string): Promise<SpeechEvent>;
next(): Promise<IteratorResult<SpeechEvent>> {
return this.queue.next();
}

abstract stream(language: string | undefined): SpeechStream;
/** Close both the input and output of the STT stream */
close() {
this.input.close();
this.queue.close();
this.closed = true;
}

get streamingSupported(): boolean {
return this.#streamingSupported;
[Symbol.asyncIterator](): SpeechStream {
return this;
}
}
21 changes: 1 addition & 20 deletions agents/src/tts/index.ts
Original file line number Diff line number Diff line change
@@ -1,23 +1,4 @@
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
//
// SPDX-License-Identifier: Apache-2.0
import { StreamAdapter, StreamAdapterWrapper } from './stream_adapter.js';
import {
ChunkedStream,
SynthesisEvent,
SynthesisEventType,
SynthesizeStream,
type SynthesizedAudio,
TTS,
} from './tts.js';

export {
TTS,
SynthesisEvent,
SynthesisEventType,
SynthesizedAudio,
SynthesizeStream,
StreamAdapter,
StreamAdapterWrapper,
ChunkedStream,
};
export { type SynthesizedAudio, type TTSCapabilities, TTS, SynthesizeStream } from './tts.js';
Loading