Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions agents/src/voice/agent_activity.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1201,6 +1201,7 @@ export class AgentActivity implements RecognitionHooks {
audioSource,
modelSettings,
replyAbortController,
this.agentSession.options.ttsTextTransforms || null,
);
tasks.push(ttsTask);

Expand Down Expand Up @@ -1314,6 +1315,7 @@ export class AgentActivity implements RecognitionHooks {
ttsTextInput,
modelSettings,
replyAbortController,
this.agentSession.options.ttsTextTransforms || null,
);
tasks.push(ttsTask);
}
Expand Down Expand Up @@ -1700,6 +1702,7 @@ export class AgentActivity implements RecognitionHooks {
ttsTextInput,
modelSettings,
abortController,
this.agentSession.options.ttsTextTransforms || null,
);
tasks.push(ttsTask);
realtimeAudioResult = ttsStream;
Expand Down
3 changes: 3 additions & 0 deletions agents/src/voice/agent_session.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ import { AgentInput, AgentOutput } from './io.js';
import { RoomIO, type RoomInputOptions, type RoomOutputOptions } from './room_io/index.js';
import type { UnknownUserData } from './run_context.js';
import type { SpeechHandle } from './speech_handle.js';
import { DEFAULT_TTS_TEXT_TRANSFORMS, type TextTransformSpec } from './transcription/transforms.js';

export interface VoiceOptions {
allowInterruptions: boolean;
Expand All @@ -60,6 +61,7 @@ export interface VoiceOptions {
maxToolSteps: number;
preemptiveGeneration: boolean;
userAwayTimeout?: number | null;
ttsTextTransforms?: TextTransformSpec[] | null;
}

const defaultVoiceOptions: VoiceOptions = {
Expand All @@ -72,6 +74,7 @@ const defaultVoiceOptions: VoiceOptions = {
maxToolSteps: 3,
preemptiveGeneration: false,
userAwayTimeout: 15.0,
ttsTextTransforms: DEFAULT_TTS_TEXT_TRANSFORMS,
} as const;

export type TurnDetectionMode = 'stt' | 'vad' | 'realtime_llm' | 'manual' | _TurnDetector;
Expand Down
10 changes: 9 additions & 1 deletion agents/src/voice/generation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import type { AgentSession } from './agent_session.js';
import type { AudioOutput, LLMNode, TTSNode, TextOutput } from './io.js';
import { RunContext } from './run_context.js';
import type { SpeechHandle } from './speech_handle.js';
import { type TextTransformSpec, applyTextTransforms } from './transcription/index.js';

/** @internal */
export class _LLMGenerationData {
Expand Down Expand Up @@ -474,6 +475,7 @@ export function performTTSInference(
text: ReadableStream<string>,
modelSettings: ModelSettings,
controller: AbortController,
textTransforms?: readonly TextTransformSpec[] | null,
): [Task<void>, ReadableStream<AudioFrame>] {
const audioStream = new IdentityTransform<AudioFrame>();
const outputWriter = audioStream.writable.getWriter();
Expand All @@ -484,7 +486,13 @@ export function performTTSInference(
let ttsStream: ReadableStream<AudioFrame> | null = null;

try {
ttsStream = await node(text, modelSettings);
// Apply text transforms
let transformedText = text;
if (textTransforms && textTransforms.length > 0) {
transformedText = await applyTextTransforms(text, textTransforms);
}

ttsStream = await node(transformedText, modelSettings);
if (ttsStream === null) {
await outputWriter.close();
return;
Expand Down
1 change: 1 addition & 0 deletions agents/src/voice/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ export * from './events.js';
export * from './report.js';
export * from './room_io/index.js';
export { RunContext } from './run_context.js';
export * from './transcription/index.js';
4 changes: 4 additions & 0 deletions agents/src/voice/transcription/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,7 @@
//
// SPDX-License-Identifier: Apache-2.0
export * from './_utils.js';
export * from './transforms.js';
export * from './transforms_agnostic.js';
export * from './transforms_en.js';
export * from './transforms_de.js';
140 changes: 140 additions & 0 deletions agents/src/voice/transcription/transforms.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
//
// SPDX-License-Identifier: Apache-2.0
import { ReadableStream } from 'node:stream/web';
import { describe, expect, it } from 'vitest';
import {
DEFAULT_TTS_TEXT_TRANSFORMS,
applyTextTransforms,
getAllAvailableTransforms,
getAvailableTransforms,
} from './transforms.js';

/**
* Helper to convert a string to a ReadableStream
*/
function stringToStream(text: string): ReadableStream<string> {
return new ReadableStream({
start(controller) {
controller.enqueue(text);
controller.close();
},
});
}

/**
* Helper to read a stream to a string
*/
async function streamToString(stream: ReadableStream<string>): Promise<string> {
const reader = stream.getReader();
let result = '';
while (true) {
const { done, value } = await reader.read();
if (done) break;
result += value;
}
return result;
}

describe('Text Transforms Core', () => {
it('should export DEFAULT_TTS_TEXT_TRANSFORMS', () => {
expect(DEFAULT_TTS_TEXT_TRANSFORMS).toBeDefined();
expect(DEFAULT_TTS_TEXT_TRANSFORMS).toEqual(['filter_markdown', 'filter_emoji']);
});

it('should list available transforms for English', () => {
const transforms = getAvailableTransforms('en');
expect(transforms.has('filter_markdown')).toBe(true);
expect(transforms.has('filter_emoji')).toBe(true);
expect(transforms.has('format_numbers')).toBe(true);
expect(transforms.has('format_dollar_amounts')).toBe(true);
});

it('should list available transforms for German', () => {
const transforms = getAvailableTransforms('de');
expect(transforms.has('filter_markdown')).toBe(true);
expect(transforms.has('filter_emoji')).toBe(true);
expect(transforms.has('format_numbers_de')).toBe(true);
expect(transforms.has('format_euro_amounts')).toBe(true);
});

it('should list all available transforms across all languages', () => {
const transforms = getAllAvailableTransforms();
// Language-agnostic transforms
expect(transforms.has('filter_markdown')).toBe(true);
expect(transforms.has('filter_emoji')).toBe(true);
// English transforms
expect(transforms.has('format_numbers')).toBe(true);
expect(transforms.has('format_dollar_amounts')).toBe(true);
// German transforms
expect(transforms.has('format_numbers_de')).toBe(true);
expect(transforms.has('format_euro_amounts')).toBe(true);
});

it('should throw error for invalid transform name', async () => {
const stream = stringToStream('test');
await expect(applyTextTransforms(stream, ['invalid_transform' as any])).rejects.toThrow(
'Invalid transform',
);
});

it('should apply custom transform function', async () => {
const customTransform = (text: ReadableStream<string>) => {
return new ReadableStream({
async start(controller) {
const reader = text.getReader();
while (true) {
const { done, value } = await reader.read();
if (done) {
controller.close();
break;
}
controller.enqueue(value.toUpperCase());
}
},
});
};

const stream = stringToStream('hello world');
const result = await applyTextTransforms(stream, [customTransform]);
const output = await streamToString(result);
expect(output).toBe('HELLO WORLD');
});

it('should apply multiple transforms in sequence', async () => {
const stream = stringToStream('**Price: $5** 🎉');
const result = await applyTextTransforms(stream, [
'filter_markdown',
'filter_emoji',
'format_dollar_amounts',
]);
const output = await streamToString(result);
expect(output).toContain('Price:');
expect(output).toContain('five dollars');
expect(output).not.toContain('**');
expect(output).not.toContain('🎉');
});

it('should find transforms across all languages without specifying language', async () => {
// Test that English transform can be found without language config
const stream1 = stringToStream('$5');
const result1 = await applyTextTransforms(stream1, ['format_dollar_amounts']);
const output1 = await streamToString(result1);
expect(output1).toBe('five dollars');

// Test that German transform can be found without language config
const stream2 = stringToStream('5€');
const result2 = await applyTextTransforms(stream2, ['format_euro_amounts']);
const output2 = await streamToString(result2);
expect(output2).toBe('fünf Euro');

// Test that mixed language transforms can be used together
const stream3 = stringToStream('$5 and 5€');
const result3 = await applyTextTransforms(stream3, [
'format_dollar_amounts',
'format_euro_amounts',
]);
const output3 = await streamToString(result3);
expect(output3).toBe('five dollars and fünf Euro');
});
});
Loading