Skip to content

Commit 81e7efc

Browse files
committed
markdown and emoji filters + advanced preprocessing
1 parent 0bba403 commit 81e7efc

File tree

12 files changed

+2486
-1
lines changed

12 files changed

+2486
-1
lines changed

agents/src/voice/agent_session.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ import { AgentInput, AgentOutput } from './io.js';
4949
import { RoomIO, type RoomInputOptions, type RoomOutputOptions } from './room_io/index.js';
5050
import type { UnknownUserData } from './run_context.js';
5151
import type { SpeechHandle } from './speech_handle.js';
52+
import { DEFAULT_TTS_TEXT_TRANSFORMS, type TextTransformSpec } from './transcription/transforms.js';
5253

5354
export interface VoiceOptions {
5455
allowInterruptions: boolean;
@@ -60,6 +61,7 @@ export interface VoiceOptions {
6061
maxToolSteps: number;
6162
preemptiveGeneration: boolean;
6263
userAwayTimeout?: number | null;
64+
ttsTextTransforms?: TextTransformSpec[] | null;
6365
}
6466

6567
const defaultVoiceOptions: VoiceOptions = {
@@ -72,6 +74,7 @@ const defaultVoiceOptions: VoiceOptions = {
7274
maxToolSteps: 3,
7375
preemptiveGeneration: false,
7476
userAwayTimeout: 15.0,
77+
ttsTextTransforms: DEFAULT_TTS_TEXT_TRANSFORMS,
7578
} as const;
7679

7780
export type TurnDetectionMode = 'stt' | 'vad' | 'realtime_llm' | 'manual' | _TurnDetector;

agents/src/voice/generation.ts

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import type { AgentSession } from './agent_session.js';
2727
import type { AudioOutput, LLMNode, TTSNode, TextOutput } from './io.js';
2828
import { RunContext } from './run_context.js';
2929
import type { SpeechHandle } from './speech_handle.js';
30+
import { type TextTransformSpec, applyTextTransforms } from './transcription/index.js';
3031

3132
/** @internal */
3233
export class _LLMGenerationData {
@@ -474,6 +475,7 @@ export function performTTSInference(
474475
text: ReadableStream<string>,
475476
modelSettings: ModelSettings,
476477
controller: AbortController,
478+
textTransforms?: readonly TextTransformSpec[] | null,
477479
): [Task<void>, ReadableStream<AudioFrame>] {
478480
const audioStream = new IdentityTransform<AudioFrame>();
479481
const outputWriter = audioStream.writable.getWriter();
@@ -484,7 +486,13 @@ export function performTTSInference(
484486
let ttsStream: ReadableStream<AudioFrame> | null = null;
485487

486488
try {
487-
ttsStream = await node(text, modelSettings);
489+
// Apply text transforms
490+
let transformedText = text;
491+
if (textTransforms && textTransforms.length > 0) {
492+
transformedText = await applyTextTransforms(text, textTransforms);
493+
}
494+
495+
ttsStream = await node(transformedText, modelSettings);
488496
if (ttsStream === null) {
489497
await outputWriter.close();
490498
return;

agents/src/voice/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@ export * from './events.js';
99
export * from './report.js';
1010
export * from './room_io/index.js';
1111
export { RunContext } from './run_context.js';
12+
export * from './transcription/index.js';

agents/src/voice/transcription/index.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,7 @@
22
//
33
// SPDX-License-Identifier: Apache-2.0
44
export * from './_utils.js';
5+
export * from './transforms.js';
6+
export * from './transforms_agnostic.js';
7+
export * from './transforms_en.js';
8+
export * from './transforms_de.js';
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
2+
//
3+
// SPDX-License-Identifier: Apache-2.0
4+
import { ReadableStream } from 'node:stream/web';
5+
import { describe, expect, it } from 'vitest';
6+
import {
7+
DEFAULT_TTS_TEXT_TRANSFORMS,
8+
applyTextTransforms,
9+
getAvailableTransforms,
10+
} from './transforms.js';
11+
12+
/**
13+
* Helper to convert a string to a ReadableStream
14+
*/
15+
function stringToStream(text: string): ReadableStream<string> {
16+
return new ReadableStream({
17+
start(controller) {
18+
controller.enqueue(text);
19+
controller.close();
20+
},
21+
});
22+
}
23+
24+
/**
25+
* Helper to read a stream to a string
26+
*/
27+
async function streamToString(stream: ReadableStream<string>): Promise<string> {
28+
const reader = stream.getReader();
29+
let result = '';
30+
while (true) {
31+
const { done, value } = await reader.read();
32+
if (done) break;
33+
result += value;
34+
}
35+
return result;
36+
}
37+
38+
describe('Text Transforms Core', () => {
39+
it('should export DEFAULT_TTS_TEXT_TRANSFORMS', () => {
40+
expect(DEFAULT_TTS_TEXT_TRANSFORMS).toBeDefined();
41+
expect(DEFAULT_TTS_TEXT_TRANSFORMS).toEqual(['filter_markdown', 'filter_emoji']);
42+
});
43+
44+
it('should list available transforms for English', () => {
45+
const transforms = getAvailableTransforms('en');
46+
expect(transforms.has('filter_markdown')).toBe(true);
47+
expect(transforms.has('filter_emoji')).toBe(true);
48+
expect(transforms.has('format_numbers')).toBe(true);
49+
expect(transforms.has('format_dollar_amounts')).toBe(true);
50+
});
51+
52+
it('should list available transforms for German', () => {
53+
const transforms = getAvailableTransforms('de');
54+
expect(transforms.has('filter_markdown')).toBe(true);
55+
expect(transforms.has('filter_emoji')).toBe(true);
56+
expect(transforms.has('format_numbers_de')).toBe(true);
57+
expect(transforms.has('format_euro_amounts')).toBe(true);
58+
});
59+
60+
it('should throw error for invalid transform name', async () => {
61+
const stream = stringToStream('test');
62+
await expect(
63+
applyTextTransforms(stream, ['invalid_transform' as any], { language: 'en' }),
64+
).rejects.toThrow('Invalid transform');
65+
});
66+
67+
it('should apply custom transform function', async () => {
68+
const customTransform = (text: ReadableStream<string>) => {
69+
return new ReadableStream({
70+
async start(controller) {
71+
const reader = text.getReader();
72+
while (true) {
73+
const { done, value } = await reader.read();
74+
if (done) {
75+
controller.close();
76+
break;
77+
}
78+
controller.enqueue(value.toUpperCase());
79+
}
80+
},
81+
});
82+
};
83+
84+
const stream = stringToStream('hello world');
85+
const result = await applyTextTransforms(stream, [customTransform]);
86+
const output = await streamToString(result);
87+
expect(output).toBe('HELLO WORLD');
88+
});
89+
90+
it('should apply multiple transforms in sequence', async () => {
91+
const stream = stringToStream('**Price: $5** 🎉');
92+
const result = await applyTextTransforms(
93+
stream,
94+
['filter_markdown', 'filter_emoji', 'format_dollar_amounts'],
95+
{ language: 'en' },
96+
);
97+
const output = await streamToString(result);
98+
expect(output).toContain('Price:');
99+
expect(output).toContain('five dollars');
100+
expect(output).not.toContain('**');
101+
expect(output).not.toContain('🎉');
102+
});
103+
});

0 commit comments

Comments
 (0)