livekit · simllll · Nov 16, 2025 · Nov 17, 2025
diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts
@@ -1201,6 +1201,7 @@ export class AgentActivity implements RecognitionHooks {
           audioSource,
           modelSettings,
           replyAbortController,
+          this.agentSession.options.ttsTextTransforms || null,
         );
         tasks.push(ttsTask);
 
@@ -1314,6 +1315,7 @@ export class AgentActivity implements RecognitionHooks {
         ttsTextInput,
         modelSettings,
         replyAbortController,
+        this.agentSession.options.ttsTextTransforms || null,
       );
       tasks.push(ttsTask);
     }
@@ -1700,6 +1702,7 @@ export class AgentActivity implements RecognitionHooks {
                 ttsTextInput,
                 modelSettings,
                 abortController,
+                this.agentSession.options.ttsTextTransforms || null,
               );
               tasks.push(ttsTask);
               realtimeAudioResult = ttsStream;

diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts
@@ -49,6 +49,7 @@ import { AgentInput, AgentOutput } from './io.js';
 import { RoomIO, type RoomInputOptions, type RoomOutputOptions } from './room_io/index.js';
 import type { UnknownUserData } from './run_context.js';
 import type { SpeechHandle } from './speech_handle.js';
+import { DEFAULT_TTS_TEXT_TRANSFORMS, type TextTransformSpec } from './transcription/transforms.js';
 
 export interface VoiceOptions {
   allowInterruptions: boolean;
@@ -60,6 +61,7 @@ export interface VoiceOptions {
   maxToolSteps: number;
   preemptiveGeneration: boolean;
   userAwayTimeout?: number | null;
+  ttsTextTransforms?: TextTransformSpec[] | null;
 }
 
 const defaultVoiceOptions: VoiceOptions = {
@@ -72,6 +74,7 @@ const defaultVoiceOptions: VoiceOptions = {
   maxToolSteps: 3,
   preemptiveGeneration: false,
   userAwayTimeout: 15.0,
+  ttsTextTransforms: DEFAULT_TTS_TEXT_TRANSFORMS,
 } as const;
 
 export type TurnDetectionMode = 'stt' | 'vad' | 'realtime_llm' | 'manual' | _TurnDetector;

diff --git a/agents/src/voice/generation.ts b/agents/src/voice/generation.ts
@@ -27,6 +27,7 @@ import type { AgentSession } from './agent_session.js';
 import type { AudioOutput, LLMNode, TTSNode, TextOutput } from './io.js';
 import { RunContext } from './run_context.js';
 import type { SpeechHandle } from './speech_handle.js';
+import { type TextTransformSpec, applyTextTransforms } from './transcription/index.js';
 
 /** @internal */
 export class _LLMGenerationData {
@@ -474,6 +475,7 @@ export function performTTSInference(
   text: ReadableStream<string>,
   modelSettings: ModelSettings,
   controller: AbortController,
+  textTransforms?: readonly TextTransformSpec[] | null,
 ): [Task<void>, ReadableStream<AudioFrame>] {
   const audioStream = new IdentityTransform<AudioFrame>();
   const outputWriter = audioStream.writable.getWriter();
@@ -484,7 +486,13 @@ export function performTTSInference(
     let ttsStream: ReadableStream<AudioFrame> | null = null;
 
     try {
-      ttsStream = await node(text, modelSettings);
+      // Apply text transforms
+      let transformedText = text;
+      if (textTransforms && textTransforms.length > 0) {
+        transformedText = await applyTextTransforms(text, textTransforms);
+      }
+
+      ttsStream = await node(transformedText, modelSettings);
       if (ttsStream === null) {
         await outputWriter.close();
         return;

diff --git a/agents/src/voice/index.ts b/agents/src/voice/index.ts
@@ -9,3 +9,4 @@ export * from './events.js';
 export * from './report.js';
 export * from './room_io/index.js';
 export { RunContext } from './run_context.js';
+export * from './transcription/index.js';
diff --git a/agents/src/voice/transcription/index.ts b/agents/src/voice/transcription/index.ts
@@ -2,3 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 export * from './_utils.js';
+export * from './transforms.js';
+export * from './transforms_agnostic.js';
+export * from './transforms_en.js';
+export * from './transforms_de.js';
diff --git a/agents/src/voice/transcription/transforms.test.ts b/agents/src/voice/transcription/transforms.test.ts
@@ -0,0 +1,140 @@
+// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import { ReadableStream } from 'node:stream/web';
+import { describe, expect, it } from 'vitest';
+import {
+  DEFAULT_TTS_TEXT_TRANSFORMS,
+  applyTextTransforms,
+  getAllAvailableTransforms,
+  getAvailableTransforms,
+} from './transforms.js';
+
+/**
+ * Helper to convert a string to a ReadableStream
+ */
+function stringToStream(text: string): ReadableStream<string> {
+  return new ReadableStream({
+    start(controller) {
+      controller.enqueue(text);
+      controller.close();
+    },
+  });
+}
+
+/**
+ * Helper to read a stream to a string
+ */
+async function streamToString(stream: ReadableStream<string>): Promise<string> {
+  const reader = stream.getReader();
+  let result = '';
+  while (true) {
+    const { done, value } = await reader.read();
+    if (done) break;
+    result += value;
+  }
+  return result;
+}
+
+describe('Text Transforms Core', () => {
+  it('should export DEFAULT_TTS_TEXT_TRANSFORMS', () => {
+    expect(DEFAULT_TTS_TEXT_TRANSFORMS).toBeDefined();
+    expect(DEFAULT_TTS_TEXT_TRANSFORMS).toEqual(['filter_markdown', 'filter_emoji']);
+  });
+
+  it('should list available transforms for English', () => {
+    const transforms = getAvailableTransforms('en');
+    expect(transforms.has('filter_markdown')).toBe(true);
+    expect(transforms.has('filter_emoji')).toBe(true);
+    expect(transforms.has('format_numbers')).toBe(true);
+    expect(transforms.has('format_dollar_amounts')).toBe(true);
+  });
+
+  it('should list available transforms for German', () => {
+    const transforms = getAvailableTransforms('de');
+    expect(transforms.has('filter_markdown')).toBe(true);
+    expect(transforms.has('filter_emoji')).toBe(true);
+    expect(transforms.has('format_numbers_de')).toBe(true);
+    expect(transforms.has('format_euro_amounts')).toBe(true);
+  });
+
+  it('should list all available transforms across all languages', () => {
+    const transforms = getAllAvailableTransforms();
+    // Language-agnostic transforms
+    expect(transforms.has('filter_markdown')).toBe(true);
+    expect(transforms.has('filter_emoji')).toBe(true);
+    // English transforms
+    expect(transforms.has('format_numbers')).toBe(true);
+    expect(transforms.has('format_dollar_amounts')).toBe(true);
+    // German transforms
+    expect(transforms.has('format_numbers_de')).toBe(true);
+    expect(transforms.has('format_euro_amounts')).toBe(true);
+  });
+
+  it('should throw error for invalid transform name', async () => {
+    const stream = stringToStream('test');
+    await expect(applyTextTransforms(stream, ['invalid_transform' as any])).rejects.toThrow(
+      'Invalid transform',
+    );
+  });
+
+  it('should apply custom transform function', async () => {
+    const customTransform = (text: ReadableStream<string>) => {
+      return new ReadableStream({
+        async start(controller) {
+          const reader = text.getReader();
+          while (true) {
+            const { done, value } = await reader.read();
+            if (done) {
+              controller.close();
+              break;
+            }
+            controller.enqueue(value.toUpperCase());
+          }
+        },
+      });
+    };
+
+    const stream = stringToStream('hello world');
+    const result = await applyTextTransforms(stream, [customTransform]);
+    const output = await streamToString(result);
+    expect(output).toBe('HELLO WORLD');
+  });
+
+  it('should apply multiple transforms in sequence', async () => {
+    const stream = stringToStream('**Price: $5** 🎉');
+    const result = await applyTextTransforms(stream, [
+      'filter_markdown',
+      'filter_emoji',
+      'format_dollar_amounts',
+    ]);
+    const output = await streamToString(result);
+    expect(output).toContain('Price:');
+    expect(output).toContain('five dollars');
+    expect(output).not.toContain('**');
+    expect(output).not.toContain('🎉');
+  });
+
+  it('should find transforms across all languages without specifying language', async () => {
+    // Test that English transform can be found without language config
+    const stream1 = stringToStream('$5');
+    const result1 = await applyTextTransforms(stream1, ['format_dollar_amounts']);
+    const output1 = await streamToString(result1);
+    expect(output1).toBe('five dollars');
+
+    // Test that German transform can be found without language config
+    const stream2 = stringToStream('5€');
+    const result2 = await applyTextTransforms(stream2, ['format_euro_amounts']);
+    const output2 = await streamToString(result2);
+    expect(output2).toBe('fünf Euro');
+
+    // Test that mixed language transforms can be used together
+    const stream3 = stringToStream('$5 and 5€');
+    const result3 = await applyTextTransforms(stream3, [
+      'format_dollar_amounts',
+      'format_euro_amounts',
+    ]);
+    const output3 = await streamToString(result3);
+    expect(output3).toBe('five dollars and fünf Euro');
+  });
+});