diff --git a/docs/adapters/gemini.md b/docs/adapters/gemini.md index a6019672..e8a938a6 100644 --- a/docs/adapters/gemini.md +++ b/docs/adapters/gemini.md @@ -3,7 +3,7 @@ title: Gemini Adapter id: gemini-adapter --- -The Google Gemini adapter provides access to Google's Gemini models, including text generation, embeddings, and image generation with Imagen. +The Google Gemini adapter provides access to Google's Gemini models, including text generation, embeddings, image generation with Imagen, and experimental text-to-speech. ## Installation @@ -75,6 +75,10 @@ const adapter = createGeminiText(process.env.GEMINI_API_KEY!, config); - `imagen-3.0-generate-002` - Imagen 3.0 - `gemini-2.0-flash-preview-image-generation` - Gemini with image generation +### Text-to-Speech Models (Experimental) + +- `gemini-2.5-flash-preview-tts` - Gemini TTS + ## Example: Chat Completion ```typescript @@ -269,6 +273,27 @@ const result = await ai({ }); ``` +## Text-to-Speech (Experimental) + +> **Note:** Gemini TTS is experimental and may require the Live API for full functionality. + +Generate speech from text: + +```typescript +import { ai } from "@tanstack/ai"; +import { geminiTTS } from "@tanstack/ai-gemini"; + +const adapter = geminiTTS(); + +const result = await ai({ + adapter, + model: "gemini-2.5-flash-preview-tts", + text: "Hello from Gemini TTS!", +}); + +console.log(result.audio); // Base64 encoded audio +``` + ## Environment Variables Set your API key in environment variables: @@ -340,6 +365,18 @@ Creates a Gemini image generation adapter with an explicit API key. **Returns:** A Gemini image adapter instance. +### `geminiTTS(config?)` + +Creates a Gemini TTS adapter using environment variables. + +**Returns:** A Gemini TTS adapter instance. + +### `createGeminiTTS(apiKey, config?)` + +Creates a Gemini TTS adapter with an explicit API key. + +**Returns:** A Gemini TTS adapter instance. + ## Next Steps - [Getting Started](../getting-started/quick-start) - Learn the basics diff --git a/docs/adapters/openai.md b/docs/adapters/openai.md index 27b45459..7c4bf4d9 100644 --- a/docs/adapters/openai.md +++ b/docs/adapters/openai.md @@ -3,7 +3,7 @@ title: OpenAI Adapter id: openai-adapter --- -The OpenAI adapter provides access to OpenAI's models, including GPT-4o, GPT-5, embeddings, and image generation (DALL-E). +The OpenAI adapter provides access to OpenAI's models, including GPT-4o, GPT-5, embeddings, image generation (DALL-E), text-to-speech (TTS), and audio transcription (Whisper). ## Installation @@ -77,6 +77,18 @@ const adapter = createOpenaiText(process.env.OPENAI_API_KEY!, config); - `gpt-image-1` - Latest image generation model - `dall-e-3` - DALL-E 3 +### Text-to-Speech Models + +- `tts-1` - Standard TTS (fast) +- `tts-1-hd` - High-definition TTS +- `gpt-4o-audio-preview` - GPT-4o with audio output + +### Transcription Models + +- `whisper-1` - Whisper large-v2 +- `gpt-4o-transcribe` - GPT-4o transcription +- `gpt-4o-mini-transcribe` - GPT-4o Mini transcription + ## Example: Chat Completion ```typescript @@ -267,6 +279,83 @@ const result = await ai({ }); ``` +## Text-to-Speech + +Generate speech from text: + +```typescript +import { ai } from "@tanstack/ai"; +import { openaiTTS } from "@tanstack/ai-openai"; + +const adapter = openaiTTS(); + +const result = await ai({ + adapter, + model: "tts-1", + text: "Hello, welcome to TanStack AI!", + voice: "alloy", + format: "mp3", +}); + +// result.audio contains base64-encoded audio +console.log(result.format); // "mp3" +``` + +### TTS Voices + +Available voices: `alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer`, `ash`, `ballad`, `coral`, `sage`, `verse` + +### TTS Provider Options + +```typescript +const result = await ai({ + adapter: openaiTTS(), + model: "tts-1-hd", + text: "High quality speech", + providerOptions: { + speed: 1.0, // 0.25 to 4.0 + }, +}); +``` + +## Transcription + +Transcribe audio to text: + +```typescript +import { ai } from "@tanstack/ai"; +import { openaiTranscription } from "@tanstack/ai-openai"; + +const adapter = openaiTranscription(); + +const result = await ai({ + adapter, + model: "whisper-1", + audio: audioFile, // File object or base64 string + language: "en", +}); + +console.log(result.text); // Transcribed text +``` + +### Transcription Provider Options + +```typescript +const result = await ai({ + adapter: openaiTranscription(), + model: "whisper-1", + audio: audioFile, + providerOptions: { + response_format: "verbose_json", // Get timestamps + temperature: 0, + prompt: "Technical terms: API, SDK", + }, +}); + +// Access segments with timestamps +console.log(result.segments); +``` + ## Environment Variables Set your API key in environment variables: @@ -331,6 +420,30 @@ Creates an OpenAI image generation adapter with an explicit API key. **Returns:** An OpenAI image adapter instance. +### `openaiTTS(config?)` + +Creates an OpenAI TTS adapter using environment variables. + +**Returns:** An OpenAI TTS adapter instance. + +### `createOpenaiTTS(apiKey, config?)` + +Creates an OpenAI TTS adapter with an explicit API key. + +**Returns:** An OpenAI TTS adapter instance. + +### `openaiTranscription(config?)` + +Creates an OpenAI transcription adapter using environment variables. + +**Returns:** An OpenAI transcription adapter instance. + +### `createOpenaiTranscription(apiKey, config?)` + +Creates an OpenAI transcription adapter with an explicit API key. + +**Returns:** An OpenAI transcription adapter instance. + ## Next Steps - [Getting Started](../getting-started/quick-start) - Learn the basics diff --git a/docs/config.json b/docs/config.json index 372f4000..d8a12298 100644 --- a/docs/config.json +++ b/docs/config.json @@ -69,6 +69,14 @@ { "label": "Per-Model Type Safety", "to": "guides/per-model-type-safety" + }, + { + "label": "Text-to-Speech", + "to": "guides/text-to-speech" + }, + { + "label": "Transcription", + "to": "guides/transcription" } ] }, diff --git a/docs/guides/text-to-speech.md b/docs/guides/text-to-speech.md new file mode 100644 index 00000000..5a14deaa --- /dev/null +++ b/docs/guides/text-to-speech.md @@ -0,0 +1,248 @@ +# Text-to-Speech (TTS) + +TanStack AI provides support for text-to-speech generation through dedicated TTS adapters. This guide covers how to convert text into spoken audio using OpenAI and Gemini providers. + +## Overview + +Text-to-speech (TTS) is handled by TTS adapters that follow the same tree-shakeable architecture as other adapters in TanStack AI. The TTS adapters support: + +- **OpenAI**: TTS-1, TTS-1-HD, and audio-capable GPT-4o models +- **Gemini**: Gemini 2.5 Flash TTS (experimental) + +## Basic Usage + +### OpenAI Text-to-Speech + +```typescript +import { ai } from '@tanstack/ai' +import { openaiTTS } from '@tanstack/ai-openai' + +// Create a TTS adapter (uses OPENAI_API_KEY from environment) +const adapter = openaiTTS() + +// Generate speech from text +const result = await ai({ + adapter, + model: 'tts-1', + text: 'Hello, welcome to TanStack AI!', + voice: 'alloy', +}) + +// result.audio contains base64-encoded audio data +console.log(result.format) // 'mp3' +console.log(result.contentType) // 'audio/mpeg' +``` + +### Gemini Text-to-Speech (Experimental) + +```typescript +import { ai } from '@tanstack/ai' +import { geminiTTS } from '@tanstack/ai-gemini' + +// Create a TTS adapter (uses GOOGLE_API_KEY from environment) +const adapter = geminiTTS() + +// Generate speech from text +const result = await ai({ + adapter, + model: 'gemini-2.5-flash-preview-tts', + text: 'Hello from Gemini TTS!', +}) + +console.log(result.audio) // Base64 encoded audio +``` + +## Options + +### Common Options + +All TTS adapters support these common options: + +| Option | Type | Description | +|--------|------|-------------| +| `text` | `string` | The text to convert to speech (required) | +| `voice` | `string` | The voice to use for generation | +| `format` | `string` | Output audio format (e.g., "mp3", "wav") | + +### OpenAI Voice Options + +OpenAI provides several distinct voices: + +| Voice | Description | +|-------|-------------| +| `alloy` | Neutral, balanced voice | +| `echo` | Warm, conversational voice | +| `fable` | Expressive, storytelling voice | +| `onyx` | Deep, authoritative voice | +| `nova` | Friendly, upbeat voice | +| `shimmer` | Clear, gentle voice | +| `ash` | Calm, measured voice | +| `ballad` | Melodic, flowing voice | +| `coral` | Bright, energetic voice | +| `sage` | Wise, thoughtful voice | +| `verse` | Poetic, rhythmic voice | + +### OpenAI Format Options + +| Format | Description | +|--------|-------------| +| `mp3` | MP3 audio (default) | +| `opus` | Opus audio (good for streaming) | +| `aac` | AAC audio | +| `flac` | FLAC audio (lossless) | +| `wav` | WAV audio (uncompressed) | +| `pcm` | Raw PCM audio | + +## Provider Options + +### OpenAI Provider Options + +```typescript +const result = await ai({ + adapter: openaiTTS(), + model: 'tts-1-hd', + text: 'High quality speech synthesis', + voice: 'nova', + format: 'mp3', + providerOptions: { + speed: 1.0, // 0.25 to 4.0 + }, +}) +``` + +| Option | Type | Description | +|--------|------|-------------| +| `speed` | `number` | Playback speed (0.25 to 4.0, default 1.0) | +| `instructions` | `string` | Voice style instructions (GPT-4o audio models only) | + +> **Note:** The `instructions` and `stream_format` options are only available with `gpt-4o-audio-preview` and `gpt-4o-mini-audio-preview` models, not with `tts-1` or `tts-1-hd`. + +## Response Format + +The TTS result includes: + +```typescript +interface TTSResult { + id: string // Unique identifier for this generation + model: string // The model used + audio: string // Base64-encoded audio data + format: string // Audio format (e.g., "mp3") + contentType: string // MIME type (e.g., "audio/mpeg") + duration?: number // Duration in seconds (if available) +} +``` + +## Playing Audio in the Browser + +```typescript +// Convert base64 to audio and play +function playAudio(result: TTSResult) { + const audioData = atob(result.audio) + const bytes = new Uint8Array(audioData.length) + for (let i = 0; i < audioData.length; i++) { + bytes[i] = audioData.charCodeAt(i) + } + + const blob = new Blob([bytes], { type: result.contentType }) + const url = URL.createObjectURL(blob) + + const audio = new Audio(url) + audio.play() + + // Clean up when done + audio.onended = () => URL.revokeObjectURL(url) +} +``` + +## Saving Audio to File (Node.js) + +```typescript +import { writeFile } from 'fs/promises' + +async function saveAudio(result: TTSResult, filename: string) { + const audioBuffer = Buffer.from(result.audio, 'base64') + await writeFile(filename, audioBuffer) + console.log(`Saved to ${filename}`) +} + +// Usage +const result = await ai({ + adapter: openaiTTS(), + model: 'tts-1', + text: 'Hello world!', +}) + +await saveAudio(result, 'output.mp3') +``` + +## Model Availability + +### OpenAI Models + +| Model | Quality | Speed | Use Case | +|-------|---------|-------|----------| +| `tts-1` | Standard | Fast | Real-time applications | +| `tts-1-hd` | High | Slower | Production audio | +| `gpt-4o-audio-preview` | Highest | Variable | Advanced voice control | +| `gpt-4o-mini-audio-preview` | High | Fast | Balanced quality/speed | + +### Gemini Models + +| Model | Status | Notes | +|-------|--------|-------| +| `gemini-2.5-flash-preview-tts` | Experimental | May require Live API for full features | + +## Error Handling + +```typescript +try { + const result = await ai({ + adapter: openaiTTS(), + model: 'tts-1', + text: 'Hello!', + }) +} catch (error) { + if (error.message.includes('exceeds maximum length')) { + console.error('Text is too long (max 4096 characters)') + } else if (error.message.includes('Speed must be between')) { + console.error('Invalid speed value') + } else { + console.error('TTS error:', error.message) + } +} +``` + +## Environment Variables + +The TTS adapters use the same environment variables as other adapters: + +- **OpenAI**: `OPENAI_API_KEY` +- **Gemini**: `GOOGLE_API_KEY` or `GEMINI_API_KEY` + +## Explicit API Keys + +For production use or when you need explicit control: + +```typescript +import { createOpenaiTTS } from '@tanstack/ai-openai' +import { createGeminiTTS } from '@tanstack/ai-gemini' + +// OpenAI +const openaiAdapter = createOpenaiTTS('your-openai-api-key') + +// Gemini +const geminiAdapter = createGeminiTTS('your-google-api-key') +``` + +## Best Practices + +1. **Text Length**: OpenAI TTS supports up to 4096 characters per request. For longer content, split into chunks. + +2. **Voice Selection**: Choose voices appropriate for your content—use `onyx` for authoritative content, `nova` for friendly interactions. + +3. **Format Selection**: Use `mp3` for general use, `opus` for streaming, `wav` for further processing. + +4. **Caching**: Cache generated audio to avoid regenerating the same content. + +5. **Error Handling**: Always handle errors gracefully, especially for user-facing applications. + diff --git a/docs/guides/transcription.md b/docs/guides/transcription.md new file mode 100644 index 00000000..ff55ae14 --- /dev/null +++ b/docs/guides/transcription.md @@ -0,0 +1,337 @@ +# Audio Transcription + +TanStack AI provides support for audio transcription (speech-to-text) through dedicated transcription adapters. This guide covers how to convert spoken audio into text using OpenAI's Whisper and GPT-4o transcription models. + +## Overview + +Audio transcription is handled by transcription adapters that follow the same tree-shakeable architecture as other adapters in TanStack AI. + +Currently supported: +- **OpenAI**: Whisper-1, GPT-4o-transcribe, GPT-4o-mini-transcribe + +## Basic Usage + +### OpenAI Transcription + +```typescript +import { ai } from '@tanstack/ai' +import { openaiTranscription } from '@tanstack/ai-openai' + +// Create a transcription adapter (uses OPENAI_API_KEY from environment) +const adapter = openaiTranscription() + +// Transcribe audio from a file +const audioFile = new File([audioBuffer], 'audio.mp3', { type: 'audio/mpeg' }) + +const result = await ai({ + adapter, + model: 'whisper-1', + audio: audioFile, + language: 'en', +}) + +console.log(result.text) // The transcribed text +``` + +### Using Base64 Audio + +```typescript +import { readFile } from 'fs/promises' + +// Read audio file as base64 +const audioBuffer = await readFile('recording.mp3') +const base64Audio = audioBuffer.toString('base64') + +const result = await ai({ + adapter: openaiTranscription(), + model: 'whisper-1', + audio: base64Audio, +}) + +console.log(result.text) +``` + +### Using Data URLs + +```typescript +const dataUrl = `data:audio/mpeg;base64,${base64AudioData}` + +const result = await ai({ + adapter: openaiTranscription(), + model: 'whisper-1', + audio: dataUrl, +}) +``` + +## Options + +### Common Options + +| Option | Type | Description | +|--------|------|-------------| +| `audio` | `File \| string` | Audio data (File object or base64 string) - required | +| `language` | `string` | Language code (e.g., "en", "es", "fr") | + +### Supported Languages + +Whisper supports many languages. Common codes include: + +| Code | Language | +|------|----------| +| `en` | English | +| `es` | Spanish | +| `fr` | French | +| `de` | German | +| `it` | Italian | +| `pt` | Portuguese | +| `ja` | Japanese | +| `ko` | Korean | +| `zh` | Chinese | +| `ru` | Russian | + +> **Tip:** Providing the correct language code improves accuracy and reduces latency. + +## Provider Options + +### OpenAI Provider Options + +```typescript +const result = await ai({ + adapter: openaiTranscription(), + model: 'whisper-1', + audio: audioFile, + providerOptions: { + response_format: 'verbose_json', // Get detailed output with timestamps + temperature: 0, // Lower = more deterministic + prompt: 'Technical terms: API, SDK, CLI', // Guide transcription + }, +}) +``` + +| Option | Type | Description | +|--------|------|-------------| +| `response_format` | `string` | Output format: "json", "text", "srt", "verbose_json", "vtt" | +| `temperature` | `number` | Sampling temperature (0 to 1) | +| `prompt` | `string` | Optional text to guide transcription style | +| `include` | `string[]` | Timestamp granularity: ["word"], ["segment"], or both | + +### Response Formats + +| Format | Description | +|--------|-------------| +| `json` | Simple JSON with text | +| `text` | Plain text only | +| `srt` | SubRip subtitle format | +| `verbose_json` | Detailed JSON with timestamps and segments | +| `vtt` | WebVTT subtitle format | + +## Response Format + +The transcription result includes: + +```typescript +interface TranscriptionResult { + id: string // Unique identifier + model: string // Model used + text: string // Full transcribed text + language?: string // Detected/specified language + duration?: number // Audio duration in seconds + segments?: Array<{ // Timestamped segments + start: number // Start time in seconds + end: number // End time in seconds + text: string // Segment text + words?: Array<{ // Word-level timestamps + word: string + start: number + end: number + confidence?: number + }> + }> +} +``` + +## Complete Example + +```typescript +import { ai } from '@tanstack/ai' +import { openaiTranscription } from '@tanstack/ai-openai' +import { readFile } from 'fs/promises' + +async function transcribeAudio(filepath: string) { + const adapter = openaiTranscription() + + // Read the audio file + const audioBuffer = await readFile(filepath) + const audioFile = new File( + [audioBuffer], + filepath.split('/').pop()!, + { type: 'audio/mpeg' } + ) + + // Transcribe with detailed output + const result = await ai({ + adapter, + model: 'whisper-1', + audio: audioFile, + language: 'en', + providerOptions: { + response_format: 'verbose_json', + include: ['segment', 'word'], + }, + }) + + console.log('Full text:', result.text) + console.log('Duration:', result.duration, 'seconds') + + // Print segments with timestamps + if (result.segments) { + for (const segment of result.segments) { + console.log(`[${segment.start.toFixed(2)}s - ${segment.end.toFixed(2)}s]: ${segment.text}`) + } + } + + return result +} + +// Usage +await transcribeAudio('./meeting-recording.mp3') +``` + +## Model Availability + +### OpenAI Models + +| Model | Description | Use Case | +|-------|-------------|----------| +| `whisper-1` | Whisper large-v2 | General transcription | +| `gpt-4o-transcribe` | GPT-4o-based transcription | Higher accuracy | +| `gpt-4o-transcribe-diarize` | With speaker diarization | Multi-speaker audio | +| `gpt-4o-mini-transcribe` | Faster, lighter model | Cost-effective | + +### Supported Audio Formats + +OpenAI supports these audio formats: + +- `mp3` - MPEG Audio Layer 3 +- `mp4` - MPEG-4 Audio +- `mpeg` - MPEG Audio +- `mpga` - MPEG Audio +- `m4a` - MPEG-4 Audio +- `wav` - Waveform Audio +- `webm` - WebM Audio +- `flac` - Free Lossless Audio Codec +- `ogg` - Ogg Vorbis + +> **Note:** Maximum file size is 25 MB. + +## Browser Usage + +### Recording and Transcribing + +```typescript +async function recordAndTranscribe() { + // Request microphone access + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }) + const mediaRecorder = new MediaRecorder(stream) + const chunks: Blob[] = [] + + mediaRecorder.ondataavailable = (e) => chunks.push(e.data) + + mediaRecorder.onstop = async () => { + const audioBlob = new Blob(chunks, { type: 'audio/webm' }) + const audioFile = new File([audioBlob], 'recording.webm', { type: 'audio/webm' }) + + // Send to your API endpoint for transcription + const formData = new FormData() + formData.append('audio', audioFile) + + const response = await fetch('/api/transcribe', { + method: 'POST', + body: formData, + }) + + const result = await response.json() + console.log('Transcription:', result.text) + } + + // Start recording + mediaRecorder.start() + + // Stop after 10 seconds + setTimeout(() => mediaRecorder.stop(), 10000) +} +``` + +### Server API Endpoint + +```typescript +// api/transcribe.ts +import { ai } from '@tanstack/ai' +import { openaiTranscription } from '@tanstack/ai-openai' + +export async function POST(request: Request) { + const formData = await request.formData() + const audioFile = formData.get('audio') as File + + const adapter = openaiTranscription() + + const result = await ai({ + adapter, + model: 'whisper-1', + audio: audioFile, + }) + + return Response.json(result) +} +``` + +## Error Handling + +```typescript +try { + const result = await ai({ + adapter: openaiTranscription(), + model: 'whisper-1', + audio: audioFile, + }) +} catch (error) { + if (error.message.includes('Invalid file format')) { + console.error('Unsupported audio format') + } else if (error.message.includes('File too large')) { + console.error('Audio file exceeds 25 MB limit') + } else if (error.message.includes('Audio file is too short')) { + console.error('Audio must be at least 0.1 seconds') + } else { + console.error('Transcription error:', error.message) + } +} +``` + +## Environment Variables + +The transcription adapter uses: + +- `OPENAI_API_KEY`: Your OpenAI API key + +## Explicit API Keys + +```typescript +import { createOpenaiTranscription } from '@tanstack/ai-openai' + +const adapter = createOpenaiTranscription('your-openai-api-key') +``` + +## Best Practices + +1. **Audio Quality**: Better audio quality leads to more accurate transcriptions. Reduce background noise when possible. + +2. **Language Specification**: Always specify the language if known—this improves accuracy and speed. + +3. **File Size**: Keep audio files under 25 MB. For longer recordings, split into chunks. + +4. **Format Selection**: MP3 offers a good balance of quality and size. Use WAV or FLAC for highest quality. + +5. **Prompting**: Use the `prompt` option to provide context or expected vocabulary (e.g., technical terms, names). + +6. **Timestamps**: Request `verbose_json` format and enable `include: ['word', 'segment']` when you need timing information for captions or synchronization. + diff --git a/docs/guides/video-generation.md b/docs/guides/video-generation.md new file mode 100644 index 00000000..54f61258 --- /dev/null +++ b/docs/guides/video-generation.md @@ -0,0 +1,331 @@ +# Video Generation (Experimental) + +> **⚠️ EXPERIMENTAL FEATURE WARNING** +> +> Video generation is an **experimental feature** that is subject to significant changes. Please read the caveats below carefully before using this feature. +> +> **Key Caveats:** +> - The API may change without notice in future versions +> - OpenAI's Sora API is in limited availability and may require organization verification +> - Video generation uses a jobs/polling architecture, which differs from other synchronous activities +> - Pricing, rate limits, and quotas may vary and are subject to change +> - Not all features described here may be available in your OpenAI account + +## Overview + +TanStack AI provides experimental support for video generation through dedicated video adapters. Unlike image generation, video generation is an **asynchronous operation** that uses a jobs/polling pattern: + +1. **Create a job** - Submit a prompt and receive a job ID +2. **Poll for status** - Check the job status until it's complete +3. **Retrieve the video** - Get the URL to download/view the generated video + +Currently supported: +- **OpenAI**: Sora-2 and Sora-2-Pro models (when available) + +## Basic Usage + +### Creating a Video Job + +```typescript +import { ai } from '@tanstack/ai' +import { openaiVideo } from '@tanstack/ai-openai' + +// Create a video adapter (uses OPENAI_API_KEY from environment) +const adapter = openaiVideo() + +// Start a video generation job +const { jobId, model } = await ai({ + adapter, + model: 'sora-2', + prompt: 'A golden retriever puppy playing in a field of sunflowers', +}) + +console.log('Job started:', jobId) +``` + +### Polling for Status + +```typescript +// Check the status of the job +const status = await ai({ + adapter, + model: 'sora-2', + jobId, + request: 'status', +}) + +console.log('Status:', status.status) // 'pending' | 'processing' | 'completed' | 'failed' +console.log('Progress:', status.progress) // 0-100 (if available) + +if (status.status === 'failed') { + console.error('Error:', status.error) +} +``` + +### Getting the Video URL + +```typescript +// Only call this after status is 'completed' +const { url, expiresAt } = await ai({ + adapter, + model: 'sora-2', + jobId, + request: 'url', +}) + +console.log('Video URL:', url) +console.log('Expires at:', expiresAt) +``` + +### Complete Example with Polling Loop + +```typescript +import { ai } from '@tanstack/ai' +import { openaiVideo } from '@tanstack/ai-openai' + +async function generateVideo(prompt: string) { + const adapter = openaiVideo() + + // 1. Create the job + const { jobId } = await ai({ + adapter, + model: 'sora-2', + prompt, + size: '1280x720', + duration: 8, // 4, 8, or 12 seconds + }) + + console.log('Job created:', jobId) + + // 2. Poll for completion + let status = 'pending' + while (status !== 'completed' && status !== 'failed') { + // Wait 5 seconds between polls + await new Promise((resolve) => setTimeout(resolve, 5000)) + + const result = await ai({ + adapter, + model: 'sora-2', + jobId, + request: 'status', + }) + + status = result.status + console.log(`Status: ${status}${result.progress ? ` (${result.progress}%)` : ''}`) + + if (result.status === 'failed') { + throw new Error(result.error || 'Video generation failed') + } + } + + // 3. Get the video URL + const { url } = await ai({ + adapter, + model: 'sora-2', + jobId, + request: 'url', + }) + + return url +} + +// Usage +const videoUrl = await generateVideo('A cat playing piano in a jazz bar') +console.log('Video ready:', videoUrl) +``` + +## Options + +### Job Creation Options + +| Option | Type | Description | +|--------|------|-------------| +| `prompt` | `string` | Text description of the video to generate (required) | +| `size` | `string` | Video resolution in WIDTHxHEIGHT format | +| `duration` | `number` | Video duration in seconds (maps to `seconds` parameter in API) | +| `providerOptions` | `object` | Provider-specific options | + +### Supported Sizes + +Based on [OpenAI API docs](https://platform.openai.com/docs/api-reference/videos/create): + +| Size | Description | +|------|-------------| +| `1280x720` | 720p landscape (16:9) - default | +| `720x1280` | 720p portrait (9:16) | +| `1792x1024` | Wide landscape | +| `1024x1792` | Tall portrait | + +### Supported Durations + +The API uses the `seconds` parameter. Allowed values: + +- `4` seconds +- `8` seconds (default) +- `12` seconds + +## Provider Options + +### OpenAI Provider Options + +Based on the [OpenAI Sora API](https://platform.openai.com/docs/api-reference/videos/create): + +```typescript +const { jobId } = await ai({ + adapter, + model: 'sora-2', + prompt: 'A beautiful sunset over the ocean', + size: '1280x720', // '1280x720', '720x1280', '1792x1024', '1024x1792' + duration: 8, // 4, 8, or 12 seconds + providerOptions: { + size: '1280x720', // Alternative way to specify size + seconds: 8, // Alternative way to specify duration + } +}) +``` + +## Response Types + +### VideoJobResult (from create) + +```typescript +interface VideoJobResult { + jobId: string // Unique job identifier for polling + model: string // Model used for generation +} +``` + +### VideoStatusResult (from status) + +```typescript +interface VideoStatusResult { + jobId: string + status: 'pending' | 'processing' | 'completed' | 'failed' + progress?: number // 0-100, if available + error?: string // Error message if failed +} +``` + +### VideoUrlResult (from url) + +```typescript +interface VideoUrlResult { + jobId: string + url: string // URL to download/stream the video + expiresAt?: Date // When the URL expires +} +``` + +## Model Variants + +| Model | Description | Use Case | +|-------|-------------|----------| +| `sora-2` | Faster generation, good quality | Rapid iteration, prototyping | +| `sora-2-pro` | Higher quality, slower | Production-quality output | + +## Error Handling + +Video generation can fail for various reasons. Always implement proper error handling: + +```typescript +try { + const { jobId } = await ai({ + adapter, + model: 'sora-2', + prompt: 'A scene', + }) + + // Poll for status... + const status = await ai({ + adapter, + model: 'sora-2', + jobId, + request: 'status', + }) + + if (status.status === 'failed') { + console.error('Generation failed:', status.error) + // Handle failure (e.g., retry, notify user) + } +} catch (error) { + if (error.message.includes('Video generation API is not available')) { + console.error('Sora API access may be required. Check your OpenAI account.') + } else if (error.message.includes('rate limit')) { + console.error('Rate limited. Please wait before trying again.') + } else { + console.error('Unexpected error:', error) + } +} +``` + +## Rate Limits and Quotas + +> **⚠️ Note:** Rate limits and quotas for video generation are subject to change and may vary by account tier. + +Typical considerations: +- Video generation is computationally expensive +- Concurrent job limits may apply +- Monthly generation quotas may exist +- Longer/higher-quality videos consume more quota + +Check the [OpenAI documentation](https://platform.openai.com/docs) for current limits. + +## Environment Variables + +The video adapter uses the same environment variable as other OpenAI adapters: + +- `OPENAI_API_KEY`: Your OpenAI API key + +## Explicit API Keys + +For production use or when you need explicit control: + +```typescript +import { createOpenaiVideo } from '@tanstack/ai-openai' + +const adapter = createOpenaiVideo('your-openai-api-key') +``` + +## Differences from Image Generation + +| Aspect | Image Generation | Video Generation | +|--------|-----------------|------------------| +| API Type | Synchronous | Jobs/Polling | +| Return Type | `ImageGenerationResult` | `VideoJobResult` → `VideoStatusResult` → `VideoUrlResult` | +| Wait Time | Seconds | Minutes | +| Multiple Outputs | `numberOfImages` option | Not supported | +| Options Field | `prompt`, `size`, `numberOfImages` | `prompt`, `size`, `duration` | + +## Known Limitations + +> **⚠️ These limitations are subject to change as the feature evolves.** + +1. **API Availability**: The Sora API may not be available in all OpenAI accounts +2. **Generation Time**: Video generation can take several minutes +3. **URL Expiration**: Generated video URLs may expire after a certain period +4. **No Real-time Progress**: Progress updates may be limited or delayed +5. **Audio Limitations**: Audio generation support may be limited +6. **Prompt Length**: Long prompts may be truncated + +## Best Practices + +1. **Implement Timeouts**: Set reasonable timeouts for the polling loop +2. **Handle Failures Gracefully**: Have fallback behavior for failed generations +3. **Cache URLs**: Store video URLs and check expiration before re-fetching +4. **User Feedback**: Show clear progress indicators during generation +5. **Validate Prompts**: Check prompt length and content before submission +6. **Monitor Usage**: Track generation usage to avoid hitting quotas + +## Future Considerations + +This feature is experimental. Future versions may include: + +- Additional video models and providers +- Streaming progress updates +- Video editing and manipulation +- Audio track generation +- Batch video generation +- Custom style/aesthetic controls + +Stay tuned to the [TanStack AI changelog](https://github.com/TanStack/ai/blob/main/CHANGELOG.md) for updates. + diff --git a/packages/typescript/ai-client/src/chat-client.ts b/packages/typescript/ai-client/src/chat-client.ts index 3b9e1787..1d1ba091 100644 --- a/packages/typescript/ai-client/src/chat-client.ts +++ b/packages/typescript/ai-client/src/chat-client.ts @@ -26,6 +26,7 @@ export class ChatClient { private clientToolsRef: { current: Map } private currentStreamId: string | null = null private currentMessageId: string | null = null + private postStreamActions: Array<() => Promise> = [] private callbacksRef: { current: { @@ -323,6 +324,9 @@ export class ChatClient { } finally { this.abortController = null this.setIsLoading(false) + + // Drain any actions that were queued while the stream was in progress + await this.drainPostStreamActions() } } @@ -394,10 +398,13 @@ export class ChatClient { result.errorText, ) - // Check if we should auto-send - if (this.shouldAutoSend()) { - await this.continueFlow() + // If stream is in progress, queue continuation check for after it ends + if (this.isLoading) { + this.queuePostStreamAction(() => this.checkForContinuation()) + return } + + await this.checkForContinuation() } /** @@ -433,18 +440,39 @@ export class ChatClient { // Add response via processor this.processor.addToolApprovalResponse(response.id, response.approved) - // Check if we should auto-send - if (this.shouldAutoSend()) { - await this.continueFlow() + // If stream is in progress, queue continuation check for after it ends + if (this.isLoading) { + this.queuePostStreamAction(() => this.checkForContinuation()) + return } + + await this.checkForContinuation() } /** - * Continue the agent flow with current messages + * Queue an action to be executed after the current stream ends */ - private async continueFlow(): Promise { - if (this.isLoading) return - await this.streamResponse() + private queuePostStreamAction(action: () => Promise): void { + this.postStreamActions.push(action) + } + + /** + * Drain and execute all queued post-stream actions + */ + private async drainPostStreamActions(): Promise { + while (this.postStreamActions.length > 0) { + const action = this.postStreamActions.shift()! + await action() + } + } + + /** + * Check if we should continue the flow and do so if needed + */ + private async checkForContinuation(): Promise { + if (this.shouldAutoSend()) { + await this.streamResponse() + } } /** diff --git a/packages/typescript/ai-gemini/src/adapters/tts.ts b/packages/typescript/ai-gemini/src/adapters/tts.ts new file mode 100644 index 00000000..1d72f8a9 --- /dev/null +++ b/packages/typescript/ai-gemini/src/adapters/tts.ts @@ -0,0 +1,192 @@ +import { BaseTTSAdapter } from '@tanstack/ai/adapters' +import { GEMINI_TTS_MODELS } from '../model-meta' +import { + createGeminiClient, + generateId, + getGeminiApiKeyFromEnv, +} from '../utils' +import type { TTSOptions, TTSResult } from '@tanstack/ai' +import type { GoogleGenAI } from '@google/genai' +import type { GeminiClientConfig } from '../utils' + +/** + * Provider-specific options for Gemini TTS + * + * @experimental Gemini TTS is an experimental feature and uses the Live API. + */ +export interface GeminiTTSProviderOptions { + /** + * Voice configuration for TTS. + * Note: Gemini TTS uses the Live API which has limited configuration options. + */ + voiceConfig?: { + prebuiltVoiceConfig?: { + voiceName?: string + } + } +} + +/** + * Configuration for Gemini TTS adapter + * + * @experimental Gemini TTS is an experimental feature. + */ +export interface GeminiTTSConfig extends GeminiClientConfig {} + +/** + * Gemini Text-to-Speech Adapter + * + * Tree-shakeable adapter for Gemini TTS functionality. + * + * **IMPORTANT**: Gemini TTS uses the Live API (WebSocket-based) which requires + * different handling than traditional REST APIs. This adapter provides a + * simplified interface but may have limitations. + * + * @experimental Gemini TTS is an experimental feature and may change. + * + * Models: + * - gemini-2.5-flash-preview-tts + */ +export class GeminiTTSAdapter extends BaseTTSAdapter< + typeof GEMINI_TTS_MODELS, + GeminiTTSProviderOptions +> { + readonly name = 'gemini' as const + readonly models = GEMINI_TTS_MODELS + + private client: GoogleGenAI + + constructor(config: GeminiTTSConfig) { + super(config) + this.client = createGeminiClient(config) + } + + /** + * Generate speech from text using Gemini's TTS model. + * + * Note: Gemini's TTS functionality uses the Live API, which is WebSocket-based. + * This implementation uses the multimodal generation endpoint with audio output + * configuration, which may have different capabilities than the full Live API. + * + * @experimental This implementation is experimental and may change. + */ + async generateSpeech( + options: TTSOptions, + ): Promise { + const { model, text, providerOptions } = options + + // Use Gemini's multimodal content generation with audio output + // Note: This requires the model to support audio output + const voiceConfig = providerOptions?.voiceConfig || { + prebuiltVoiceConfig: { + voiceName: 'Kore', // Default Gemini voice + }, + } + + const response = await this.client.models.generateContent({ + model, + contents: [ + { + role: 'user', + parts: [{ text: `Please speak the following text: ${text}` }], + }, + ], + config: { + // Configure for audio output + responseModalities: ['AUDIO'], + speechConfig: { + voiceConfig, + }, + }, + }) + + // Extract audio data from response + const candidate = response.candidates?.[0] + const parts = candidate?.content?.parts + + if (!parts || parts.length === 0) { + throw new Error('No audio output received from Gemini TTS') + } + + // Look for inline data (audio) + const audioPart = parts.find((part: any) => + part.inlineData?.mimeType?.startsWith('audio/'), + ) + + if (!audioPart || !('inlineData' in audioPart)) { + throw new Error('No audio data in Gemini TTS response') + } + + const inlineData = (audioPart as any).inlineData + const audioBase64 = inlineData.data + const mimeType = inlineData.mimeType || 'audio/wav' + const format = mimeType.split('/')[1] || 'wav' + + return { + id: generateId(this.name), + model, + audio: audioBase64, + format, + contentType: mimeType, + } + } +} + +/** + * Creates a Gemini TTS adapter with explicit API key + * + * @experimental Gemini TTS is an experimental feature and may change. + * + * @param apiKey - Your Google API key + * @param config - Optional additional configuration + * @returns Configured Gemini TTS adapter instance + * + * @example + * ```typescript + * const adapter = createGeminiTTS("your-api-key"); + * + * const result = await ai({ + * adapter, + * model: 'gemini-2.5-flash-preview-tts', + * text: 'Hello, world!' + * }); + * ``` + */ +export function createGeminiTTS( + apiKey: string, + config?: Omit, +): GeminiTTSAdapter { + return new GeminiTTSAdapter({ apiKey, ...config }) +} + +/** + * Creates a Gemini TTS adapter with automatic API key detection from environment variables. + * + * @experimental Gemini TTS is an experimental feature and may change. + * + * Looks for `GOOGLE_API_KEY` or `GEMINI_API_KEY` in: + * - `process.env` (Node.js) + * - `window.env` (Browser with injected env) + * + * @param config - Optional configuration (excluding apiKey which is auto-detected) + * @returns Configured Gemini TTS adapter instance + * @throws Error if GOOGLE_API_KEY or GEMINI_API_KEY is not found in environment + * + * @example + * ```typescript + * // Automatically uses GOOGLE_API_KEY from environment + * const adapter = geminiTTS(); + * + * const result = await ai({ + * adapter, + * model: 'gemini-2.5-flash-preview-tts', + * text: 'Welcome to TanStack AI!' + * }); + * ``` + */ +export function geminiTTS( + config?: Omit, +): GeminiTTSAdapter { + const apiKey = getGeminiApiKeyFromEnv() + return createGeminiTTS(apiKey, config) +} diff --git a/packages/typescript/ai-gemini/src/index.ts b/packages/typescript/ai-gemini/src/index.ts index 76a2ba23..3330af93 100644 --- a/packages/typescript/ai-gemini/src/index.ts +++ b/packages/typescript/ai-gemini/src/index.ts @@ -50,9 +50,22 @@ export type { ImagePromptLanguage, } from './image/image-provider-options' +// TTS adapter (experimental) +/** + * @experimental Gemini TTS is an experimental feature and may change. + */ +export { + GeminiTTSAdapter, + createGeminiTTS, + geminiTTS, + type GeminiTTSConfig, + type GeminiTTSProviderOptions, +} from './adapters/tts' + // Re-export models from model-meta for convenience export { GEMINI_MODELS as GeminiTextModels } from './model-meta' export { GEMINI_IMAGE_MODELS as GeminiImageModels } from './model-meta' +export { GEMINI_TTS_MODELS as GeminiTTSModels } from './model-meta' export type { GeminiModels as GeminiTextModel } from './model-meta' export type { GeminiImageModels as GeminiImageModel } from './model-meta' diff --git a/packages/typescript/ai-gemini/src/model-meta.ts b/packages/typescript/ai-gemini/src/model-meta.ts index 034cc65c..bae5cb5f 100644 --- a/packages/typescript/ai-gemini/src/model-meta.ts +++ b/packages/typescript/ai-gemini/src/model-meta.ts @@ -794,6 +794,12 @@ export const GEMINI_IMAGE_MODELS = [ export const GEMINI_EMBEDDING_MODELS = [GEMINI_EMBEDDING.name] as const +/** + * Text-to-speech models + * @experimental Gemini TTS is an experimental feature and may change. + */ +export const GEMINI_TTS_MODELS = ['gemini-2.5-flash-preview-tts'] as const + /* const GEMINI_AUDIO_MODELS = [ GEMINI_2_5_PRO_TTS.name, GEMINI_2_5_FLASH_TTS.name, diff --git a/packages/typescript/ai-openai/src/adapters/transcription.ts b/packages/typescript/ai-openai/src/adapters/transcription.ts new file mode 100644 index 00000000..7bb754e6 --- /dev/null +++ b/packages/typescript/ai-openai/src/adapters/transcription.ts @@ -0,0 +1,239 @@ +import { BaseTranscriptionAdapter } from '@tanstack/ai/adapters' +import { OPENAI_TRANSCRIPTION_MODELS } from '../model-meta' +import { + createOpenAIClient, + generateId, + getOpenAIApiKeyFromEnv, +} from '../utils' +import type { OpenAITranscriptionProviderOptions } from '../audio/transcription-provider-options' +import type { + TranscriptionOptions, + TranscriptionResult, + TranscriptionSegment, +} from '@tanstack/ai' +import type OpenAI_SDK from 'openai' +import type { OpenAIClientConfig } from '../utils' + +/** + * Configuration for OpenAI Transcription adapter + */ +export interface OpenAITranscriptionConfig extends OpenAIClientConfig {} + +/** + * OpenAI Transcription (Speech-to-Text) Adapter + * + * Tree-shakeable adapter for OpenAI audio transcription functionality. + * Supports whisper-1, gpt-4o-transcribe, gpt-4o-mini-transcribe, and gpt-4o-transcribe-diarize models. + * + * Features: + * - Multiple transcription models with different capabilities + * - Language detection or specification + * - Multiple output formats: json, text, srt, verbose_json, vtt + * - Word and segment-level timestamps (with verbose_json) + * - Speaker diarization (with gpt-4o-transcribe-diarize) + */ +export class OpenAITranscriptionAdapter extends BaseTranscriptionAdapter< + typeof OPENAI_TRANSCRIPTION_MODELS, + OpenAITranscriptionProviderOptions +> { + readonly name = 'openai' as const + readonly models = OPENAI_TRANSCRIPTION_MODELS + + private client: OpenAI_SDK + + constructor(config: OpenAITranscriptionConfig) { + super(config) + this.client = createOpenAIClient(config) + } + + async transcribe( + options: TranscriptionOptions, + ): Promise { + const { model, audio, language, prompt, responseFormat, providerOptions } = + options + + // Convert audio input to File object + const file = this.prepareAudioFile(audio) + + // Build request + const request: OpenAI_SDK.Audio.TranscriptionCreateParams = { + model, + file, + language, + prompt, + response_format: this.mapResponseFormat(responseFormat), + ...providerOptions, + } + + // Call OpenAI API - use verbose_json to get timestamps when available + const useVerbose = + responseFormat === 'verbose_json' || + (!responseFormat && model !== 'whisper-1') + + if (useVerbose) { + request.response_format = 'verbose_json' + const response = (await this.client.audio.transcriptions.create( + request, + )) as OpenAI_SDK.Audio.Transcription & { + segments?: Array<{ + id: number + start: number + end: number + text: string + avg_logprob?: number + }> + words?: Array<{ + word: string + start: number + end: number + }> + duration?: number + language?: string + } + + return { + id: generateId(this.name), + model, + text: response.text, + language: response.language, + duration: response.duration, + segments: response.segments?.map( + (seg): TranscriptionSegment => ({ + id: seg.id, + start: seg.start, + end: seg.end, + text: seg.text, + confidence: seg.avg_logprob ? Math.exp(seg.avg_logprob) : undefined, + }), + ), + words: response.words?.map((w) => ({ + word: w.word, + start: w.start, + end: w.end, + })), + } + } else { + const response = await this.client.audio.transcriptions.create(request) + + return { + id: generateId(this.name), + model, + text: typeof response === 'string' ? response : response.text, + language, + } + } + } + + private prepareAudioFile(audio: string | File | Blob | ArrayBuffer): File { + // If already a File, return it + if (audio instanceof File) { + return audio + } + + // If Blob, convert to File + if (audio instanceof Blob) { + return new File([audio], 'audio.mp3', { + type: audio.type || 'audio/mpeg', + }) + } + + // If ArrayBuffer, convert to File + if (audio instanceof ArrayBuffer) { + return new File([audio], 'audio.mp3', { type: 'audio/mpeg' }) + } + + // If base64 string, decode and convert to File + if (typeof audio === 'string') { + // Check if it's a data URL + if (audio.startsWith('data:')) { + const parts = audio.split(',') + const header = parts[0] + const base64Data = parts[1] || '' + const mimeMatch = header?.match(/data:([^;]+)/) + const mimeType = mimeMatch?.[1] || 'audio/mpeg' + const binaryStr = atob(base64Data) + const bytes = new Uint8Array(binaryStr.length) + for (let i = 0; i < binaryStr.length; i++) { + bytes[i] = binaryStr.charCodeAt(i) + } + const extension = mimeType.split('/')[1] || 'mp3' + return new File([bytes], `audio.${extension}`, { type: mimeType }) + } + + // Assume raw base64 + const binaryStr = atob(audio) + const bytes = new Uint8Array(binaryStr.length) + for (let i = 0; i < binaryStr.length; i++) { + bytes[i] = binaryStr.charCodeAt(i) + } + return new File([bytes], 'audio.mp3', { type: 'audio/mpeg' }) + } + + throw new Error('Invalid audio input type') + } + + private mapResponseFormat( + format?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt', + ): OpenAI_SDK.Audio.TranscriptionCreateParams['response_format'] { + if (!format) return 'json' + return format as OpenAI_SDK.Audio.TranscriptionCreateParams['response_format'] + } +} + +/** + * Creates an OpenAI Transcription adapter with explicit API key + * + * @param apiKey - Your OpenAI API key + * @param config - Optional additional configuration + * @returns Configured OpenAI Transcription adapter instance + * + * @example + * ```typescript + * const adapter = createOpenaiTranscription("sk-..."); + * + * const result = await ai({ + * adapter, + * model: 'whisper-1', + * audio: audioFile, + * language: 'en' + * }); + * ``` + */ +export function createOpenaiTranscription( + apiKey: string, + config?: Omit, +): OpenAITranscriptionAdapter { + return new OpenAITranscriptionAdapter({ apiKey, ...config }) +} + +/** + * Creates an OpenAI Transcription adapter with automatic API key detection from environment variables. + * + * Looks for `OPENAI_API_KEY` in: + * - `process.env` (Node.js) + * - `window.env` (Browser with injected env) + * + * @param config - Optional configuration (excluding apiKey which is auto-detected) + * @returns Configured OpenAI Transcription adapter instance + * @throws Error if OPENAI_API_KEY is not found in environment + * + * @example + * ```typescript + * // Automatically uses OPENAI_API_KEY from environment + * const adapter = openaiTranscription(); + * + * const result = await ai({ + * adapter, + * model: 'whisper-1', + * audio: audioFile + * }); + * + * console.log(result.text) + * ``` + */ +export function openaiTranscription( + config?: Omit, +): OpenAITranscriptionAdapter { + const apiKey = getOpenAIApiKeyFromEnv() + return createOpenaiTranscription(apiKey, config) +} diff --git a/packages/typescript/ai-openai/src/adapters/tts.ts b/packages/typescript/ai-openai/src/adapters/tts.ts new file mode 100644 index 00000000..1e2a0df4 --- /dev/null +++ b/packages/typescript/ai-openai/src/adapters/tts.ts @@ -0,0 +1,169 @@ +import { BaseTTSAdapter } from '@tanstack/ai/adapters' +import { OPENAI_TTS_MODELS } from '../model-meta' +import { + createOpenAIClient, + generateId, + getOpenAIApiKeyFromEnv, +} from '../utils' +import { + validateAudioInput, + validateInstructions, + validateSpeed, +} from '../audio/audio-provider-options' +import type { + OpenAITTSFormat, + OpenAITTSProviderOptions, + OpenAITTSVoice, +} from '../audio/tts-provider-options' +import type { TTSOptions, TTSResult } from '@tanstack/ai' +import type OpenAI_SDK from 'openai' +import type { OpenAIClientConfig } from '../utils' + +/** + * Configuration for OpenAI TTS adapter + */ +export interface OpenAITTSConfig extends OpenAIClientConfig {} + +/** + * OpenAI Text-to-Speech Adapter + * + * Tree-shakeable adapter for OpenAI TTS functionality. + * Supports tts-1, tts-1-hd, and gpt-4o-audio-preview models. + * + * Features: + * - Multiple voice options: alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer, verse + * - Multiple output formats: mp3, opus, aac, flac, wav, pcm + * - Speed control (0.25 to 4.0) + */ +export class OpenAITTSAdapter extends BaseTTSAdapter< + typeof OPENAI_TTS_MODELS, + OpenAITTSProviderOptions +> { + readonly name = 'openai' as const + readonly models = OPENAI_TTS_MODELS + + private client: OpenAI_SDK + + constructor(config: OpenAITTSConfig) { + super(config) + this.client = createOpenAIClient(config) + } + + async generateSpeech( + options: TTSOptions, + ): Promise { + const { model, text, voice, format, speed, providerOptions } = options + + // Validate inputs using existing validators + const audioOptions = { + input: text, + model, + voice: voice as OpenAITTSVoice, + speed, + response_format: format as OpenAITTSFormat, + ...providerOptions, + } + + validateAudioInput(audioOptions) + validateSpeed(audioOptions) + validateInstructions(audioOptions) + + // Build request + const request: OpenAI_SDK.Audio.SpeechCreateParams = { + model, + input: text, + voice: voice || 'alloy', + response_format: format, + speed, + ...providerOptions, + } + + // Call OpenAI API + const response = await this.client.audio.speech.create(request) + + // Convert response to base64 + const arrayBuffer = await response.arrayBuffer() + const base64 = Buffer.from(arrayBuffer).toString('base64') + + const outputFormat = format || 'mp3' + const contentType = this.getContentType(outputFormat) + + return { + id: generateId(this.name), + model, + audio: base64, + format: outputFormat, + contentType, + } + } + + private getContentType(format: string): string { + const contentTypes: Record = { + mp3: 'audio/mpeg', + opus: 'audio/opus', + aac: 'audio/aac', + flac: 'audio/flac', + wav: 'audio/wav', + pcm: 'audio/pcm', + } + return contentTypes[format] || 'audio/mpeg' + } +} + +/** + * Creates an OpenAI TTS adapter with explicit API key + * + * @param apiKey - Your OpenAI API key + * @param config - Optional additional configuration + * @returns Configured OpenAI TTS adapter instance + * + * @example + * ```typescript + * const adapter = createOpenaiTTS("sk-..."); + * + * const result = await ai({ + * adapter, + * model: 'tts-1-hd', + * text: 'Hello, world!', + * voice: 'nova' + * }); + * ``` + */ +export function createOpenaiTTS( + apiKey: string, + config?: Omit, +): OpenAITTSAdapter { + return new OpenAITTSAdapter({ apiKey, ...config }) +} + +/** + * Creates an OpenAI TTS adapter with automatic API key detection from environment variables. + * + * Looks for `OPENAI_API_KEY` in: + * - `process.env` (Node.js) + * - `window.env` (Browser with injected env) + * + * @param config - Optional configuration (excluding apiKey which is auto-detected) + * @returns Configured OpenAI TTS adapter instance + * @throws Error if OPENAI_API_KEY is not found in environment + * + * @example + * ```typescript + * // Automatically uses OPENAI_API_KEY from environment + * const adapter = openaiTTS(); + * + * const result = await ai({ + * adapter, + * model: 'tts-1', + * text: 'Welcome to TanStack AI!', + * voice: 'alloy', + * format: 'mp3' + * }); + * ``` + */ +export function openaiTTS( + config?: Omit, +): OpenAITTSAdapter { + const apiKey = getOpenAIApiKeyFromEnv() + return createOpenaiTTS(apiKey, config) +} diff --git a/packages/typescript/ai-openai/src/adapters/video.ts b/packages/typescript/ai-openai/src/adapters/video.ts new file mode 100644 index 00000000..661e96d0 --- /dev/null +++ b/packages/typescript/ai-openai/src/adapters/video.ts @@ -0,0 +1,400 @@ +import { BaseVideoAdapter } from '@tanstack/ai/adapters' +import { OPENAI_VIDEO_MODELS } from '../model-meta' +import { createOpenAIClient, getOpenAIApiKeyFromEnv } from '../utils' +import { + toApiSeconds, + validateVideoSeconds, + validateVideoSize, +} from '../video/video-provider-options' +import type { + OpenAIVideoModelProviderOptionsByName, + OpenAIVideoProviderOptions, +} from '../video/video-provider-options' +import type { + VideoGenerationOptions, + VideoJobResult, + VideoStatusResult, + VideoUrlResult, +} from '@tanstack/ai' +import type OpenAI_SDK from 'openai' +import type { OpenAIClientConfig } from '../utils' + +/** + * Configuration for OpenAI video adapter. + * + * @experimental Video generation is an experimental feature and may change. + */ +export interface OpenAIVideoConfig extends OpenAIClientConfig {} + +/** + * OpenAI Video Generation Adapter + * + * Tree-shakeable adapter for OpenAI video generation functionality using Sora-2. + * Uses a jobs/polling architecture for async video generation. + * + * @experimental Video generation is an experimental feature and may change. + * + * Features: + * - Async job-based video generation + * - Status polling for job progress + * - URL retrieval for completed videos + * - Model-specific type-safe provider options + */ +export class OpenAIVideoAdapter extends BaseVideoAdapter< + typeof OPENAI_VIDEO_MODELS, + OpenAIVideoProviderOptions +> { + readonly name = 'openai' as const + readonly models = OPENAI_VIDEO_MODELS + + // Type-only properties for type inference + declare _modelProviderOptionsByName?: OpenAIVideoModelProviderOptionsByName + + private client: OpenAI_SDK + + constructor(config: OpenAIVideoConfig) { + super(config) + this.client = createOpenAIClient(config) + } + + /** + * Create a new video generation job. + * + * API: POST /v1/videos + * Docs: https://platform.openai.com/docs/api-reference/videos/create + * + * @experimental Video generation is an experimental feature and may change. + * + * @example + * ```ts + * const { jobId } = await adapter.createVideoJob({ + * model: 'sora-2', + * prompt: 'A cat chasing a dog in a sunny park', + * size: '1280x720', + * duration: 8 // seconds: 4, 8, or 12 + * }) + * ``` + */ + async createVideoJob( + options: VideoGenerationOptions, + ): Promise { + const { model, size, duration, providerOptions } = options + + // Validate inputs + validateVideoSize(model, size) + // Duration maps to 'seconds' in the API + const seconds = duration ?? providerOptions?.seconds + validateVideoSeconds(model, seconds) + + // Build request + const request = this.buildRequest(options) + + try { + // POST /v1/videos + // Cast to any because the videos API may not be in SDK types yet + const client = this.client as any + const response = await client.videos.create(request) + + return { + jobId: response.id, + model, + } + } catch (error: any) { + // Fallback for when the videos API is not available + if (error.message?.includes('videos') || error.code === 'invalid_api') { + throw new Error( + `Video generation API is not available. The Sora API may require special access. ` + + `Original error: ${error.message}`, + ) + } + throw error + } + } + + /** + * Get the current status of a video generation job. + * + * API: GET /v1/videos/{video_id} + * Docs: https://platform.openai.com/docs/api-reference/videos/get + * + * @experimental Video generation is an experimental feature and may change. + * + * @example + * ```ts + * const status = await adapter.getVideoStatus(jobId) + * if (status.status === 'completed') { + * console.log('Video is ready!') + * } else if (status.status === 'processing') { + * console.log(`Progress: ${status.progress}%`) + * } + * ``` + */ + async getVideoStatus(jobId: string): Promise { + try { + // GET /v1/videos/{video_id} + const client = this.client as any + const response = await client.videos.retrieve(jobId) + + return { + jobId, + status: this.mapStatus(response.status), + progress: response.progress, + error: response.error?.message, + } + } catch (error: any) { + if (error.status === 404) { + return { + jobId, + status: 'failed', + error: 'Job not found', + } + } + throw error + } + } + + /** + * Get the URL to download/view the generated video. + * + * API: GET /v1/videos/{video_id}/content + * Docs: https://platform.openai.com/docs/api-reference/videos/content + * + * @experimental Video generation is an experimental feature and may change. + * + * @example + * ```ts + * const { url, expiresAt } = await adapter.getVideoUrl(jobId) + * console.log('Video URL:', url) + * console.log('Expires at:', expiresAt) + * ``` + */ + async getVideoUrl(jobId: string): Promise { + try { + // GET /v1/videos/{video_id}/content + // The SDK may not have a .content() method, so we try multiple approaches + const client = this.client as any + + let response: any + + // Try different possible method names + if (typeof client.videos?.content === 'function') { + response = await client.videos.content(jobId) + } else if (typeof client.videos?.getContent === 'function') { + response = await client.videos.getContent(jobId) + } else if (typeof client.videos?.download === 'function') { + response = await client.videos.download(jobId) + } else { + // Fallback: check if retrieve returns the URL directly + const videoInfo = await client.videos.retrieve(jobId) + if (videoInfo.url) { + return { + jobId, + url: videoInfo.url, + expiresAt: videoInfo.expires_at + ? new Date(videoInfo.expires_at) + : undefined, + } + } + + // Last resort: The /content endpoint returns raw binary video data, not JSON. + // We need to construct a URL that the client can use to fetch the video. + // The URL needs to include auth, so we'll create a signed URL or return + // a proxy endpoint. + + // For now, return a URL that goes through our API to proxy the request + // since the raw endpoint requires auth headers that browsers can't send. + // The video element can't add Authorization headers, so we need a workaround. + + // Option 1: Return the direct URL (only works if OpenAI supports query param auth) + // Option 2: Return a blob URL after fetching (memory intensive) + // Option 3: Return a proxy URL through our server + + // Let's try fetching and returning a data URL for now + const baseUrl = this.config.baseUrl || 'https://api.openai.com/v1' + const apiKey = this.config.apiKey + + const contentResponse = await fetch( + `${baseUrl}/videos/${jobId}/content`, + { + method: 'GET', + headers: { + Authorization: `Bearer ${apiKey}`, + }, + }, + ) + + if (!contentResponse.ok) { + // Try to parse error as JSON, but it might be binary + const contentType = contentResponse.headers.get('content-type') + if (contentType?.includes('application/json')) { + const errorData = await contentResponse.json().catch(() => ({})) + throw new Error( + errorData.error?.message || + `Failed to get video content: ${contentResponse.status}`, + ) + } + throw new Error( + `Failed to get video content: ${contentResponse.status}`, + ) + } + + // The response is the raw video file - convert to base64 data URL + const videoBlob = await contentResponse.blob() + const buffer = await videoBlob.arrayBuffer() + const base64 = Buffer.from(buffer).toString('base64') + const mimeType = + contentResponse.headers.get('content-type') || 'video/mp4' + + return { + jobId, + url: `data:${mimeType};base64,${base64}`, + expiresAt: undefined, // Data URLs don't expire + } + } + + return { + jobId, + url: response.url, + expiresAt: response.expires_at + ? new Date(response.expires_at) + : undefined, + } + } catch (error: any) { + if (error.status === 404) { + throw new Error(`Video job not found: ${jobId}`) + } + if (error.status === 400) { + throw new Error( + `Video is not ready for download. Check status first. Job ID: ${jobId}`, + ) + } + throw error + } + } + + private buildRequest( + options: VideoGenerationOptions, + ): Record { + const { model, prompt, size, duration, providerOptions } = options + + const request: Record = { + model, + prompt, + } + + // Add size/resolution + // Supported: '1280x720', '720x1280', '1792x1024', '1024x1792' + if (size) { + request.size = size + } else if (providerOptions?.size) { + request.size = providerOptions.size + } + + // Add seconds (duration) + // Supported: '4', '8', or '12' - yes, the API wants strings + const seconds = duration ?? providerOptions?.seconds + if (seconds !== undefined) { + request.seconds = toApiSeconds(seconds) + } + + return request + } + + private mapStatus( + apiStatus: string, + ): 'pending' | 'processing' | 'completed' | 'failed' { + switch (apiStatus) { + case 'queued': + case 'pending': + return 'pending' + case 'processing': + case 'in_progress': + return 'processing' + case 'completed': + case 'succeeded': + return 'completed' + case 'failed': + case 'error': + case 'cancelled': + return 'failed' + default: + return 'processing' + } + } +} + +/** + * Creates an OpenAI video adapter with an explicit API key. + * + * @experimental Video generation is an experimental feature and may change. + * + * @param apiKey - Your OpenAI API key + * @param config - Optional additional configuration + * @returns Configured OpenAI video adapter instance + * + * @example + * ```typescript + * const adapter = createOpenaiVideo('your-api-key'); + * + * const { jobId } = await ai({ + * adapter, + * model: 'sora-2', + * prompt: 'A beautiful sunset over the ocean' + * }); + * ``` + */ +export function createOpenaiVideo( + apiKey: string, + config?: Omit, +): OpenAIVideoAdapter { + return new OpenAIVideoAdapter({ apiKey, ...config }) +} + +/** + * Creates an OpenAI video adapter with automatic API key detection from environment variables. + * + * Looks for `OPENAI_API_KEY` in: + * - `process.env` (Node.js) + * - `window.env` (Browser with injected env) + * + * @experimental Video generation is an experimental feature and may change. + * + * @param config - Optional configuration (excluding apiKey which is auto-detected) + * @returns Configured OpenAI video adapter instance + * @throws Error if OPENAI_API_KEY is not found in environment + * + * @example + * ```typescript + * // Automatically uses OPENAI_API_KEY from environment + * const adapter = openaiVideo(); + * + * // Create a video generation job + * const { jobId } = await ai({ + * adapter, + * model: 'sora-2', + * prompt: 'A cat playing piano' + * }); + * + * // Poll for status + * const status = await ai({ + * adapter, + * model: 'sora-2', + * jobId, + * request: 'status' + * }); + * + * // Get video URL when complete + * const { url } = await ai({ + * adapter, + * model: 'sora-2', + * jobId, + * request: 'url' + * }); + * ``` + */ +export function openaiVideo( + config?: Omit, +): OpenAIVideoAdapter { + const apiKey = getOpenAIApiKeyFromEnv() + return createOpenaiVideo(apiKey, config) +} diff --git a/packages/typescript/ai-openai/src/audio/transcription-provider-options.ts b/packages/typescript/ai-openai/src/audio/transcription-provider-options.ts new file mode 100644 index 00000000..4dfe5ffb --- /dev/null +++ b/packages/typescript/ai-openai/src/audio/transcription-provider-options.ts @@ -0,0 +1,18 @@ +/** + * Provider-specific options for OpenAI Transcription + */ +export interface OpenAITranscriptionProviderOptions { + /** + * The sampling temperature, between 0 and 1. + * Higher values like 0.8 will make the output more random, + * while lower values like 0.2 will make it more focused and deterministic. + */ + temperature?: number + + /** + * The timestamp granularities to populate for this transcription. + * response_format must be set to verbose_json to use timestamp granularities. + * Either or both of these options are supported: word, or segment. + */ + timestamp_granularities?: Array<'word' | 'segment'> +} diff --git a/packages/typescript/ai-openai/src/audio/tts-provider-options.ts b/packages/typescript/ai-openai/src/audio/tts-provider-options.ts new file mode 100644 index 00000000..4368caed --- /dev/null +++ b/packages/typescript/ai-openai/src/audio/tts-provider-options.ts @@ -0,0 +1,31 @@ +/** + * OpenAI TTS voice options + */ +export type OpenAITTSVoice = + | 'alloy' + | 'ash' + | 'ballad' + | 'coral' + | 'echo' + | 'fable' + | 'onyx' + | 'nova' + | 'sage' + | 'shimmer' + | 'verse' + +/** + * OpenAI TTS output format options + */ +export type OpenAITTSFormat = 'mp3' | 'opus' | 'aac' | 'flac' | 'wav' | 'pcm' + +/** + * Provider-specific options for OpenAI TTS + */ +export interface OpenAITTSProviderOptions { + /** + * Control the voice of your generated audio with additional instructions. + * Does not work with tts-1 or tts-1-hd. + */ + instructions?: string +} diff --git a/packages/typescript/ai-openai/src/index.ts b/packages/typescript/ai-openai/src/index.ts index 8038da8b..2e0cf5d3 100644 --- a/packages/typescript/ai-openai/src/index.ts +++ b/packages/typescript/ai-openai/src/index.ts @@ -41,6 +41,45 @@ export type { OpenAIImageModelProviderOptionsByName, } from './image/image-provider-options' +// Video adapter - for video generation (experimental) +/** + * @experimental Video generation is an experimental feature and may change. + */ +export { + OpenAIVideoAdapter, + createOpenaiVideo, + openaiVideo, + type OpenAIVideoConfig, +} from './adapters/video' +export type { + OpenAIVideoProviderOptions, + OpenAIVideoModelProviderOptionsByName, + OpenAIVideoSize, + OpenAIVideoDuration, +} from './video/video-provider-options' + +// TTS adapter - for text-to-speech +export { + OpenAITTSAdapter, + createOpenaiTTS, + openaiTTS, + type OpenAITTSConfig, +} from './adapters/tts' +export type { + OpenAITTSProviderOptions, + OpenAITTSVoice, + OpenAITTSFormat, +} from './audio/tts-provider-options' + +// Transcription adapter - for speech-to-text +export { + OpenAITranscriptionAdapter, + createOpenaiTranscription, + openaiTranscription, + type OpenAITranscriptionConfig, +} from './adapters/transcription' +export type { OpenAITranscriptionProviderOptions } from './audio/transcription-provider-options' + // ============================================================================ // Legacy Exports (Deprecated - will be removed in future versions) // ============================================================================ @@ -64,7 +103,12 @@ export type { OpenAIChatModelProviderOptionsByName, OpenAIModelInputModalitiesByName, } from './model-meta' -export { OPENAI_IMAGE_MODELS } from './model-meta' +export { + OPENAI_IMAGE_MODELS, + OPENAI_TTS_MODELS, + OPENAI_TRANSCRIPTION_MODELS, + OPENAI_VIDEO_MODELS, +} from './model-meta' export type { OpenAITextMetadata, OpenAIImageMetadata, diff --git a/packages/typescript/ai-openai/src/model-meta.ts b/packages/typescript/ai-openai/src/model-meta.ts index da204c70..69ae9cba 100644 --- a/packages/typescript/ai-openai/src/model-meta.ts +++ b/packages/typescript/ai-openai/src/model-meta.ts @@ -300,7 +300,11 @@ const GPT5_CODEX = { OpenAIMetadataOptions > -/* const SORA2 = { +/** + * Sora-2 video generation model. + * @experimental Video generation is an experimental feature and may change. + */ +const SORA2 = { name: 'sora-2', pricing: { input: { @@ -321,6 +325,10 @@ const GPT5_CODEX = { OpenAIBaseOptions & OpenAIStreamingOptions & OpenAIMetadataOptions > +/** + * Sora-2-Pro video generation model (higher quality). + * @experimental Video generation is an experimental feature and may change. + */ const SORA2_PRO = { name: 'sora-2-pro', pricing: { @@ -340,7 +348,7 @@ const SORA2_PRO = { }, } as const satisfies ModelMeta< OpenAIBaseOptions & OpenAIStreamingOptions & OpenAIMetadataOptions -> */ +> const GPT_IMAGE_1 = { name: 'gpt-image-1', @@ -1691,9 +1699,30 @@ export const OPENAI_EMBEDDING_MODELS = [ GPT_4O_MINI_TRANSCRIBE.name, ] as const -// Video generation models (based on endpoints: "video") -const OPENAI_VIDEO_MODELS = [SORA2.name, SORA2_PRO.name] as const +/** + * Video generation models (based on endpoints: "video") + * @experimental Video generation is an experimental feature and may change. */ +export const OPENAI_VIDEO_MODELS = [SORA2.name, SORA2_PRO.name] as const + +/** + * Text-to-speech models (based on endpoints: "speech_generation") + */ +export const OPENAI_TTS_MODELS = [ + 'tts-1', + 'tts-1-hd', + 'gpt-4o-audio-preview', +] as const + +/** + * Transcription models (based on endpoints: "transcription") + */ +export const OPENAI_TRANSCRIPTION_MODELS = [ + 'whisper-1', + 'gpt-4o-transcribe', + 'gpt-4o-mini-transcribe', + 'gpt-4o-transcribe-diarize', +] as const // const OPENAI_MODERATION_MODELS = [OMNI_MODERATION.name] as const // export type OpenAIChatModel = (typeof OPENAI_CHAT_MODELS)[number] diff --git a/packages/typescript/ai-openai/src/video/video-provider-options.ts b/packages/typescript/ai-openai/src/video/video-provider-options.ts new file mode 100644 index 00000000..b0355128 --- /dev/null +++ b/packages/typescript/ai-openai/src/video/video-provider-options.ts @@ -0,0 +1,123 @@ +/** + * OpenAI Video Generation Provider Options + * + * Based on https://platform.openai.com/docs/api-reference/videos/create + * + * @experimental Video generation is an experimental feature and may change. + */ + +/** + * Supported video sizes for OpenAI Sora video generation. + * Based on the official API documentation. + * + * @experimental Video generation is an experimental feature and may change. + */ +export type OpenAIVideoSize = + | '1280x720' // 720p landscape (16:9) + | '720x1280' // 720p portrait (9:16) + | '1792x1024' // Landscape wide + | '1024x1792' // Portrait tall + +/** + * Supported video durations (in seconds) for OpenAI Sora video generation. + * The API uses the `seconds` parameter with STRING values '4', '8', or '12'. + * Yes, really. They're strings. + * + * @experimental Video generation is an experimental feature and may change. + */ +export type OpenAIVideoSeconds = '4' | '8' | '12' + +/** + * Provider-specific options for OpenAI video generation. + * + * @experimental Video generation is an experimental feature and may change. + */ +export interface OpenAIVideoProviderOptions { + /** + * Video size in WIDTHxHEIGHT format. + * Supported: '1280x720', '720x1280', '1792x1024', '1024x1792' + */ + size?: OpenAIVideoSize + + /** + * Video duration in seconds. + * Supported values: 4, 8, or 12 seconds. + */ + seconds?: OpenAIVideoSeconds +} + +/** + * Model-specific provider options mapping. + * + * @experimental Video generation is an experimental feature and may change. + */ +export type OpenAIVideoModelProviderOptionsByName = { + 'sora-2': OpenAIVideoProviderOptions + 'sora-2-pro': OpenAIVideoProviderOptions +} + +/** + * Validate video size for a given model. + * + * @experimental Video generation is an experimental feature and may change. + */ +export function validateVideoSize( + model: string, + size?: string, +): asserts size is OpenAIVideoSize | undefined { + const validSizes: Array = [ + '1280x720', + '720x1280', + '1792x1024', + '1024x1792', + ] + + if (size && !validSizes.includes(size as OpenAIVideoSize)) { + throw new Error( + `Size "${size}" is not supported by model "${model}". Supported sizes: ${validSizes.join(', ')}`, + ) + } +} + +/** + * Validate video duration (seconds) for a given model. + * Accepts both string and number for convenience, but the API requires strings. + * + * @experimental Video generation is an experimental feature and may change. + */ +export function validateVideoSeconds( + model: string, + seconds?: number | string, +): asserts seconds is OpenAIVideoSeconds | number | undefined { + const validSeconds: Array = ['4', '8', '12'] + const validNumbers: Array = [4, 8, 12] + + if (seconds !== undefined) { + const isValid = + typeof seconds === 'string' + ? validSeconds.includes(seconds) + : validNumbers.includes(seconds) + + if (!isValid) { + throw new Error( + `Duration "${seconds}" is not supported by model "${model}". Supported durations: 4, 8, or 12 seconds`, + ) + } + } +} + +/** + * Convert duration to API format (string). + * The OpenAI Sora API inexplicably requires seconds as a string. + */ +export function toApiSeconds( + seconds: number | string | undefined, +): OpenAIVideoSeconds | undefined { + if (seconds === undefined) return undefined + return String(seconds) as OpenAIVideoSeconds +} + +/** + * @deprecated Use OpenAIVideoSeconds instead + */ +export type OpenAIVideoDuration = OpenAIVideoSeconds diff --git a/packages/typescript/ai-react/src/use-chat.ts b/packages/typescript/ai-react/src/use-chat.ts index 1064bb52..2f38c14c 100644 --- a/packages/typescript/ai-react/src/use-chat.ts +++ b/packages/typescript/ai-react/src/use-chat.ts @@ -80,14 +80,15 @@ export function useChat = any>( }, []) // Only run on mount - initialMessages are handled by ChatClient constructor // Cleanup on unmount: stop any in-flight requests + // Note: We only cleanup when client changes or component unmounts. + // DO NOT include isLoading in dependencies - that would cause the cleanup + // to run when isLoading changes, aborting continuation requests. useEffect(() => { return () => { - // Stop any active generation when component unmounts - if (isLoading) { - client.stop() - } + // Stop any active generation when component unmounts or client changes + client.stop() } - }, [client, isLoading]) + }, [client]) // Note: Callback options (onResponse, onChunk, onFinish, onError, onToolCall) // are captured at client creation time. Changes to these callbacks require diff --git a/packages/typescript/ai-solid/src/use-chat.ts b/packages/typescript/ai-solid/src/use-chat.ts index 8dd8ae1a..2a15fb37 100644 --- a/packages/typescript/ai-solid/src/use-chat.ts +++ b/packages/typescript/ai-solid/src/use-chat.ts @@ -63,14 +63,14 @@ export function useChat = any>( }) // Only run on mount - initialMessages are handled by ChatClient constructor // Cleanup on unmount: stop any in-flight requests + // Note: We use createEffect with a cleanup return to handle component unmount. + // The cleanup only runs on disposal (unmount), not on signal changes. createEffect(() => { return () => { // Stop any active generation when component unmounts - if (isLoading()) { - client().stop() - } + client().stop() } - }, [client, isLoading]) + }) // Note: Callback options (onResponse, onChunk, onFinish, onError, onToolCall) // are captured at client creation time. Changes to these callbacks require diff --git a/packages/typescript/ai-svelte/src/create-chat.svelte.ts b/packages/typescript/ai-svelte/src/create-chat.svelte.ts index 29d58301..c4081d27 100644 --- a/packages/typescript/ai-svelte/src/create-chat.svelte.ts +++ b/packages/typescript/ai-svelte/src/create-chat.svelte.ts @@ -68,6 +68,11 @@ export function createChat = any>( }, }) + // Note: Cleanup is handled by calling stop() directly when needed. + // Unlike React/Vue/Solid, Svelte 5 runes like $effect can only be used + // during component initialization, so we don't add automatic cleanup here. + // Users should call chat.stop() in their component's cleanup if needed. + // Define methods const sendMessage = async (content: string) => { await client.sendMessage(content) diff --git a/packages/typescript/ai-vue/src/use-chat.ts b/packages/typescript/ai-vue/src/use-chat.ts index 946ad4a9..f190d0ee 100644 --- a/packages/typescript/ai-vue/src/use-chat.ts +++ b/packages/typescript/ai-vue/src/use-chat.ts @@ -39,10 +39,9 @@ export function useChat = any>( }) // Cleanup on unmount: stop any in-flight requests + // Note: client.stop() is safe to call even if nothing is in progress onScopeDispose(() => { - if (isLoading.value) { - client.stop() - } + client.stop() }) // Note: Callback options (onResponse, onChunk, onFinish, onError, onToolCall) diff --git a/packages/typescript/ai/src/activities/index.ts b/packages/typescript/ai/src/activities/index.ts index aacbc566..66152106 100644 --- a/packages/typescript/ai/src/activities/index.ts +++ b/packages/typescript/ai/src/activities/index.ts @@ -23,6 +23,12 @@ import { kind as summarizeKindValue, } from './summarize/index' import { imageActivity, kind as imageKindValue } from './image/index' +import { videoActivity, kind as videoKindValue } from './video/index' +import { ttsActivity, kind as ttsKindValue } from './tts/index' +import { + transcriptionActivity, + kind as transcriptionKindValue, +} from './transcription/index' // Import model types for use in local type definitions import type { @@ -51,23 +57,53 @@ import type { ImageProviderOptionsForModel, ImageSizeForModel, } from './image/index' +import type { + VideoActivityOptions, + VideoActivityResult, + VideoCreateOptions, + VideoModels, + VideoProviderOptions, + VideoStatusOptions, + VideoUrlOptions, +} from './video/index' +import type { + TTSActivityOptions, + TTSActivityResult, + TTSModels, + TTSProviderOptions, +} from './tts/index' +import type { + TranscriptionActivityOptions, + TranscriptionActivityResult, + TranscriptionModels, + TranscriptionProviderOptions, +} from './transcription/index' // Import adapter types for type definitions import type { TextAdapter } from './text/adapter' import type { EmbeddingAdapter } from './embedding/adapter' import type { SummarizeAdapter } from './summarize/adapter' import type { ImageAdapter } from './image/adapter' +import type { VideoAdapter } from './video/adapter' +import type { TTSAdapter } from './tts/adapter' +import type { TranscriptionAdapter } from './transcription/adapter' // eslint-disable-next-line import/no-duplicates import type { TextActivityOptions, TextActivityResult } from './text/index' import type { z } from 'zod' + import type { ConstrainedModelMessage, EmbeddingResult, ImageGenerationResult, StreamChunk, SummarizationResult, + TTSResult, TextOptions, + TranscriptionResult, + VideoJobResult, + VideoStatusResult, + VideoUrlResult, } from '../types' // =========================== @@ -153,6 +189,66 @@ export { type ImageAdapterConfig, } from './image/adapter' +// =========================== +// Video Activity (Experimental) +// =========================== + +export { + kind as videoKind, + videoActivity, + type VideoActivityOptions, + type VideoActivityResult, + type VideoModels, + type VideoProviderOptions, + type VideoCreateOptions, + type VideoStatusOptions, + type VideoUrlOptions, +} from './video/index' + +export { + BaseVideoAdapter, + type VideoAdapter, + type VideoAdapterConfig, +} from './video/adapter' + +// =========================== +// TTS Activity +// =========================== + +export { + kind as ttsKind, + ttsActivity, + type TTSActivityOptions, + type TTSActivityResult, + type TTSModels, + type TTSProviderOptions, +} from './tts/index' + +export { + BaseTTSAdapter, + type TTSAdapter, + type TTSAdapterConfig, +} from './tts/adapter' + +// =========================== +// Transcription Activity +// =========================== + +export { + kind as transcriptionKind, + transcriptionActivity, + type TranscriptionActivityOptions, + type TranscriptionActivityResult, + type TranscriptionModels, + type TranscriptionProviderOptions, +} from './transcription/index' + +export { + BaseTranscriptionAdapter, + type TranscriptionAdapter, + type TranscriptionAdapterConfig, +} from './transcription/adapter' + // =========================== // Activity Handler Type // =========================== @@ -173,6 +269,9 @@ export const activityMap = new Map([ [embeddingKindValue, embeddingActivity], [summarizeKindValue, summarizeActivity], [imageKindValue, imageActivity], + [videoKindValue, videoActivity], + [ttsKindValue, ttsActivity], + [transcriptionKindValue, transcriptionActivity], ]) // =========================== @@ -185,6 +284,9 @@ export type AIAdapter = | EmbeddingAdapter, object> | SummarizeAdapter, object> | ImageAdapter, object, any, any> + | VideoAdapter, object> + | TTSAdapter, object> + | TranscriptionAdapter, object> /** Alias for backwards compatibility */ export type GenerateAdapter = AIAdapter @@ -195,9 +297,19 @@ export type AnyAdapter = | EmbeddingAdapter | SummarizeAdapter | ImageAdapter + | VideoAdapter + | TTSAdapter + | TranscriptionAdapter /** Union type of all adapter kinds */ -export type AdapterKind = 'text' | 'embedding' | 'summarize' | 'image' +export type AdapterKind = + | 'text' + | 'embedding' + | 'summarize' + | 'image' + | 'video' + | 'tts' + | 'transcription' // =========================== // Unified Options Type @@ -211,6 +323,11 @@ export type AnyAIAdapter = | (EmbeddingAdapter, object> & { kind: 'embedding' }) | (SummarizeAdapter, object> & { kind: 'summarize' }) | (ImageAdapter, object, any, any> & { kind: 'image' }) + | (VideoAdapter, object> & { kind: 'video' }) + | (TTSAdapter, object> & { kind: 'tts' }) + | (TranscriptionAdapter, object> & { + kind: 'transcription' + }) /** Infer the correct options type based on adapter kind */ export type AIOptionsFor< @@ -218,6 +335,7 @@ export type AIOptionsFor< TModel extends string, TSchema extends z.ZodType | undefined = undefined, TStream extends boolean | undefined = undefined, + TRequest extends 'create' | 'status' | 'url' = 'create', > = TAdapter extends { kind: 'text' } ? TAdapter extends TextAdapter, object, any, any, any> ? TextActivityOptions< @@ -243,7 +361,29 @@ export type AIOptionsFor< ? TAdapter extends ImageAdapter, object, any, any> ? ImageActivityOptions> : never - : never + : TAdapter extends { kind: 'video' } + ? TAdapter extends VideoAdapter, object> + ? VideoActivityOptions< + TAdapter, + TModel & VideoModels, + TRequest + > + : never + : TAdapter extends { kind: 'tts' } + ? TAdapter extends TTSAdapter, object> + ? TTSActivityOptions> + : never + : TAdapter extends { kind: 'transcription' } + ? TAdapter extends TranscriptionAdapter< + ReadonlyArray, + object + > + ? TranscriptionActivityOptions< + TAdapter, + TModel & TranscriptionModels + > + : never + : never // =========================== // Unified Result Type @@ -254,6 +394,7 @@ export type AIResultFor< TAdapter extends AnyAIAdapter, TSchema extends z.ZodType | undefined = undefined, TStream extends boolean | undefined = undefined, + TRequest extends 'create' | 'status' | 'url' = 'create', > = TAdapter extends { kind: 'text' } ? TextActivityResult : TAdapter extends { kind: 'embedding' } @@ -262,7 +403,13 @@ export type AIResultFor< ? SummarizeActivityResult : TAdapter extends { kind: 'image' } ? ImageActivityResult - : never + : TAdapter extends { kind: 'video' } + ? VideoActivityResult + : TAdapter extends { kind: 'tts' } + ? TTSActivityResult + : TAdapter extends { kind: 'transcription' } + ? TranscriptionActivityResult + : never // =========================== // Unified Options Type (Legacy) @@ -352,6 +499,14 @@ export type AIOptionsUnion = ImageAdapter, object, any, any>, string > + | VideoCreateOptions, object>, string> + | VideoStatusOptions, object>, string> + | VideoUrlOptions, object>, string> + | TTSActivityOptions, object>, string> + | TranscriptionActivityOptions< + TranscriptionAdapter, object>, + string + > /** * Union type for all possible ai() return types (used in implementation signature) @@ -362,6 +517,11 @@ export type AIResultUnion = | Promise | Promise | Promise + | Promise + | Promise + | Promise + | Promise + | Promise | Promise // =========================== @@ -437,6 +597,128 @@ export type AIImageOptions< providerOptions?: ImageProviderOptionsForModel } +/** + * Explicit video options for creating a job - provides clear autocomplete and required field enforcement. + * + * @experimental Video generation is an experimental feature and may change. + */ +export type AIVideoCreateOptions< + TAdapter extends VideoAdapter, object>, + TModel extends VideoModels, +> = { + /** The video adapter to use */ + adapter: TAdapter & { kind: 'video' } + /** The model name (autocompletes based on adapter) */ + model: TModel + /** Request type - create a new job */ + request?: 'create' + /** Text description of the desired video - REQUIRED */ + prompt: string + /** Video size in WIDTHxHEIGHT format (e.g., "1280x720") */ + size?: string + /** Video duration in seconds */ + duration?: number + /** Provider-specific options */ + providerOptions?: VideoProviderOptions +} + +/** + * Explicit video options for checking status. + * + * @experimental Video generation is an experimental feature and may change. + */ +export type AIVideoStatusOptions< + TAdapter extends VideoAdapter, object>, + TModel extends VideoModels, +> = { + /** The video adapter to use */ + adapter: TAdapter & { kind: 'video' } + /** The model name (autocompletes based on adapter) */ + model: TModel + /** Request type - get status */ + request: 'status' + /** Job ID to check status for - REQUIRED */ + jobId: string +} + +/** + * Explicit video options for getting the video URL. + * + * @experimental Video generation is an experimental feature and may change. + */ +export type AIVideoUrlOptions< + TAdapter extends VideoAdapter, object>, + TModel extends VideoModels, +> = { + /** The video adapter to use */ + adapter: TAdapter & { kind: 'video' } + /** The model name (autocompletes based on adapter) */ + model: TModel + /** Request type - get URL */ + request: 'url' + /** Job ID to get URL for - REQUIRED */ + jobId: string +} + +/** + * Union of all video options types. + * + * @experimental Video generation is an experimental feature and may change. + */ +export type AIVideoOptions< + TAdapter extends VideoAdapter, object>, + TModel extends VideoModels, +> = + | AIVideoCreateOptions + | AIVideoStatusOptions + | AIVideoUrlOptions + +/** + * Explicit TTS options - provides clear autocomplete and required field enforcement. + */ +export type AITTSOptions< + TAdapter extends TTSAdapter, object>, + TModel extends TTSModels, +> = { + /** The TTS adapter to use */ + adapter: TAdapter & { kind: 'tts' } + /** The model name (autocompletes based on adapter) */ + model: TModel + /** The text to convert to speech - REQUIRED */ + text: string + /** The voice to use for generation */ + voice?: string + /** The output audio format */ + format?: 'mp3' | 'opus' | 'aac' | 'flac' | 'wav' | 'pcm' + /** The speed of the generated audio (0.25 to 4.0) */ + speed?: number + /** Provider-specific options */ + providerOptions?: TTSProviderOptions +} + +/** + * Explicit transcription options - provides clear autocomplete and required field enforcement. + */ +export type AITranscriptionOptions< + TAdapter extends TranscriptionAdapter, object>, + TModel extends TranscriptionModels, +> = { + /** The transcription adapter to use */ + adapter: TAdapter & { kind: 'transcription' } + /** The model name (autocompletes based on adapter) */ + model: TModel + /** The audio data to transcribe - REQUIRED */ + audio: string | File | Blob | ArrayBuffer + /** The language of the audio in ISO-639-1 format (e.g., 'en') */ + language?: string + /** An optional prompt to guide the transcription */ + prompt?: string + /** The format of the transcription output */ + responseFormat?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt' + /** Provider-specific options */ + providerOptions?: TranscriptionProviderOptions +} + /** * Explicit text options - provides clear autocomplete and required field enforcement. * Uses NoInfer on providerOptions to prevent inference widening. @@ -513,3 +795,23 @@ export type ImageGenerateOptions< TAdapter extends ImageAdapter, object, any, any>, TModel extends ImageModels, > = ImageActivityOptions + +/** + * @deprecated Use VideoActivityOptions + * @experimental Video generation is an experimental feature and may change. + */ +export type VideoGenerateOptions< + TAdapter extends VideoAdapter, object>, + TModel extends VideoModels, + TRequest extends 'create' | 'status' | 'url' = 'create', +> = VideoActivityOptions + +/** + * @deprecated Use VideoActivityOptions + * @experimental Video generation is an experimental feature and may change. + */ +export type GenerateVideoOptions< + TAdapter extends VideoAdapter, object>, + TModel extends VideoModels, + TRequest extends 'create' | 'status' | 'url' = 'create', +> = VideoActivityOptions diff --git a/packages/typescript/ai/src/activities/transcription/adapter.ts b/packages/typescript/ai/src/activities/transcription/adapter.ts new file mode 100644 index 00000000..f81c2560 --- /dev/null +++ b/packages/typescript/ai/src/activities/transcription/adapter.ts @@ -0,0 +1,74 @@ +import type { TranscriptionOptions, TranscriptionResult } from '../../types' + +/** + * Configuration for transcription adapter instances + */ +export interface TranscriptionAdapterConfig { + apiKey?: string + baseUrl?: string + timeout?: number + maxRetries?: number + headers?: Record +} + +/** + * Base interface for audio transcription adapters. + * Provides type-safe transcription functionality with support for + * model-specific provider options. + * + * Generic parameters: + * - TModels: Array of supported transcription model names + * - TProviderOptions: Base provider-specific options for transcription + */ +export interface TranscriptionAdapter< + TModels extends ReadonlyArray = ReadonlyArray, + TProviderOptions extends object = Record, +> { + /** Discriminator for adapter kind - used by ai() to determine API shape */ + readonly kind: 'transcription' + /** Adapter name identifier */ + readonly name: string + /** Supported transcription models */ + readonly models: TModels + + // Type-only properties for type inference + /** @internal Type-only property for provider options inference */ + _providerOptions?: TProviderOptions + + /** + * Transcribe audio to text + */ + transcribe: ( + options: TranscriptionOptions, + ) => Promise +} + +/** + * Abstract base class for audio transcription adapters. + * Extend this class to implement a transcription adapter for a specific provider. + */ +export abstract class BaseTranscriptionAdapter< + TModels extends ReadonlyArray = ReadonlyArray, + TProviderOptions extends object = Record, +> implements TranscriptionAdapter { + readonly kind = 'transcription' as const + abstract readonly name: string + abstract readonly models: TModels + + // Type-only properties - never assigned at runtime + declare _providerOptions?: TProviderOptions + + protected config: TranscriptionAdapterConfig + + constructor(config: TranscriptionAdapterConfig = {}) { + this.config = config + } + + abstract transcribe( + options: TranscriptionOptions, + ): Promise + + protected generateId(): string { + return `${this.name}-${Date.now()}-${Math.random().toString(36).substring(7)}` + } +} diff --git a/packages/typescript/ai/src/activities/transcription/index.ts b/packages/typescript/ai/src/activities/transcription/index.ts new file mode 100644 index 00000000..2a8a1c4d --- /dev/null +++ b/packages/typescript/ai/src/activities/transcription/index.ts @@ -0,0 +1,125 @@ +/** + * Transcription Activity + * + * Transcribes audio to text using speech-to-text models. + * This is a self-contained module with implementation, types, and JSDoc. + */ + +import type { TranscriptionAdapter } from './adapter' +import type { TranscriptionResult } from '../../types' + +// =========================== +// Activity Kind +// =========================== + +/** The adapter kind this activity handles */ +export const kind = 'transcription' as const + +// =========================== +// Type Extraction Helpers +// =========================== + +/** Extract model types from a TranscriptionAdapter */ +export type TranscriptionModels = + TAdapter extends TranscriptionAdapter ? M[number] : string + +/** + * Extract provider options from a TranscriptionAdapter. + */ +export type TranscriptionProviderOptions = + TAdapter extends TranscriptionAdapter + ? TProviderOptions + : object + +// =========================== +// Activity Options Type +// =========================== + +/** + * Options for the transcription activity. + * + * @template TAdapter - The transcription adapter type + * @template TModel - The model name type (inferred from adapter) + */ +export interface TranscriptionActivityOptions< + TAdapter extends TranscriptionAdapter, object>, + TModel extends TranscriptionModels, +> { + /** The transcription adapter to use */ + adapter: TAdapter & { kind: typeof kind } + /** The model name (autocompletes based on adapter) */ + model: TModel + /** The audio data to transcribe - can be base64 string, File, Blob, or Buffer */ + audio: string | File | Blob | ArrayBuffer + /** The language of the audio in ISO-639-1 format (e.g., 'en') */ + language?: string + /** An optional prompt to guide the transcription */ + prompt?: string + /** The format of the transcription output */ + responseFormat?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt' + /** Provider-specific options for transcription */ + providerOptions?: TranscriptionProviderOptions +} + +// =========================== +// Activity Result Type +// =========================== + +/** Result type for the transcription activity */ +export type TranscriptionActivityResult = Promise + +// =========================== +// Activity Implementation +// =========================== + +/** + * Transcription activity - converts audio to text. + * + * Uses AI speech-to-text models to transcribe audio content. + * + * @example Transcribe an audio file + * ```ts + * import { ai } from '@tanstack/ai' + * import { openaiTranscription } from '@tanstack/ai-openai' + * + * const result = await ai({ + * adapter: openaiTranscription(), + * model: 'whisper-1', + * audio: audioFile, // File, Blob, or base64 string + * language: 'en' + * }) + * + * console.log(result.text) + * ``` + * + * @example With verbose output for timestamps + * ```ts + * const result = await ai({ + * adapter: openaiTranscription(), + * model: 'whisper-1', + * audio: audioFile, + * responseFormat: 'verbose_json' + * }) + * + * result.segments?.forEach(segment => { + * console.log(`[${segment.start}s - ${segment.end}s]: ${segment.text}`) + * }) + * ``` + */ +export async function transcriptionActivity< + TAdapter extends TranscriptionAdapter, object>, + TModel extends TranscriptionModels, +>( + options: TranscriptionActivityOptions, +): TranscriptionActivityResult { + const { adapter, ...rest } = options + + return adapter.transcribe(rest) +} + +// Re-export adapter types +export type { + TranscriptionAdapter, + TranscriptionAdapterConfig, +} from './adapter' +export { BaseTranscriptionAdapter } from './adapter' diff --git a/packages/typescript/ai/src/activities/tts/adapter.ts b/packages/typescript/ai/src/activities/tts/adapter.ts new file mode 100644 index 00000000..17c7ab74 --- /dev/null +++ b/packages/typescript/ai/src/activities/tts/adapter.ts @@ -0,0 +1,72 @@ +import type { TTSOptions, TTSResult } from '../../types' + +/** + * Configuration for TTS adapter instances + */ +export interface TTSAdapterConfig { + apiKey?: string + baseUrl?: string + timeout?: number + maxRetries?: number + headers?: Record +} + +/** + * Base interface for text-to-speech adapters. + * Provides type-safe TTS functionality with support for + * model-specific provider options. + * + * Generic parameters: + * - TModels: Array of supported TTS model names + * - TProviderOptions: Base provider-specific options for TTS generation + */ +export interface TTSAdapter< + TModels extends ReadonlyArray = ReadonlyArray, + TProviderOptions extends object = Record, +> { + /** Discriminator for adapter kind - used by ai() to determine API shape */ + readonly kind: 'tts' + /** Adapter name identifier */ + readonly name: string + /** Supported TTS models */ + readonly models: TModels + + // Type-only properties for type inference + /** @internal Type-only property for provider options inference */ + _providerOptions?: TProviderOptions + + /** + * Generate speech from text + */ + generateSpeech: (options: TTSOptions) => Promise +} + +/** + * Abstract base class for text-to-speech adapters. + * Extend this class to implement a TTS adapter for a specific provider. + */ +export abstract class BaseTTSAdapter< + TModels extends ReadonlyArray = ReadonlyArray, + TProviderOptions extends object = Record, +> implements TTSAdapter { + readonly kind = 'tts' as const + abstract readonly name: string + abstract readonly models: TModels + + // Type-only properties - never assigned at runtime + declare _providerOptions?: TProviderOptions + + protected config: TTSAdapterConfig + + constructor(config: TTSAdapterConfig = {}) { + this.config = config + } + + abstract generateSpeech( + options: TTSOptions, + ): Promise + + protected generateId(): string { + return `${this.name}-${Date.now()}-${Math.random().toString(36).substring(7)}` + } +} diff --git a/packages/typescript/ai/src/activities/tts/index.ts b/packages/typescript/ai/src/activities/tts/index.ts new file mode 100644 index 00000000..eaf72d74 --- /dev/null +++ b/packages/typescript/ai/src/activities/tts/index.ts @@ -0,0 +1,118 @@ +/** + * TTS Activity + * + * Generates speech audio from text using text-to-speech models. + * This is a self-contained module with implementation, types, and JSDoc. + */ + +import type { TTSAdapter } from './adapter' +import type { TTSResult } from '../../types' + +// =========================== +// Activity Kind +// =========================== + +/** The adapter kind this activity handles */ +export const kind = 'tts' as const + +// =========================== +// Type Extraction Helpers +// =========================== + +/** Extract model types from a TTSAdapter */ +export type TTSModels = + TAdapter extends TTSAdapter ? M[number] : string + +/** + * Extract provider options from a TTSAdapter. + */ +export type TTSProviderOptions = + TAdapter extends TTSAdapter + ? TProviderOptions + : object + +// =========================== +// Activity Options Type +// =========================== + +/** + * Options for the TTS activity. + * + * @template TAdapter - The TTS adapter type + * @template TModel - The model name type (inferred from adapter) + */ +export interface TTSActivityOptions< + TAdapter extends TTSAdapter, object>, + TModel extends TTSModels, +> { + /** The TTS adapter to use */ + adapter: TAdapter & { kind: typeof kind } + /** The model name (autocompletes based on adapter) */ + model: TModel + /** The text to convert to speech */ + text: string + /** The voice to use for generation */ + voice?: string + /** The output audio format */ + format?: 'mp3' | 'opus' | 'aac' | 'flac' | 'wav' | 'pcm' + /** The speed of the generated audio (0.25 to 4.0) */ + speed?: number + /** Provider-specific options for TTS generation */ + providerOptions?: TTSProviderOptions +} + +// =========================== +// Activity Result Type +// =========================== + +/** Result type for the TTS activity */ +export type TTSActivityResult = Promise + +// =========================== +// Activity Implementation +// =========================== + +/** + * TTS activity - generates speech from text. + * + * Uses AI text-to-speech models to create audio from natural language text. + * + * @example Generate speech from text + * ```ts + * import { ai } from '@tanstack/ai' + * import { openaiTTS } from '@tanstack/ai-openai' + * + * const result = await ai({ + * adapter: openaiTTS(), + * model: 'tts-1-hd', + * text: 'Hello, welcome to TanStack AI!', + * voice: 'nova' + * }) + * + * console.log(result.audio) // base64-encoded audio + * ``` + * + * @example With format and speed options + * ```ts + * const result = await ai({ + * adapter: openaiTTS(), + * model: 'tts-1', + * text: 'This is slower speech.', + * voice: 'alloy', + * format: 'wav', + * speed: 0.8 + * }) + * ``` + */ +export async function ttsActivity< + TAdapter extends TTSAdapter, object>, + TModel extends TTSModels, +>(options: TTSActivityOptions): TTSActivityResult { + const { adapter, ...rest } = options + + return adapter.generateSpeech(rest) +} + +// Re-export adapter types +export type { TTSAdapter, TTSAdapterConfig } from './adapter' +export { BaseTTSAdapter } from './adapter' diff --git a/packages/typescript/ai/src/activities/video/adapter.ts b/packages/typescript/ai/src/activities/video/adapter.ts new file mode 100644 index 00000000..f5a55d70 --- /dev/null +++ b/packages/typescript/ai/src/activities/video/adapter.ts @@ -0,0 +1,101 @@ +import type { + VideoGenerationOptions, + VideoJobResult, + VideoStatusResult, + VideoUrlResult, +} from '../../types' + +/** + * Configuration for video adapter instances + * + * @experimental Video generation is an experimental feature and may change. + */ +export interface VideoAdapterConfig { + apiKey?: string + baseUrl?: string + timeout?: number + maxRetries?: number + headers?: Record +} + +/** + * Base interface for video generation adapters. + * Provides type-safe video generation functionality with support for + * job-based async operations (create, poll status, get URL). + * + * @experimental Video generation is an experimental feature and may change. + * + * Generic parameters: + * - TModels: Array of supported video model names + * - TProviderOptions: Base provider-specific options for video generation + */ +export interface VideoAdapter< + TModels extends ReadonlyArray = ReadonlyArray, + TProviderOptions extends object = Record, +> { + /** Discriminator for adapter kind - used by ai() to determine API shape */ + readonly kind: 'video' + /** Adapter name identifier */ + readonly name: string + /** Supported video generation models */ + readonly models: TModels + + // Type-only properties for type inference + /** @internal Type-only property for provider options inference */ + _providerOptions?: TProviderOptions + + /** + * Create a new video generation job. + * Returns a job ID that can be used to poll for status and retrieve the video. + */ + createVideoJob: ( + options: VideoGenerationOptions, + ) => Promise + + /** + * Get the current status of a video generation job. + */ + getVideoStatus: (jobId: string) => Promise + + /** + * Get the URL to download/view the generated video. + * Should only be called after status is 'completed'. + */ + getVideoUrl: (jobId: string) => Promise +} + +/** + * Abstract base class for video generation adapters. + * Extend this class to implement a video adapter for a specific provider. + * + * @experimental Video generation is an experimental feature and may change. + */ +export abstract class BaseVideoAdapter< + TModels extends ReadonlyArray = ReadonlyArray, + TProviderOptions extends object = Record, +> implements VideoAdapter { + readonly kind = 'video' as const + abstract readonly name: string + abstract readonly models: TModels + + // Type-only properties - never assigned at runtime + declare _providerOptions?: TProviderOptions + + protected config: VideoAdapterConfig + + constructor(config: VideoAdapterConfig = {}) { + this.config = config + } + + abstract createVideoJob( + options: VideoGenerationOptions, + ): Promise + + abstract getVideoStatus(jobId: string): Promise + + abstract getVideoUrl(jobId: string): Promise + + protected generateId(): string { + return `${this.name}-${Date.now()}-${Math.random().toString(36).substring(7)}` + } +} diff --git a/packages/typescript/ai/src/activities/video/index.ts b/packages/typescript/ai/src/activities/video/index.ts new file mode 100644 index 00000000..3ed876ec --- /dev/null +++ b/packages/typescript/ai/src/activities/video/index.ts @@ -0,0 +1,230 @@ +/** + * Video Activity (Experimental) + * + * Generates videos from text prompts using a jobs/polling architecture. + * This is a self-contained module with implementation, types, and JSDoc. + * + * @experimental Video generation is an experimental feature and may change. + */ + +import type { VideoAdapter } from './adapter' +import type { + VideoJobResult, + VideoStatusResult, + VideoUrlResult, +} from '../../types' + +// =========================== +// Activity Kind +// =========================== + +/** The adapter kind this activity handles */ +export const kind = 'video' as const + +// =========================== +// Type Extraction Helpers +// =========================== + +/** Extract model types from a VideoAdapter */ +export type VideoModels = + TAdapter extends VideoAdapter ? M[number] : string + +/** + * Extract provider options from a VideoAdapter. + */ +export type VideoProviderOptions = + TAdapter extends VideoAdapter + ? TProviderOptions + : object + +// =========================== +// Activity Options Types +// =========================== + +/** + * Base options shared by all video activity operations. + */ +interface VideoActivityBaseOptions< + TAdapter extends VideoAdapter, object>, + TModel extends VideoModels, +> { + /** The video adapter to use */ + adapter: TAdapter & { kind: typeof kind } + /** The model name (autocompletes based on adapter) */ + model: TModel +} + +/** + * Options for creating a new video generation job. + * + * @experimental Video generation is an experimental feature and may change. + */ +export interface VideoCreateOptions< + TAdapter extends VideoAdapter, object>, + TModel extends VideoModels, +> extends VideoActivityBaseOptions { + /** Request type - create a new job (default if not specified) */ + request?: 'create' + /** Text description of the desired video */ + prompt: string + /** Video size in WIDTHxHEIGHT format (e.g., "1280x720") */ + size?: string + /** Video duration in seconds */ + duration?: number + /** Provider-specific options for video generation */ + providerOptions?: VideoProviderOptions +} + +/** + * Options for polling the status of a video generation job. + * + * @experimental Video generation is an experimental feature and may change. + */ +export interface VideoStatusOptions< + TAdapter extends VideoAdapter, object>, + TModel extends VideoModels, +> extends VideoActivityBaseOptions { + /** Request type - get job status */ + request: 'status' + /** The job ID to check status for */ + jobId: string +} + +/** + * Options for getting the URL of a completed video. + * + * @experimental Video generation is an experimental feature and may change. + */ +export interface VideoUrlOptions< + TAdapter extends VideoAdapter, object>, + TModel extends VideoModels, +> extends VideoActivityBaseOptions { + /** Request type - get video URL */ + request: 'url' + /** The job ID to get URL for */ + jobId: string +} + +/** + * Union type for all video activity options. + * Discriminated by the `request` field. + * + * @experimental Video generation is an experimental feature and may change. + */ +export type VideoActivityOptions< + TAdapter extends VideoAdapter, object>, + TModel extends VideoModels, + TRequest extends 'create' | 'status' | 'url' = 'create', +> = TRequest extends 'status' + ? VideoStatusOptions + : TRequest extends 'url' + ? VideoUrlOptions + : VideoCreateOptions + +// =========================== +// Activity Result Types +// =========================== + +/** + * Result type for the video activity, based on request type. + * + * @experimental Video generation is an experimental feature and may change. + */ +export type VideoActivityResult< + TRequest extends 'create' | 'status' | 'url' = 'create', +> = TRequest extends 'status' + ? Promise + : TRequest extends 'url' + ? Promise + : Promise + +// =========================== +// Activity Implementation +// =========================== + +/** + * Video activity - generates videos from text prompts using a jobs/polling pattern. + * + * Uses AI video generation models to create videos based on natural language descriptions. + * Unlike image generation, video generation is asynchronous and requires polling for completion. + * + * @experimental Video generation is an experimental feature and may change. + * + * @example Create a video generation job + * ```ts + * import { ai } from '@tanstack/ai' + * import { openaiVideo } from '@tanstack/ai-openai' + * + * // Start a video generation job + * const { jobId } = await ai({ + * adapter: openaiVideo(), + * model: 'sora-2', + * prompt: 'A cat chasing a dog in a sunny park' + * }) + * + * console.log('Job started:', jobId) + * ``` + * + * @example Poll for job status + * ```ts + * // Check status of the job + * const status = await ai({ + * adapter: openaiVideo(), + * model: 'sora-2', + * jobId, + * request: 'status' + * }) + * + * console.log('Status:', status.status, 'Progress:', status.progress) + * ``` + * + * @example Get the video URL when complete + * ```ts + * // Get the video URL (after status is 'completed') + * const { url } = await ai({ + * adapter: openaiVideo(), + * model: 'sora-2', + * jobId, + * request: 'url' + * }) + * + * console.log('Video URL:', url) + * ``` + */ +export async function videoActivity< + TAdapter extends VideoAdapter, object>, + TModel extends VideoModels, +>( + options: + | VideoCreateOptions + | VideoStatusOptions + | VideoUrlOptions, +): Promise { + const { adapter, request = 'create' } = options + + switch (request) { + case 'status': { + const statusOptions = options as VideoStatusOptions + return adapter.getVideoStatus(statusOptions.jobId) + } + case 'url': { + const urlOptions = options as VideoUrlOptions + return adapter.getVideoUrl(urlOptions.jobId) + } + case 'create': + default: { + const createOptions = options as VideoCreateOptions + return adapter.createVideoJob({ + model: createOptions.model, + prompt: createOptions.prompt, + size: createOptions.size, + duration: createOptions.duration, + providerOptions: createOptions.providerOptions, + }) + } + } +} + +// Re-export adapter types +export type { VideoAdapter, VideoAdapterConfig } from './adapter' +export { BaseVideoAdapter } from './adapter' diff --git a/packages/typescript/ai/src/ai.ts b/packages/typescript/ai/src/ai.ts index c05d3e30..079b14e6 100644 --- a/packages/typescript/ai/src/ai.ts +++ b/packages/typescript/ai/src/ai.ts @@ -13,22 +13,30 @@ import type { AIResultUnion, AISummarizeOptions, AITextOptions, + AIVideoCreateOptions, + AIVideoStatusOptions, + AIVideoUrlOptions, AnyAIAdapter, EmbeddingModels, ImageModels, SummarizeModels, TextModels, + VideoModels, } from './activities' import type { TextAdapter } from './activities/text/adapter' import type { EmbeddingAdapter } from './activities/embedding/adapter' import type { SummarizeAdapter } from './activities/summarize/adapter' import type { ImageAdapter } from './activities/image/adapter' +import type { VideoAdapter } from './activities/video/adapter' import type { z } from 'zod' import type { EmbeddingResult, ImageGenerationResult, StreamChunk, SummarizationResult, + VideoJobResult, + VideoStatusResult, + VideoUrlResult, } from './types' // =========================== @@ -41,6 +49,7 @@ export type GenerateAdapter = | EmbeddingAdapter, object> | SummarizeAdapter, object> | ImageAdapter, object, any, any> + | VideoAdapter, object> /** Alias for backwards compatibility */ export type AnyAdapter = GenerateAdapter @@ -54,6 +63,7 @@ type ExtractTextModels = TextModels type ExtractEmbeddingModels = EmbeddingModels type ExtractSummarizeModels = SummarizeModels type ExtractImageModels = ImageModels +type ExtractVideoModels = VideoModels // =========================== // Options/Return Type Mapping @@ -65,6 +75,7 @@ type AIOptionsFor< TSchema extends z.ZodType | undefined = undefined, TTextStream extends boolean = true, TSummarizeStream extends boolean = false, + TVideoRequest extends 'create' | 'status' | 'url' = 'create', > = TAdapter extends { kind: 'text' } ? AITextOptions< Extract< @@ -94,13 +105,35 @@ type AIOptionsFor< >, TModel & ExtractImageModels > - : never + : TAdapter extends { kind: 'video' } + ? TVideoRequest extends 'status' + ? AIVideoStatusOptions< + Extract, object>>, + TModel & ExtractVideoModels + > + : TVideoRequest extends 'url' + ? AIVideoUrlOptions< + Extract< + TAdapter, + VideoAdapter, object> + >, + TModel & ExtractVideoModels + > + : AIVideoCreateOptions< + Extract< + TAdapter, + VideoAdapter, object> + >, + TModel & ExtractVideoModels + > + : never type AIReturnFor< TAdapter extends AnyAIAdapter, TSchema extends z.ZodType | undefined = undefined, TTextStream extends boolean = true, TSummarizeStream extends boolean = false, + TVideoRequest extends 'create' | 'status' | 'url' = 'create', > = TAdapter extends { kind: 'text' } ? TSchema extends z.ZodType ? Promise> @@ -115,7 +148,13 @@ type AIReturnFor< : Promise : TAdapter extends { kind: 'image' } ? Promise - : never + : TAdapter extends { kind: 'video' } + ? TVideoRequest extends 'status' + ? Promise + : TVideoRequest extends 'url' + ? Promise + : Promise + : never // =========================== // AI Function @@ -130,6 +169,7 @@ type AIReturnFor< * - `'embedding'` → Embedding activity (vector generation) * - `'summarize'` → Summarize activity (text summarization) * - `'image'` → Image activity (image generation) + * - `'video'` → Video activity (video generation via jobs/polling) [experimental] * * @example Chat generation (streaming) * ```ts @@ -202,6 +242,34 @@ type AIReturnFor< * prompt: 'A serene mountain landscape' * }) * ``` + * + * @example Video generation (experimental) + * ```ts + * import { openaiVideo } from '@tanstack/ai-openai' + * + * // Create a video job + * const { jobId } = await ai({ + * adapter: openaiVideo(), + * model: 'sora-2', + * prompt: 'A cat chasing a dog' + * }) + * + * // Poll for status + * const status = await ai({ + * adapter: openaiVideo(), + * model: 'sora-2', + * jobId, + * request: 'status' + * }) + * + * // Get video URL when complete + * const { url } = await ai({ + * adapter: openaiVideo(), + * model: 'sora-2', + * jobId, + * request: 'url' + * }) + * ``` */ export function ai< TAdapter extends AnyAIAdapter, @@ -209,15 +277,17 @@ export function ai< TSchema extends z.ZodType | undefined = undefined, TTextStream extends boolean = true, TSummarizeStream extends boolean = false, + TVideoRequest extends 'create' | 'status' | 'url' = 'create', >( options: AIOptionsFor< TAdapter, TModel, TSchema, TTextStream, - TSummarizeStream + TSummarizeStream, + TVideoRequest >, -): AIReturnFor +): AIReturnFor // Implementation export function ai(options: AIOptionsUnion): AIResultUnion { @@ -240,6 +310,7 @@ export type { TextAdapter } from './activities/text/adapter' export type { EmbeddingAdapter } from './activities/embedding/adapter' export type { SummarizeAdapter } from './activities/summarize/adapter' export type { ImageAdapter } from './activities/image/adapter' +export type { VideoAdapter } from './activities/video/adapter' // Re-export type helpers export type { @@ -247,6 +318,7 @@ export type { EmbeddingModels, SummarizeModels, ImageModels, + VideoModels, } from './activities' // Re-export activity option types and legacy aliases used by the package entrypoint @@ -258,8 +330,10 @@ export type { EmbeddingGenerateOptions, SummarizeGenerateOptions, ImageGenerateOptions, + VideoGenerateOptions, GenerateTextOptions, GenerateEmbeddingOptions, GenerateSummarizeOptions, GenerateImageOptions, + GenerateVideoOptions, } from './activities' diff --git a/packages/typescript/ai/src/types.ts b/packages/typescript/ai/src/types.ts index e2512709..6763547a 100644 --- a/packages/typescript/ai/src/types.ts +++ b/packages/typescript/ai/src/types.ts @@ -798,6 +798,189 @@ export interface ImageGenerationResult { } } +// ============================================================================ +// Video Generation Types (Experimental) +// ============================================================================ + +/** + * Options for video generation. + * These are the common options supported across providers. + * + * @experimental Video generation is an experimental feature and may change. + */ +export interface VideoGenerationOptions< + TProviderOptions extends object = object, +> { + /** The model to use for video generation */ + model: string + /** Text description of the desired video */ + prompt: string + /** Video size in WIDTHxHEIGHT format (e.g., "1280x720") */ + size?: string + /** Video duration in seconds */ + duration?: number + /** Provider-specific options for video generation */ + providerOptions?: TProviderOptions +} + +/** + * Result of creating a video generation job. + * + * @experimental Video generation is an experimental feature and may change. + */ +export interface VideoJobResult { + /** Unique job identifier for polling status */ + jobId: string + /** Model used for generation */ + model: string +} + +/** + * Status of a video generation job. + * + * @experimental Video generation is an experimental feature and may change. + */ +export interface VideoStatusResult { + /** Job identifier */ + jobId: string + /** Current status of the job */ + status: 'pending' | 'processing' | 'completed' | 'failed' + /** Progress percentage (0-100), if available */ + progress?: number + /** Error message if status is 'failed' */ + error?: string +} + +/** + * Result containing the URL to a generated video. + * + * @experimental Video generation is an experimental feature and may change. + */ +export interface VideoUrlResult { + /** Job identifier */ + jobId: string + /** URL to the generated video */ + url: string + /** When the URL expires, if applicable */ + expiresAt?: Date +} + +// ============================================================================ +// Text-to-Speech (TTS) Types +// ============================================================================ + +/** + * Options for text-to-speech generation. + * These are the common options supported across providers. + */ +export interface TTSOptions { + /** The model to use for TTS generation */ + model: string + /** The text to convert to speech */ + text: string + /** The voice to use for generation */ + voice?: string + /** The output audio format */ + format?: 'mp3' | 'opus' | 'aac' | 'flac' | 'wav' | 'pcm' + /** The speed of the generated audio (0.25 to 4.0) */ + speed?: number + /** Provider-specific options for TTS generation */ + providerOptions?: TProviderOptions +} + +/** + * Result of text-to-speech generation. + */ +export interface TTSResult { + /** Unique identifier for the generation */ + id: string + /** Model used for generation */ + model: string + /** Base64-encoded audio data */ + audio: string + /** Audio format of the generated audio */ + format: string + /** Duration of the audio in seconds, if available */ + duration?: number + /** Content type of the audio (e.g., 'audio/mp3') */ + contentType?: string +} + +// ============================================================================ +// Transcription (Speech-to-Text) Types +// ============================================================================ + +/** + * Options for audio transcription. + * These are the common options supported across providers. + */ +export interface TranscriptionOptions< + TProviderOptions extends object = object, +> { + /** The model to use for transcription */ + model: string + /** The audio data to transcribe - can be base64 string, File, Blob, or Buffer */ + audio: string | File | Blob | ArrayBuffer + /** The language of the audio in ISO-639-1 format (e.g., 'en') */ + language?: string + /** An optional prompt to guide the transcription */ + prompt?: string + /** The format of the transcription output */ + responseFormat?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt' + /** Provider-specific options for transcription */ + providerOptions?: TProviderOptions +} + +/** + * A single segment of transcribed audio with timing information. + */ +export interface TranscriptionSegment { + /** Unique identifier for the segment */ + id: number + /** Start time of the segment in seconds */ + start: number + /** End time of the segment in seconds */ + end: number + /** Transcribed text for this segment */ + text: string + /** Confidence score (0-1), if available */ + confidence?: number + /** Speaker identifier, if diarization is enabled */ + speaker?: string +} + +/** + * A single word with timing information. + */ +export interface TranscriptionWord { + /** The transcribed word */ + word: string + /** Start time in seconds */ + start: number + /** End time in seconds */ + end: number +} + +/** + * Result of audio transcription. + */ +export interface TranscriptionResult { + /** Unique identifier for the transcription */ + id: string + /** Model used for transcription */ + model: string + /** The full transcribed text */ + text: string + /** Language detected or specified */ + language?: string + /** Duration of the audio in seconds */ + duration?: number + /** Detailed segments with timing, if available */ + segments?: Array + /** Word-level timestamps, if available */ + words?: Array +} + /** * Default metadata type for adapters that don't define custom metadata. * Uses unknown for all modalities. diff --git a/packages/typescript/smoke-tests/adapters/fixtures/test-audio.mp3 b/packages/typescript/smoke-tests/adapters/fixtures/test-audio.mp3 new file mode 100644 index 00000000..08466969 Binary files /dev/null and b/packages/typescript/smoke-tests/adapters/fixtures/test-audio.mp3 differ diff --git a/packages/typescript/smoke-tests/adapters/src/adapters/index.ts b/packages/typescript/smoke-tests/adapters/src/adapters/index.ts index 91562919..c239d326 100644 --- a/packages/typescript/smoke-tests/adapters/src/adapters/index.ts +++ b/packages/typescript/smoke-tests/adapters/src/adapters/index.ts @@ -7,6 +7,7 @@ import { createGeminiImage, createGeminiSummarize, createGeminiText, + createGeminiTTS, } from '@tanstack/ai-gemini' import { createOllamaEmbed, @@ -18,6 +19,8 @@ import { createOpenaiImage, createOpenaiSummarize, createOpenaiText, + createOpenaiTTS, + createOpenaiTranscription, } from '@tanstack/ai-openai' /** @@ -32,6 +35,10 @@ export interface AdapterSet { embeddingAdapter?: any /** Image adapter for image generation */ imageAdapter?: any + /** TTS adapter for text-to-speech */ + ttsAdapter?: any + /** Transcription adapter for speech-to-text */ + transcriptionAdapter?: any /** Model to use for chat */ chatModel: string /** Model to use for summarization */ @@ -40,6 +47,10 @@ export interface AdapterSet { embeddingModel: string /** Model to use for image generation */ imageModel?: string + /** Model to use for TTS */ + ttsModel?: string + /** Model to use for transcription */ + transcriptionModel?: string } /** @@ -69,6 +80,9 @@ const OPENAI_SUMMARY_MODEL = process.env.OPENAI_SUMMARY_MODEL || OPENAI_MODEL const OPENAI_EMBEDDING_MODEL = process.env.OPENAI_EMBEDDING_MODEL || 'text-embedding-3-small' const OPENAI_IMAGE_MODEL = process.env.OPENAI_IMAGE_MODEL || 'gpt-image-1' +const OPENAI_TTS_MODEL = process.env.OPENAI_TTS_MODEL || 'tts-1' +const OPENAI_TRANSCRIPTION_MODEL = + process.env.OPENAI_TRANSCRIPTION_MODEL || 'whisper-1' const GEMINI_MODEL = process.env.GEMINI_MODEL || 'gemini-2.0-flash-lite' const GEMINI_SUMMARY_MODEL = process.env.GEMINI_SUMMARY_MODEL || GEMINI_MODEL @@ -76,6 +90,8 @@ const GEMINI_EMBEDDING_MODEL = process.env.GEMINI_EMBEDDING_MODEL || 'gemini-embedding-001' const GEMINI_IMAGE_MODEL = process.env.GEMINI_IMAGE_MODEL || 'imagen-3.0-generate-002' +const GEMINI_TTS_MODEL = + process.env.GEMINI_TTS_MODEL || 'gemini-2.5-flash-preview-tts' const OLLAMA_MODEL = process.env.OLLAMA_MODEL || 'mistral:7b' const OLLAMA_SUMMARY_MODEL = process.env.OLLAMA_SUMMARY_MODEL || OLLAMA_MODEL @@ -113,10 +129,14 @@ function createOpenAIAdapters(): AdapterSet | null { summarizeAdapter: createOpenaiSummarize(apiKey), embeddingAdapter: createOpenaiEmbed(apiKey), imageAdapter: createOpenaiImage(apiKey), + ttsAdapter: createOpenaiTTS(apiKey), + transcriptionAdapter: createOpenaiTranscription(apiKey), chatModel: OPENAI_MODEL, summarizeModel: OPENAI_SUMMARY_MODEL, embeddingModel: OPENAI_EMBEDDING_MODEL, imageModel: OPENAI_IMAGE_MODEL, + ttsModel: OPENAI_TTS_MODEL, + transcriptionModel: OPENAI_TRANSCRIPTION_MODEL, } } @@ -132,10 +152,12 @@ function createGeminiAdapters(): AdapterSet | null { summarizeAdapter: createGeminiSummarize(apiKey), embeddingAdapter: createGeminiEmbed(apiKey), imageAdapter: createGeminiImage(apiKey), + ttsAdapter: createGeminiTTS(apiKey), chatModel: GEMINI_MODEL, summarizeModel: GEMINI_SUMMARY_MODEL, embeddingModel: GEMINI_EMBEDDING_MODEL, imageModel: GEMINI_IMAGE_MODEL, + ttsModel: GEMINI_TTS_MODEL, } } diff --git a/packages/typescript/smoke-tests/adapters/src/cli.ts b/packages/typescript/smoke-tests/adapters/src/cli.ts index d7b353c7..b723e5a6 100644 --- a/packages/typescript/smoke-tests/adapters/src/cli.ts +++ b/packages/typescript/smoke-tests/adapters/src/cli.ts @@ -14,7 +14,6 @@ config({ path: '.env' }) interface AdapterResult { adapter: string - model: string tests: Record } @@ -32,7 +31,7 @@ function displayWidth(str: string): number { // Emojis and some special characters take 2 columns // This regex matches most common emojis const emojiRegex = - /[\u{1F300}-\u{1F9FF}]|[\u{2600}-\u{26FF}]|[\u{2700}-\u{27BF}]|✅|❌|⋯|⚠️/gu + /[\u{1F300}-\u{1F9FF}]|[\u{2600}-\u{26FF}]|[\u{2700}-\u{27BF}]|✅|❌|⚠️/gu const emojiCount = (str.match(emojiRegex) || []).length // Each emoji takes ~2 display columns but counts as 1-2 in length // We need to add extra padding for emojis @@ -105,6 +104,10 @@ function hasCapability( return !!adapterSet.embeddingAdapter case 'image': return !!adapterSet.imageAdapter + case 'tts': + return !!adapterSet.ttsAdapter + case 'transcription': + return !!adapterSet.transcriptionAdapter default: return false } @@ -118,11 +121,11 @@ function formatGrid(results: AdapterResult[], testsRun: TestDefinition[]) { // Build rows with result indicators const rows = results.map((result) => [ - `${result.adapter} (${result.model})`, + result.adapter, ...testsRun.map((test) => { const outcome = result.tests[test.id] if (!outcome) return '—' - if (outcome.ignored) return '⋯' + if (outcome.ignored) return '—' return outcome.passed ? '✅' : '❌' }), ]) @@ -189,11 +192,10 @@ async function runSequential( continue } - console.log(`\n${adapterDef.name} (chat: ${adapterSet.chatModel})`) + console.log(`\n${adapterDef.name}`) const adapterResult: AdapterResult = { adapter: adapterDef.name, - model: adapterSet.chatModel, tests: {}, } @@ -203,10 +205,14 @@ async function runSequential( summarizeAdapter: adapterSet.summarizeAdapter, embeddingAdapter: adapterSet.embeddingAdapter, imageAdapter: adapterSet.imageAdapter, + ttsAdapter: adapterSet.ttsAdapter, + transcriptionAdapter: adapterSet.transcriptionAdapter, model: adapterSet.chatModel, summarizeModel: adapterSet.summarizeModel, embeddingModel: adapterSet.embeddingModel, imageModel: adapterSet.imageModel, + ttsModel: adapterSet.ttsModel, + transcriptionModel: adapterSet.transcriptionModel, } for (const test of testsToRun) { @@ -216,7 +222,7 @@ async function runSequential( if (missingCapabilities.length > 0) { console.log( - `[${adapterDef.name}] ⋯ ${test.id}: Ignored (missing: ${missingCapabilities.join(', ')})`, + `[${adapterDef.name}] — ${test.id}: Ignored (missing: ${missingCapabilities.join(', ')})`, ) adapterResult.tests[test.id] = { passed: true, ignored: true } continue @@ -255,7 +261,6 @@ async function runParallel( // Initialize result for this adapter const adapterResult: AdapterResult = { adapter: adapterDef.name, - model: adapterSet.chatModel, tests: {}, } resultsMap.set(adapterDef.id, adapterResult) @@ -266,10 +271,14 @@ async function runParallel( summarizeAdapter: adapterSet.summarizeAdapter, embeddingAdapter: adapterSet.embeddingAdapter, imageAdapter: adapterSet.imageAdapter, + ttsAdapter: adapterSet.ttsAdapter, + transcriptionAdapter: adapterSet.transcriptionAdapter, model: adapterSet.chatModel, summarizeModel: adapterSet.summarizeModel, embeddingModel: adapterSet.embeddingModel, imageModel: adapterSet.imageModel, + ttsModel: adapterSet.ttsModel, + transcriptionModel: adapterSet.transcriptionModel, } for (const test of testsToRun) { diff --git a/packages/typescript/smoke-tests/adapters/src/harness.ts b/packages/typescript/smoke-tests/adapters/src/harness.ts index 35e58a78..026b7393 100644 --- a/packages/typescript/smoke-tests/adapters/src/harness.ts +++ b/packages/typescript/smoke-tests/adapters/src/harness.ts @@ -55,6 +55,10 @@ export interface AdapterContext { embeddingAdapter?: any /** Image adapter for image generation */ imageAdapter?: any + /** TTS adapter for text-to-speech */ + ttsAdapter?: any + /** Transcription adapter for speech-to-text */ + transcriptionAdapter?: any /** Model for chat/text */ model: string /** Model for summarization */ @@ -63,6 +67,10 @@ export interface AdapterContext { embeddingModel?: string /** Model for image generation */ imageModel?: string + /** Model for TTS */ + ttsModel?: string + /** Model for transcription */ + transcriptionModel?: string } interface DebugEnvelope { diff --git a/packages/typescript/smoke-tests/adapters/src/tests/emb-embedding.ts b/packages/typescript/smoke-tests/adapters/src/tests/emb-embedding.ts index e94b3a79..c6d5bc04 100644 --- a/packages/typescript/smoke-tests/adapters/src/tests/emb-embedding.ts +++ b/packages/typescript/smoke-tests/adapters/src/tests/emb-embedding.ts @@ -17,7 +17,7 @@ export async function runEMB( // Skip if no embedding adapter is available if (!adapterContext.embeddingAdapter) { console.log( - `[${adapterName}] ⋯ ${testName}: Ignored (no embedding adapter)`, + `[${adapterName}] — ${testName}: Ignored (no embedding adapter)`, ) return { passed: true, ignored: true } } diff --git a/packages/typescript/smoke-tests/adapters/src/tests/img-image-generation.ts b/packages/typescript/smoke-tests/adapters/src/tests/img-image-generation.ts index 1d349edf..8d197b09 100644 --- a/packages/typescript/smoke-tests/adapters/src/tests/img-image-generation.ts +++ b/packages/typescript/smoke-tests/adapters/src/tests/img-image-generation.ts @@ -19,7 +19,7 @@ export async function runIMG( // Skip if no image adapter is available if (!adapterContext.imageAdapter) { - console.log(`[${adapterName}] ⋯ ${testName}: Ignored (no image adapter)`) + console.log(`[${adapterName}] — ${testName}: Ignored (no image adapter)`) return { passed: true, ignored: true } } diff --git a/packages/typescript/smoke-tests/adapters/src/tests/index.ts b/packages/typescript/smoke-tests/adapters/src/tests/index.ts index 7e93a780..7e9dbca6 100644 --- a/packages/typescript/smoke-tests/adapters/src/tests/index.ts +++ b/packages/typescript/smoke-tests/adapters/src/tests/index.ts @@ -10,11 +10,19 @@ import { runAGS } from './ags-agentic-structured' import { runSUM } from './sum-summarize' import { runEMB } from './emb-embedding' import { runIMG } from './img-image-generation' +import { runTTS } from './tts-text-to-speech' +import { runTRN } from './trn-transcription' /** * Adapter capability types */ -export type AdapterCapability = 'text' | 'summarize' | 'embedding' | 'image' +export type AdapterCapability = + | 'text' + | 'summarize' + | 'embedding' + | 'image' + | 'tts' + | 'transcription' /** * Definition for a test @@ -102,6 +110,22 @@ export const TESTS: TestDefinition[] = [ requires: ['image'], skipByDefault: true, // Skip unless explicitly requested }, + { + id: 'TTS', + name: 'Text-to-Speech', + description: 'Generate speech audio from text', + run: runTTS, + requires: ['tts'], + skipByDefault: true, // Skip unless explicitly requested + }, + { + id: 'TRN', + name: 'Transcription', + description: 'Transcribe audio to text', + run: runTRN, + requires: ['transcription'], + skipByDefault: true, // Skip unless explicitly requested + }, ] /** diff --git a/packages/typescript/smoke-tests/adapters/src/tests/sum-summarize.ts b/packages/typescript/smoke-tests/adapters/src/tests/sum-summarize.ts index 1604d236..8dc63df4 100644 --- a/packages/typescript/smoke-tests/adapters/src/tests/sum-summarize.ts +++ b/packages/typescript/smoke-tests/adapters/src/tests/sum-summarize.ts @@ -17,7 +17,7 @@ export async function runSUM( // Skip if no summarize adapter is available if (!adapterContext.summarizeAdapter) { console.log( - `[${adapterName}] ⋯ ${testName}: Ignored (no summarize adapter)`, + `[${adapterName}] — ${testName}: Ignored (no summarize adapter)`, ) return { passed: true, ignored: true } } diff --git a/packages/typescript/smoke-tests/adapters/src/tests/trn-transcription.ts b/packages/typescript/smoke-tests/adapters/src/tests/trn-transcription.ts new file mode 100644 index 00000000..ef15ded9 --- /dev/null +++ b/packages/typescript/smoke-tests/adapters/src/tests/trn-transcription.ts @@ -0,0 +1,115 @@ +import { readFile } from 'node:fs/promises' +import { join } from 'node:path' +import { ai } from '@tanstack/ai' +import { writeDebugFile } from '../harness' +import type { AdapterContext, TestOutcome } from '../harness' + +/** + * TRN: Audio Transcription Test + * + * Tests audio transcription by providing an audio file and + * verifying we get valid transcription text back. + * + * NOTE: This test is skipped by default to avoid transcription + * costs on every run. Use --tests trn to run explicitly. + * + * Requires a test audio file at: fixtures/test-audio.mp3 + */ +export async function runTRN( + adapterContext: AdapterContext, +): Promise { + const testName = 'trn-transcription' + const adapterName = adapterContext.adapterName + + // Skip if no transcription adapter is available + if (!adapterContext.transcriptionAdapter) { + console.log( + `[${adapterName}] — ${testName}: Ignored (no transcription adapter)`, + ) + return { passed: true, ignored: true } + } + + const model = adapterContext.transcriptionModel || 'whisper-1' + + const debugData: Record = { + adapter: adapterName, + test: testName, + model, + timestamp: new Date().toISOString(), + } + + try { + // Try to load test audio file + const testAudioPath = join(process.cwd(), 'fixtures', 'test-audio.mp3') + let audioData: string + + try { + const audioBuffer = await readFile(testAudioPath) + audioData = audioBuffer.toString('base64') + debugData.input = { + audioFile: testAudioPath, + audioSize: audioBuffer.length, + } + } catch (fileError) { + // No test audio file available - skip test + console.log( + `[${adapterName}] — ${testName}: Ignored (no test audio file at fixtures/test-audio.mp3)`, + ) + return { passed: true, ignored: true } + } + + const result = await ai({ + adapter: adapterContext.transcriptionAdapter, + model, + audio: audioData, + language: 'en', + }) + + // Check that we got valid transcription data + const hasText = + result.text && typeof result.text === 'string' && result.text.length > 0 + const hasId = result.id && typeof result.id === 'string' + const hasModel = result.model && typeof result.model === 'string' + + const passed = hasText && hasId && hasModel + + debugData.summary = { + hasText, + hasId, + hasModel, + textLength: result.text?.length || 0, + textPreview: result.text?.substring(0, 100) || '', + language: result.language, + duration: result.duration, + segmentCount: result.segments?.length || 0, + wordCount: result.words?.length || 0, + } + debugData.result = { + passed, + error: passed + ? undefined + : !hasText + ? 'Transcription text missing' + : !hasId + ? 'ID missing' + : 'Model missing', + } + + await writeDebugFile(adapterName, testName, debugData) + + console.log( + `[${adapterName}] ${passed ? '✅' : '❌'} ${testName}${ + passed ? '' : `: ${debugData.result.error}` + }`, + ) + + return { passed, error: debugData.result.error } + } catch (error: any) { + const message = error?.message || String(error) + debugData.summary = { error: message } + debugData.result = { passed: false, error: message } + await writeDebugFile(adapterName, testName, debugData) + console.log(`[${adapterName}] ❌ ${testName}: ${message}`) + return { passed: false, error: message } + } +} diff --git a/packages/typescript/smoke-tests/adapters/src/tests/tts-text-to-speech.ts b/packages/typescript/smoke-tests/adapters/src/tests/tts-text-to-speech.ts new file mode 100644 index 00000000..853e3e61 --- /dev/null +++ b/packages/typescript/smoke-tests/adapters/src/tests/tts-text-to-speech.ts @@ -0,0 +1,94 @@ +import { ai } from '@tanstack/ai' +import { writeDebugFile } from '../harness' +import type { AdapterContext, TestOutcome } from '../harness' + +/** + * TTS: Text-to-Speech Test + * + * Tests text-to-speech generation by providing text and + * verifying we get valid audio data back. + * + * NOTE: This test is skipped by default to avoid generating + * audio on every run. Use --tests tts to run explicitly. + */ +export async function runTTS( + adapterContext: AdapterContext, +): Promise { + const testName = 'tts-text-to-speech' + const adapterName = adapterContext.adapterName + + // Skip if no TTS adapter is available + if (!adapterContext.ttsAdapter) { + console.log(`[${adapterName}] — ${testName}: Ignored (no TTS adapter)`) + return { passed: true, ignored: true } + } + + const model = adapterContext.ttsModel || 'tts-1' + const text = 'Hello, this is a test of text to speech synthesis.' + + const debugData: Record = { + adapter: adapterName, + test: testName, + model, + timestamp: new Date().toISOString(), + input: { text }, + } + + try { + const result = await ai({ + adapter: adapterContext.ttsAdapter, + model, + text, + voice: 'alloy', + format: 'mp3', + }) + + // Check that we got valid audio data + const hasAudio = + result.audio && + typeof result.audio === 'string' && + result.audio.length > 0 + const hasFormat = result.format && typeof result.format === 'string' + const hasId = result.id && typeof result.id === 'string' + + const passed = hasAudio && hasFormat && hasId + + debugData.summary = { + hasAudio, + hasFormat, + hasId, + format: result.format, + audioLength: result.audio?.length || 0, + // Don't log the actual audio data, just metadata + contentType: result.contentType, + duration: result.duration, + } + debugData.result = { + passed, + error: passed + ? undefined + : !hasAudio + ? 'Audio data missing' + : !hasFormat + ? 'Format missing' + : 'ID missing', + } + + await writeDebugFile(adapterName, testName, debugData) + + console.log( + `[${adapterName}] ${passed ? '✅' : '❌'} ${testName}${ + passed ? '' : `: ${debugData.result.error}` + }`, + ) + + return { passed, error: debugData.result.error } + } catch (error: any) { + const message = error?.message || String(error) + debugData.summary = { error: message } + debugData.result = { passed: false, error: message } + await writeDebugFile(adapterName, testName, debugData) + console.log(`[${adapterName}] ❌ ${testName}: ${message}`) + return { passed: false, error: message } + } +} diff --git a/testing/panel/src/components/Header.tsx b/testing/panel/src/components/Header.tsx index 633fc151..c682ca70 100644 --- a/testing/panel/src/components/Header.tsx +++ b/testing/panel/src/components/Header.tsx @@ -2,13 +2,17 @@ import { Link } from '@tanstack/react-router' import { useState } from 'react' import { + ChefHat, + FileText, + FlaskConical, Home, + ImageIcon, Menu, + Mic, + Package, + Video, + Volume2, X, - FlaskConical, - FileText, - ImageIcon, - ChefHat, } from 'lucide-react' export default function Header() { @@ -78,6 +82,24 @@ export default function Header() { Stream Debugger + setIsOpen(false)} + className="flex items-center gap-3 p-3 rounded-lg hover:bg-gray-800 transition-colors mb-2" + activeProps={{ + className: + 'flex items-center gap-3 p-3 rounded-lg bg-green-600 hover:bg-green-700 transition-colors mb-2', + }} + > + +
+ Add-on Manager + + Multi-Tool + +
+ +

Activities @@ -109,6 +131,50 @@ export default function Header() { Image Generation + setIsOpen(false)} + className="flex items-center gap-3 p-3 rounded-lg hover:bg-gray-800 transition-colors mb-2" + activeProps={{ + className: + 'flex items-center gap-3 p-3 rounded-lg bg-purple-600 hover:bg-purple-700 transition-colors mb-2', + }} + > +

+ Video Generation + + Exp + +
+ + + setIsOpen(false)} + className="flex items-center gap-3 p-3 rounded-lg hover:bg-gray-800 transition-colors mb-2" + activeProps={{ + className: + 'flex items-center gap-3 p-3 rounded-lg bg-emerald-600 hover:bg-emerald-700 transition-colors mb-2', + }} + > + + Text-to-Speech + + + setIsOpen(false)} + className="flex items-center gap-3 p-3 rounded-lg hover:bg-gray-800 transition-colors mb-2" + activeProps={{ + className: + 'flex items-center gap-3 p-3 rounded-lg bg-amber-600 hover:bg-amber-700 transition-colors mb-2', + }} + > + + Transcription + + setIsOpen(false)} diff --git a/testing/panel/src/hooks/index.ts b/testing/panel/src/hooks/index.ts new file mode 100644 index 00000000..a00083b1 --- /dev/null +++ b/testing/panel/src/hooks/index.ts @@ -0,0 +1,2 @@ +export { useAudioRecorder } from './useAudioRecorder' +export { useTTS } from './useTTS' diff --git a/testing/panel/src/hooks/useAudioRecorder.ts b/testing/panel/src/hooks/useAudioRecorder.ts new file mode 100644 index 00000000..3a54bb9d --- /dev/null +++ b/testing/panel/src/hooks/useAudioRecorder.ts @@ -0,0 +1,85 @@ +import { useCallback, useRef, useState } from 'react' + +/** + * Hook for recording audio and transcribing it via the transcription API. + */ +export function useAudioRecorder() { + const [isRecording, setIsRecording] = useState(false) + const [isTranscribing, setIsTranscribing] = useState(false) + const mediaRecorderRef = useRef(null) + const chunksRef = useRef([]) + + const startRecording = useCallback(async () => { + try { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }) + const mediaRecorder = new MediaRecorder(stream, { + mimeType: 'audio/webm;codecs=opus', + }) + mediaRecorderRef.current = mediaRecorder + chunksRef.current = [] + + mediaRecorder.ondataavailable = (e) => { + if (e.data.size > 0) { + chunksRef.current.push(e.data) + } + } + + mediaRecorder.start() + setIsRecording(true) + } catch (error) { + console.error('Failed to start recording:', error) + alert('Could not access microphone. Please check permissions.') + } + }, []) + + const stopRecording = useCallback(async (): Promise => { + return new Promise((resolve) => { + const mediaRecorder = mediaRecorderRef.current + if (!mediaRecorder) { + resolve(null) + return + } + + mediaRecorder.onstop = async () => { + setIsRecording(false) + setIsTranscribing(true) + + const audioBlob = new Blob(chunksRef.current, { type: 'audio/webm' }) + + // Stop all tracks + mediaRecorder.stream.getTracks().forEach((track) => track.stop()) + + try { + const formData = new FormData() + formData.append( + 'audio', + new File([audioBlob], 'recording.webm', { type: 'audio/webm' }), + ) + formData.append('model', 'whisper-1') + + const response = await fetch('/api/transcription', { + method: 'POST', + body: formData, + }) + + if (!response.ok) { + const errorData = await response.json() + throw new Error(errorData.error || 'Transcription failed') + } + + const result = await response.json() + setIsTranscribing(false) + resolve(result.text || null) + } catch (error) { + console.error('Transcription error:', error) + setIsTranscribing(false) + resolve(null) + } + } + + mediaRecorder.stop() + }) + }, []) + + return { isRecording, isTranscribing, startRecording, stopRecording } +} diff --git a/testing/panel/src/hooks/useTTS.ts b/testing/panel/src/hooks/useTTS.ts new file mode 100644 index 00000000..c8712618 --- /dev/null +++ b/testing/panel/src/hooks/useTTS.ts @@ -0,0 +1,78 @@ +import { useCallback, useRef, useState } from 'react' + +/** + * Hook for text-to-speech playback via the TTS API. + */ +export function useTTS() { + const [playingId, setPlayingId] = useState(null) + const audioRef = useRef(null) + + const speak = useCallback(async (text: string, id: string) => { + // Stop any currently playing audio + if (audioRef.current) { + audioRef.current.pause() + audioRef.current = null + } + + setPlayingId(id) + + try { + const response = await fetch('/api/tts', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + text, + voice: 'nova', + model: 'tts-1', + format: 'mp3', + }), + }) + + if (!response.ok) { + const errorData = await response.json() + throw new Error(errorData.error || 'TTS failed') + } + + const result = await response.json() + + // Convert base64 to audio and play + const audioData = atob(result.audio) + const bytes = new Uint8Array(audioData.length) + for (let i = 0; i < audioData.length; i++) { + bytes[i] = audioData.charCodeAt(i) + } + const blob = new Blob([bytes], { type: result.contentType }) + const url = URL.createObjectURL(blob) + + const audio = new Audio(url) + audioRef.current = audio + + audio.onended = () => { + URL.revokeObjectURL(url) + setPlayingId(null) + audioRef.current = null + } + + audio.onerror = () => { + URL.revokeObjectURL(url) + setPlayingId(null) + audioRef.current = null + } + + await audio.play() + } catch (error) { + console.error('TTS error:', error) + setPlayingId(null) + } + }, []) + + const stop = useCallback(() => { + if (audioRef.current) { + audioRef.current.pause() + audioRef.current = null + } + setPlayingId(null) + }, []) + + return { playingId, speak, stop } +} diff --git a/testing/panel/src/lib/addon-tools.ts b/testing/panel/src/lib/addon-tools.ts new file mode 100644 index 00000000..86d5303b --- /dev/null +++ b/testing/panel/src/lib/addon-tools.ts @@ -0,0 +1,114 @@ +import { toolDefinition } from '@tanstack/ai' +import { z } from 'zod' + +// Sample add-ons data +export const availableAddOns = [ + { + id: 'clerk', + name: 'Clerk Authentication', + description: + 'Complete user management and authentication solution with social logins, MFA, and session management.', + type: 'authentication', + }, + { + id: 'stripe', + name: 'Stripe Payments', + description: + 'Accept payments and manage subscriptions with the leading payment processing platform.', + type: 'payments', + }, + { + id: 'drizzle', + name: 'Drizzle ORM', + description: + 'TypeScript-first ORM with excellent DX, type safety, and SQL-like query builder.', + type: 'database', + }, + { + id: 'prisma', + name: 'Prisma ORM', + description: + 'Next-generation ORM with intuitive data modeling, automated migrations, and type safety.', + type: 'database', + }, + { + id: 'tailwind', + name: 'Tailwind CSS', + description: + 'Utility-first CSS framework for rapidly building custom user interfaces.', + type: 'styling', + }, + { + id: 'shadcn', + name: 'shadcn/ui', + description: + 'Beautifully designed components built with Radix UI and Tailwind CSS.', + type: 'ui-components', + }, + { + id: 'sentry', + name: 'Sentry', + description: + 'Error tracking and performance monitoring for your application.', + type: 'monitoring', + }, + { + id: 'posthog', + name: 'PostHog', + description: + 'Product analytics, session recording, and feature flags in one platform.', + type: 'analytics', + }, +] + +// Tool 1: Get available add-ons with their current selection state +export const getAvailableAddOnsToolDef = toolDefinition({ + name: 'getAvailableAddOns', + description: + 'Get all available add-ons that can be selected for the project. Returns the list of add-ons with their id, name, description, type, and current selection state.', + inputSchema: z.object({}), + outputSchema: z.array( + z.object({ + id: z.string(), + name: z.string(), + description: z.string(), + type: z.string(), + selected: z.boolean(), + enabled: z.boolean(), + }), + ), +}) + +// Tool 2: Select add-ons by ID +export const selectAddOnsToolDef = toolDefinition({ + name: 'selectAddOns', + description: + 'Select one or more add-ons by their IDs. This will enable the specified add-ons for the project.', + inputSchema: z.object({ + addOnIds: z + .array(z.string()) + .describe('Array of add-on IDs to select/enable'), + }), + outputSchema: z.object({ + success: z.boolean(), + selectedAddOns: z.array(z.string()), + message: z.string(), + }), +}) + +// Tool 3: Unselect add-ons by ID +export const unselectAddOnsToolDef = toolDefinition({ + name: 'unselectAddOns', + description: + 'Unselect one or more add-ons by their IDs. This will disable the specified add-ons for the project.', + inputSchema: z.object({ + addOnIds: z + .array(z.string()) + .describe('Array of add-on IDs to unselect/disable'), + }), + outputSchema: z.object({ + success: z.boolean(), + unselectedAddOns: z.array(z.string()), + message: z.string(), + }), +}) diff --git a/testing/panel/src/routeTree.gen.ts b/testing/panel/src/routeTree.gen.ts index 6459a6b9..9e1b8e08 100644 --- a/testing/panel/src/routeTree.gen.ts +++ b/testing/panel/src/routeTree.gen.ts @@ -9,18 +9,41 @@ // Additionally, you should also exclude this file from your linter and/or formatter to prevent it from being checked or modified. import { Route as rootRouteImport } from './routes/__root' +import { Route as VideoRouteImport } from './routes/video' +import { Route as TtsRouteImport } from './routes/tts' +import { Route as TranscriptionRouteImport } from './routes/transcription' import { Route as SummarizeRouteImport } from './routes/summarize' import { Route as StructuredRouteImport } from './routes/structured' import { Route as StreamDebuggerRouteImport } from './routes/stream-debugger' import { Route as ImageRouteImport } from './routes/image' +import { Route as AddonManagerRouteImport } from './routes/addon-manager' import { Route as IndexRouteImport } from './routes/index' +import { Route as ApiVideoRouteImport } from './routes/api.video' +import { Route as ApiTtsRouteImport } from './routes/api.tts' +import { Route as ApiTranscriptionRouteImport } from './routes/api.transcription' import { Route as ApiSummarizeRouteImport } from './routes/api.summarize' import { Route as ApiStructuredRouteImport } from './routes/api.structured' import { Route as ApiLoadTraceRouteImport } from './routes/api.load-trace' import { Route as ApiListTracesRouteImport } from './routes/api.list-traces' import { Route as ApiImageRouteImport } from './routes/api.image' import { Route as ApiChatRouteImport } from './routes/api.chat' +import { Route as ApiAddonChatRouteImport } from './routes/api.addon-chat' +const VideoRoute = VideoRouteImport.update({ + id: '/video', + path: '/video', + getParentRoute: () => rootRouteImport, +} as any) +const TtsRoute = TtsRouteImport.update({ + id: '/tts', + path: '/tts', + getParentRoute: () => rootRouteImport, +} as any) +const TranscriptionRoute = TranscriptionRouteImport.update({ + id: '/transcription', + path: '/transcription', + getParentRoute: () => rootRouteImport, +} as any) const SummarizeRoute = SummarizeRouteImport.update({ id: '/summarize', path: '/summarize', @@ -41,11 +64,31 @@ const ImageRoute = ImageRouteImport.update({ path: '/image', getParentRoute: () => rootRouteImport, } as any) +const AddonManagerRoute = AddonManagerRouteImport.update({ + id: '/addon-manager', + path: '/addon-manager', + getParentRoute: () => rootRouteImport, +} as any) const IndexRoute = IndexRouteImport.update({ id: '/', path: '/', getParentRoute: () => rootRouteImport, } as any) +const ApiVideoRoute = ApiVideoRouteImport.update({ + id: '/api/video', + path: '/api/video', + getParentRoute: () => rootRouteImport, +} as any) +const ApiTtsRoute = ApiTtsRouteImport.update({ + id: '/api/tts', + path: '/api/tts', + getParentRoute: () => rootRouteImport, +} as any) +const ApiTranscriptionRoute = ApiTranscriptionRouteImport.update({ + id: '/api/transcription', + path: '/api/transcription', + getParentRoute: () => rootRouteImport, +} as any) const ApiSummarizeRoute = ApiSummarizeRouteImport.update({ id: '/api/summarize', path: '/api/summarize', @@ -76,105 +119,187 @@ const ApiChatRoute = ApiChatRouteImport.update({ path: '/api/chat', getParentRoute: () => rootRouteImport, } as any) +const ApiAddonChatRoute = ApiAddonChatRouteImport.update({ + id: '/api/addon-chat', + path: '/api/addon-chat', + getParentRoute: () => rootRouteImport, +} as any) export interface FileRoutesByFullPath { '/': typeof IndexRoute + '/addon-manager': typeof AddonManagerRoute '/image': typeof ImageRoute '/stream-debugger': typeof StreamDebuggerRoute '/structured': typeof StructuredRoute '/summarize': typeof SummarizeRoute + '/transcription': typeof TranscriptionRoute + '/tts': typeof TtsRoute + '/video': typeof VideoRoute + '/api/addon-chat': typeof ApiAddonChatRoute '/api/chat': typeof ApiChatRoute '/api/image': typeof ApiImageRoute '/api/list-traces': typeof ApiListTracesRoute '/api/load-trace': typeof ApiLoadTraceRoute '/api/structured': typeof ApiStructuredRoute '/api/summarize': typeof ApiSummarizeRoute + '/api/transcription': typeof ApiTranscriptionRoute + '/api/tts': typeof ApiTtsRoute + '/api/video': typeof ApiVideoRoute } export interface FileRoutesByTo { '/': typeof IndexRoute + '/addon-manager': typeof AddonManagerRoute '/image': typeof ImageRoute '/stream-debugger': typeof StreamDebuggerRoute '/structured': typeof StructuredRoute '/summarize': typeof SummarizeRoute + '/transcription': typeof TranscriptionRoute + '/tts': typeof TtsRoute + '/video': typeof VideoRoute + '/api/addon-chat': typeof ApiAddonChatRoute '/api/chat': typeof ApiChatRoute '/api/image': typeof ApiImageRoute '/api/list-traces': typeof ApiListTracesRoute '/api/load-trace': typeof ApiLoadTraceRoute '/api/structured': typeof ApiStructuredRoute '/api/summarize': typeof ApiSummarizeRoute + '/api/transcription': typeof ApiTranscriptionRoute + '/api/tts': typeof ApiTtsRoute + '/api/video': typeof ApiVideoRoute } export interface FileRoutesById { __root__: typeof rootRouteImport '/': typeof IndexRoute + '/addon-manager': typeof AddonManagerRoute '/image': typeof ImageRoute '/stream-debugger': typeof StreamDebuggerRoute '/structured': typeof StructuredRoute '/summarize': typeof SummarizeRoute + '/transcription': typeof TranscriptionRoute + '/tts': typeof TtsRoute + '/video': typeof VideoRoute + '/api/addon-chat': typeof ApiAddonChatRoute '/api/chat': typeof ApiChatRoute '/api/image': typeof ApiImageRoute '/api/list-traces': typeof ApiListTracesRoute '/api/load-trace': typeof ApiLoadTraceRoute '/api/structured': typeof ApiStructuredRoute '/api/summarize': typeof ApiSummarizeRoute + '/api/transcription': typeof ApiTranscriptionRoute + '/api/tts': typeof ApiTtsRoute + '/api/video': typeof ApiVideoRoute } export interface FileRouteTypes { fileRoutesByFullPath: FileRoutesByFullPath fullPaths: | '/' + | '/addon-manager' | '/image' | '/stream-debugger' | '/structured' | '/summarize' + | '/transcription' + | '/tts' + | '/video' + | '/api/addon-chat' | '/api/chat' | '/api/image' | '/api/list-traces' | '/api/load-trace' | '/api/structured' | '/api/summarize' + | '/api/transcription' + | '/api/tts' + | '/api/video' fileRoutesByTo: FileRoutesByTo to: | '/' + | '/addon-manager' | '/image' | '/stream-debugger' | '/structured' | '/summarize' + | '/transcription' + | '/tts' + | '/video' + | '/api/addon-chat' | '/api/chat' | '/api/image' | '/api/list-traces' | '/api/load-trace' | '/api/structured' | '/api/summarize' + | '/api/transcription' + | '/api/tts' + | '/api/video' id: | '__root__' | '/' + | '/addon-manager' | '/image' | '/stream-debugger' | '/structured' | '/summarize' + | '/transcription' + | '/tts' + | '/video' + | '/api/addon-chat' | '/api/chat' | '/api/image' | '/api/list-traces' | '/api/load-trace' | '/api/structured' | '/api/summarize' + | '/api/transcription' + | '/api/tts' + | '/api/video' fileRoutesById: FileRoutesById } export interface RootRouteChildren { IndexRoute: typeof IndexRoute + AddonManagerRoute: typeof AddonManagerRoute ImageRoute: typeof ImageRoute StreamDebuggerRoute: typeof StreamDebuggerRoute StructuredRoute: typeof StructuredRoute SummarizeRoute: typeof SummarizeRoute + TranscriptionRoute: typeof TranscriptionRoute + TtsRoute: typeof TtsRoute + VideoRoute: typeof VideoRoute + ApiAddonChatRoute: typeof ApiAddonChatRoute ApiChatRoute: typeof ApiChatRoute ApiImageRoute: typeof ApiImageRoute ApiListTracesRoute: typeof ApiListTracesRoute ApiLoadTraceRoute: typeof ApiLoadTraceRoute ApiStructuredRoute: typeof ApiStructuredRoute ApiSummarizeRoute: typeof ApiSummarizeRoute + ApiTranscriptionRoute: typeof ApiTranscriptionRoute + ApiTtsRoute: typeof ApiTtsRoute + ApiVideoRoute: typeof ApiVideoRoute } declare module '@tanstack/react-router' { interface FileRoutesByPath { + '/video': { + id: '/video' + path: '/video' + fullPath: '/video' + preLoaderRoute: typeof VideoRouteImport + parentRoute: typeof rootRouteImport + } + '/tts': { + id: '/tts' + path: '/tts' + fullPath: '/tts' + preLoaderRoute: typeof TtsRouteImport + parentRoute: typeof rootRouteImport + } + '/transcription': { + id: '/transcription' + path: '/transcription' + fullPath: '/transcription' + preLoaderRoute: typeof TranscriptionRouteImport + parentRoute: typeof rootRouteImport + } '/summarize': { id: '/summarize' path: '/summarize' @@ -203,6 +328,13 @@ declare module '@tanstack/react-router' { preLoaderRoute: typeof ImageRouteImport parentRoute: typeof rootRouteImport } + '/addon-manager': { + id: '/addon-manager' + path: '/addon-manager' + fullPath: '/addon-manager' + preLoaderRoute: typeof AddonManagerRouteImport + parentRoute: typeof rootRouteImport + } '/': { id: '/' path: '/' @@ -210,6 +342,27 @@ declare module '@tanstack/react-router' { preLoaderRoute: typeof IndexRouteImport parentRoute: typeof rootRouteImport } + '/api/video': { + id: '/api/video' + path: '/api/video' + fullPath: '/api/video' + preLoaderRoute: typeof ApiVideoRouteImport + parentRoute: typeof rootRouteImport + } + '/api/tts': { + id: '/api/tts' + path: '/api/tts' + fullPath: '/api/tts' + preLoaderRoute: typeof ApiTtsRouteImport + parentRoute: typeof rootRouteImport + } + '/api/transcription': { + id: '/api/transcription' + path: '/api/transcription' + fullPath: '/api/transcription' + preLoaderRoute: typeof ApiTranscriptionRouteImport + parentRoute: typeof rootRouteImport + } '/api/summarize': { id: '/api/summarize' path: '/api/summarize' @@ -252,21 +405,36 @@ declare module '@tanstack/react-router' { preLoaderRoute: typeof ApiChatRouteImport parentRoute: typeof rootRouteImport } + '/api/addon-chat': { + id: '/api/addon-chat' + path: '/api/addon-chat' + fullPath: '/api/addon-chat' + preLoaderRoute: typeof ApiAddonChatRouteImport + parentRoute: typeof rootRouteImport + } } } const rootRouteChildren: RootRouteChildren = { IndexRoute: IndexRoute, + AddonManagerRoute: AddonManagerRoute, ImageRoute: ImageRoute, StreamDebuggerRoute: StreamDebuggerRoute, StructuredRoute: StructuredRoute, SummarizeRoute: SummarizeRoute, + TranscriptionRoute: TranscriptionRoute, + TtsRoute: TtsRoute, + VideoRoute: VideoRoute, + ApiAddonChatRoute: ApiAddonChatRoute, ApiChatRoute: ApiChatRoute, ApiImageRoute: ApiImageRoute, ApiListTracesRoute: ApiListTracesRoute, ApiLoadTraceRoute: ApiLoadTraceRoute, ApiStructuredRoute: ApiStructuredRoute, ApiSummarizeRoute: ApiSummarizeRoute, + ApiTranscriptionRoute: ApiTranscriptionRoute, + ApiTtsRoute: ApiTtsRoute, + ApiVideoRoute: ApiVideoRoute, } export const routeTree = rootRouteImport ._addFileChildren(rootRouteChildren) diff --git a/testing/panel/src/routes/addon-manager.tsx b/testing/panel/src/routes/addon-manager.tsx new file mode 100644 index 00000000..f4b01dfa --- /dev/null +++ b/testing/panel/src/routes/addon-manager.tsx @@ -0,0 +1,583 @@ +import { useCallback, useEffect, useMemo, useRef, useState } from 'react' +import { createFileRoute } from '@tanstack/react-router' +import { + Check, + Loader2, + Package, + Send, + Square, + X, + CreditCard, + Database, + Paintbrush, + LayoutGrid, + AlertTriangle, + BarChart3, + Shield, +} from 'lucide-react' +import ReactMarkdown from 'react-markdown' +import rehypeRaw from 'rehype-raw' +import rehypeSanitize from 'rehype-sanitize' +import { fetchServerSentEvents, useChat } from '@tanstack/ai-react' +import { clientTools } from '@tanstack/ai-client' +import type { UIMessage } from '@tanstack/ai-react' +import { + availableAddOns, + getAvailableAddOnsToolDef, + selectAddOnsToolDef, + unselectAddOnsToolDef, +} from '@/lib/addon-tools' + +// Type for add-on state +interface AddOnState { + selected: boolean + enabled: boolean +} + +// Icons for different add-on types +const typeIcons: Record = { + authentication: , + payments: , + database: , + styling: , + 'ui-components': , + monitoring: , + analytics: , +} + +function AddOnCard({ + addOn, + state, + onToggle, +}: { + addOn: (typeof availableAddOns)[0] + state: AddOnState + onToggle: () => void +}) { + const Icon = typeIcons[addOn.type] || + + return ( +
+
+
+ {Icon} +
+
+
+

{addOn.name}

+ + {addOn.type} + +
+

+ {addOn.description} +

+
+ +
+
+ ) +} + +function AddOnPanel({ + addOnState, + onToggle, +}: { + addOnState: Record + onToggle: (id: string) => void +}) { + const selectedCount = Object.values(addOnState).filter( + (s) => s.selected, + ).length + + return ( +
+
+

+ + Project Add-ons +

+

+ {selectedCount} of {availableAddOns.length} add-ons selected +

+
+
+ {availableAddOns.map((addOn) => ( + onToggle(addOn.id)} + /> + ))} +
+
+ ) +} + +function Messages({ messages }: { messages: Array }) { + const messagesContainerRef = useRef(null) + + useEffect(() => { + if (messagesContainerRef.current) { + messagesContainerRef.current.scrollTop = + messagesContainerRef.current.scrollHeight + } + }, [messages]) + + if (!messages.length) { + return ( +
+
+ +

Ask the AI to configure your add-ons

+

+ Try: "Add authentication and payments" or "Show me what's available" +

+
+
+ ) + } + + return ( +
+ {messages.map(({ id, role, parts }) => ( +
+
+ {role === 'assistant' ? ( +
+ AI +
+ ) : ( +
+ U +
+ )} +
+ {parts.map((part, index) => { + if (part.type === 'text' && part.content) { + return ( +
+ + {part.content} + +
+ ) + } + + // Show tool call status + if (part.type === 'tool-call') { + return ( +
+
+ Tool: + + {part.name} + + {part.state === 'input-streaming' && ( + + )} + {part.output !== undefined && ( + + )} +
+ {part.output !== undefined && ( +
+
+                            {JSON.stringify(part.output, null, 2).slice(0, 200)}
+                            {JSON.stringify(part.output).length > 200
+                              ? '...'
+                              : ''}
+                          
+
+ )} +
+ ) + } + + return null + })} +
+
+
+ ))} +
+ ) +} + +function DebugPanel({ + messages, + chunks, + onClearChunks, +}: { + messages: Array + chunks: Array + onClearChunks: () => void +}) { + const [activeTab, setActiveTab] = useState<'messages' | 'chunks'>('chunks') + + return ( +
+
+

Debug Panel

+

+ Monitor multi-tool execution +

+ +
+ + +
+
+ +
+ {activeTab === 'messages' && ( +
+            {JSON.stringify(messages, null, 2)}
+          
+ )} + + {activeTab === 'chunks' && ( +
+ + +
+ + + + + + + + + + {chunks.map((chunk, idx) => { + const toolName = + chunk.toolCall?.function?.name || chunk.toolName || '-' + + let detail = '-' + if (chunk.type === 'content' && chunk.content) { + detail = chunk.content + } else if ( + chunk.type === 'tool_call' && + chunk.toolCall?.function?.arguments + ) { + detail = chunk.toolCall.function.arguments + } else if (chunk.type === 'tool_result' && chunk.content) { + detail = chunk.content + } else if (chunk.type === 'tool-input-available') { + detail = JSON.stringify(chunk.input) + } else if (chunk.type === 'done') { + detail = `Finish: ${chunk.finishReason || 'unknown'}` + } + + if (detail.length > 100) { + detail = detail.substring(0, 100) + '...' + } + + return ( + + + + + + ) + })} + +
TypeTool NameDetail
{chunk.type}{toolName} + {detail} +
+
+
+ )} +
+
+ ) +} + +function AddonManagerPage() { + const [chunks, setChunks] = useState>([]) + const [input, setInput] = useState('') + + // Initialize add-on state + const [addOnState, setAddOnState] = useState>( + () => { + const initial: Record = {} + for (const addOn of availableAddOns) { + initial[addOn.id] = { selected: false, enabled: true } + } + return initial + }, + ) + + // Toggle add-on selection (for manual UI interaction) + const toggleAddOn = useCallback((id: string) => { + setAddOnState((prev) => ({ + ...prev, + [id]: { + ...prev[id], + selected: !prev[id]?.selected, + }, + })) + }, []) + + // Client tool 1: Returns current add-on state + const getAvailableAddOnsClient = useMemo( + () => + getAvailableAddOnsToolDef.client(() => { + console.log('[Client Tool] getAvailableAddOns called') + return availableAddOns.map((addOn) => ({ + id: addOn.id, + name: addOn.name, + description: addOn.description, + type: addOn.type, + selected: addOnState[addOn.id]?.selected ?? false, + enabled: addOnState[addOn.id]?.enabled ?? true, + })) + }), + [addOnState], + ) + + // Client tool 2: Selects add-ons + const selectAddOnsClient = useMemo( + () => + selectAddOnsToolDef.client((args) => { + console.log('[Client Tool] selectAddOns called with:', args) + + // Calculate what will be selected BEFORE calling setState + // (setState callback is async, so we can't read results from it) + const toSelect: string[] = [] + for (const addOnId of args.addOnIds) { + const state = addOnState[addOnId] + if (state && !state.selected && state.enabled) { + toSelect.push(addOnId) + } + } + + // Update state if there's anything to select + if (toSelect.length > 0) { + setAddOnState((prev) => { + const next = { ...prev } + for (const addOnId of toSelect) { + next[addOnId] = { ...next[addOnId], selected: true } + } + return next + }) + } + + return { + success: toSelect.length > 0, + selectedAddOns: toSelect, + message: + toSelect.length > 0 + ? `Successfully selected: ${toSelect.join(', ')}` + : 'No add-ons were selected (may already be selected or not found).', + } + }), + [addOnState], + ) + + // Client tool 3: Unselects add-ons + const unselectAddOnsClient = useMemo( + () => + unselectAddOnsToolDef.client((args) => { + console.log('[Client Tool] unselectAddOns called with:', args) + + // Calculate what will be unselected BEFORE calling setState + // (setState callback is async, so we can't read results from it) + const toUnselect: string[] = [] + for (const addOnId of args.addOnIds) { + const state = addOnState[addOnId] + if (state && state.selected && state.enabled) { + toUnselect.push(addOnId) + } + } + + // Update state if there's anything to unselect + if (toUnselect.length > 0) { + setAddOnState((prev) => { + const next = { ...prev } + for (const addOnId of toUnselect) { + next[addOnId] = { ...next[addOnId], selected: false } + } + return next + }) + } + + return { + success: toUnselect.length > 0, + unselectedAddOns: toUnselect, + message: + toUnselect.length > 0 + ? `Successfully unselected: ${toUnselect.join(', ')}` + : 'No add-ons were unselected (may not be selected or not found).', + } + }), + [addOnState], + ) + + // Combine client tools + const tools = useMemo( + () => + clientTools( + getAvailableAddOnsClient, + selectAddOnsClient, + unselectAddOnsClient, + ), + [getAvailableAddOnsClient, selectAddOnsClient, unselectAddOnsClient], + ) + + const { messages, sendMessage, isLoading, stop } = useChat({ + connection: fetchServerSentEvents('/api/addon-chat'), + tools, + onChunk: (chunk: any) => { + setChunks((prev) => [...prev, chunk]) + }, + }) + + const clearChunks = () => setChunks([]) + + return ( +
+ {/* Left side - Add-on Selection Panel (1/4 width) */} +
+ +
+ + {/* Middle - Chat (1/2 width) */} +
+
+

AI Add-on Assistant

+

+ Ask me to configure your project add-ons +

+
+ + + +
+ {isLoading && ( +
+ +
+ )} +
+