diff --git a/apps/docs/components/icons.tsx b/apps/docs/components/icons.tsx index 9562e82bcb..b4dd059b70 100644 --- a/apps/docs/components/icons.tsx +++ b/apps/docs/components/icons.tsx @@ -4085,7 +4085,29 @@ export function CalendlyIcon(props: SVGProps) { ) } -export function AudioWaveformIcon(props: SVGProps) { +export function STTIcon(props: SVGProps) { + return ( + + + + + + + ) +} + +export function TTSIcon(props: SVGProps) { return ( ) { ) } + +export function VideoIcon(props: SVGProps) { + return ( + + + + + ) +} diff --git a/apps/docs/components/ui/icon-mapping.ts b/apps/docs/components/ui/icon-mapping.ts index 4d57eae8a5..9bce0402fa 100644 --- a/apps/docs/components/ui/icon-mapping.ts +++ b/apps/docs/components/ui/icon-mapping.ts @@ -8,7 +8,6 @@ import { ApolloIcon, ArxivIcon, AsanaIcon, - AudioWaveformIcon, BrainIcon, BrowserUseIcon, CalendlyIcon, @@ -63,6 +62,7 @@ import { SalesforceIcon, SerperIcon, SlackIcon, + STTIcon, StagehandIcon, StripeIcon, SupabaseIcon, @@ -70,8 +70,10 @@ import { TelegramIcon, TranslateIcon, TrelloIcon, + TTSIcon, TwilioIcon, TypeformIcon, + VideoIcon, WealthboxIcon, WebflowIcon, WhatsAppIcon, @@ -92,16 +94,18 @@ export const blockTypeToIconMap: Record = { webflow: WebflowIcon, wealthbox: WealthboxIcon, vision: EyeIcon, + video_generator: VideoIcon, typeform: TypeformIcon, twilio_voice: TwilioIcon, twilio_sms: TwilioIcon, + tts: TTSIcon, trello: TrelloIcon, translate: TranslateIcon, thinking: BrainIcon, telegram: TelegramIcon, tavily: TavilyIcon, supabase: SupabaseIcon, - stt: AudioWaveformIcon, + stt: STTIcon, stripe: StripeIcon, stagehand_agent: StagehandIcon, stagehand: StagehandIcon, diff --git a/apps/docs/content/docs/en/tools/meta.json b/apps/docs/content/docs/en/tools/meta.json index 437ad185c9..51c52c3a71 100644 --- a/apps/docs/content/docs/en/tools/meta.json +++ b/apps/docs/content/docs/en/tools/meta.json @@ -68,9 +68,11 @@ "thinking", "translate", "trello", + "tts", "twilio_sms", "twilio_voice", "typeform", + "video_generator", "vision", "wealthbox", "webflow", diff --git a/apps/docs/content/docs/en/tools/stt.mdx b/apps/docs/content/docs/en/tools/stt.mdx index 2132b8c51b..d76afbbd31 100644 --- a/apps/docs/content/docs/en/tools/stt.mdx +++ b/apps/docs/content/docs/en/tools/stt.mdx @@ -11,15 +11,32 @@ import { BlockInfoCard } from "@/components/ui/block-info-card" /> {/* MANUAL-CONTENT-START:intro */} -Transcribe speech to text using state-of-the-art AI models from leading providers. The Sim Speech-to-Text (STT) tools allow you to convert audio and video files into accurate transcripts, supporting multiple languages, timestamps, and optional translation. +Transcribe speech to text using the latest AI models from world-class providers. Sim's Speech-to-Text (STT) tools empower you to turn audio and video into accurate, timestamped, and optionally translated transcripts—supporting a diversity of languages and enhanced with advanced features such as diarization and speaker identification. -Supported providers: +**Supported Providers & Models:** -- **[OpenAI Whisper](https://platform.openai.com/docs/guides/speech-to-text/overview)**: Advanced open-source STT model from OpenAI. Supports models such as `whisper-1` and handles a wide variety of languages and audio formats. -- **[Deepgram](https://deepgram.com/)**: Real-time and batch STT API with deep learning models like `nova-3`, `nova-2`, and `whisper-large`. Offers features like diarization, intent recognition, and industry-specific tuning. -- **[ElevenLabs](https://elevenlabs.io/)**: Known for high-quality speech AI, ElevenLabs provides STT models focused on accuracy and natural language understanding for numerous languages and dialects. +- **[OpenAI Whisper](https://platform.openai.com/docs/guides/speech-to-text/overview)** (OpenAI): + OpenAI’s Whisper is an open-source deep learning model renowned for its robustness across languages and audio conditions. It supports advanced models such as `whisper-1`, excelling in transcription, translation, and tasks demanding high model generalization. Backed by OpenAI—the company known for ChatGPT and leading AI research—Whisper is widely used in research and as a baseline for comparative evaluation. -Choose the provider and model best suited to your task—whether fast, production-grade transcription (Deepgram), highly accurate multi-language capability (Whisper), or advanced understanding and language coverage (ElevenLabs). +- **[Deepgram](https://deepgram.com/)** (Deepgram Inc.): + Based in San Francisco, Deepgram offers scalable, production-grade speech recognition APIs for developers and enterprises. Deepgram’s models include `nova-3`, `nova-2`, and `whisper-large`, offering real-time and batch transcription with industry-leading accuracy, multi-language support, automatic punctuation, intelligent diarization, call analytics, and features for use cases ranging from telephony to media production. + +- **[ElevenLabs](https://elevenlabs.io/)** (ElevenLabs): + A leader in voice AI, ElevenLabs is especially known for premium voice synthesis and recognition. Its STT product delivers high-accuracy, natural understanding of numerous languages, dialects, and accents. Recent ElevenLabs STT models are optimized for clarity, speaker distinction, and are suitable for both creative and accessibility scenarios. ElevenLabs is recognized for cutting-edge advancements in AI-powered speech technologies. + +- **[AssemblyAI](https://www.assemblyai.com/)** (AssemblyAI Inc.): + AssemblyAI provides API-driven, highly accurate speech recognition, with features such as auto chaptering, topic detection, summarization, sentiment analysis, and content moderation alongside transcription. Its proprietary model, including the acclaimed `Conformer-2`, powers some of the largest media, call center, and compliance applications in the industry. AssemblyAI is trusted by Fortune 500s and leading AI startups globally. + +- **[Google Cloud Speech-to-Text](https://cloud.google.com/speech-to-text)** (Google Cloud): + Google’s enterprise-grade Speech-to-Text API supports over 125 languages and variants, offering high accuracy and features such as real-time streaming, word-level confidence, speaker diarization, automatic punctuation, custom vocabulary, and domain-specific tuning. Models such as `latest_long`, `video`, and domain-optimized models are available, powered by Google’s years of research and deployed for global scalability. + +- **[AWS Transcribe](https://aws.amazon.com/transcribe/)** (Amazon Web Services): + AWS Transcribe leverages Amazon’s cloud infrastructure to deliver robust speech recognition as an API. It supports multiple languages and features such as speaker identification, custom vocabulary, channel identification (for call center audio), and medical-specific transcription. Popular models include `standard` and domain-specific variations. AWS Transcribe is ideal for organizations already using Amazon’s cloud. + +**How to Choose:** +Select the provider and model that fits your application—whether you need fast, enterprise-ready transcription with extra analytics (Deepgram, AssemblyAI, Google, AWS), high versatility and open-source access (OpenAI Whisper), or advanced speaker/contextual understanding (ElevenLabs). Consider the pricing, language coverage, accuracy, and any special features (like summarization, chaptering, or sentiment analysis) you might need. + +For more details on capabilities, pricing, feature highlights, and fine-tuning options, refer to each provider’s official documentation via the links above. {/* MANUAL-CONTENT-END */} @@ -48,6 +65,8 @@ Transcribe audio to text using OpenAI Whisper | `language` | string | No | Language code \(e.g., "en", "es", "fr"\) or "auto" for auto-detection | | `timestamps` | string | No | Timestamp granularity: none, sentence, or word | | `translateToEnglish` | boolean | No | Translate audio to English | +| `prompt` | string | No | Optional text to guide the model's style or continue a previous audio segment. Helps with proper nouns and context. | +| `temperature` | number | No | Sampling temperature between 0 and 1. Higher values make output more random, lower values more focused and deterministic. | #### Output @@ -57,7 +76,6 @@ Transcribe audio to text using OpenAI Whisper | `segments` | array | Timestamped segments | | `language` | string | Detected or specified language | | `duration` | number | Audio duration in seconds | -| `confidence` | number | Overall confidence score | ### `stt_deepgram` @@ -114,6 +132,68 @@ Transcribe audio to text using ElevenLabs | `duration` | number | Audio duration in seconds | | `confidence` | number | Overall confidence score | +### `stt_assemblyai` + +Transcribe audio to text using AssemblyAI with advanced NLP features + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `provider` | string | Yes | STT provider \(assemblyai\) | +| `apiKey` | string | Yes | AssemblyAI API key | +| `model` | string | No | AssemblyAI model to use \(default: best\) | +| `audioFile` | file | No | Audio or video file to transcribe | +| `audioFileReference` | file | No | Reference to audio/video file from previous blocks | +| `audioUrl` | string | No | URL to audio or video file | +| `language` | string | No | Language code \(e.g., "en", "es", "fr"\) or "auto" for auto-detection | +| `timestamps` | string | No | Timestamp granularity: none, sentence, or word | +| `diarization` | boolean | No | Enable speaker diarization | +| `sentiment` | boolean | No | Enable sentiment analysis | +| `entityDetection` | boolean | No | Enable entity detection | +| `piiRedaction` | boolean | No | Enable PII redaction | +| `summarization` | boolean | No | Enable automatic summarization | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `transcript` | string | Full transcribed text | +| `segments` | array | Timestamped segments with speaker labels | +| `language` | string | Detected or specified language | +| `duration` | number | Audio duration in seconds | +| `confidence` | number | Overall confidence score | +| `sentiment` | array | Sentiment analysis results | +| `entities` | array | Detected entities | +| `summary` | string | Auto-generated summary | + +### `stt_gemini` + +Transcribe audio to text using Google Gemini with multimodal capabilities + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `provider` | string | Yes | STT provider \(gemini\) | +| `apiKey` | string | Yes | Google API key | +| `model` | string | No | Gemini model to use \(default: gemini-2.5-flash\) | +| `audioFile` | file | No | Audio or video file to transcribe | +| `audioFileReference` | file | No | Reference to audio/video file from previous blocks | +| `audioUrl` | string | No | URL to audio or video file | +| `language` | string | No | Language code \(e.g., "en", "es", "fr"\) or "auto" for auto-detection | +| `timestamps` | string | No | Timestamp granularity: none, sentence, or word | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `transcript` | string | Full transcribed text | +| `segments` | array | Timestamped segments | +| `language` | string | Detected or specified language | +| `duration` | number | Audio duration in seconds | +| `confidence` | number | Overall confidence score | + ## Notes diff --git a/apps/docs/content/docs/en/tools/tts.mdx b/apps/docs/content/docs/en/tools/tts.mdx new file mode 100644 index 0000000000..99b380939a --- /dev/null +++ b/apps/docs/content/docs/en/tools/tts.mdx @@ -0,0 +1,261 @@ +--- +title: Text-to-Speech +description: Convert text to speech using AI voices +--- + +import { BlockInfoCard } from "@/components/ui/block-info-card" + + + +{/* MANUAL-CONTENT-START:intro */} +Convert text to natural-sounding speech using the latest AI voices. Sim's Text-to-Speech (TTS) tools let you generate audio from written text in dozens of languages, with a choice of expressive voices, formats, and advanced controls like speed, style, emotion, and more. + +**Supported Providers & Models:** + +- **[OpenAI Text-to-Speech](https://platform.openai.com/docs/guides/text-to-speech/voice-options)** (OpenAI): + OpenAI's TTS API offers ultra-realistic voices using advanced AI models like `tts-1`, `tts-1-hd`, and `gpt-4o-mini-tts`. Voices include both male and female, with options such as alloy, echo, fable, onyx, nova, shimmer, ash, ballad, coral, sage, and verse. Supports multiple audio formats (mp3, opus, aac, flac, wav, pcm), adjustable speed and streaming synthesis. + +- **[Deepgram Aura](https://deepgram.com/products/text-to-speech)** (Deepgram Inc.): + Deepgram’s Aura provides expressive English and multilingual AI voices, optimized for conversational clarity, low latency, and customization. Models like `aura-asteria-en`, `aura-luna-en`, and others are available. Supports multiple encoding formats (linear16, mp3, opus, aac, flac) and fine tuning on speed, sample rate, and style. + +- **[ElevenLabs Text-to-Speech](https://elevenlabs.io/text-to-speech)** (ElevenLabs): + ElevenLabs leads in lifelike, emotionally rich TTS, offering dozens of voices in 29+ languages and the ability to clone custom voices. Models support voice design, speech synthesis, and direct API access, with advanced controls for style, emotion, stability, and similarity. Suitable for audiobooks, content creation, accessibility, and more. + +- **[Cartesia TTS](https://docs.cartesia.ai/)** (Cartesia): + Cartesia offers high-quality, fast, and secure text-to-speech with a focus on privacy and flexible deployment. It provides instant streaming, real-time synthesis, and supports multiple international voices and accents, accessible through a simple API. + +- **[Google Cloud Text-to-Speech](https://cloud.google.com/text-to-speech)** (Google Cloud): + Google uses DeepMind WaveNet and Neural2 models to power high-fidelity voices in 50+ languages and variants. Features include voice selection, pitch, speaking rate, volume control, SSML tags, and access to both standard and studio-grade premium voices. Widely used for accessibility, IVR, and media. + +- **[Microsoft Azure Speech](https://azure.microsoft.com/en-us/products/ai-services/text-to-speech)** (Microsoft Azure): + Azure provides over 400 neural voices across 140+ languages and locales, with unique voice customization, style, emotion, role, and real-time controls. Offers SSML support for pronunciation, intonation, and more. Ideal for global, enterprise, or creative TTS needs. + +- **[PlayHT](https://play.ht/)** (PlayHT): + PlayHT specializes in realistic voice synthesis, voice cloning, and instant streaming playback with 800+ voices in over 100 languages. Features include emotion, pitch and speed controls, multi-voice audio, and custom voice creation via the API or online studio. + +**How to Choose:** +Pick your provider and model by prioritizing languages, supported voice types, desired formats (mp3, wav, etc.), control granularity (speed, emotion, etc.), and specialized features (voice cloning, accent, streaming). For creative, accessibility, or developer use cases, ensure compatibility with your application's requirements and compare costs. + +Visit each provider’s official site for up-to-date capabilities, pricing, and documentation details! +{/* MANUAL-CONTENT-END */} + + +## Usage Instructions + +Generate natural-sounding speech from text using state-of-the-art AI voices from OpenAI, Deepgram, ElevenLabs, Cartesia, Google Cloud, Azure, and PlayHT. Supports multiple voices, languages, and audio formats. + + + +## Tools + +### `tts_openai` + +Convert text to speech using OpenAI TTS models + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `text` | string | Yes | The text to convert to speech | +| `apiKey` | string | Yes | OpenAI API key | +| `model` | string | No | TTS model to use \(tts-1, tts-1-hd, or gpt-4o-mini-tts\) | +| `voice` | string | No | Voice to use \(alloy, ash, ballad, cedar, coral, echo, marin, sage, shimmer, verse\) | +| `responseFormat` | string | No | Audio format \(mp3, opus, aac, flac, wav, pcm\) | +| `speed` | number | No | Speech speed \(0.25 to 4.0, default: 1.0\) | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `audioUrl` | string | URL to the generated audio file | +| `audioFile` | file | Generated audio file object | +| `duration` | number | Audio duration in seconds | +| `characterCount` | number | Number of characters processed | +| `format` | string | Audio format | +| `provider` | string | TTS provider used | + +### `tts_deepgram` + +Convert text to speech using Deepgram Aura + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `text` | string | Yes | The text to convert to speech | +| `apiKey` | string | Yes | Deepgram API key | +| `model` | string | No | Deepgram model/voice \(e.g., aura-asteria-en, aura-luna-en\) | +| `voice` | string | No | Voice identifier \(alternative to model param\) | +| `encoding` | string | No | Audio encoding \(linear16, mp3, opus, aac, flac\) | +| `sampleRate` | number | No | Sample rate \(8000, 16000, 24000, 48000\) | +| `bitRate` | number | No | Bit rate for compressed formats | +| `container` | string | No | Container format \(none, wav, ogg\) | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `audioUrl` | string | URL to the generated audio file | +| `audioFile` | file | Generated audio file object | +| `duration` | number | Audio duration in seconds | +| `characterCount` | number | Number of characters processed | +| `format` | string | Audio format | +| `provider` | string | TTS provider used | + +### `tts_elevenlabs` + +Convert text to speech using ElevenLabs voices + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `text` | string | Yes | The text to convert to speech | +| `voiceId` | string | Yes | The ID of the voice to use | +| `apiKey` | string | Yes | ElevenLabs API key | +| `modelId` | string | No | Model to use \(e.g., eleven_monolingual_v1, eleven_turbo_v2_5, eleven_flash_v2_5\) | +| `stability` | number | No | Voice stability \(0.0 to 1.0, default: 0.5\) | +| `similarityBoost` | number | No | Similarity boost \(0.0 to 1.0, default: 0.8\) | +| `style` | number | No | Style exaggeration \(0.0 to 1.0\) | +| `useSpeakerBoost` | boolean | No | Use speaker boost \(default: true\) | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `audioUrl` | string | URL to the generated audio file | +| `audioFile` | file | Generated audio file object | +| `duration` | number | Audio duration in seconds | +| `characterCount` | number | Number of characters processed | +| `format` | string | Audio format | +| `provider` | string | TTS provider used | + +### `tts_cartesia` + +Convert text to speech using Cartesia Sonic (ultra-low latency) + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `text` | string | Yes | The text to convert to speech | +| `apiKey` | string | Yes | Cartesia API key | +| `modelId` | string | No | Model ID \(sonic-english, sonic-multilingual\) | +| `voice` | string | No | Voice ID or embedding | +| `language` | string | No | Language code \(en, es, fr, de, it, pt, etc.\) | +| `outputFormat` | json | No | Output format configuration \(container, encoding, sampleRate\) | +| `speed` | number | No | Speed multiplier | +| `emotion` | array | No | Emotion tags for Sonic-3 \(e.g., \['positivity:high'\]\) | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `audioUrl` | string | URL to the generated audio file | +| `audioFile` | file | Generated audio file object | +| `duration` | number | Audio duration in seconds | +| `characterCount` | number | Number of characters processed | +| `format` | string | Audio format | +| `provider` | string | TTS provider used | + +### `tts_google` + +Convert text to speech using Google Cloud Text-to-Speech + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `text` | string | Yes | The text to convert to speech | +| `apiKey` | string | Yes | Google Cloud API key | +| `voiceId` | string | No | Voice ID \(e.g., en-US-Neural2-A, en-US-Wavenet-D\) | +| `languageCode` | string | Yes | Language code \(e.g., en-US, es-ES, fr-FR\) | +| `gender` | string | No | Voice gender \(MALE, FEMALE, NEUTRAL\) | +| `audioEncoding` | string | No | Audio encoding \(LINEAR16, MP3, OGG_OPUS, MULAW, ALAW\) | +| `speakingRate` | number | No | Speaking rate \(0.25 to 2.0, default: 1.0\) | +| `pitch` | number | No | Voice pitch \(-20.0 to 20.0, default: 0.0\) | +| `volumeGainDb` | number | No | Volume gain in dB \(-96.0 to 16.0\) | +| `sampleRateHertz` | number | No | Sample rate in Hz | +| `effectsProfileId` | array | No | Effects profile \(e.g., \['headphone-class-device'\]\) | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `audioUrl` | string | URL to the generated audio file | +| `audioFile` | file | Generated audio file object | +| `duration` | number | Audio duration in seconds | +| `characterCount` | number | Number of characters processed | +| `format` | string | Audio format | +| `provider` | string | TTS provider used | + +### `tts_azure` + +Convert text to speech using Azure Cognitive Services + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `text` | string | Yes | The text to convert to speech | +| `apiKey` | string | Yes | Azure Speech Services API key | +| `voiceId` | string | No | Voice ID \(e.g., en-US-JennyNeural, en-US-GuyNeural\) | +| `region` | string | No | Azure region \(e.g., eastus, westus, westeurope\) | +| `outputFormat` | string | No | Output audio format | +| `rate` | string | No | Speaking rate \(e.g., +10%, -20%, 1.5\) | +| `pitch` | string | No | Voice pitch \(e.g., +5Hz, -2st, low\) | +| `style` | string | No | Speaking style \(e.g., cheerful, sad, angry - neural voices only\) | +| `styleDegree` | number | No | Style intensity \(0.01 to 2.0\) | +| `role` | string | No | Role \(e.g., Girl, Boy, YoungAdultFemale\) | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `audioUrl` | string | URL to the generated audio file | +| `audioFile` | file | Generated audio file object | +| `duration` | number | Audio duration in seconds | +| `characterCount` | number | Number of characters processed | +| `format` | string | Audio format | +| `provider` | string | TTS provider used | + +### `tts_playht` + +Convert text to speech using PlayHT (voice cloning) + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `text` | string | Yes | The text to convert to speech | +| `apiKey` | string | Yes | PlayHT API key \(AUTHORIZATION header\) | +| `userId` | string | Yes | PlayHT user ID \(X-USER-ID header\) | +| `voice` | string | No | Voice ID or manifest URL | +| `quality` | string | No | Quality level \(draft, standard, premium\) | +| `outputFormat` | string | No | Output format \(mp3, wav, ogg, flac, mulaw\) | +| `speed` | number | No | Speed multiplier \(0.5 to 2.0\) | +| `temperature` | number | No | Creativity/randomness \(0.0 to 2.0\) | +| `voiceGuidance` | number | No | Voice stability \(1.0 to 6.0\) | +| `textGuidance` | number | No | Text adherence \(1.0 to 6.0\) | +| `sampleRate` | number | No | Sample rate \(8000, 16000, 22050, 24000, 44100, 48000\) | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `audioUrl` | string | URL to the generated audio file | +| `audioFile` | file | Generated audio file object | +| `duration` | number | Audio duration in seconds | +| `characterCount` | number | Number of characters processed | +| `format` | string | Audio format | +| `provider` | string | TTS provider used | + + + +## Notes + +- Category: `tools` +- Type: `tts` diff --git a/apps/docs/content/docs/en/tools/video_generator.mdx b/apps/docs/content/docs/en/tools/video_generator.mdx new file mode 100644 index 0000000000..a56b59108a --- /dev/null +++ b/apps/docs/content/docs/en/tools/video_generator.mdx @@ -0,0 +1,197 @@ +--- +title: Video Generator +description: Generate videos from text using AI +--- + +import { BlockInfoCard } from "@/components/ui/block-info-card" + + + +{/* MANUAL-CONTENT-START:intro */} +Create videos from text prompts using cutting-edge AI models from top providers. Sim's Video Generator brings powerful, creative video synthesis capabilities to your workflow—supporting diverse models, aspect ratios, resolutions, camera controls, native audio, and advanced style and consistency features. + +**Supported Providers & Models:** + +- **[Runway Gen-4](https://research.runwayml.com/gen2/)** (Runway ML): + Runway is a pioneer in text-to-video generation, known for powerful models like Gen-2, Gen-3, and Gen-4. The latest [Gen-4](https://research.runwayml.com/gen2/) model (and Gen-4 Turbo for faster results) supports more realistic motion, greater world consistency, and visual references for character, object, style, and location. Supports 16:9, 9:16, and 1:1 aspect ratios, 5–10 second durations, up to 4K resolution, style presets, and direct upload of reference images for consistent generations. Runway powers creative tools for filmmakers, studios, and content creators worldwide. + +- **[Google Veo](https://deepmind.google/technologies/veo/)** (Google DeepMind): + [Veo](https://deepmind.google/technologies/veo/) is Google’s next-generation video generation model, offering high-quality, native-audio videos up to 1080p and 16 seconds. Supports advanced motion, cinematic effects, and nuanced text understanding. Veo can generate videos with built-in sound—activating native audio as well as silent clips. Options include 16:9 aspect, variable duration, different models (veo-3, veo-3.1), and prompt-based controls. Ideal for storytelling, advertising, research, and ideation. + +- **[Luma Dream Machine](https://lumalabs.ai/dream-machine)** (Luma AI): + [Dream Machine](https://lumalabs.ai/dream-machine) delivers jaw-droppingly realistic and fluid video from text. It incorporates advanced camera control, cinematography prompts, and supports both ray-1 and ray-2 models. Dream Machine supports precise aspect ratios (16:9, 9:16, 1:1), variable durations, and the specification of camera paths for intricate visual direction. Luma is renowned for breakthrough visual fidelity and is backed by leading AI vision researchers. + +- **[MiniMax Hailuo-02](https://minimax.chat/)** (via [Fal.ai](https://fal.ai/)): + [MiniMax Hailuo-02](https://minimax.chat/) is a sophisticated Chinese generative video model, available globally through [Fal.ai](https://fal.ai/). Generate videos up to 16 seconds in landscape or portrait format, with options for prompt optimization to improve clarity and creativity. Pro and standard endpoints available, supporting high resolutions (up to 1920×1080). Well-suited for creative projects needing prompt translation and optimization, commercial storytelling, and rapid prototyping of visual ideas. + +**How to Choose:** +Pick your provider and model based on your needs for quality, speed, duration, audio, cost, and unique features. Runway and Veo offer world-leading realism and cinematic capabilities; Luma excels in fluid motion and camera control; MiniMax is ideal for Chinese-language prompts and offers fast, affordable access. Consider reference support, style presets, audio requirements, and pricing when selecting your tool. + +For more details on features, restrictions, pricing, and model advances, see each provider’s official documentation above. +{/* MANUAL-CONTENT-END */} + + +## Usage Instructions + +Generate high-quality videos from text prompts using leading AI providers. Supports multiple models, aspect ratios, resolutions, and provider-specific features like world consistency, camera controls, and audio generation. + + + +## Tools + +### `video_runway` + +Generate videos using Runway Gen-4 with world consistency and visual references + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `provider` | string | Yes | Video provider \(runway\) | +| `apiKey` | string | Yes | Runway API key | +| `model` | string | No | Runway model: gen-4 \(default, higher quality\) or gen-4-turbo \(faster\) | +| `prompt` | string | Yes | Text prompt describing the video to generate | +| `duration` | number | No | Video duration in seconds \(5 or 10, default: 5\) | +| `aspectRatio` | string | No | Aspect ratio: 16:9 \(landscape\), 9:16 \(portrait\), or 1:1 \(square\) | +| `resolution` | string | No | Video resolution \(720p output\). Note: Gen-4 Turbo outputs at 720p natively | +| `visualReference` | json | Yes | Reference image REQUIRED for Gen-4 \(UserFile object\). Gen-4 only supports image-to-video, not text-only generation | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `videoUrl` | string | Generated video URL | +| `videoFile` | json | Video file object with metadata | +| `duration` | number | Video duration in seconds | +| `width` | number | Video width in pixels | +| `height` | number | Video height in pixels | +| `provider` | string | Provider used \(runway\) | +| `model` | string | Model used | +| `jobId` | string | Runway job ID | + +### `video_veo` + +Generate videos using Google Veo 3/3.1 with native audio generation + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `provider` | string | Yes | Video provider \(veo\) | +| `apiKey` | string | Yes | Google Gemini API key | +| `model` | string | No | Veo model: veo-3 \(default, highest quality\), veo-3-fast \(faster\), or veo-3.1 \(latest\) | +| `prompt` | string | Yes | Text prompt describing the video to generate | +| `duration` | number | No | Video duration in seconds \(4, 6, or 8, default: 8\) | +| `aspectRatio` | string | No | Aspect ratio: 16:9 \(landscape\) or 9:16 \(portrait\) | +| `resolution` | string | No | Video resolution: 720p or 1080p \(default: 1080p\) | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `videoUrl` | string | Generated video URL | +| `videoFile` | json | Video file object with metadata | +| `duration` | number | Video duration in seconds | +| `width` | number | Video width in pixels | +| `height` | number | Video height in pixels | +| `provider` | string | Provider used \(veo\) | +| `model` | string | Model used | +| `jobId` | string | Veo job ID | + +### `video_luma` + +Generate videos using Luma Dream Machine with advanced camera controls + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `provider` | string | Yes | Video provider \(luma\) | +| `apiKey` | string | Yes | Luma AI API key | +| `model` | string | No | Luma model: ray-2 \(default\) | +| `prompt` | string | Yes | Text prompt describing the video to generate | +| `duration` | number | No | Video duration in seconds \(5 or 9, default: 5\) | +| `aspectRatio` | string | No | Aspect ratio: 16:9 \(landscape\), 9:16 \(portrait\), or 1:1 \(square\) | +| `resolution` | string | No | Video resolution: 540p, 720p, or 1080p \(default: 1080p\) | +| `cameraControl` | json | No | Camera controls as array of concept objects. Format: \[\{ "key": "concept_name" \}\]. Valid keys: truck_left, truck_right, pan_left, pan_right, tilt_up, tilt_down, zoom_in, zoom_out, push_in, pull_out, orbit_left, orbit_right, crane_up, crane_down, static, handheld, and 20+ more predefined options | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `videoUrl` | string | Generated video URL | +| `videoFile` | json | Video file object with metadata | +| `duration` | number | Video duration in seconds | +| `width` | number | Video width in pixels | +| `height` | number | Video height in pixels | +| `provider` | string | Provider used \(luma\) | +| `model` | string | Model used | +| `jobId` | string | Luma job ID | + +### `video_minimax` + +Generate videos using MiniMax Hailuo through MiniMax Platform API with advanced realism and prompt optimization + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `provider` | string | Yes | Video provider \(minimax\) | +| `apiKey` | string | Yes | MiniMax API key from platform.minimax.io | +| `model` | string | No | MiniMax model: hailuo-02 \(default\) | +| `prompt` | string | Yes | Text prompt describing the video to generate | +| `duration` | number | No | Video duration in seconds \(6 or 10, default: 6\) | +| `promptOptimizer` | boolean | No | Enable prompt optimization for better results \(default: true\) | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `videoUrl` | string | Generated video URL | +| `videoFile` | json | Video file object with metadata | +| `duration` | number | Video duration in seconds | +| `width` | number | Video width in pixels | +| `height` | number | Video height in pixels | +| `provider` | string | Provider used \(minimax\) | +| `model` | string | Model used | +| `jobId` | string | MiniMax job ID | + +### `video_falai` + +Generate videos using Fal.ai platform with access to multiple models including Veo 3.1, Sora 2, Kling 2.5, MiniMax Hailuo, and more + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `provider` | string | Yes | Video provider \(falai\) | +| `apiKey` | string | Yes | Fal.ai API key | +| `model` | string | Yes | Fal.ai model: veo-3.1 \(Google Veo 3.1\), sora-2 \(OpenAI Sora 2\), kling-2.5-turbo-pro \(Kling 2.5 Turbo Pro\), kling-2.1-pro \(Kling 2.1 Master\), minimax-hailuo-2.3-pro \(MiniMax Hailuo Pro\), minimax-hailuo-2.3-standard \(MiniMax Hailuo Standard\), wan-2.1 \(WAN T2V\), ltxv-0.9.8 \(LTXV 13B\) | +| `prompt` | string | Yes | Text prompt describing the video to generate | +| `duration` | number | No | Video duration in seconds \(varies by model\) | +| `aspectRatio` | string | No | Aspect ratio \(varies by model\): 16:9, 9:16, 1:1 | +| `resolution` | string | No | Video resolution \(varies by model\): 540p, 720p, 1080p | +| `promptOptimizer` | boolean | No | Enable prompt optimization for MiniMax models \(default: true\) | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `videoUrl` | string | Generated video URL | +| `videoFile` | json | Video file object with metadata | +| `duration` | number | Video duration in seconds | +| `width` | number | Video width in pixels | +| `height` | number | Video height in pixels | +| `provider` | string | Provider used \(falai\) | +| `model` | string | Model used | +| `jobId` | string | Job ID | + + + +## Notes + +- Category: `tools` +- Type: `video_generator` diff --git a/apps/sim/app/api/proxy/stt/route.ts b/apps/sim/app/api/proxy/stt/route.ts index 7e30e75647..3c3813516c 100644 --- a/apps/sim/app/api/proxy/stt/route.ts +++ b/apps/sim/app/api/proxy/stt/route.ts @@ -12,7 +12,7 @@ export const dynamic = 'force-dynamic' export const maxDuration = 300 // 5 minutes for large files interface SttRequestBody { - provider: 'whisper' | 'deepgram' | 'elevenlabs' + provider: 'whisper' | 'deepgram' | 'elevenlabs' | 'assemblyai' | 'gemini' apiKey: string model?: string audioFile?: UserFile | UserFile[] @@ -22,6 +22,14 @@ interface SttRequestBody { timestamps?: 'none' | 'sentence' | 'word' diarization?: boolean translateToEnglish?: boolean + // Whisper-specific options + prompt?: string + temperature?: number + // AssemblyAI-specific options + sentiment?: boolean + entityDetection?: boolean + piiRedaction?: boolean + summarization?: boolean workspaceId?: string workflowId?: string executionId?: string @@ -38,7 +46,19 @@ export async function POST(request: NextRequest) { } const body: SttRequestBody = await request.json() - const { provider, apiKey, model, language, timestamps, diarization, translateToEnglish } = body + const { + provider, + apiKey, + model, + language, + timestamps, + diarization, + translateToEnglish, + sentiment, + entityDetection, + piiRedaction, + summarization, + } = body if (!provider || !apiKey) { return NextResponse.json( @@ -115,6 +135,9 @@ export async function POST(request: NextRequest) { let detectedLanguage: string | undefined let duration: number | undefined let confidence: number | undefined + let sentimentResults: any[] | undefined + let entities: any[] | undefined + let summary: string | undefined try { if (provider === 'whisper') { @@ -124,7 +147,9 @@ export async function POST(request: NextRequest) { language, timestamps, translateToEnglish, - model + model, + body.prompt, + body.temperature ) transcript = result.transcript segments = result.segments @@ -156,6 +181,41 @@ export async function POST(request: NextRequest) { segments = result.segments detectedLanguage = result.language duration = result.duration + } else if (provider === 'assemblyai') { + const result = await transcribeWithAssemblyAI( + audioBuffer, + apiKey, + language, + timestamps, + diarization, + sentiment, + entityDetection, + piiRedaction, + summarization, + model + ) + transcript = result.transcript + segments = result.segments + detectedLanguage = result.language + duration = result.duration + confidence = result.confidence + sentimentResults = result.sentiment + entities = result.entities + summary = result.summary + } else if (provider === 'gemini') { + const result = await transcribeWithGemini( + audioBuffer, + apiKey, + audioMimeType, + language, + timestamps, + model + ) + transcript = result.transcript + segments = result.segments + detectedLanguage = result.language + duration = result.duration + confidence = result.confidence } else { return NextResponse.json({ error: `Unknown provider: ${provider}` }, { status: 400 }) } @@ -173,6 +233,9 @@ export async function POST(request: NextRequest) { language: detectedLanguage, duration, confidence, + sentiment: sentimentResults, + entities, + summary, }) } catch (error) { logger.error(`[${requestId}] STT proxy error:`, error) @@ -187,7 +250,9 @@ async function transcribeWithWhisper( language?: string, timestamps?: 'none' | 'sentence' | 'word', translate?: boolean, - model?: string + model?: string, + prompt?: string, + temperature?: number ): Promise<{ transcript: string segments?: TranscriptSegment[] @@ -204,12 +269,20 @@ async function transcribeWithWhisper( formData.append('language', language) } + if (prompt) { + formData.append('prompt', prompt) + } + + if (temperature !== undefined) { + formData.append('temperature', temperature.toString()) + } + if (timestamps === 'word') { formData.append('response_format', 'verbose_json') - formData.append('timestamp_granularities[]', 'word') + formData.append('timestamp_granularities', 'word') } else if (timestamps === 'sentence') { formData.append('response_format', 'verbose_json') - formData.append('timestamp_granularities[]', 'segment') + formData.append('timestamp_granularities', 'segment') } const endpoint = translate ? 'translations' : 'transcriptions' @@ -271,9 +344,11 @@ async function transcribeWithDeepgram( if (language && language !== 'auto') { params.append('language', language) + } else if (language === 'auto') { + params.append('detect_language', 'true') } - if (timestamps !== 'none') { + if (timestamps === 'sentence') { params.append('utterances', 'true') } @@ -308,7 +383,7 @@ async function transcribeWithDeepgram( const confidence = result.confidence let segments: TranscriptSegment[] | undefined - if (timestamps !== 'none' && result.words) { + if (result.words && timestamps === 'word') { segments = result.words.map((word: any) => ({ text: word.word, start: word.start, @@ -316,6 +391,14 @@ async function transcribeWithDeepgram( speaker: word.speaker !== undefined ? `Speaker ${word.speaker}` : undefined, confidence: word.confidence, })) + } else if (data.results?.utterances && timestamps === 'sentence') { + segments = data.results.utterances.map((utterance: any) => ({ + text: utterance.transcript, + start: utterance.start, + end: utterance.end, + speaker: utterance.speaker !== undefined ? `Speaker ${utterance.speaker}` : undefined, + confidence: utterance.confidence, + })) } return { @@ -345,7 +428,14 @@ async function transcribeWithElevenLabs( formData.append('model_id', model || 'scribe_v1') if (language && language !== 'auto') { - formData.append('language', language) + formData.append('language_code', language) + } + + if (timestamps && timestamps !== 'none') { + const granularity = timestamps === 'word' ? 'word' : 'word' + formData.append('timestamps_granularity', granularity) + } else { + formData.append('timestamps_granularity', 'word') } const response = await fetch('https://api.elevenlabs.io/v1/speech-to-text', { @@ -367,9 +457,269 @@ async function transcribeWithElevenLabs( const data = await response.json() + const words = data.words || [] + const segments: TranscriptSegment[] = words + .filter((w: any) => w.type === 'word') + .map((w: any) => ({ + text: w.text, + start: w.start, + end: w.end, + speaker: w.speaker_id, + })) + return { transcript: data.text || '', - language: data.language, - duration: data.duration, + segments: segments.length > 0 ? segments : undefined, + language: data.language_code, + duration: undefined, // ElevenLabs doesn't return duration in response + } +} + +async function transcribeWithAssemblyAI( + audioBuffer: Buffer, + apiKey: string, + language?: string, + timestamps?: 'none' | 'sentence' | 'word', + diarization?: boolean, + sentiment?: boolean, + entityDetection?: boolean, + piiRedaction?: boolean, + summarization?: boolean, + model?: string +): Promise<{ + transcript: string + segments?: TranscriptSegment[] + language?: string + duration?: number + confidence?: number + sentiment?: any[] + entities?: any[] + summary?: string +}> { + const uploadResponse = await fetch('https://api.assemblyai.com/v2/upload', { + method: 'POST', + headers: { + authorization: apiKey, + 'content-type': 'application/octet-stream', + }, + body: new Uint8Array(audioBuffer), + }) + + if (!uploadResponse.ok) { + const error = await uploadResponse.json() + throw new Error(`AssemblyAI upload error: ${error.error || JSON.stringify(error)}`) + } + + const { upload_url } = await uploadResponse.json() + + const transcriptRequest: any = { + audio_url: upload_url, + } + + if (model === 'best' || model === 'nano') { + transcriptRequest.speech_model = model + } + + if (language && language !== 'auto') { + transcriptRequest.language_code = language + } else if (language === 'auto') { + transcriptRequest.language_detection = true + } + + if (diarization) { + transcriptRequest.speaker_labels = true + } + + if (sentiment) { + transcriptRequest.sentiment_analysis = true + } + + if (entityDetection) { + transcriptRequest.entity_detection = true + } + + if (piiRedaction) { + transcriptRequest.redact_pii = true + transcriptRequest.redact_pii_policies = [ + 'us_social_security_number', + 'email_address', + 'phone_number', + ] + } + + if (summarization) { + transcriptRequest.summarization = true + transcriptRequest.summary_model = 'informative' + transcriptRequest.summary_type = 'bullets' + } + + const transcriptResponse = await fetch('https://api.assemblyai.com/v2/transcript', { + method: 'POST', + headers: { + authorization: apiKey, + 'content-type': 'application/json', + }, + body: JSON.stringify(transcriptRequest), + }) + + if (!transcriptResponse.ok) { + const error = await transcriptResponse.json() + throw new Error(`AssemblyAI transcript error: ${error.error || JSON.stringify(error)}`) + } + + const { id } = await transcriptResponse.json() + + let transcript: any + let attempts = 0 + const maxAttempts = 60 // 5 minutes with 5-second intervals + + while (attempts < maxAttempts) { + const statusResponse = await fetch(`https://api.assemblyai.com/v2/transcript/${id}`, { + headers: { + authorization: apiKey, + }, + }) + + if (!statusResponse.ok) { + const error = await statusResponse.json() + throw new Error(`AssemblyAI status error: ${error.error || JSON.stringify(error)}`) + } + + transcript = await statusResponse.json() + + if (transcript.status === 'completed') { + break + } + if (transcript.status === 'error') { + throw new Error(`AssemblyAI transcription failed: ${transcript.error}`) + } + + await new Promise((resolve) => setTimeout(resolve, 5000)) + attempts++ + } + + if (transcript.status !== 'completed') { + throw new Error('AssemblyAI transcription timed out') + } + + let segments: TranscriptSegment[] | undefined + if (timestamps !== 'none' && transcript.words) { + segments = transcript.words.map((word: any) => ({ + text: word.text, + start: word.start / 1000, + end: word.end / 1000, + speaker: word.speaker ? `Speaker ${word.speaker}` : undefined, + confidence: word.confidence, + })) + } + + const result: any = { + transcript: transcript.text, + segments, + language: transcript.language_code, + duration: transcript.audio_duration, + confidence: transcript.confidence, + } + + if (sentiment && transcript.sentiment_analysis_results) { + result.sentiment = transcript.sentiment_analysis_results + } + + if (entityDetection && transcript.entities) { + result.entities = transcript.entities + } + + if (summarization && transcript.summary) { + result.summary = transcript.summary + } + + return result +} + +async function transcribeWithGemini( + audioBuffer: Buffer, + apiKey: string, + mimeType: string, + language?: string, + timestamps?: 'none' | 'sentence' | 'word', + model?: string +): Promise<{ + transcript: string + segments?: TranscriptSegment[] + language?: string + duration?: number + confidence?: number +}> { + const modelName = model || 'gemini-2.5-flash' + + const estimatedSize = audioBuffer.length * 1.34 + if (estimatedSize > 20 * 1024 * 1024) { + throw new Error('Audio file exceeds 20MB limit for inline data') + } + + const base64Audio = audioBuffer.toString('base64') + + const languagePrompt = language && language !== 'auto' ? ` The audio is in ${language}.` : '' + + const timestampPrompt = + timestamps === 'sentence' || timestamps === 'word' + ? ' Include timestamps in MM:SS format for each sentence.' + : '' + + const requestBody = { + contents: [ + { + parts: [ + { + inline_data: { + mime_type: mimeType, + data: base64Audio, + }, + }, + { + text: `Please transcribe this audio file.${languagePrompt}${timestampPrompt} Provide the full transcript.`, + }, + ], + }, + ], + } + + const response = await fetch( + `https://generativelanguage.googleapis.com/v1beta/models/${modelName}:generateContent?key=${apiKey}`, + { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify(requestBody), + } + ) + + if (!response.ok) { + const error = await response.json() + if (response.status === 404) { + throw new Error( + `Model not found: ${modelName}. Use gemini-3-pro-preview, gemini-2.5-pro, gemini-2.5-flash, gemini-2.5-flash-lite, or gemini-2.0-flash-exp` + ) + } + const errorMessage = error.error?.message || JSON.stringify(error) + throw new Error(`Gemini API error: ${errorMessage}`) + } + + const data = await response.json() + + if (!data.candidates?.[0]?.content?.parts?.[0]?.text) { + const candidate = data.candidates?.[0] + if (candidate?.finishReason === 'SAFETY') { + throw new Error('Content was blocked by safety filters') + } + throw new Error('Invalid response structure from Gemini API') + } + + const transcript = data.candidates[0].content.parts[0].text + + return { + transcript, + language: language !== 'auto' ? language : undefined, } } diff --git a/apps/sim/app/api/proxy/tts/unified/route.ts b/apps/sim/app/api/proxy/tts/unified/route.ts new file mode 100644 index 0000000000..dadfe50f50 --- /dev/null +++ b/apps/sim/app/api/proxy/tts/unified/route.ts @@ -0,0 +1,808 @@ +import type { NextRequest } from 'next/server' +import { NextResponse } from 'next/server' +import { checkHybridAuth } from '@/lib/auth/hybrid' +import { createLogger } from '@/lib/logs/console/logger' +import { StorageService } from '@/lib/uploads' +import { getBaseUrl } from '@/lib/urls/utils' +import type { + AzureTtsParams, + CartesiaTtsParams, + DeepgramTtsParams, + ElevenLabsTtsUnifiedParams, + GoogleTtsParams, + OpenAiTtsParams, + PlayHtTtsParams, + TtsProvider, + TtsResponse, +} from '@/tools/tts/types' +import { getFileExtension, getMimeType } from '@/tools/tts/types' + +const logger = createLogger('TtsUnifiedProxyAPI') + +export const dynamic = 'force-dynamic' +export const maxDuration = 60 // 1 minute + +interface TtsUnifiedRequestBody { + provider: TtsProvider + text: string + apiKey: string + + // OpenAI specific + model?: 'tts-1' | 'tts-1-hd' | 'gpt-4o-mini-tts' + voice?: string + responseFormat?: 'mp3' | 'opus' | 'aac' | 'flac' | 'wav' | 'pcm' + speed?: number + + // Deepgram specific + encoding?: 'linear16' | 'mp3' | 'opus' | 'aac' | 'flac' | 'mulaw' | 'alaw' + sampleRate?: number + bitRate?: number + container?: 'none' | 'wav' | 'ogg' + + // ElevenLabs specific + voiceId?: string + modelId?: string + stability?: number + similarityBoost?: number + style?: number | string + useSpeakerBoost?: boolean + + // Cartesia specific + language?: string + outputFormat?: object + emotion?: string[] + + // Google Cloud specific + languageCode?: string + gender?: 'MALE' | 'FEMALE' | 'NEUTRAL' + audioEncoding?: 'LINEAR16' | 'MP3' | 'OGG_OPUS' | 'MULAW' | 'ALAW' + speakingRate?: number + pitch?: number + volumeGainDb?: number + sampleRateHertz?: number + effectsProfileId?: string[] + + // Azure specific + region?: string + rate?: string + styleDegree?: number + role?: string + + // PlayHT specific + userId?: string + quality?: 'draft' | 'standard' | 'premium' + temperature?: number + voiceGuidance?: number + textGuidance?: number + + // Execution context + workspaceId?: string + workflowId?: string + executionId?: string +} + +export async function POST(request: NextRequest) { + const requestId = crypto.randomUUID() + logger.info(`[${requestId}] TTS unified request started`) + + try { + const authResult = await checkHybridAuth(request, { requireWorkflowId: false }) + if (!authResult.success) { + logger.error('Authentication failed for TTS unified proxy:', authResult.error) + return NextResponse.json({ error: 'Unauthorized' }, { status: 401 }) + } + + const body: TtsUnifiedRequestBody = await request.json() + const { provider, text, apiKey, workspaceId, workflowId, executionId } = body + + if (!provider || !text || !apiKey) { + return NextResponse.json( + { error: 'Missing required fields: provider, text, and apiKey' }, + { status: 400 } + ) + } + + const hasExecutionContext = workspaceId && workflowId && executionId + logger.info(`[${requestId}] Processing TTS with ${provider}`, { + hasExecutionContext, + textLength: text.length, + }) + + let audioBuffer: Buffer + let format: string + let mimeType: string + let duration: number | undefined + + try { + if (provider === 'openai') { + const result = await synthesizeWithOpenAi({ + text, + apiKey, + model: body.model, + voice: body.voice as OpenAiTtsParams['voice'], + responseFormat: body.responseFormat, + speed: body.speed, + }) + audioBuffer = result.audioBuffer + format = result.format + mimeType = result.mimeType + } else if (provider === 'deepgram') { + const result = await synthesizeWithDeepgram({ + text, + apiKey, + model: body.voice, + encoding: body.encoding, + sampleRate: body.sampleRate, + bitRate: body.bitRate, + container: body.container, + }) + audioBuffer = result.audioBuffer + format = result.format + mimeType = result.mimeType + duration = result.duration + } else if (provider === 'elevenlabs') { + if (!body.voiceId) { + return NextResponse.json( + { error: 'voiceId is required for ElevenLabs provider' }, + { status: 400 } + ) + } + const result = await synthesizeWithElevenLabs({ + text, + apiKey, + voiceId: body.voiceId, + modelId: body.modelId, + stability: body.stability, + similarityBoost: body.similarityBoost, + style: body.style as number | undefined, + useSpeakerBoost: body.useSpeakerBoost, + }) + audioBuffer = result.audioBuffer + format = result.format + mimeType = result.mimeType + } else if (provider === 'cartesia') { + const result = await synthesizeWithCartesia({ + text, + apiKey, + modelId: body.modelId, + voice: body.voice, + language: body.language, + outputFormat: body.outputFormat, + speed: body.speed, + emotion: body.emotion, + }) + audioBuffer = result.audioBuffer + format = result.format + mimeType = result.mimeType + } else if (provider === 'google') { + const result = await synthesizeWithGoogle({ + text, + apiKey, + voiceId: body.voiceId, + languageCode: body.languageCode, + gender: body.gender, + audioEncoding: body.audioEncoding, + speakingRate: body.speakingRate, + pitch: body.pitch, + volumeGainDb: body.volumeGainDb, + sampleRateHertz: body.sampleRateHertz, + effectsProfileId: body.effectsProfileId, + }) + audioBuffer = result.audioBuffer + format = result.format + mimeType = result.mimeType + } else if (provider === 'azure') { + const result = await synthesizeWithAzure({ + text, + apiKey, + voiceId: body.voiceId, + region: body.region, + outputFormat: body.outputFormat as AzureTtsParams['outputFormat'], + rate: body.rate, + pitch: body.pitch as string | undefined, + style: body.style as string | undefined, + styleDegree: body.styleDegree, + role: body.role, + }) + audioBuffer = result.audioBuffer + format = result.format + mimeType = result.mimeType + } else if (provider === 'playht') { + if (!body.userId) { + return NextResponse.json( + { error: 'userId is required for PlayHT provider' }, + { status: 400 } + ) + } + const result = await synthesizeWithPlayHT({ + text, + apiKey, + userId: body.userId, + voice: body.voice, + quality: body.quality, + outputFormat: typeof body.outputFormat === 'string' ? body.outputFormat : undefined, + speed: body.speed, + temperature: body.temperature, + voiceGuidance: body.voiceGuidance, + textGuidance: body.textGuidance, + sampleRate: body.sampleRate, + }) + audioBuffer = result.audioBuffer + format = result.format + mimeType = result.mimeType + } else { + return NextResponse.json({ error: `Unknown provider: ${provider}` }, { status: 400 }) + } + } catch (error) { + logger.error(`[${requestId}] TTS synthesis failed:`, error) + const errorMessage = error instanceof Error ? error.message : 'TTS synthesis failed' + return NextResponse.json({ error: errorMessage }, { status: 500 }) + } + + const timestamp = Date.now() + const fileExtension = getFileExtension(format) + const fileName = `tts-${provider}-${timestamp}.${fileExtension}` + + if (hasExecutionContext) { + const { uploadExecutionFile } = await import('@/lib/uploads/contexts/execution') + + const userFile = await uploadExecutionFile( + { workspaceId, workflowId, executionId }, + audioBuffer, + fileName, + mimeType, + authResult.userId + ) + + logger.info(`[${requestId}] TTS audio stored in execution context:`, { + executionId, + fileName, + size: userFile.size, + }) + + const response: TtsResponse = { + audioUrl: userFile.url, + audioFile: userFile, + characterCount: text.length, + format, + provider, + } + + if (duration) { + response.duration = duration + } + + return NextResponse.json(response) + } + + // Chat UI / copilot usage - no execution context + const fileInfo = await StorageService.uploadFile({ + file: audioBuffer, + fileName, + contentType: mimeType, + context: 'copilot', + }) + + const audioUrl = `${getBaseUrl()}${fileInfo.path}` + + logger.info(`[${requestId}] TTS audio stored in copilot context:`, { + fileName, + size: fileInfo.size, + }) + + const response: TtsResponse = { + audioUrl, + characterCount: text.length, + format, + provider, + } + + if (duration) { + response.duration = duration + } + + return NextResponse.json(response) + } catch (error) { + logger.error(`[${requestId}] TTS unified proxy error:`, error) + const errorMessage = error instanceof Error ? error.message : 'Unknown error' + return NextResponse.json({ error: errorMessage }, { status: 500 }) + } +} + +async function synthesizeWithOpenAi( + params: OpenAiTtsParams +): Promise<{ audioBuffer: Buffer; format: string; mimeType: string }> { + const { text, apiKey, model = 'tts-1', responseFormat = 'mp3', speed = 1.0 } = params + const voice = (params.voice || 'alloy') as OpenAiTtsParams['voice'] + + const response = await fetch('https://api.openai.com/v1/audio/speech', { + method: 'POST', + headers: { + Authorization: `Bearer ${apiKey}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + model, + voice, + input: text, + response_format: responseFormat, + speed: Math.max(0.25, Math.min(4.0, speed)), + }), + }) + + if (!response.ok) { + const error = await response.json().catch(() => ({})) + const errorMessage = error.error?.message || error.message || response.statusText + throw new Error(`OpenAI TTS API error: ${errorMessage}`) + } + + const arrayBuffer = await response.arrayBuffer() + const audioBuffer = Buffer.from(arrayBuffer) + const mimeType = getMimeType(responseFormat) + + return { + audioBuffer, + format: responseFormat, + mimeType, + } +} + +async function synthesizeWithDeepgram( + params: DeepgramTtsParams +): Promise<{ audioBuffer: Buffer; format: string; mimeType: string; duration?: number }> { + const { + text, + apiKey, + model = 'aura-asteria-en', + encoding = 'mp3', + sampleRate, + bitRate, + container, + } = params + + const queryParams = new URLSearchParams({ + model: model, + encoding: encoding, + }) + + if (sampleRate && encoding === 'linear16') { + queryParams.append('sample_rate', sampleRate.toString()) + } + + if (bitRate) { + queryParams.append('bit_rate', bitRate.toString()) + } + + if (container && container !== 'none') { + queryParams.append('container', container) + } + + const response = await fetch(`https://api.deepgram.com/v1/speak?${queryParams.toString()}`, { + method: 'POST', + headers: { + Authorization: `Token ${apiKey}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ text }), + }) + + if (!response.ok) { + const error = await response.json().catch(() => ({})) + const errorMessage = error.err_msg || error.message || response.statusText + throw new Error(`Deepgram TTS API error: ${errorMessage}`) + } + + const arrayBuffer = await response.arrayBuffer() + const audioBuffer = Buffer.from(arrayBuffer) + + let finalFormat: string = encoding + if (container === 'wav') { + finalFormat = 'wav' + } else if (container === 'ogg') { + finalFormat = 'ogg' + } + + const mimeType = getMimeType(finalFormat) + + return { + audioBuffer, + format: finalFormat, + mimeType, + } +} + +async function synthesizeWithElevenLabs( + params: ElevenLabsTtsUnifiedParams +): Promise<{ audioBuffer: Buffer; format: string; mimeType: string }> { + const { + text, + apiKey, + voiceId, + modelId = 'eleven_turbo_v2_5', + stability = 0.5, + similarityBoost = 0.8, + style, + useSpeakerBoost = true, + } = params + + const voiceSettings: any = { + stability: Math.max(0, Math.min(1, stability)), + similarity_boost: Math.max(0, Math.min(1, similarityBoost)), + use_speaker_boost: useSpeakerBoost, + } + + if (style !== undefined) { + voiceSettings.style = Math.max(0, Math.min(1, style)) + } + + const response = await fetch(`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`, { + method: 'POST', + headers: { + Accept: 'audio/mpeg', + 'Content-Type': 'application/json', + 'xi-api-key': apiKey, + }, + body: JSON.stringify({ + text, + model_id: modelId, + voice_settings: voiceSettings, + }), + }) + + if (!response.ok) { + const error = await response.json().catch(() => ({})) + const errorMessage = + typeof error.detail === 'string' + ? error.detail + : error.detail?.message || error.message || response.statusText + throw new Error(`ElevenLabs TTS API error: ${errorMessage}`) + } + + const arrayBuffer = await response.arrayBuffer() + const audioBuffer = Buffer.from(arrayBuffer) + + return { + audioBuffer, + format: 'mp3', + mimeType: 'audio/mpeg', + } +} + +async function synthesizeWithCartesia( + params: Partial +): Promise<{ audioBuffer: Buffer; format: string; mimeType: string }> { + const { + text, + apiKey, + modelId = 'sonic-3', + voice, + language = 'en', + outputFormat, + speed, + emotion, + } = params + + if (!text || !apiKey) { + throw new Error('text and apiKey are required for Cartesia') + } + + const requestBody: Record = { + model_id: modelId, + transcript: text, + language, + } + + if (voice) { + requestBody.voice = { + mode: 'id', + id: voice, + } + } + + const generationConfig: Record = {} + if (speed !== undefined) generationConfig.speed = speed + if (emotion !== undefined) generationConfig.emotion = emotion + if (Object.keys(generationConfig).length > 0) { + requestBody.generation_config = generationConfig + } + + if (outputFormat && typeof outputFormat === 'object') { + requestBody.output_format = outputFormat + } + + if (!requestBody.output_format) { + requestBody.output_format = { + container: 'wav', + encoding: 'pcm_s16le', + sample_rate: 24000, + } + } + + logger.info('Cartesia API request:', { + model_id: requestBody.model_id, + has_voice: !!requestBody.voice, + language: requestBody.language, + output_format: requestBody.output_format, + has_generation_config: !!requestBody.generation_config, + }) + + const response = await fetch('https://api.cartesia.ai/tts/bytes', { + method: 'POST', + headers: { + Authorization: `Bearer ${apiKey}`, + 'Content-Type': 'application/json', + 'Cartesia-Version': '2025-04-16', + }, + body: JSON.stringify(requestBody), + }) + + if (!response.ok) { + const error = await response.json().catch(() => ({})) + const errorMessage = error.error || error.message || response.statusText + const errorDetail = error.detail || '' + logger.error('Cartesia API error details:', { + status: response.status, + error: errorMessage, + detail: errorDetail, + requestBody: JSON.stringify(requestBody), + }) + throw new Error( + `Cartesia TTS API error: ${errorMessage}${errorDetail ? ` - ${errorDetail}` : ''}` + ) + } + + const arrayBuffer = await response.arrayBuffer() + const audioBuffer = Buffer.from(arrayBuffer) + + const format = + outputFormat && typeof outputFormat === 'object' && 'container' in outputFormat + ? (outputFormat.container as string) + : 'mp3' + const mimeType = getMimeType(format) + + return { + audioBuffer, + format, + mimeType, + } +} + +async function synthesizeWithGoogle( + params: Partial +): Promise<{ audioBuffer: Buffer; format: string; mimeType: string }> { + const { + text, + apiKey, + voiceId, + languageCode, + gender, + audioEncoding = 'MP3', + speakingRate = 1.0, + pitch = 0.0, + volumeGainDb, + sampleRateHertz, + effectsProfileId, + } = params + + if (!text || !apiKey || !languageCode) { + throw new Error('text, apiKey, and languageCode are required for Google Cloud TTS') + } + + const clampedSpeakingRate = Math.max(0.25, Math.min(2.0, speakingRate)) + + const audioConfig: Record = { + audioEncoding, + speakingRate: clampedSpeakingRate, + pitch, + } + + if (volumeGainDb !== undefined) { + audioConfig.volumeGainDb = volumeGainDb + } + if (sampleRateHertz) { + audioConfig.sampleRateHertz = sampleRateHertz + } + if (effectsProfileId && effectsProfileId.length > 0) { + audioConfig.effectsProfileId = effectsProfileId + } + + // Build voice config based on what's provided + const voice: Record = { + languageCode, + } + + // If voiceId is provided, use it (it takes precedence over gender) + if (voiceId) { + voice.name = voiceId + } + + // Only include gender if specified (don't default to NEUTRAL as it's not supported) + if (gender) { + voice.ssmlGender = gender + } + + // If neither voiceId nor gender is provided, default to a specific voice + if (!voiceId && !gender) { + voice.name = 'en-US-Neural2-C' + } + + const requestBody: Record = { + input: { text }, + voice, + audioConfig, + } + + const response = await fetch( + `https://texttospeech.googleapis.com/v1/text:synthesize?key=${apiKey}`, + { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify(requestBody), + } + ) + + if (!response.ok) { + const error = await response.json().catch(() => ({})) + const errorMessage = error.error?.message || error.message || response.statusText + throw new Error(`Google Cloud TTS API error: ${errorMessage}`) + } + + const data = await response.json() + const audioContent = data.audioContent + + if (!audioContent) { + throw new Error('No audio content returned from Google Cloud TTS') + } + + const audioBuffer = Buffer.from(audioContent, 'base64') + + const format = audioEncoding.toLowerCase().replace('_', '') + const mimeType = getMimeType(format) + + return { + audioBuffer, + format, + mimeType, + } +} + +async function synthesizeWithAzure( + params: Partial +): Promise<{ audioBuffer: Buffer; format: string; mimeType: string }> { + const { + text, + apiKey, + voiceId = 'en-US-JennyNeural', + region = 'eastus', + outputFormat = 'audio-24khz-96kbitrate-mono-mp3', + rate, + pitch, + style, + styleDegree, + role, + } = params + + if (!text || !apiKey) { + throw new Error('text and apiKey are required for Azure TTS') + } + + let ssml = `` + + if (style) { + ssml += ` +): Promise<{ audioBuffer: Buffer; format: string; mimeType: string }> { + const { + text, + apiKey, + userId, + voice, + quality = 'standard', + outputFormat = 'mp3', + speed = 1.0, + temperature, + voiceGuidance, + textGuidance, + sampleRate, + } = params + + if (!text || !apiKey || !userId) { + throw new Error('text, apiKey, and userId are required for PlayHT') + } + + const requestBody: Record = { + text, + quality, + output_format: outputFormat, + speed, + } + + if (voice) requestBody.voice = voice + if (temperature !== undefined) requestBody.temperature = temperature + if (voiceGuidance !== undefined) requestBody.voice_guidance = voiceGuidance + if (textGuidance !== undefined) requestBody.text_guidance = textGuidance + if (sampleRate) requestBody.sample_rate = sampleRate + + const response = await fetch('https://api.play.ht/api/v2/tts/stream', { + method: 'POST', + headers: { + AUTHORIZATION: apiKey, + 'X-USER-ID': userId, + 'Content-Type': 'application/json', + }, + body: JSON.stringify(requestBody), + }) + + if (!response.ok) { + const error = await response.json().catch(() => ({})) + const errorMessage = error.error_message || error.message || response.statusText + throw new Error(`PlayHT TTS API error: ${errorMessage}`) + } + + const arrayBuffer = await response.arrayBuffer() + const audioBuffer = Buffer.from(arrayBuffer) + + const format = outputFormat || 'mp3' + const mimeType = getMimeType(format) + + return { + audioBuffer, + format, + mimeType, + } +} diff --git a/apps/sim/app/api/proxy/video/route.ts b/apps/sim/app/api/proxy/video/route.ts new file mode 100644 index 0000000000..fe3bf433f1 --- /dev/null +++ b/apps/sim/app/api/proxy/video/route.ts @@ -0,0 +1,950 @@ +import { type NextRequest, NextResponse } from 'next/server' +import { checkHybridAuth } from '@/lib/auth/hybrid' +import { createLogger } from '@/lib/logs/console/logger' +import { downloadFileFromStorage } from '@/lib/uploads/utils/file-utils.server' +import type { UserFile } from '@/executor/types' +import type { VideoRequestBody } from '@/tools/video/types' + +const logger = createLogger('VideoProxyAPI') + +export const dynamic = 'force-dynamic' +export const maxDuration = 600 // 10 minutes for video generation + +export async function POST(request: NextRequest) { + const requestId = crypto.randomUUID() + logger.info(`[${requestId}] Video generation request started`) + + try { + const authResult = await checkHybridAuth(request, { requireWorkflowId: false }) + if (!authResult.success) { + return NextResponse.json({ error: 'Unauthorized' }, { status: 401 }) + } + + const body: VideoRequestBody = await request.json() + const { provider, apiKey, model, prompt, duration, aspectRatio, resolution } = body + + if (!provider || !apiKey || !prompt) { + return NextResponse.json( + { error: 'Missing required fields: provider, apiKey, and prompt' }, + { status: 400 } + ) + } + + const validProviders = ['runway', 'veo', 'luma', 'minimax', 'falai'] + if (!validProviders.includes(provider)) { + return NextResponse.json( + { error: `Invalid provider. Must be one of: ${validProviders.join(', ')}` }, + { status: 400 } + ) + } + + if (prompt.length < 3 || prompt.length > 2000) { + return NextResponse.json( + { error: 'Prompt must be between 3 and 2000 characters' }, + { status: 400 } + ) + } + + // Validate duration (provider-specific constraints) + if (provider === 'veo') { + if (duration !== undefined && ![4, 6, 8].includes(duration)) { + return NextResponse.json( + { error: 'Duration must be 4, 6, or 8 seconds for Veo' }, + { status: 400 } + ) + } + } else if (provider === 'minimax') { + if (duration !== undefined && ![6, 10].includes(duration)) { + return NextResponse.json( + { error: 'Duration must be 6 or 10 seconds for MiniMax' }, + { status: 400 } + ) + } + } else if (provider !== 'falai' && duration !== undefined && (duration < 5 || duration > 10)) { + // Fal.ai has variable duration constraints per model, skip validation + return NextResponse.json( + { error: 'Duration must be between 5 and 10 seconds' }, + { status: 400 } + ) + } + + // Validate aspect ratio (Veo only supports 16:9 and 9:16) + const validAspectRatios = provider === 'veo' ? ['16:9', '9:16'] : ['16:9', '9:16', '1:1'] + if (aspectRatio && !validAspectRatios.includes(aspectRatio)) { + return NextResponse.json( + { error: `Aspect ratio must be ${validAspectRatios.join(', ')}` }, + { status: 400 } + ) + } + + logger.info(`[${requestId}] Generating video with ${provider}, model: ${model || 'default'}`) + + let videoUrl: string + let videoBuffer: Buffer + let width: number | undefined + let height: number | undefined + let jobId: string | undefined + let actualDuration: number | undefined + + try { + if (provider === 'runway') { + const result = await generateWithRunway( + apiKey, + model || 'gen-4', + prompt, + duration || 5, + aspectRatio || '16:9', + resolution || '1080p', + body.visualReference, + requestId, + logger + ) + videoBuffer = result.buffer + width = result.width + height = result.height + jobId = result.jobId + actualDuration = result.duration + } else if (provider === 'veo') { + const result = await generateWithVeo( + apiKey, + model || 'veo-3', + prompt, + duration || 8, // Default to 8 seconds (valid: 4, 6, or 8) + aspectRatio || '16:9', + resolution || '1080p', + requestId, + logger + ) + videoBuffer = result.buffer + width = result.width + height = result.height + jobId = result.jobId + actualDuration = result.duration + } else if (provider === 'luma') { + const result = await generateWithLuma( + apiKey, + model || 'ray-2', + prompt, + duration || 5, + aspectRatio || '16:9', + resolution || '1080p', + body.cameraControl, + requestId, + logger + ) + videoBuffer = result.buffer + width = result.width + height = result.height + jobId = result.jobId + actualDuration = result.duration + } else if (provider === 'minimax') { + const result = await generateWithMiniMax( + apiKey, + model || 'hailuo-02', + prompt, + duration || 6, + body.promptOptimizer !== false, // Default true + requestId, + logger + ) + videoBuffer = result.buffer + width = result.width + height = result.height + jobId = result.jobId + actualDuration = result.duration + } else if (provider === 'falai') { + if (!model) { + return NextResponse.json( + { error: 'Model is required for Fal.ai provider' }, + { status: 400 } + ) + } + const result = await generateWithFalAI( + apiKey, + model, + prompt, + duration, + aspectRatio, + resolution, + body.promptOptimizer, + requestId, + logger + ) + videoBuffer = result.buffer + width = result.width + height = result.height + jobId = result.jobId + actualDuration = result.duration + } else { + return NextResponse.json({ error: `Unknown provider: ${provider}` }, { status: 400 }) + } + } catch (error) { + logger.error(`[${requestId}] Video generation failed:`, error) + const errorMessage = error instanceof Error ? error.message : 'Video generation failed' + return NextResponse.json({ error: errorMessage }, { status: 500 }) + } + + const hasExecutionContext = body.workspaceId && body.workflowId && body.executionId + + logger.info(`[${requestId}] Storing video file, size: ${videoBuffer.length} bytes`) + + if (hasExecutionContext) { + const { uploadExecutionFile } = await import('@/lib/uploads/contexts/execution') + const timestamp = Date.now() + const fileName = `video-${provider}-${timestamp}.mp4` + + let videoFile + try { + videoFile = await uploadExecutionFile( + { + workspaceId: body.workspaceId!, + workflowId: body.workflowId!, + executionId: body.executionId!, + }, + videoBuffer, + fileName, + 'video/mp4', + authResult.userId + ) + + logger.info(`[${requestId}] Video stored successfully:`, { + fileName, + size: videoFile.size, + executionId: body.executionId, + }) + } catch (error) { + logger.error(`[${requestId}] Failed to upload video file:`, error) + throw new Error( + `Failed to store video: ${error instanceof Error ? error.message : 'Unknown error'}` + ) + } + + return NextResponse.json({ + videoUrl: videoFile.url, + videoFile, + duration: actualDuration || duration, + width, + height, + provider, + model: model || 'default', + jobId, + }) + } + + const { StorageService } = await import('@/lib/uploads') + const { getBaseUrl } = await import('@/lib/urls/utils') + const timestamp = Date.now() + const fileName = `video-${provider}-${timestamp}.mp4` + + try { + const fileInfo = await StorageService.uploadFile({ + file: videoBuffer, + fileName, + contentType: 'video/mp4', + context: 'copilot', + }) + + videoUrl = `${getBaseUrl()}${fileInfo.path}` + } catch (error) { + logger.error(`[${requestId}] Failed to upload video file (fallback):`, error) + throw new Error( + `Failed to store video: ${error instanceof Error ? error.message : 'Unknown error'}` + ) + } + + logger.info(`[${requestId}] Video generation completed successfully`) + + return NextResponse.json({ + videoUrl, + duration: actualDuration || duration, + width, + height, + provider, + model: model || 'default', + jobId, + }) + } catch (error) { + logger.error(`[${requestId}] Video proxy error:`, error) + const errorMessage = error instanceof Error ? error.message : 'Unknown error' + return NextResponse.json({ error: errorMessage }, { status: 500 }) + } +} + +async function generateWithRunway( + apiKey: string, + model: string, + prompt: string, + duration: number, + aspectRatio: string, + resolution: string, + visualReference: UserFile | undefined, + requestId: string, + logger: ReturnType +): Promise<{ buffer: Buffer; width: number; height: number; jobId: string; duration: number }> { + logger.info(`[${requestId}] Starting Runway Gen-4 generation`) + + const dimensions = getVideoDimensions(aspectRatio, resolution) + + // Convert aspect ratio to resolution format for 2024-11-06 API version + const ratioMap: { [key: string]: string } = { + '16:9': '1280:720', // Landscape (720p) + '9:16': '720:1280', // Portrait (720p) + '1:1': '960:960', // Square + } + const runwayRatio = ratioMap[aspectRatio] || '1280:720' + + const createPayload: any = { + promptText: prompt, + duration, + ratio: runwayRatio, // Use resolution-based ratio for 2024-11-06 API + model: 'gen4_turbo', // Only gen4_turbo supports image-to-video // Use underscore + } + + if (visualReference) { + const refBuffer = await downloadFileFromStorage(visualReference, requestId, logger) + const refBase64 = refBuffer.toString('base64') + createPayload.promptImage = `data:${visualReference.type};base64,${refBase64}` // Use promptImage + } + + const createResponse = await fetch('https://api.dev.runwayml.com/v1/image_to_video', { + method: 'POST', + headers: { + Authorization: `Bearer ${apiKey}`, + 'Content-Type': 'application/json', + 'X-Runway-Version': '2024-11-06', + }, + body: JSON.stringify(createPayload), + }) + + if (!createResponse.ok) { + const error = await createResponse.text() + throw new Error(`Runway API error: ${createResponse.status} - ${error}`) + } + + const createData = await createResponse.json() + const taskId = createData.id + + logger.info(`[${requestId}] Runway task created: ${taskId}`) + + const maxAttempts = 120 // 10 minutes with 5-second intervals + let attempts = 0 + + while (attempts < maxAttempts) { + await sleep(5000) // Poll every 5 seconds + + const statusResponse = await fetch(`https://api.dev.runwayml.com/v1/tasks/${taskId}`, { + headers: { + Authorization: `Bearer ${apiKey}`, + 'X-Runway-Version': '2024-11-06', + }, + }) + + if (!statusResponse.ok) { + throw new Error(`Runway status check failed: ${statusResponse.status}`) + } + + const statusData = await statusResponse.json() + + if (statusData.status === 'SUCCEEDED') { + logger.info(`[${requestId}] Runway generation completed after ${attempts * 5}s`) + + const videoResponse = await fetch(statusData.output[0]) + if (!videoResponse.ok) { + throw new Error(`Failed to download video: ${videoResponse.status}`) + } + + const arrayBuffer = await videoResponse.arrayBuffer() + return { + buffer: Buffer.from(arrayBuffer), + width: dimensions.width, + height: dimensions.height, + jobId: taskId, + duration, + } + } + + if (statusData.status === 'FAILED') { + throw new Error(`Runway generation failed: ${statusData.failure || 'Unknown error'}`) + } + + attempts++ + } + + throw new Error('Runway generation timed out after 10 minutes') +} + +async function generateWithVeo( + apiKey: string, + model: string, + prompt: string, + duration: number, + aspectRatio: string, + resolution: string, + requestId: string, + logger: ReturnType +): Promise<{ buffer: Buffer; width: number; height: number; jobId: string; duration: number }> { + logger.info(`[${requestId}] Starting Google Veo generation`) + + const dimensions = getVideoDimensions(aspectRatio, resolution) + + const modelNameMap: Record = { + 'veo-3': 'veo-3.0-generate-001', + 'veo-3-fast': 'veo-3.0-fast-generate-001', // Fixed: was incorrectly mapped to 3.1 + 'veo-3.1': 'veo-3.1-generate-preview', + } + const modelName = modelNameMap[model] || 'veo-3.1-generate-preview' + + const createPayload = { + instances: [ + { + prompt, + }, + ], + parameters: { + aspectRatio: aspectRatio, // Keep as "16:9", don't convert + resolution: resolution, + durationSeconds: duration, // Keep as number + }, + } + + const createResponse = await fetch( + `https://generativelanguage.googleapis.com/v1beta/models/${modelName}:predictLongRunning`, + { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'x-goog-api-key': apiKey, + }, + body: JSON.stringify(createPayload), + } + ) + + if (!createResponse.ok) { + const error = await createResponse.text() + throw new Error(`Veo API error: ${createResponse.status} - ${error}`) + } + + const createData = await createResponse.json() + const operationName = createData.name + + logger.info(`[${requestId}] Veo operation created: ${operationName}`) + + const maxAttempts = 60 // 5 minutes with 5-second intervals + let attempts = 0 + + while (attempts < maxAttempts) { + await sleep(5000) + + const statusResponse = await fetch( + `https://generativelanguage.googleapis.com/v1beta/${operationName}`, + { + headers: { + 'x-goog-api-key': apiKey, + }, + } + ) + + if (!statusResponse.ok) { + throw new Error(`Veo status check failed: ${statusResponse.status}`) + } + + const statusData = await statusResponse.json() + + if (statusData.done) { + if (statusData.error) { + throw new Error(`Veo generation failed: ${statusData.error.message}`) + } + + logger.info(`[${requestId}] Veo generation completed after ${attempts * 5}s`) + + const videoUri = statusData.response?.generateVideoResponse?.generatedSamples?.[0]?.video?.uri + if (!videoUri) { + throw new Error('No video URI in response') + } + + const videoResponse = await fetch(videoUri, { + headers: { + 'x-goog-api-key': apiKey, + }, + }) + + if (!videoResponse.ok) { + throw new Error(`Failed to download video: ${videoResponse.status}`) + } + + const arrayBuffer = await videoResponse.arrayBuffer() + return { + buffer: Buffer.from(arrayBuffer), + width: dimensions.width, + height: dimensions.height, + jobId: operationName, + duration, + } + } + + attempts++ + } + + throw new Error('Veo generation timed out after 5 minutes') +} + +async function generateWithLuma( + apiKey: string, + model: string, + prompt: string, + duration: number, + aspectRatio: string, + resolution: string, + cameraControl: any | undefined, + requestId: string, + logger: ReturnType +): Promise<{ buffer: Buffer; width: number; height: number; jobId: string; duration: number }> { + logger.info(`[${requestId}] Starting Luma Dream Machine generation`) + + const dimensions = getVideoDimensions(aspectRatio, resolution) + + const createPayload: any = { + prompt, + model: model || 'ray-2', + aspect_ratio: aspectRatio, + loop: false, + } + + if (duration) { + createPayload.duration = `${duration}s` + } + + if (resolution) { + createPayload.resolution = resolution + } + + if (cameraControl) { + createPayload.concepts = Array.isArray(cameraControl) ? cameraControl : [{ key: cameraControl }] + } + + const createResponse = await fetch('https://api.lumalabs.ai/dream-machine/v1/generations', { + method: 'POST', + headers: { + Authorization: `Bearer ${apiKey}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify(createPayload), + }) + + if (!createResponse.ok) { + const error = await createResponse.text() + throw new Error(`Luma API error: ${createResponse.status} - ${error}`) + } + + const createData = await createResponse.json() + const generationId = createData.id + + logger.info(`[${requestId}] Luma generation created: ${generationId}`) + + const maxAttempts = 120 // 10 minutes + let attempts = 0 + + while (attempts < maxAttempts) { + await sleep(5000) + + const statusResponse = await fetch( + `https://api.lumalabs.ai/dream-machine/v1/generations/${generationId}`, + { + headers: { + Authorization: `Bearer ${apiKey}`, + }, + } + ) + + if (!statusResponse.ok) { + throw new Error(`Luma status check failed: ${statusResponse.status}`) + } + + const statusData = await statusResponse.json() + + if (statusData.state === 'completed') { + logger.info(`[${requestId}] Luma generation completed after ${attempts * 5}s`) + + const videoUrl = statusData.assets?.video + if (!videoUrl) { + throw new Error('No video URL in response') + } + + const videoResponse = await fetch(videoUrl) + if (!videoResponse.ok) { + throw new Error(`Failed to download video: ${videoResponse.status}`) + } + + const arrayBuffer = await videoResponse.arrayBuffer() + return { + buffer: Buffer.from(arrayBuffer), + width: dimensions.width, + height: dimensions.height, + jobId: generationId, + duration, + } + } + + if (statusData.state === 'failed') { + throw new Error(`Luma generation failed: ${statusData.failure_reason || 'Unknown error'}`) + } + + attempts++ + } + + throw new Error('Luma generation timed out after 10 minutes') +} + +async function generateWithMiniMax( + apiKey: string, + model: string, + prompt: string, + duration: number, + promptOptimizer: boolean, + requestId: string, + logger: ReturnType +): Promise<{ buffer: Buffer; width: number; height: number; jobId: string; duration: number }> { + logger.info(`[${requestId}] Starting MiniMax Hailuo generation via MiniMax Platform API`) + logger.info( + `[${requestId}] Request params - model: ${model}, duration: ${duration}, promptOptimizer: ${promptOptimizer}` + ) + + // Determine resolution and dimensions based on duration + // MiniMax-Hailuo-02 supports 768P (6s) or 1080P (10s) + const resolution = duration === 10 ? '1080P' : '768P' + const dimensions = duration === 10 ? { width: 1920, height: 1080 } : { width: 1360, height: 768 } + + logger.info( + `[${requestId}] Using resolution: ${resolution}, dimensions: ${dimensions.width}x${dimensions.height}` + ) + + // Map our model ID to MiniMax model name + const minimaxModel = model === 'hailuo-02' ? 'MiniMax-Hailuo-02' : 'MiniMax-Hailuo-2.3' + + // Create video generation request via MiniMax Platform API + const createResponse = await fetch('https://api.minimax.io/v1/video_generation', { + method: 'POST', + headers: { + Authorization: `Bearer ${apiKey}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + model: minimaxModel, + prompt: prompt, + duration: duration, + resolution: resolution, + prompt_optimizer: promptOptimizer, + }), + }) + + if (!createResponse.ok) { + const errorText = await createResponse.text() + if (createResponse.status === 401 || createResponse.status === 1004) { + throw new Error( + `MiniMax API authentication failed (${createResponse.status}). Please ensure you're using a valid MiniMax API key from platform.minimax.io. Error: ${errorText}` + ) + } + throw new Error(`MiniMax API error: ${createResponse.status} - ${errorText}`) + } + + const createData = await createResponse.json() + + // Check for error in response + if (createData.base_resp?.status_code !== 0) { + throw new Error(`MiniMax API error: ${createData.base_resp?.status_msg || 'Unknown error'}`) + } + + const taskId = createData.task_id + + logger.info(`[${requestId}] MiniMax task created: ${taskId}`) + + // Poll for completion (6-10 minutes typical) + const maxAttempts = 120 // 10 minutes with 5-second intervals + let attempts = 0 + + while (attempts < maxAttempts) { + await sleep(5000) + + // Query task status + const statusResponse = await fetch( + `https://api.minimax.io/v1/query/video_generation?task_id=${taskId}`, + { + headers: { + Authorization: `Bearer ${apiKey}`, + }, + } + ) + + if (!statusResponse.ok) { + throw new Error(`MiniMax status check failed: ${statusResponse.status}`) + } + + const statusData = await statusResponse.json() + + if ( + statusData.base_resp?.status_code !== 0 && + statusData.base_resp?.status_code !== undefined + ) { + throw new Error( + `MiniMax status query error: ${statusData.base_resp?.status_msg || 'Unknown error'}` + ) + } + + if (statusData.status === 'Success' || statusData.status === 'success') { + logger.info(`[${requestId}] MiniMax generation completed after ${attempts * 5}s`) + + const fileId = statusData.file_id + if (!fileId) { + throw new Error('No file_id in response') + } + + // Download the video using file_id + const fileResponse = await fetch( + `https://api.minimax.io/v1/files/retrieve?file_id=${fileId}`, + { + headers: { + Authorization: `Bearer ${apiKey}`, + }, + } + ) + + if (!fileResponse.ok) { + throw new Error(`Failed to download video: ${fileResponse.status}`) + } + + const fileData = await fileResponse.json() + const videoUrl = fileData.file?.download_url + + if (!videoUrl) { + throw new Error('No download URL in file response') + } + + // Download the actual video file + const videoResponse = await fetch(videoUrl) + if (!videoResponse.ok) { + throw new Error(`Failed to download video from URL: ${videoResponse.status}`) + } + + const arrayBuffer = await videoResponse.arrayBuffer() + return { + buffer: Buffer.from(arrayBuffer), + width: dimensions.width, + height: dimensions.height, + jobId: taskId, + duration, + } + } + + if (statusData.status === 'Failed' || statusData.status === 'failed') { + throw new Error(`MiniMax generation failed: ${statusData.error || 'Unknown error'}`) + } + + // Status is still "Processing" or "Queueing", continue polling + attempts++ + } + + throw new Error('MiniMax generation timed out after 10 minutes') +} + +// Helper function to strip subpaths from Fal.ai model IDs for status/result endpoints +function getBaseModelId(fullModelId: string): string { + const parts = fullModelId.split('/') + // Keep only the first two parts (e.g., "fal-ai/sora-2" from "fal-ai/sora-2/text-to-video") + if (parts.length > 2) { + return parts.slice(0, 2).join('/') + } + return fullModelId +} + +// Helper function to format duration based on model requirements +function formatDuration(model: string, duration: number | undefined): string | number | undefined { + if (duration === undefined) return undefined + + // Veo 3.1 requires duration with "s" suffix (e.g., "8s") + if (model === 'veo-3.1') { + return `${duration}s` + } + + // Sora 2 requires numeric duration + if (model === 'sora-2') { + return duration + } + + // Other models use string format + return String(duration) +} + +async function generateWithFalAI( + apiKey: string, + model: string, + prompt: string, + duration: number | undefined, + aspectRatio: string | undefined, + resolution: string | undefined, + promptOptimizer: boolean | undefined, + requestId: string, + logger: ReturnType +): Promise<{ buffer: Buffer; width: number; height: number; jobId: string; duration: number }> { + logger.info(`[${requestId}] Starting Fal.ai generation with model: ${model}`) + + // Map our model IDs to Fal.ai model paths + const modelMap: { [key: string]: string } = { + 'veo-3.1': 'fal-ai/veo3.1', + 'sora-2': 'fal-ai/sora-2/text-to-video', + 'kling-2.5-turbo-pro': 'fal-ai/kling-video/v2.5-turbo/pro/text-to-video', + 'kling-2.1-pro': 'fal-ai/kling-video/v2.1/master/text-to-video', + 'minimax-hailuo-2.3-pro': 'fal-ai/minimax/hailuo-02/pro/text-to-video', + 'minimax-hailuo-2.3-standard': 'fal-ai/minimax/hailuo-02/standard/text-to-video', + 'wan-2.1': 'fal-ai/wan-t2v', + 'ltxv-0.9.8': 'fal-ai/ltxv-13b-098-distilled', + } + + const falModelId = modelMap[model] + if (!falModelId) { + throw new Error(`Unknown Fal.ai model: ${model}`) + } + + // Build request body based on model requirements + const requestBody: any = { prompt } + + // Format duration based on model requirements + const formattedDuration = formatDuration(model, duration) + if (formattedDuration !== undefined) { + requestBody.duration = formattedDuration + } + + if (aspectRatio) { + requestBody.aspect_ratio = aspectRatio + } + + if (resolution) { + requestBody.resolution = resolution + } + + // MiniMax models support prompt optimizer + if (model.startsWith('minimax-hailuo') && promptOptimizer !== undefined) { + requestBody.prompt_optimizer = promptOptimizer + } + + const createResponse = await fetch(`https://queue.fal.run/${falModelId}`, { + method: 'POST', + headers: { + Authorization: `Key ${apiKey}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify(requestBody), + }) + + if (!createResponse.ok) { + const error = await createResponse.text() + throw new Error(`Fal.ai API error: ${createResponse.status} - ${error}`) + } + + const createData = await createResponse.json() + const requestIdFal = createData.request_id + + logger.info(`[${requestId}] Fal.ai request created: ${requestIdFal}`) + + // Get base model ID (without subpath) for status and result endpoints + const baseModelId = getBaseModelId(falModelId) + + const maxAttempts = 96 // 8 minutes with 5-second intervals + let attempts = 0 + + while (attempts < maxAttempts) { + await sleep(5000) + + const statusResponse = await fetch( + `https://queue.fal.run/${baseModelId}/requests/${requestIdFal}/status`, + { + headers: { + Authorization: `Key ${apiKey}`, + }, + } + ) + + if (!statusResponse.ok) { + throw new Error(`Fal.ai status check failed: ${statusResponse.status}`) + } + + const statusData = await statusResponse.json() + + if (statusData.status === 'COMPLETED') { + logger.info(`[${requestId}] Fal.ai generation completed after ${attempts * 5}s`) + + const resultResponse = await fetch( + `https://queue.fal.run/${baseModelId}/requests/${requestIdFal}`, + { + headers: { + Authorization: `Key ${apiKey}`, + }, + } + ) + + if (!resultResponse.ok) { + throw new Error(`Failed to fetch result: ${resultResponse.status}`) + } + + const resultData = await resultResponse.json() + + const videoUrl = resultData.video?.url || resultData.output?.url + if (!videoUrl) { + throw new Error('No video URL in response') + } + + const videoResponse = await fetch(videoUrl) + if (!videoResponse.ok) { + throw new Error(`Failed to download video: ${videoResponse.status}`) + } + + const arrayBuffer = await videoResponse.arrayBuffer() + + // Try to get dimensions from response, or calculate from aspect ratio + let width = resultData.video?.width || 1920 + let height = resultData.video?.height || 1080 + + if (!resultData.video?.width && aspectRatio) { + const dims = getVideoDimensions(aspectRatio, resolution || '1080p') + width = dims.width + height = dims.height + } + + return { + buffer: Buffer.from(arrayBuffer), + width, + height, + jobId: requestIdFal, + duration: duration || 5, + } + } + + if (statusData.status === 'FAILED') { + throw new Error(`Fal.ai generation failed: ${statusData.error || 'Unknown error'}`) + } + + attempts++ + } + + throw new Error('Fal.ai generation timed out after 8 minutes') +} + +function getVideoDimensions( + aspectRatio: string, + resolution: string +): { width: number; height: number } { + let height: number + if (resolution === '4k') { + height = 2160 + } else { + height = Number.parseInt(resolution.replace('p', '')) + } + + const [ratioW, ratioH] = aspectRatio.split(':').map(Number) + const width = Math.round((height * ratioW) / ratioH) + + return { width, height } +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)) +} diff --git a/apps/sim/app/workspace/[workspaceId]/w/components/sidebar/components-new/search-modal/search-modal.tsx b/apps/sim/app/workspace/[workspaceId]/w/components/sidebar/components-new/search-modal/search-modal.tsx index 161e90f9f1..6129da68fc 100644 --- a/apps/sim/app/workspace/[workspaceId]/w/components/sidebar/components-new/search-modal/search-modal.tsx +++ b/apps/sim/app/workspace/[workspaceId]/w/components/sidebar/components-new/search-modal/search-modal.tsx @@ -359,9 +359,42 @@ export function SearchModal({ .map((result) => result.item) }, [allItems, searchQuery, sectionOrder]) + const groupedItems = useMemo(() => { + const groups: Record = { + workspace: [], + workflow: [], + page: [], + trigger: [], + block: [], + tool: [], + doc: [], + } + + filteredItems.forEach((item) => { + if (groups[item.type]) { + groups[item.type].push(item) + } + }) + + return groups + }, [filteredItems]) + + const displayedItemsInVisualOrder = useMemo(() => { + const visualOrder: SearchItem[] = [] + + sectionOrder.forEach((type) => { + const items = groupedItems[type] || [] + items.forEach((item) => { + visualOrder.push(item) + }) + }) + + return visualOrder + }, [groupedItems, sectionOrder]) + useEffect(() => { setSelectedIndex(0) - }, [filteredItems]) + }, [displayedItemsInVisualOrder]) useEffect(() => { if (!open) { @@ -413,7 +446,7 @@ export function SearchModal({ switch (e.key) { case 'ArrowDown': e.preventDefault() - setSelectedIndex((prev) => Math.min(prev + 1, filteredItems.length - 1)) + setSelectedIndex((prev) => Math.min(prev + 1, displayedItemsInVisualOrder.length - 1)) break case 'ArrowUp': e.preventDefault() @@ -421,8 +454,8 @@ export function SearchModal({ break case 'Enter': e.preventDefault() - if (filteredItems[selectedIndex]) { - handleItemClick(filteredItems[selectedIndex]) + if (displayedItemsInVisualOrder[selectedIndex]) { + handleItemClick(displayedItemsInVisualOrder[selectedIndex]) } break case 'Escape': @@ -434,7 +467,7 @@ export function SearchModal({ document.addEventListener('keydown', handleKeyDown) return () => document.removeEventListener('keydown', handleKeyDown) - }, [open, selectedIndex, filteredItems, handleItemClick, onOpenChange]) + }, [open, selectedIndex, displayedItemsInVisualOrder, handleItemClick, onOpenChange]) useEffect(() => { if (open && selectedIndex >= 0) { @@ -448,26 +481,6 @@ export function SearchModal({ } }, [selectedIndex, open]) - const groupedItems = useMemo(() => { - const groups: Record = { - workspace: [], - workflow: [], - page: [], - trigger: [], - block: [], - tool: [], - doc: [], - } - - filteredItems.forEach((item) => { - if (groups[item.type]) { - groups[item.type].push(item) - } - }) - - return groups - }, [filteredItems]) - const sectionTitles: Record = { workspace: 'Workspaces', workflow: 'Workflows', @@ -501,7 +514,7 @@ export function SearchModal({ {/* Floating results container */} - {filteredItems.length > 0 ? ( + {displayedItemsInVisualOrder.length > 0 ? (
{sectionOrder.map((type) => { const items = groupedItems[type] || [] @@ -518,8 +531,8 @@ export function SearchModal({
{items.map((item, itemIndex) => { const Icon = item.icon - const globalIndex = filteredItems.indexOf(item) - const isSelected = globalIndex === selectedIndex + const visualIndex = displayedItemsInVisualOrder.indexOf(item) + const isSelected = visualIndex === selectedIndex const showColoredIcon = item.type === 'block' || item.type === 'trigger' || item.type === 'tool' const isWorkflow = item.type === 'workflow' @@ -528,7 +541,7 @@ export function SearchModal({ return (