diff --git a/apps/docs/components/icons.tsx b/apps/docs/components/icons.tsx
index 9562e82bcb..b4dd059b70 100644
--- a/apps/docs/components/icons.tsx
+++ b/apps/docs/components/icons.tsx
@@ -4085,7 +4085,29 @@ export function CalendlyIcon(props: SVGProps<SVGSVGElement>) {
   )
 }
 
-export function AudioWaveformIcon(props: SVGProps<SVGSVGElement>) {
+export function STTIcon(props: SVGProps<SVGSVGElement>) {
+  return (
+    <svg
+      {...props}
+      xmlns='http://www.w3.org/2000/svg'
+      width='24'
+      height='24'
+      viewBox='0 0 24 24'
+      fill='none'
+      stroke='currentColor'
+      strokeWidth='2'
+      strokeLinecap='round'
+      strokeLinejoin='round'
+    >
+      <path d='m15 16 2.536-7.328a1.02 1.02 1 0 1 1.928 0L22 16' />
+      <path d='M15.697 14h5.606' />
+      <path d='m2 16 4.039-9.69a.5.5 0 0 1 .923 0L11 16' />
+      <path d='M3.304 13h6.392' />
+    </svg>
+  )
+}
+
+export function TTSIcon(props: SVGProps<SVGSVGElement>) {
   return (
     <svg
       {...props}
@@ -4108,3 +4130,23 @@ export function AudioWaveformIcon(props: SVGProps<SVGSVGElement>) {
     </svg>
   )
 }
+
+export function VideoIcon(props: SVGProps<SVGSVGElement>) {
+  return (
+    <svg
+      {...props}
+      xmlns='http://www.w3.org/2000/svg'
+      width='24'
+      height='24'
+      viewBox='0 0 24 24'
+      fill='none'
+      stroke='currentColor'
+      strokeWidth='2'
+      strokeLinecap='round'
+      strokeLinejoin='round'
+    >
+      <path d='m16 13 5.223 3.482a.5.5 0 0 0 .777-.416V7.87a.5.5 0 0 0-.752-.432L16 10.5' />
+      <rect x='2' y='6' width='14' height='12' rx='2' />
+    </svg>
+  )
+}
diff --git a/apps/docs/components/ui/icon-mapping.ts b/apps/docs/components/ui/icon-mapping.ts
index 4d57eae8a5..9bce0402fa 100644
--- a/apps/docs/components/ui/icon-mapping.ts
+++ b/apps/docs/components/ui/icon-mapping.ts
@@ -8,7 +8,6 @@ import {
   ApolloIcon,
   ArxivIcon,
   AsanaIcon,
-  AudioWaveformIcon,
   BrainIcon,
   BrowserUseIcon,
   CalendlyIcon,
@@ -63,6 +62,7 @@ import {
   SalesforceIcon,
   SerperIcon,
   SlackIcon,
+  STTIcon,
   StagehandIcon,
   StripeIcon,
   SupabaseIcon,
@@ -70,8 +70,10 @@ import {
   TelegramIcon,
   TranslateIcon,
   TrelloIcon,
+  TTSIcon,
   TwilioIcon,
   TypeformIcon,
+  VideoIcon,
   WealthboxIcon,
   WebflowIcon,
   WhatsAppIcon,
@@ -92,16 +94,18 @@ export const blockTypeToIconMap: Record<string, IconComponent> = {
   webflow: WebflowIcon,
   wealthbox: WealthboxIcon,
   vision: EyeIcon,
+  video_generator: VideoIcon,
   typeform: TypeformIcon,
   twilio_voice: TwilioIcon,
   twilio_sms: TwilioIcon,
+  tts: TTSIcon,
   trello: TrelloIcon,
   translate: TranslateIcon,
   thinking: BrainIcon,
   telegram: TelegramIcon,
   tavily: TavilyIcon,
   supabase: SupabaseIcon,
-  stt: AudioWaveformIcon,
+  stt: STTIcon,
   stripe: StripeIcon,
   stagehand_agent: StagehandIcon,
   stagehand: StagehandIcon,
diff --git a/apps/docs/content/docs/en/tools/meta.json b/apps/docs/content/docs/en/tools/meta.json
index 437ad185c9..51c52c3a71 100644
--- a/apps/docs/content/docs/en/tools/meta.json
+++ b/apps/docs/content/docs/en/tools/meta.json
@@ -68,9 +68,11 @@
     "thinking",
     "translate",
     "trello",
+    "tts",
     "twilio_sms",
     "twilio_voice",
     "typeform",
+    "video_generator",
     "vision",
     "wealthbox",
     "webflow",
diff --git a/apps/docs/content/docs/en/tools/stt.mdx b/apps/docs/content/docs/en/tools/stt.mdx
index 2132b8c51b..d76afbbd31 100644
--- a/apps/docs/content/docs/en/tools/stt.mdx
+++ b/apps/docs/content/docs/en/tools/stt.mdx
@@ -11,15 +11,32 @@ import { BlockInfoCard } from "@/components/ui/block-info-card"
 />
 
 {/* MANUAL-CONTENT-START:intro */}
-Transcribe speech to text using state-of-the-art AI models from leading providers. The Sim Speech-to-Text (STT) tools allow you to convert audio and video files into accurate transcripts, supporting multiple languages, timestamps, and optional translation.
+Transcribe speech to text using the latest AI models from world-class providers. Sim's Speech-to-Text (STT) tools empower you to turn audio and video into accurate, timestamped, and optionally translated transcripts—supporting a diversity of languages and enhanced with advanced features such as diarization and speaker identification.
 
-Supported providers:
+**Supported Providers & Models:**
 
-- **[OpenAI Whisper](https://platform.openai.com/docs/guides/speech-to-text/overview)**: Advanced open-source STT model from OpenAI. Supports models such as `whisper-1` and handles a wide variety of languages and audio formats.
-- **[Deepgram](https://deepgram.com/)**: Real-time and batch STT API with deep learning models like `nova-3`, `nova-2`, and `whisper-large`. Offers features like diarization, intent recognition, and industry-specific tuning.
-- **[ElevenLabs](https://elevenlabs.io/)**: Known for high-quality speech AI, ElevenLabs provides STT models focused on accuracy and natural language understanding for numerous languages and dialects.
+- **[OpenAI Whisper](https://platform.openai.com/docs/guides/speech-to-text/overview)** (OpenAI):  
+  OpenAI’s Whisper is an open-source deep learning model renowned for its robustness across languages and audio conditions. It supports advanced models such as `whisper-1`, excelling in transcription, translation, and tasks demanding high model generalization. Backed by OpenAI—the company known for ChatGPT and leading AI research—Whisper is widely used in research and as a baseline for comparative evaluation.
 
-Choose the provider and model best suited to your task—whether fast, production-grade transcription (Deepgram), highly accurate multi-language capability (Whisper), or advanced understanding and language coverage (ElevenLabs).
+- **[Deepgram](https://deepgram.com/)** (Deepgram Inc.):  
+  Based in San Francisco, Deepgram offers scalable, production-grade speech recognition APIs for developers and enterprises. Deepgram’s models include `nova-3`, `nova-2`, and `whisper-large`, offering real-time and batch transcription with industry-leading accuracy, multi-language support, automatic punctuation, intelligent diarization, call analytics, and features for use cases ranging from telephony to media production.
+
+- **[ElevenLabs](https://elevenlabs.io/)** (ElevenLabs):  
+  A leader in voice AI, ElevenLabs is especially known for premium voice synthesis and recognition. Its STT product delivers high-accuracy, natural understanding of numerous languages, dialects, and accents. Recent ElevenLabs STT models are optimized for clarity, speaker distinction, and are suitable for both creative and accessibility scenarios. ElevenLabs is recognized for cutting-edge advancements in AI-powered speech technologies.
+
+- **[AssemblyAI](https://www.assemblyai.com/)** (AssemblyAI Inc.):  
+  AssemblyAI provides API-driven, highly accurate speech recognition, with features such as auto chaptering, topic detection, summarization, sentiment analysis, and content moderation alongside transcription. Its proprietary model, including the acclaimed `Conformer-2`, powers some of the largest media, call center, and compliance applications in the industry. AssemblyAI is trusted by Fortune 500s and leading AI startups globally.
+
+- **[Google Cloud Speech-to-Text](https://cloud.google.com/speech-to-text)** (Google Cloud):  
+  Google’s enterprise-grade Speech-to-Text API supports over 125 languages and variants, offering high accuracy and features such as real-time streaming, word-level confidence, speaker diarization, automatic punctuation, custom vocabulary, and domain-specific tuning. Models such as `latest_long`, `video`, and domain-optimized models are available, powered by Google’s years of research and deployed for global scalability.
+
+- **[AWS Transcribe](https://aws.amazon.com/transcribe/)** (Amazon Web Services):  
+  AWS Transcribe leverages Amazon’s cloud infrastructure to deliver robust speech recognition as an API. It supports multiple languages and features such as speaker identification, custom vocabulary, channel identification (for call center audio), and medical-specific transcription. Popular models include `standard` and domain-specific variations. AWS Transcribe is ideal for organizations already using Amazon’s cloud.
+
+**How to Choose:**  
+Select the provider and model that fits your application—whether you need fast, enterprise-ready transcription with extra analytics (Deepgram, AssemblyAI, Google, AWS), high versatility and open-source access (OpenAI Whisper), or advanced speaker/contextual understanding (ElevenLabs). Consider the pricing, language coverage, accuracy, and any special features (like summarization, chaptering, or sentiment analysis) you might need.
+
+For more details on capabilities, pricing, feature highlights, and fine-tuning options, refer to each provider’s official documentation via the links above.
 {/* MANUAL-CONTENT-END */}
 
 
@@ -48,6 +65,8 @@ Transcribe audio to text using OpenAI Whisper
 | `language` | string | No | Language code \(e.g., "en", "es", "fr"\) or "auto" for auto-detection |
 | `timestamps` | string | No | Timestamp granularity: none, sentence, or word |
 | `translateToEnglish` | boolean | No | Translate audio to English |
+| `prompt` | string | No | Optional text to guide the model's style or continue a previous audio segment. Helps with proper nouns and context. |
+| `temperature` | number | No | Sampling temperature between 0 and 1. Higher values make output more random, lower values more focused and deterministic. |
 
 #### Output
 
@@ -57,7 +76,6 @@ Transcribe audio to text using OpenAI Whisper
 | `segments` | array | Timestamped segments |
 | `language` | string | Detected or specified language |
 | `duration` | number | Audio duration in seconds |
-| `confidence` | number | Overall confidence score |
 
 ### `stt_deepgram`
 
@@ -114,6 +132,68 @@ Transcribe audio to text using ElevenLabs
 | `duration` | number | Audio duration in seconds |
 | `confidence` | number | Overall confidence score |
 
+### `stt_assemblyai`
+
+Transcribe audio to text using AssemblyAI with advanced NLP features
+
+#### Input
+
+| Parameter | Type | Required | Description |
+| --------- | ---- | -------- | ----------- |
+| `provider` | string | Yes | STT provider \(assemblyai\) |
+| `apiKey` | string | Yes | AssemblyAI API key |
+| `model` | string | No | AssemblyAI model to use \(default: best\) |
+| `audioFile` | file | No | Audio or video file to transcribe |
+| `audioFileReference` | file | No | Reference to audio/video file from previous blocks |
+| `audioUrl` | string | No | URL to audio or video file |
+| `language` | string | No | Language code \(e.g., "en", "es", "fr"\) or "auto" for auto-detection |
+| `timestamps` | string | No | Timestamp granularity: none, sentence, or word |
+| `diarization` | boolean | No | Enable speaker diarization |
+| `sentiment` | boolean | No | Enable sentiment analysis |
+| `entityDetection` | boolean | No | Enable entity detection |
+| `piiRedaction` | boolean | No | Enable PII redaction |
+| `summarization` | boolean | No | Enable automatic summarization |
+
+#### Output
+
+| Parameter | Type | Description |
+| --------- | ---- | ----------- |
+| `transcript` | string | Full transcribed text |
+| `segments` | array | Timestamped segments with speaker labels |
+| `language` | string | Detected or specified language |
+| `duration` | number | Audio duration in seconds |
+| `confidence` | number | Overall confidence score |
+| `sentiment` | array | Sentiment analysis results |
+| `entities` | array | Detected entities |
+| `summary` | string | Auto-generated summary |
+
+### `stt_gemini`
+
+Transcribe audio to text using Google Gemini with multimodal capabilities
+
+#### Input
+
+| Parameter | Type | Required | Description |
+| --------- | ---- | -------- | ----------- |
+| `provider` | string | Yes | STT provider \(gemini\) |
+| `apiKey` | string | Yes | Google API key |
+| `model` | string | No | Gemini model to use \(default: gemini-2.5-flash\) |
+| `audioFile` | file | No | Audio or video file to transcribe |
+| `audioFileReference` | file | No | Reference to audio/video file from previous blocks |
+| `audioUrl` | string | No | URL to audio or video file |
+| `language` | string | No | Language code \(e.g., "en", "es", "fr"\) or "auto" for auto-detection |
+| `timestamps` | string | No | Timestamp granularity: none, sentence, or word |
+
+#### Output
+
+| Parameter | Type | Description |
+| --------- | ---- | ----------- |
+| `transcript` | string | Full transcribed text |
+| `segments` | array | Timestamped segments |
+| `language` | string | Detected or specified language |
+| `duration` | number | Audio duration in seconds |
+| `confidence` | number | Overall confidence score |
+
 
 
 ## Notes
diff --git a/apps/docs/content/docs/en/tools/tts.mdx b/apps/docs/content/docs/en/tools/tts.mdx
new file mode 100644
index 0000000000..99b380939a
--- /dev/null
+++ b/apps/docs/content/docs/en/tools/tts.mdx
@@ -0,0 +1,261 @@
+---
+title: Text-to-Speech
+description: Convert text to speech using AI voices
+---
+
+import { BlockInfoCard } from "@/components/ui/block-info-card"
+
+<BlockInfoCard 
+  type="tts"
+  color="#181C1E"
+/>
+
+{/* MANUAL-CONTENT-START:intro */}
+Convert text to natural-sounding speech using the latest AI voices. Sim's Text-to-Speech (TTS) tools let you generate audio from written text in dozens of languages, with a choice of expressive voices, formats, and advanced controls like speed, style, emotion, and more.
+
+**Supported Providers & Models:**
+
+- **[OpenAI Text-to-Speech](https://platform.openai.com/docs/guides/text-to-speech/voice-options)** (OpenAI):  
+  OpenAI's TTS API offers ultra-realistic voices using advanced AI models like `tts-1`, `tts-1-hd`, and `gpt-4o-mini-tts`. Voices include both male and female, with options such as alloy, echo, fable, onyx, nova, shimmer, ash, ballad, coral, sage, and verse. Supports multiple audio formats (mp3, opus, aac, flac, wav, pcm), adjustable speed and streaming synthesis.
+
+- **[Deepgram Aura](https://deepgram.com/products/text-to-speech)** (Deepgram Inc.):  
+  Deepgram’s Aura provides expressive English and multilingual AI voices, optimized for conversational clarity, low latency, and customization. Models like `aura-asteria-en`, `aura-luna-en`, and others are available. Supports multiple encoding formats (linear16, mp3, opus, aac, flac) and fine tuning on speed, sample rate, and style.
+
+- **[ElevenLabs Text-to-Speech](https://elevenlabs.io/text-to-speech)** (ElevenLabs):  
+  ElevenLabs leads in lifelike, emotionally rich TTS, offering dozens of voices in 29+ languages and the ability to clone custom voices. Models support voice design, speech synthesis, and direct API access, with advanced controls for style, emotion, stability, and similarity. Suitable for audiobooks, content creation, accessibility, and more.
+
+- **[Cartesia TTS](https://docs.cartesia.ai/)** (Cartesia):  
+  Cartesia offers high-quality, fast, and secure text-to-speech with a focus on privacy and flexible deployment. It provides instant streaming, real-time synthesis, and supports multiple international voices and accents, accessible through a simple API.
+
+- **[Google Cloud Text-to-Speech](https://cloud.google.com/text-to-speech)** (Google Cloud):  
+  Google uses DeepMind WaveNet and Neural2 models to power high-fidelity voices in 50+ languages and variants. Features include voice selection, pitch, speaking rate, volume control, SSML tags, and access to both standard and studio-grade premium voices. Widely used for accessibility, IVR, and media.
+
+- **[Microsoft Azure Speech](https://azure.microsoft.com/en-us/products/ai-services/text-to-speech)** (Microsoft Azure):  
+  Azure provides over 400 neural voices across 140+ languages and locales, with unique voice customization, style, emotion, role, and real-time controls. Offers SSML support for pronunciation, intonation, and more. Ideal for global, enterprise, or creative TTS needs.
+
+- **[PlayHT](https://play.ht/)** (PlayHT):  
+  PlayHT specializes in realistic voice synthesis, voice cloning, and instant streaming playback with 800+ voices in over 100 languages. Features include emotion, pitch and speed controls, multi-voice audio, and custom voice creation via the API or online studio.
+
+**How to Choose:**  
+Pick your provider and model by prioritizing languages, supported voice types, desired formats (mp3, wav, etc.), control granularity (speed, emotion, etc.), and specialized features (voice cloning, accent, streaming). For creative, accessibility, or developer use cases, ensure compatibility with your application's requirements and compare costs.
+
+Visit each provider’s official site for up-to-date capabilities, pricing, and documentation details!
+{/* MANUAL-CONTENT-END */}
+
+
+## Usage Instructions
+
+Generate natural-sounding speech from text using state-of-the-art AI voices from OpenAI, Deepgram, ElevenLabs, Cartesia, Google Cloud, Azure, and PlayHT. Supports multiple voices, languages, and audio formats.
+
+
+
+## Tools
+
+### `tts_openai`
+
+Convert text to speech using OpenAI TTS models
+
+#### Input
+
+| Parameter | Type | Required | Description |
+| --------- | ---- | -------- | ----------- |
+| `text` | string | Yes | The text to convert to speech |
+| `apiKey` | string | Yes | OpenAI API key |
+| `model` | string | No | TTS model to use \(tts-1, tts-1-hd, or gpt-4o-mini-tts\) |
+| `voice` | string | No | Voice to use \(alloy, ash, ballad, cedar, coral, echo, marin, sage, shimmer, verse\) |
+| `responseFormat` | string | No | Audio format \(mp3, opus, aac, flac, wav, pcm\) |
+| `speed` | number | No | Speech speed \(0.25 to 4.0, default: 1.0\) |
+
+#### Output
+
+| Parameter | Type | Description |
+| --------- | ---- | ----------- |
+| `audioUrl` | string | URL to the generated audio file |
+| `audioFile` | file | Generated audio file object |
+| `duration` | number | Audio duration in seconds |
+| `characterCount` | number | Number of characters processed |
+| `format` | string | Audio format |
+| `provider` | string | TTS provider used |
+
+### `tts_deepgram`
+
+Convert text to speech using Deepgram Aura
+
+#### Input
+
+| Parameter | Type | Required | Description |
+| --------- | ---- | -------- | ----------- |
+| `text` | string | Yes | The text to convert to speech |
+| `apiKey` | string | Yes | Deepgram API key |
+| `model` | string | No | Deepgram model/voice \(e.g., aura-asteria-en, aura-luna-en\) |
+| `voice` | string | No | Voice identifier \(alternative to model param\) |
+| `encoding` | string | No | Audio encoding \(linear16, mp3, opus, aac, flac\) |
+| `sampleRate` | number | No | Sample rate \(8000, 16000, 24000, 48000\) |
+| `bitRate` | number | No | Bit rate for compressed formats |
+| `container` | string | No | Container format \(none, wav, ogg\) |
+
+#### Output
+
+| Parameter | Type | Description |
+| --------- | ---- | ----------- |
+| `audioUrl` | string | URL to the generated audio file |
+| `audioFile` | file | Generated audio file object |
+| `duration` | number | Audio duration in seconds |
+| `characterCount` | number | Number of characters processed |
+| `format` | string | Audio format |
+| `provider` | string | TTS provider used |
+
+### `tts_elevenlabs`
+
+Convert text to speech using ElevenLabs voices
+
+#### Input
+
+| Parameter | Type | Required | Description |
+| --------- | ---- | -------- | ----------- |
+| `text` | string | Yes | The text to convert to speech |
+| `voiceId` | string | Yes | The ID of the voice to use |
+| `apiKey` | string | Yes | ElevenLabs API key |
+| `modelId` | string | No | Model to use \(e.g., eleven_monolingual_v1, eleven_turbo_v2_5, eleven_flash_v2_5\) |
+| `stability` | number | No | Voice stability \(0.0 to 1.0, default: 0.5\) |
+| `similarityBoost` | number | No | Similarity boost \(0.0 to 1.0, default: 0.8\) |
+| `style` | number | No | Style exaggeration \(0.0 to 1.0\) |
+| `useSpeakerBoost` | boolean | No | Use speaker boost \(default: true\) |
+
+#### Output
+
+| Parameter | Type | Description |
+| --------- | ---- | ----------- |
+| `audioUrl` | string | URL to the generated audio file |
+| `audioFile` | file | Generated audio file object |
+| `duration` | number | Audio duration in seconds |
+| `characterCount` | number | Number of characters processed |
+| `format` | string | Audio format |
+| `provider` | string | TTS provider used |
+
+### `tts_cartesia`
+
+Convert text to speech using Cartesia Sonic (ultra-low latency)
+
+#### Input
+
+| Parameter | Type | Required | Description |
+| --------- | ---- | -------- | ----------- |
+| `text` | string | Yes | The text to convert to speech |
+| `apiKey` | string | Yes | Cartesia API key |
+| `modelId` | string | No | Model ID \(sonic-english, sonic-multilingual\) |
+| `voice` | string | No | Voice ID or embedding |
+| `language` | string | No | Language code \(en, es, fr, de, it, pt, etc.\) |
+| `outputFormat` | json | No | Output format configuration \(container, encoding, sampleRate\) |
+| `speed` | number | No | Speed multiplier |
+| `emotion` | array | No | Emotion tags for Sonic-3 \(e.g., \['positivity:high'\]\) |
+
+#### Output
+
+| Parameter | Type | Description |
+| --------- | ---- | ----------- |
+| `audioUrl` | string | URL to the generated audio file |
+| `audioFile` | file | Generated audio file object |
+| `duration` | number | Audio duration in seconds |
+| `characterCount` | number | Number of characters processed |
+| `format` | string | Audio format |
+| `provider` | string | TTS provider used |
+
+### `tts_google`
+
+Convert text to speech using Google Cloud Text-to-Speech
+
+#### Input
+
+| Parameter | Type | Required | Description |
+| --------- | ---- | -------- | ----------- |
+| `text` | string | Yes | The text to convert to speech |
+| `apiKey` | string | Yes | Google Cloud API key |
+| `voiceId` | string | No | Voice ID \(e.g., en-US-Neural2-A, en-US-Wavenet-D\) |
+| `languageCode` | string | Yes | Language code \(e.g., en-US, es-ES, fr-FR\) |
+| `gender` | string | No | Voice gender \(MALE, FEMALE, NEUTRAL\) |
+| `audioEncoding` | string | No | Audio encoding \(LINEAR16, MP3, OGG_OPUS, MULAW, ALAW\) |
+| `speakingRate` | number | No | Speaking rate \(0.25 to 2.0, default: 1.0\) |
+| `pitch` | number | No | Voice pitch \(-20.0 to 20.0, default: 0.0\) |
+| `volumeGainDb` | number | No | Volume gain in dB \(-96.0 to 16.0\) |
+| `sampleRateHertz` | number | No | Sample rate in Hz |
+| `effectsProfileId` | array | No | Effects profile \(e.g., \['headphone-class-device'\]\) |
+
+#### Output
+
+| Parameter | Type | Description |
+| --------- | ---- | ----------- |
+| `audioUrl` | string | URL to the generated audio file |
+| `audioFile` | file | Generated audio file object |
+| `duration` | number | Audio duration in seconds |
+| `characterCount` | number | Number of characters processed |
+| `format` | string | Audio format |
+| `provider` | string | TTS provider used |
+
+### `tts_azure`
+
+Convert text to speech using Azure Cognitive Services
+
+#### Input
+
+| Parameter | Type | Required | Description |
+| --------- | ---- | -------- | ----------- |
+| `text` | string | Yes | The text to convert to speech |
+| `apiKey` | string | Yes | Azure Speech Services API key |
+| `voiceId` | string | No | Voice ID \(e.g., en-US-JennyNeural, en-US-GuyNeural\) |
+| `region` | string | No | Azure region \(e.g., eastus, westus, westeurope\) |
+| `outputFormat` | string | No | Output audio format |
+| `rate` | string | No | Speaking rate \(e.g., +10%, -20%, 1.5\) |
+| `pitch` | string | No | Voice pitch \(e.g., +5Hz, -2st, low\) |
+| `style` | string | No | Speaking style \(e.g., cheerful, sad, angry - neural voices only\) |
+| `styleDegree` | number | No | Style intensity \(0.01 to 2.0\) |
+| `role` | string | No | Role \(e.g., Girl, Boy, YoungAdultFemale\) |
+
+#### Output
+
+| Parameter | Type | Description |
+| --------- | ---- | ----------- |
+| `audioUrl` | string | URL to the generated audio file |
+| `audioFile` | file | Generated audio file object |
+| `duration` | number | Audio duration in seconds |
+| `characterCount` | number | Number of characters processed |
+| `format` | string | Audio format |
+| `provider` | string | TTS provider used |
+
+### `tts_playht`
+
+Convert text to speech using PlayHT (voice cloning)
+
+#### Input
+
+| Parameter | Type | Required | Description |
+| --------- | ---- | -------- | ----------- |
+| `text` | string | Yes | The text to convert to speech |
+| `apiKey` | string | Yes | PlayHT API key \(AUTHORIZATION header\) |
+| `userId` | string | Yes | PlayHT user ID \(X-USER-ID header\) |
+| `voice` | string | No | Voice ID or manifest URL |
+| `quality` | string | No | Quality level \(draft, standard, premium\) |
+| `outputFormat` | string | No | Output format \(mp3, wav, ogg, flac, mulaw\) |
+| `speed` | number | No | Speed multiplier \(0.5 to 2.0\) |
+| `temperature` | number | No | Creativity/randomness \(0.0 to 2.0\) |
+| `voiceGuidance` | number | No | Voice stability \(1.0 to 6.0\) |
+| `textGuidance` | number | No | Text adherence \(1.0 to 6.0\) |
+| `sampleRate` | number | No | Sample rate \(8000, 16000, 22050, 24000, 44100, 48000\) |
+
+#### Output
+
+| Parameter | Type | Description |
+| --------- | ---- | ----------- |
+| `audioUrl` | string | URL to the generated audio file |
+| `audioFile` | file | Generated audio file object |
+| `duration` | number | Audio duration in seconds |
+| `characterCount` | number | Number of characters processed |
+| `format` | string | Audio format |
+| `provider` | string | TTS provider used |
+
+
+
+## Notes
+
+- Category: `tools`
+- Type: `tts`
diff --git a/apps/docs/content/docs/en/tools/video_generator.mdx b/apps/docs/content/docs/en/tools/video_generator.mdx
new file mode 100644
index 0000000000..a56b59108a
--- /dev/null
+++ b/apps/docs/content/docs/en/tools/video_generator.mdx
@@ -0,0 +1,197 @@
+---
+title: Video Generator
+description: Generate videos from text using AI
+---
+
+import { BlockInfoCard } from "@/components/ui/block-info-card"
+
+<BlockInfoCard 
+  type="video_generator"
+  color="#181C1E"
+/>
+
+{/* MANUAL-CONTENT-START:intro */}
+Create videos from text prompts using cutting-edge AI models from top providers. Sim's Video Generator brings powerful, creative video synthesis capabilities to your workflow—supporting diverse models, aspect ratios, resolutions, camera controls, native audio, and advanced style and consistency features.
+
+**Supported Providers & Models:**
+
+- **[Runway Gen-4](https://research.runwayml.com/gen2/)** (Runway ML):  
+  Runway is a pioneer in text-to-video generation, known for powerful models like Gen-2, Gen-3, and Gen-4. The latest [Gen-4](https://research.runwayml.com/gen2/) model (and Gen-4 Turbo for faster results) supports more realistic motion, greater world consistency, and visual references for character, object, style, and location. Supports 16:9, 9:16, and 1:1 aspect ratios, 5–10 second durations, up to 4K resolution, style presets, and direct upload of reference images for consistent generations. Runway powers creative tools for filmmakers, studios, and content creators worldwide.
+
+- **[Google Veo](https://deepmind.google/technologies/veo/)** (Google DeepMind):  
+  [Veo](https://deepmind.google/technologies/veo/) is Google’s next-generation video generation model, offering high-quality, native-audio videos up to 1080p and 16 seconds. Supports advanced motion, cinematic effects, and nuanced text understanding. Veo can generate videos with built-in sound—activating native audio as well as silent clips. Options include 16:9 aspect, variable duration, different models (veo-3, veo-3.1), and prompt-based controls. Ideal for storytelling, advertising, research, and ideation.
+
+- **[Luma Dream Machine](https://lumalabs.ai/dream-machine)** (Luma AI):  
+  [Dream Machine](https://lumalabs.ai/dream-machine) delivers jaw-droppingly realistic and fluid video from text. It incorporates advanced camera control, cinematography prompts, and supports both ray-1 and ray-2 models. Dream Machine supports precise aspect ratios (16:9, 9:16, 1:1), variable durations, and the specification of camera paths for intricate visual direction. Luma is renowned for breakthrough visual fidelity and is backed by leading AI vision researchers.
+
+- **[MiniMax Hailuo-02](https://minimax.chat/)** (via [Fal.ai](https://fal.ai/)):  
+  [MiniMax Hailuo-02](https://minimax.chat/) is a sophisticated Chinese generative video model, available globally through [Fal.ai](https://fal.ai/). Generate videos up to 16 seconds in landscape or portrait format, with options for prompt optimization to improve clarity and creativity. Pro and standard endpoints available, supporting high resolutions (up to 1920×1080). Well-suited for creative projects needing prompt translation and optimization, commercial storytelling, and rapid prototyping of visual ideas.
+
+**How to Choose:**  
+Pick your provider and model based on your needs for quality, speed, duration, audio, cost, and unique features. Runway and Veo offer world-leading realism and cinematic capabilities; Luma excels in fluid motion and camera control; MiniMax is ideal for Chinese-language prompts and offers fast, affordable access. Consider reference support, style presets, audio requirements, and pricing when selecting your tool.
+
+For more details on features, restrictions, pricing, and model advances, see each provider’s official documentation above.
+{/* MANUAL-CONTENT-END */}
+
+
+## Usage Instructions
+
+Generate high-quality videos from text prompts using leading AI providers. Supports multiple models, aspect ratios, resolutions, and provider-specific features like world consistency, camera controls, and audio generation.
+
+
+
+## Tools
+
+### `video_runway`
+
+Generate videos using Runway Gen-4 with world consistency and visual references
+
+#### Input
+
+| Parameter | Type | Required | Description |
+| --------- | ---- | -------- | ----------- |
+| `provider` | string | Yes | Video provider \(runway\) |
+| `apiKey` | string | Yes | Runway API key |
+| `model` | string | No | Runway model: gen-4 \(default, higher quality\) or gen-4-turbo \(faster\) |
+| `prompt` | string | Yes | Text prompt describing the video to generate |
+| `duration` | number | No | Video duration in seconds \(5 or 10, default: 5\) |
+| `aspectRatio` | string | No | Aspect ratio: 16:9 \(landscape\), 9:16 \(portrait\), or 1:1 \(square\) |
+| `resolution` | string | No | Video resolution \(720p output\). Note: Gen-4 Turbo outputs at 720p natively |
+| `visualReference` | json | Yes | Reference image REQUIRED for Gen-4 \(UserFile object\). Gen-4 only supports image-to-video, not text-only generation |
+
+#### Output
+
+| Parameter | Type | Description |
+| --------- | ---- | ----------- |
+| `videoUrl` | string | Generated video URL |
+| `videoFile` | json | Video file object with metadata |
+| `duration` | number | Video duration in seconds |
+| `width` | number | Video width in pixels |
+| `height` | number | Video height in pixels |
+| `provider` | string | Provider used \(runway\) |
+| `model` | string | Model used |
+| `jobId` | string | Runway job ID |
+
+### `video_veo`
+
+Generate videos using Google Veo 3/3.1 with native audio generation
+
+#### Input
+
+| Parameter | Type | Required | Description |
+| --------- | ---- | -------- | ----------- |
+| `provider` | string | Yes | Video provider \(veo\) |
+| `apiKey` | string | Yes | Google Gemini API key |
+| `model` | string | No | Veo model: veo-3 \(default, highest quality\), veo-3-fast \(faster\), or veo-3.1 \(latest\) |
+| `prompt` | string | Yes | Text prompt describing the video to generate |
+| `duration` | number | No | Video duration in seconds \(4, 6, or 8, default: 8\) |
+| `aspectRatio` | string | No | Aspect ratio: 16:9 \(landscape\) or 9:16 \(portrait\) |
+| `resolution` | string | No | Video resolution: 720p or 1080p \(default: 1080p\) |
+
+#### Output
+
+| Parameter | Type | Description |
+| --------- | ---- | ----------- |
+| `videoUrl` | string | Generated video URL |
+| `videoFile` | json | Video file object with metadata |
+| `duration` | number | Video duration in seconds |
+| `width` | number | Video width in pixels |
+| `height` | number | Video height in pixels |
+| `provider` | string | Provider used \(veo\) |
+| `model` | string | Model used |
+| `jobId` | string | Veo job ID |
+
+### `video_luma`
+
+Generate videos using Luma Dream Machine with advanced camera controls
+
+#### Input
+
+| Parameter | Type | Required | Description |
+| --------- | ---- | -------- | ----------- |
+| `provider` | string | Yes | Video provider \(luma\) |
+| `apiKey` | string | Yes | Luma AI API key |
+| `model` | string | No | Luma model: ray-2 \(default\) |
+| `prompt` | string | Yes | Text prompt describing the video to generate |
+| `duration` | number | No | Video duration in seconds \(5 or 9, default: 5\) |
+| `aspectRatio` | string | No | Aspect ratio: 16:9 \(landscape\), 9:16 \(portrait\), or 1:1 \(square\) |
+| `resolution` | string | No | Video resolution: 540p, 720p, or 1080p \(default: 1080p\) |
+| `cameraControl` | json | No | Camera controls as array of concept objects. Format: \[\{ "key": "concept_name" \}\]. Valid keys: truck_left, truck_right, pan_left, pan_right, tilt_up, tilt_down, zoom_in, zoom_out, push_in, pull_out, orbit_left, orbit_right, crane_up, crane_down, static, handheld, and 20+ more predefined options |
+
+#### Output
+
+| Parameter | Type | Description |
+| --------- | ---- | ----------- |
+| `videoUrl` | string | Generated video URL |
+| `videoFile` | json | Video file object with metadata |
+| `duration` | number | Video duration in seconds |
+| `width` | number | Video width in pixels |
+| `height` | number | Video height in pixels |
+| `provider` | string | Provider used \(luma\) |
+| `model` | string | Model used |
+| `jobId` | string | Luma job ID |
+
+### `video_minimax`
+
+Generate videos using MiniMax Hailuo through MiniMax Platform API with advanced realism and prompt optimization
+
+#### Input
+
+| Parameter | Type | Required | Description |
+| --------- | ---- | -------- | ----------- |
+| `provider` | string | Yes | Video provider \(minimax\) |
+| `apiKey` | string | Yes | MiniMax API key from platform.minimax.io |
+| `model` | string | No | MiniMax model: hailuo-02 \(default\) |
+| `prompt` | string | Yes | Text prompt describing the video to generate |
+| `duration` | number | No | Video duration in seconds \(6 or 10, default: 6\) |
+| `promptOptimizer` | boolean | No | Enable prompt optimization for better results \(default: true\) |
+
+#### Output
+
+| Parameter | Type | Description |
+| --------- | ---- | ----------- |
+| `videoUrl` | string | Generated video URL |
+| `videoFile` | json | Video file object with metadata |
+| `duration` | number | Video duration in seconds |
+| `width` | number | Video width in pixels |
+| `height` | number | Video height in pixels |
+| `provider` | string | Provider used \(minimax\) |
+| `model` | string | Model used |
+| `jobId` | string | MiniMax job ID |
+
+### `video_falai`
+
+Generate videos using Fal.ai platform with access to multiple models including Veo 3.1, Sora 2, Kling 2.5, MiniMax Hailuo, and more
+
+#### Input
+
+| Parameter | Type | Required | Description |
+| --------- | ---- | -------- | ----------- |
+| `provider` | string | Yes | Video provider \(falai\) |
+| `apiKey` | string | Yes | Fal.ai API key |
+| `model` | string | Yes | Fal.ai model: veo-3.1 \(Google Veo 3.1\), sora-2 \(OpenAI Sora 2\), kling-2.5-turbo-pro \(Kling 2.5 Turbo Pro\), kling-2.1-pro \(Kling 2.1 Master\), minimax-hailuo-2.3-pro \(MiniMax Hailuo Pro\), minimax-hailuo-2.3-standard \(MiniMax Hailuo Standard\), wan-2.1 \(WAN T2V\), ltxv-0.9.8 \(LTXV 13B\) |
+| `prompt` | string | Yes | Text prompt describing the video to generate |
+| `duration` | number | No | Video duration in seconds \(varies by model\) |
+| `aspectRatio` | string | No | Aspect ratio \(varies by model\): 16:9, 9:16, 1:1 |
+| `resolution` | string | No | Video resolution \(varies by model\): 540p, 720p, 1080p |
+| `promptOptimizer` | boolean | No | Enable prompt optimization for MiniMax models \(default: true\) |
+
+#### Output
+
+| Parameter | Type | Description |
+| --------- | ---- | ----------- |
+| `videoUrl` | string | Generated video URL |
+| `videoFile` | json | Video file object with metadata |
+| `duration` | number | Video duration in seconds |
+| `width` | number | Video width in pixels |
+| `height` | number | Video height in pixels |
+| `provider` | string | Provider used \(falai\) |
+| `model` | string | Model used |
+| `jobId` | string | Job ID |
+
+
+
+## Notes
+
+- Category: `tools`
+- Type: `video_generator`
diff --git a/apps/sim/app/api/proxy/stt/route.ts b/apps/sim/app/api/proxy/stt/route.ts
index 7e30e75647..3c3813516c 100644
--- a/apps/sim/app/api/proxy/stt/route.ts
+++ b/apps/sim/app/api/proxy/stt/route.ts
@@ -12,7 +12,7 @@ export const dynamic = 'force-dynamic'
 export const maxDuration = 300 // 5 minutes for large files
 
 interface SttRequestBody {
-  provider: 'whisper' | 'deepgram' | 'elevenlabs'
+  provider: 'whisper' | 'deepgram' | 'elevenlabs' | 'assemblyai' | 'gemini'
   apiKey: string
   model?: string
   audioFile?: UserFile | UserFile[]
@@ -22,6 +22,14 @@ interface SttRequestBody {
   timestamps?: 'none' | 'sentence' | 'word'
   diarization?: boolean
   translateToEnglish?: boolean
+  // Whisper-specific options
+  prompt?: string
+  temperature?: number
+  // AssemblyAI-specific options
+  sentiment?: boolean
+  entityDetection?: boolean
+  piiRedaction?: boolean
+  summarization?: boolean
   workspaceId?: string
   workflowId?: string
   executionId?: string
@@ -38,7 +46,19 @@ export async function POST(request: NextRequest) {
     }
 
     const body: SttRequestBody = await request.json()
-    const { provider, apiKey, model, language, timestamps, diarization, translateToEnglish } = body
+    const {
+      provider,
+      apiKey,
+      model,
+      language,
+      timestamps,
+      diarization,
+      translateToEnglish,
+      sentiment,
+      entityDetection,
+      piiRedaction,
+      summarization,
+    } = body
 
     if (!provider || !apiKey) {
       return NextResponse.json(
@@ -115,6 +135,9 @@ export async function POST(request: NextRequest) {
     let detectedLanguage: string | undefined
     let duration: number | undefined
     let confidence: number | undefined
+    let sentimentResults: any[] | undefined
+    let entities: any[] | undefined
+    let summary: string | undefined
 
     try {
       if (provider === 'whisper') {
@@ -124,7 +147,9 @@ export async function POST(request: NextRequest) {
           language,
           timestamps,
           translateToEnglish,
-          model
+          model,
+          body.prompt,
+          body.temperature
         )
         transcript = result.transcript
         segments = result.segments
@@ -156,6 +181,41 @@ export async function POST(request: NextRequest) {
         segments = result.segments
         detectedLanguage = result.language
         duration = result.duration
+      } else if (provider === 'assemblyai') {
+        const result = await transcribeWithAssemblyAI(
+          audioBuffer,
+          apiKey,
+          language,
+          timestamps,
+          diarization,
+          sentiment,
+          entityDetection,
+          piiRedaction,
+          summarization,
+          model
+        )
+        transcript = result.transcript
+        segments = result.segments
+        detectedLanguage = result.language
+        duration = result.duration
+        confidence = result.confidence
+        sentimentResults = result.sentiment
+        entities = result.entities
+        summary = result.summary
+      } else if (provider === 'gemini') {
+        const result = await transcribeWithGemini(
+          audioBuffer,
+          apiKey,
+          audioMimeType,
+          language,
+          timestamps,
+          model
+        )
+        transcript = result.transcript
+        segments = result.segments
+        detectedLanguage = result.language
+        duration = result.duration
+        confidence = result.confidence
       } else {
         return NextResponse.json({ error: `Unknown provider: ${provider}` }, { status: 400 })
       }
@@ -173,6 +233,9 @@ export async function POST(request: NextRequest) {
       language: detectedLanguage,
       duration,
       confidence,
+      sentiment: sentimentResults,
+      entities,
+      summary,
     })
   } catch (error) {
     logger.error(`[${requestId}] STT proxy error:`, error)
@@ -187,7 +250,9 @@ async function transcribeWithWhisper(
   language?: string,
   timestamps?: 'none' | 'sentence' | 'word',
   translate?: boolean,
-  model?: string
+  model?: string,
+  prompt?: string,
+  temperature?: number
 ): Promise<{
   transcript: string
   segments?: TranscriptSegment[]
@@ -204,12 +269,20 @@ async function transcribeWithWhisper(
     formData.append('language', language)
   }
 
+  if (prompt) {
+    formData.append('prompt', prompt)
+  }
+
+  if (temperature !== undefined) {
+    formData.append('temperature', temperature.toString())
+  }
+
   if (timestamps === 'word') {
     formData.append('response_format', 'verbose_json')
-    formData.append('timestamp_granularities[]', 'word')
+    formData.append('timestamp_granularities', 'word')
   } else if (timestamps === 'sentence') {
     formData.append('response_format', 'verbose_json')
-    formData.append('timestamp_granularities[]', 'segment')
+    formData.append('timestamp_granularities', 'segment')
   }
 
   const endpoint = translate ? 'translations' : 'transcriptions'
@@ -271,9 +344,11 @@ async function transcribeWithDeepgram(
 
   if (language && language !== 'auto') {
     params.append('language', language)
+  } else if (language === 'auto') {
+    params.append('detect_language', 'true')
   }
 
-  if (timestamps !== 'none') {
+  if (timestamps === 'sentence') {
     params.append('utterances', 'true')
   }
 
@@ -308,7 +383,7 @@ async function transcribeWithDeepgram(
   const confidence = result.confidence
 
   let segments: TranscriptSegment[] | undefined
-  if (timestamps !== 'none' && result.words) {
+  if (result.words && timestamps === 'word') {
     segments = result.words.map((word: any) => ({
       text: word.word,
       start: word.start,
@@ -316,6 +391,14 @@ async function transcribeWithDeepgram(
       speaker: word.speaker !== undefined ? `Speaker ${word.speaker}` : undefined,
       confidence: word.confidence,
     }))
+  } else if (data.results?.utterances && timestamps === 'sentence') {
+    segments = data.results.utterances.map((utterance: any) => ({
+      text: utterance.transcript,
+      start: utterance.start,
+      end: utterance.end,
+      speaker: utterance.speaker !== undefined ? `Speaker ${utterance.speaker}` : undefined,
+      confidence: utterance.confidence,
+    }))
   }
 
   return {
@@ -345,7 +428,14 @@ async function transcribeWithElevenLabs(
   formData.append('model_id', model || 'scribe_v1')
 
   if (language && language !== 'auto') {
-    formData.append('language', language)
+    formData.append('language_code', language)
+  }
+
+  if (timestamps && timestamps !== 'none') {
+    const granularity = timestamps === 'word' ? 'word' : 'word'
+    formData.append('timestamps_granularity', granularity)
+  } else {
+    formData.append('timestamps_granularity', 'word')
   }
 
   const response = await fetch('https://api.elevenlabs.io/v1/speech-to-text', {
@@ -367,9 +457,269 @@ async function transcribeWithElevenLabs(
 
   const data = await response.json()
 
+  const words = data.words || []
+  const segments: TranscriptSegment[] = words
+    .filter((w: any) => w.type === 'word')
+    .map((w: any) => ({
+      text: w.text,
+      start: w.start,
+      end: w.end,
+      speaker: w.speaker_id,
+    }))
+
   return {
     transcript: data.text || '',
-    language: data.language,
-    duration: data.duration,
+    segments: segments.length > 0 ? segments : undefined,
+    language: data.language_code,
+    duration: undefined, // ElevenLabs doesn't return duration in response
+  }
+}
+
+async function transcribeWithAssemblyAI(
+  audioBuffer: Buffer,
+  apiKey: string,
+  language?: string,
+  timestamps?: 'none' | 'sentence' | 'word',
+  diarization?: boolean,
+  sentiment?: boolean,
+  entityDetection?: boolean,
+  piiRedaction?: boolean,
+  summarization?: boolean,
+  model?: string
+): Promise<{
+  transcript: string
+  segments?: TranscriptSegment[]
+  language?: string
+  duration?: number
+  confidence?: number
+  sentiment?: any[]
+  entities?: any[]
+  summary?: string
+}> {
+  const uploadResponse = await fetch('https://api.assemblyai.com/v2/upload', {
+    method: 'POST',
+    headers: {
+      authorization: apiKey,
+      'content-type': 'application/octet-stream',
+    },
+    body: new Uint8Array(audioBuffer),
+  })
+
+  if (!uploadResponse.ok) {
+    const error = await uploadResponse.json()
+    throw new Error(`AssemblyAI upload error: ${error.error || JSON.stringify(error)}`)
+  }
+
+  const { upload_url } = await uploadResponse.json()
+
+  const transcriptRequest: any = {
+    audio_url: upload_url,
+  }
+
+  if (model === 'best' || model === 'nano') {
+    transcriptRequest.speech_model = model
+  }
+
+  if (language && language !== 'auto') {
+    transcriptRequest.language_code = language
+  } else if (language === 'auto') {
+    transcriptRequest.language_detection = true
+  }
+
+  if (diarization) {
+    transcriptRequest.speaker_labels = true
+  }
+
+  if (sentiment) {
+    transcriptRequest.sentiment_analysis = true
+  }
+
+  if (entityDetection) {
+    transcriptRequest.entity_detection = true
+  }
+
+  if (piiRedaction) {
+    transcriptRequest.redact_pii = true
+    transcriptRequest.redact_pii_policies = [
+      'us_social_security_number',
+      'email_address',
+      'phone_number',
+    ]
+  }
+
+  if (summarization) {
+    transcriptRequest.summarization = true
+    transcriptRequest.summary_model = 'informative'
+    transcriptRequest.summary_type = 'bullets'
+  }
+
+  const transcriptResponse = await fetch('https://api.assemblyai.com/v2/transcript', {
+    method: 'POST',
+    headers: {
+      authorization: apiKey,
+      'content-type': 'application/json',
+    },
+    body: JSON.stringify(transcriptRequest),
+  })
+
+  if (!transcriptResponse.ok) {
+    const error = await transcriptResponse.json()
+    throw new Error(`AssemblyAI transcript error: ${error.error || JSON.stringify(error)}`)
+  }
+
+  const { id } = await transcriptResponse.json()
+
+  let transcript: any
+  let attempts = 0
+  const maxAttempts = 60 // 5 minutes with 5-second intervals
+
+  while (attempts < maxAttempts) {
+    const statusResponse = await fetch(`https://api.assemblyai.com/v2/transcript/${id}`, {
+      headers: {
+        authorization: apiKey,
+      },
+    })
+
+    if (!statusResponse.ok) {
+      const error = await statusResponse.json()
+      throw new Error(`AssemblyAI status error: ${error.error || JSON.stringify(error)}`)
+    }
+
+    transcript = await statusResponse.json()
+
+    if (transcript.status === 'completed') {
+      break
+    }
+    if (transcript.status === 'error') {
+      throw new Error(`AssemblyAI transcription failed: ${transcript.error}`)
+    }
+
+    await new Promise((resolve) => setTimeout(resolve, 5000))
+    attempts++
+  }
+
+  if (transcript.status !== 'completed') {
+    throw new Error('AssemblyAI transcription timed out')
+  }
+
+  let segments: TranscriptSegment[] | undefined
+  if (timestamps !== 'none' && transcript.words) {
+    segments = transcript.words.map((word: any) => ({
+      text: word.text,
+      start: word.start / 1000,
+      end: word.end / 1000,
+      speaker: word.speaker ? `Speaker ${word.speaker}` : undefined,
+      confidence: word.confidence,
+    }))
+  }
+
+  const result: any = {
+    transcript: transcript.text,
+    segments,
+    language: transcript.language_code,
+    duration: transcript.audio_duration,
+    confidence: transcript.confidence,
+  }
+
+  if (sentiment && transcript.sentiment_analysis_results) {
+    result.sentiment = transcript.sentiment_analysis_results
+  }
+
+  if (entityDetection && transcript.entities) {
+    result.entities = transcript.entities
+  }
+
+  if (summarization && transcript.summary) {
+    result.summary = transcript.summary
+  }
+
+  return result
+}
+
+async function transcribeWithGemini(
+  audioBuffer: Buffer,
+  apiKey: string,
+  mimeType: string,
+  language?: string,
+  timestamps?: 'none' | 'sentence' | 'word',
+  model?: string
+): Promise<{
+  transcript: string
+  segments?: TranscriptSegment[]
+  language?: string
+  duration?: number
+  confidence?: number
+}> {
+  const modelName = model || 'gemini-2.5-flash'
+
+  const estimatedSize = audioBuffer.length * 1.34
+  if (estimatedSize > 20 * 1024 * 1024) {
+    throw new Error('Audio file exceeds 20MB limit for inline data')
+  }
+
+  const base64Audio = audioBuffer.toString('base64')
+
+  const languagePrompt = language && language !== 'auto' ? ` The audio is in ${language}.` : ''
+
+  const timestampPrompt =
+    timestamps === 'sentence' || timestamps === 'word'
+      ? ' Include timestamps in MM:SS format for each sentence.'
+      : ''
+
+  const requestBody = {
+    contents: [
+      {
+        parts: [
+          {
+            inline_data: {
+              mime_type: mimeType,
+              data: base64Audio,
+            },
+          },
+          {
+            text: `Please transcribe this audio file.${languagePrompt}${timestampPrompt} Provide the full transcript.`,
+          },
+        ],
+      },
+    ],
+  }
+
+  const response = await fetch(
+    `https://generativelanguage.googleapis.com/v1beta/models/${modelName}:generateContent?key=${apiKey}`,
+    {
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+      },
+      body: JSON.stringify(requestBody),
+    }
+  )
+
+  if (!response.ok) {
+    const error = await response.json()
+    if (response.status === 404) {
+      throw new Error(
+        `Model not found: ${modelName}. Use gemini-3-pro-preview, gemini-2.5-pro, gemini-2.5-flash, gemini-2.5-flash-lite, or gemini-2.0-flash-exp`
+      )
+    }
+    const errorMessage = error.error?.message || JSON.stringify(error)
+    throw new Error(`Gemini API error: ${errorMessage}`)
+  }
+
+  const data = await response.json()
+
+  if (!data.candidates?.[0]?.content?.parts?.[0]?.text) {
+    const candidate = data.candidates?.[0]
+    if (candidate?.finishReason === 'SAFETY') {
+      throw new Error('Content was blocked by safety filters')
+    }
+    throw new Error('Invalid response structure from Gemini API')
+  }
+
+  const transcript = data.candidates[0].content.parts[0].text
+
+  return {
+    transcript,
+    language: language !== 'auto' ? language : undefined,
   }
 }
diff --git a/apps/sim/app/api/proxy/tts/unified/route.ts b/apps/sim/app/api/proxy/tts/unified/route.ts
new file mode 100644
index 0000000000..dadfe50f50
--- /dev/null
+++ b/apps/sim/app/api/proxy/tts/unified/route.ts
@@ -0,0 +1,808 @@
+import type { NextRequest } from 'next/server'
+import { NextResponse } from 'next/server'
+import { checkHybridAuth } from '@/lib/auth/hybrid'
+import { createLogger } from '@/lib/logs/console/logger'
+import { StorageService } from '@/lib/uploads'
+import { getBaseUrl } from '@/lib/urls/utils'
+import type {
+  AzureTtsParams,
+  CartesiaTtsParams,
+  DeepgramTtsParams,
+  ElevenLabsTtsUnifiedParams,
+  GoogleTtsParams,
+  OpenAiTtsParams,
+  PlayHtTtsParams,
+  TtsProvider,
+  TtsResponse,
+} from '@/tools/tts/types'
+import { getFileExtension, getMimeType } from '@/tools/tts/types'
+
+const logger = createLogger('TtsUnifiedProxyAPI')
+
+export const dynamic = 'force-dynamic'
+export const maxDuration = 60 // 1 minute
+
+interface TtsUnifiedRequestBody {
+  provider: TtsProvider
+  text: string
+  apiKey: string
+
+  // OpenAI specific
+  model?: 'tts-1' | 'tts-1-hd' | 'gpt-4o-mini-tts'
+  voice?: string
+  responseFormat?: 'mp3' | 'opus' | 'aac' | 'flac' | 'wav' | 'pcm'
+  speed?: number
+
+  // Deepgram specific
+  encoding?: 'linear16' | 'mp3' | 'opus' | 'aac' | 'flac' | 'mulaw' | 'alaw'
+  sampleRate?: number
+  bitRate?: number
+  container?: 'none' | 'wav' | 'ogg'
+
+  // ElevenLabs specific
+  voiceId?: string
+  modelId?: string
+  stability?: number
+  similarityBoost?: number
+  style?: number | string
+  useSpeakerBoost?: boolean
+
+  // Cartesia specific
+  language?: string
+  outputFormat?: object
+  emotion?: string[]
+
+  // Google Cloud specific
+  languageCode?: string
+  gender?: 'MALE' | 'FEMALE' | 'NEUTRAL'
+  audioEncoding?: 'LINEAR16' | 'MP3' | 'OGG_OPUS' | 'MULAW' | 'ALAW'
+  speakingRate?: number
+  pitch?: number
+  volumeGainDb?: number
+  sampleRateHertz?: number
+  effectsProfileId?: string[]
+
+  // Azure specific
+  region?: string
+  rate?: string
+  styleDegree?: number
+  role?: string
+
+  // PlayHT specific
+  userId?: string
+  quality?: 'draft' | 'standard' | 'premium'
+  temperature?: number
+  voiceGuidance?: number
+  textGuidance?: number
+
+  // Execution context
+  workspaceId?: string
+  workflowId?: string
+  executionId?: string
+}
+
+export async function POST(request: NextRequest) {
+  const requestId = crypto.randomUUID()
+  logger.info(`[${requestId}] TTS unified request started`)
+
+  try {
+    const authResult = await checkHybridAuth(request, { requireWorkflowId: false })
+    if (!authResult.success) {
+      logger.error('Authentication failed for TTS unified proxy:', authResult.error)
+      return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
+    }
+
+    const body: TtsUnifiedRequestBody = await request.json()
+    const { provider, text, apiKey, workspaceId, workflowId, executionId } = body
+
+    if (!provider || !text || !apiKey) {
+      return NextResponse.json(
+        { error: 'Missing required fields: provider, text, and apiKey' },
+        { status: 400 }
+      )
+    }
+
+    const hasExecutionContext = workspaceId && workflowId && executionId
+    logger.info(`[${requestId}] Processing TTS with ${provider}`, {
+      hasExecutionContext,
+      textLength: text.length,
+    })
+
+    let audioBuffer: Buffer
+    let format: string
+    let mimeType: string
+    let duration: number | undefined
+
+    try {
+      if (provider === 'openai') {
+        const result = await synthesizeWithOpenAi({
+          text,
+          apiKey,
+          model: body.model,
+          voice: body.voice as OpenAiTtsParams['voice'],
+          responseFormat: body.responseFormat,
+          speed: body.speed,
+        })
+        audioBuffer = result.audioBuffer
+        format = result.format
+        mimeType = result.mimeType
+      } else if (provider === 'deepgram') {
+        const result = await synthesizeWithDeepgram({
+          text,
+          apiKey,
+          model: body.voice,
+          encoding: body.encoding,
+          sampleRate: body.sampleRate,
+          bitRate: body.bitRate,
+          container: body.container,
+        })
+        audioBuffer = result.audioBuffer
+        format = result.format
+        mimeType = result.mimeType
+        duration = result.duration
+      } else if (provider === 'elevenlabs') {
+        if (!body.voiceId) {
+          return NextResponse.json(
+            { error: 'voiceId is required for ElevenLabs provider' },
+            { status: 400 }
+          )
+        }
+        const result = await synthesizeWithElevenLabs({
+          text,
+          apiKey,
+          voiceId: body.voiceId,
+          modelId: body.modelId,
+          stability: body.stability,
+          similarityBoost: body.similarityBoost,
+          style: body.style as number | undefined,
+          useSpeakerBoost: body.useSpeakerBoost,
+        })
+        audioBuffer = result.audioBuffer
+        format = result.format
+        mimeType = result.mimeType
+      } else if (provider === 'cartesia') {
+        const result = await synthesizeWithCartesia({
+          text,
+          apiKey,
+          modelId: body.modelId,
+          voice: body.voice,
+          language: body.language,
+          outputFormat: body.outputFormat,
+          speed: body.speed,
+          emotion: body.emotion,
+        })
+        audioBuffer = result.audioBuffer
+        format = result.format
+        mimeType = result.mimeType
+      } else if (provider === 'google') {
+        const result = await synthesizeWithGoogle({
+          text,
+          apiKey,
+          voiceId: body.voiceId,
+          languageCode: body.languageCode,
+          gender: body.gender,
+          audioEncoding: body.audioEncoding,
+          speakingRate: body.speakingRate,
+          pitch: body.pitch,
+          volumeGainDb: body.volumeGainDb,
+          sampleRateHertz: body.sampleRateHertz,
+          effectsProfileId: body.effectsProfileId,
+        })
+        audioBuffer = result.audioBuffer
+        format = result.format
+        mimeType = result.mimeType
+      } else if (provider === 'azure') {
+        const result = await synthesizeWithAzure({
+          text,
+          apiKey,
+          voiceId: body.voiceId,
+          region: body.region,
+          outputFormat: body.outputFormat as AzureTtsParams['outputFormat'],
+          rate: body.rate,
+          pitch: body.pitch as string | undefined,
+          style: body.style as string | undefined,
+          styleDegree: body.styleDegree,
+          role: body.role,
+        })
+        audioBuffer = result.audioBuffer
+        format = result.format
+        mimeType = result.mimeType
+      } else if (provider === 'playht') {
+        if (!body.userId) {
+          return NextResponse.json(
+            { error: 'userId is required for PlayHT provider' },
+            { status: 400 }
+          )
+        }
+        const result = await synthesizeWithPlayHT({
+          text,
+          apiKey,
+          userId: body.userId,
+          voice: body.voice,
+          quality: body.quality,
+          outputFormat: typeof body.outputFormat === 'string' ? body.outputFormat : undefined,
+          speed: body.speed,
+          temperature: body.temperature,
+          voiceGuidance: body.voiceGuidance,
+          textGuidance: body.textGuidance,
+          sampleRate: body.sampleRate,
+        })
+        audioBuffer = result.audioBuffer
+        format = result.format
+        mimeType = result.mimeType
+      } else {
+        return NextResponse.json({ error: `Unknown provider: ${provider}` }, { status: 400 })
+      }
+    } catch (error) {
+      logger.error(`[${requestId}] TTS synthesis failed:`, error)
+      const errorMessage = error instanceof Error ? error.message : 'TTS synthesis failed'
+      return NextResponse.json({ error: errorMessage }, { status: 500 })
+    }
+
+    const timestamp = Date.now()
+    const fileExtension = getFileExtension(format)
+    const fileName = `tts-${provider}-${timestamp}.${fileExtension}`
+
+    if (hasExecutionContext) {
+      const { uploadExecutionFile } = await import('@/lib/uploads/contexts/execution')
+
+      const userFile = await uploadExecutionFile(
+        { workspaceId, workflowId, executionId },
+        audioBuffer,
+        fileName,
+        mimeType,
+        authResult.userId
+      )
+
+      logger.info(`[${requestId}] TTS audio stored in execution context:`, {
+        executionId,
+        fileName,
+        size: userFile.size,
+      })
+
+      const response: TtsResponse = {
+        audioUrl: userFile.url,
+        audioFile: userFile,
+        characterCount: text.length,
+        format,
+        provider,
+      }
+
+      if (duration) {
+        response.duration = duration
+      }
+
+      return NextResponse.json(response)
+    }
+
+    // Chat UI / copilot usage - no execution context
+    const fileInfo = await StorageService.uploadFile({
+      file: audioBuffer,
+      fileName,
+      contentType: mimeType,
+      context: 'copilot',
+    })
+
+    const audioUrl = `${getBaseUrl()}${fileInfo.path}`
+
+    logger.info(`[${requestId}] TTS audio stored in copilot context:`, {
+      fileName,
+      size: fileInfo.size,
+    })
+
+    const response: TtsResponse = {
+      audioUrl,
+      characterCount: text.length,
+      format,
+      provider,
+    }
+
+    if (duration) {
+      response.duration = duration
+    }
+
+    return NextResponse.json(response)
+  } catch (error) {
+    logger.error(`[${requestId}] TTS unified proxy error:`, error)
+    const errorMessage = error instanceof Error ? error.message : 'Unknown error'
+    return NextResponse.json({ error: errorMessage }, { status: 500 })
+  }
+}
+
+async function synthesizeWithOpenAi(
+  params: OpenAiTtsParams
+): Promise<{ audioBuffer: Buffer; format: string; mimeType: string }> {
+  const { text, apiKey, model = 'tts-1', responseFormat = 'mp3', speed = 1.0 } = params
+  const voice = (params.voice || 'alloy') as OpenAiTtsParams['voice']
+
+  const response = await fetch('https://api.openai.com/v1/audio/speech', {
+    method: 'POST',
+    headers: {
+      Authorization: `Bearer ${apiKey}`,
+      'Content-Type': 'application/json',
+    },
+    body: JSON.stringify({
+      model,
+      voice,
+      input: text,
+      response_format: responseFormat,
+      speed: Math.max(0.25, Math.min(4.0, speed)),
+    }),
+  })
+
+  if (!response.ok) {
+    const error = await response.json().catch(() => ({}))
+    const errorMessage = error.error?.message || error.message || response.statusText
+    throw new Error(`OpenAI TTS API error: ${errorMessage}`)
+  }
+
+  const arrayBuffer = await response.arrayBuffer()
+  const audioBuffer = Buffer.from(arrayBuffer)
+  const mimeType = getMimeType(responseFormat)
+
+  return {
+    audioBuffer,
+    format: responseFormat,
+    mimeType,
+  }
+}
+
+async function synthesizeWithDeepgram(
+  params: DeepgramTtsParams
+): Promise<{ audioBuffer: Buffer; format: string; mimeType: string; duration?: number }> {
+  const {
+    text,
+    apiKey,
+    model = 'aura-asteria-en',
+    encoding = 'mp3',
+    sampleRate,
+    bitRate,
+    container,
+  } = params
+
+  const queryParams = new URLSearchParams({
+    model: model,
+    encoding: encoding,
+  })
+
+  if (sampleRate && encoding === 'linear16') {
+    queryParams.append('sample_rate', sampleRate.toString())
+  }
+
+  if (bitRate) {
+    queryParams.append('bit_rate', bitRate.toString())
+  }
+
+  if (container && container !== 'none') {
+    queryParams.append('container', container)
+  }
+
+  const response = await fetch(`https://api.deepgram.com/v1/speak?${queryParams.toString()}`, {
+    method: 'POST',
+    headers: {
+      Authorization: `Token ${apiKey}`,
+      'Content-Type': 'application/json',
+    },
+    body: JSON.stringify({ text }),
+  })
+
+  if (!response.ok) {
+    const error = await response.json().catch(() => ({}))
+    const errorMessage = error.err_msg || error.message || response.statusText
+    throw new Error(`Deepgram TTS API error: ${errorMessage}`)
+  }
+
+  const arrayBuffer = await response.arrayBuffer()
+  const audioBuffer = Buffer.from(arrayBuffer)
+
+  let finalFormat: string = encoding
+  if (container === 'wav') {
+    finalFormat = 'wav'
+  } else if (container === 'ogg') {
+    finalFormat = 'ogg'
+  }
+
+  const mimeType = getMimeType(finalFormat)
+
+  return {
+    audioBuffer,
+    format: finalFormat,
+    mimeType,
+  }
+}
+
+async function synthesizeWithElevenLabs(
+  params: ElevenLabsTtsUnifiedParams
+): Promise<{ audioBuffer: Buffer; format: string; mimeType: string }> {
+  const {
+    text,
+    apiKey,
+    voiceId,
+    modelId = 'eleven_turbo_v2_5',
+    stability = 0.5,
+    similarityBoost = 0.8,
+    style,
+    useSpeakerBoost = true,
+  } = params
+
+  const voiceSettings: any = {
+    stability: Math.max(0, Math.min(1, stability)),
+    similarity_boost: Math.max(0, Math.min(1, similarityBoost)),
+    use_speaker_boost: useSpeakerBoost,
+  }
+
+  if (style !== undefined) {
+    voiceSettings.style = Math.max(0, Math.min(1, style))
+  }
+
+  const response = await fetch(`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`, {
+    method: 'POST',
+    headers: {
+      Accept: 'audio/mpeg',
+      'Content-Type': 'application/json',
+      'xi-api-key': apiKey,
+    },
+    body: JSON.stringify({
+      text,
+      model_id: modelId,
+      voice_settings: voiceSettings,
+    }),
+  })
+
+  if (!response.ok) {
+    const error = await response.json().catch(() => ({}))
+    const errorMessage =
+      typeof error.detail === 'string'
+        ? error.detail
+        : error.detail?.message || error.message || response.statusText
+    throw new Error(`ElevenLabs TTS API error: ${errorMessage}`)
+  }
+
+  const arrayBuffer = await response.arrayBuffer()
+  const audioBuffer = Buffer.from(arrayBuffer)
+
+  return {
+    audioBuffer,
+    format: 'mp3',
+    mimeType: 'audio/mpeg',
+  }
+}
+
+async function synthesizeWithCartesia(
+  params: Partial<CartesiaTtsParams>
+): Promise<{ audioBuffer: Buffer; format: string; mimeType: string }> {
+  const {
+    text,
+    apiKey,
+    modelId = 'sonic-3',
+    voice,
+    language = 'en',
+    outputFormat,
+    speed,
+    emotion,
+  } = params
+
+  if (!text || !apiKey) {
+    throw new Error('text and apiKey are required for Cartesia')
+  }
+
+  const requestBody: Record<string, unknown> = {
+    model_id: modelId,
+    transcript: text,
+    language,
+  }
+
+  if (voice) {
+    requestBody.voice = {
+      mode: 'id',
+      id: voice,
+    }
+  }
+
+  const generationConfig: Record<string, unknown> = {}
+  if (speed !== undefined) generationConfig.speed = speed
+  if (emotion !== undefined) generationConfig.emotion = emotion
+  if (Object.keys(generationConfig).length > 0) {
+    requestBody.generation_config = generationConfig
+  }
+
+  if (outputFormat && typeof outputFormat === 'object') {
+    requestBody.output_format = outputFormat
+  }
+
+  if (!requestBody.output_format) {
+    requestBody.output_format = {
+      container: 'wav',
+      encoding: 'pcm_s16le',
+      sample_rate: 24000,
+    }
+  }
+
+  logger.info('Cartesia API request:', {
+    model_id: requestBody.model_id,
+    has_voice: !!requestBody.voice,
+    language: requestBody.language,
+    output_format: requestBody.output_format,
+    has_generation_config: !!requestBody.generation_config,
+  })
+
+  const response = await fetch('https://api.cartesia.ai/tts/bytes', {
+    method: 'POST',
+    headers: {
+      Authorization: `Bearer ${apiKey}`,
+      'Content-Type': 'application/json',
+      'Cartesia-Version': '2025-04-16',
+    },
+    body: JSON.stringify(requestBody),
+  })
+
+  if (!response.ok) {
+    const error = await response.json().catch(() => ({}))
+    const errorMessage = error.error || error.message || response.statusText
+    const errorDetail = error.detail || ''
+    logger.error('Cartesia API error details:', {
+      status: response.status,
+      error: errorMessage,
+      detail: errorDetail,
+      requestBody: JSON.stringify(requestBody),
+    })
+    throw new Error(
+      `Cartesia TTS API error: ${errorMessage}${errorDetail ? ` - ${errorDetail}` : ''}`
+    )
+  }
+
+  const arrayBuffer = await response.arrayBuffer()
+  const audioBuffer = Buffer.from(arrayBuffer)
+
+  const format =
+    outputFormat && typeof outputFormat === 'object' && 'container' in outputFormat
+      ? (outputFormat.container as string)
+      : 'mp3'
+  const mimeType = getMimeType(format)
+
+  return {
+    audioBuffer,
+    format,
+    mimeType,
+  }
+}
+
+async function synthesizeWithGoogle(
+  params: Partial<GoogleTtsParams>
+): Promise<{ audioBuffer: Buffer; format: string; mimeType: string }> {
+  const {
+    text,
+    apiKey,
+    voiceId,
+    languageCode,
+    gender,
+    audioEncoding = 'MP3',
+    speakingRate = 1.0,
+    pitch = 0.0,
+    volumeGainDb,
+    sampleRateHertz,
+    effectsProfileId,
+  } = params
+
+  if (!text || !apiKey || !languageCode) {
+    throw new Error('text, apiKey, and languageCode are required for Google Cloud TTS')
+  }
+
+  const clampedSpeakingRate = Math.max(0.25, Math.min(2.0, speakingRate))
+
+  const audioConfig: Record<string, unknown> = {
+    audioEncoding,
+    speakingRate: clampedSpeakingRate,
+    pitch,
+  }
+
+  if (volumeGainDb !== undefined) {
+    audioConfig.volumeGainDb = volumeGainDb
+  }
+  if (sampleRateHertz) {
+    audioConfig.sampleRateHertz = sampleRateHertz
+  }
+  if (effectsProfileId && effectsProfileId.length > 0) {
+    audioConfig.effectsProfileId = effectsProfileId
+  }
+
+  // Build voice config based on what's provided
+  const voice: Record<string, unknown> = {
+    languageCode,
+  }
+
+  // If voiceId is provided, use it (it takes precedence over gender)
+  if (voiceId) {
+    voice.name = voiceId
+  }
+
+  // Only include gender if specified (don't default to NEUTRAL as it's not supported)
+  if (gender) {
+    voice.ssmlGender = gender
+  }
+
+  // If neither voiceId nor gender is provided, default to a specific voice
+  if (!voiceId && !gender) {
+    voice.name = 'en-US-Neural2-C'
+  }
+
+  const requestBody: Record<string, unknown> = {
+    input: { text },
+    voice,
+    audioConfig,
+  }
+
+  const response = await fetch(
+    `https://texttospeech.googleapis.com/v1/text:synthesize?key=${apiKey}`,
+    {
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+      },
+      body: JSON.stringify(requestBody),
+    }
+  )
+
+  if (!response.ok) {
+    const error = await response.json().catch(() => ({}))
+    const errorMessage = error.error?.message || error.message || response.statusText
+    throw new Error(`Google Cloud TTS API error: ${errorMessage}`)
+  }
+
+  const data = await response.json()
+  const audioContent = data.audioContent
+
+  if (!audioContent) {
+    throw new Error('No audio content returned from Google Cloud TTS')
+  }
+
+  const audioBuffer = Buffer.from(audioContent, 'base64')
+
+  const format = audioEncoding.toLowerCase().replace('_', '')
+  const mimeType = getMimeType(format)
+
+  return {
+    audioBuffer,
+    format,
+    mimeType,
+  }
+}
+
+async function synthesizeWithAzure(
+  params: Partial<AzureTtsParams>
+): Promise<{ audioBuffer: Buffer; format: string; mimeType: string }> {
+  const {
+    text,
+    apiKey,
+    voiceId = 'en-US-JennyNeural',
+    region = 'eastus',
+    outputFormat = 'audio-24khz-96kbitrate-mono-mp3',
+    rate,
+    pitch,
+    style,
+    styleDegree,
+    role,
+  } = params
+
+  if (!text || !apiKey) {
+    throw new Error('text and apiKey are required for Azure TTS')
+  }
+
+  let ssml = `<speak version='1.0' xml:lang='en-US' xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts"><voice name='${voiceId}'>`
+
+  if (style) {
+    ssml += `<mstts:express-as style='${style}'`
+    if (styleDegree) ssml += ` styledegree='${styleDegree}'`
+    if (role) ssml += ` role='${role}'`
+    ssml += '>'
+  }
+
+  if (rate || pitch) {
+    ssml += '<prosody'
+    if (rate) ssml += ` rate='${rate}'`
+    if (pitch) ssml += ` pitch='${pitch}'`
+    ssml += '>'
+  }
+
+  ssml += text
+
+  if (rate || pitch) {
+    ssml += '</prosody>'
+  }
+
+  if (style) {
+    ssml += '</mstts:express-as>'
+  }
+
+  ssml += '</voice></speak>'
+
+  const response = await fetch(`https://${region}.tts.speech.microsoft.com/cognitiveservices/v1`, {
+    method: 'POST',
+    headers: {
+      'Ocp-Apim-Subscription-Key': apiKey,
+      'Content-Type': 'application/ssml+xml',
+      'X-Microsoft-OutputFormat': outputFormat,
+    },
+    body: ssml,
+  })
+
+  if (!response.ok) {
+    const error = await response.text()
+    throw new Error(`Azure TTS API error: ${error || response.statusText}`)
+  }
+
+  const arrayBuffer = await response.arrayBuffer()
+  const audioBuffer = Buffer.from(arrayBuffer)
+
+  const format = outputFormat.includes('mp3') ? 'mp3' : 'wav'
+  const mimeType = getMimeType(format)
+
+  return {
+    audioBuffer,
+    format,
+    mimeType,
+  }
+}
+
+async function synthesizeWithPlayHT(
+  params: Partial<PlayHtTtsParams>
+): Promise<{ audioBuffer: Buffer; format: string; mimeType: string }> {
+  const {
+    text,
+    apiKey,
+    userId,
+    voice,
+    quality = 'standard',
+    outputFormat = 'mp3',
+    speed = 1.0,
+    temperature,
+    voiceGuidance,
+    textGuidance,
+    sampleRate,
+  } = params
+
+  if (!text || !apiKey || !userId) {
+    throw new Error('text, apiKey, and userId are required for PlayHT')
+  }
+
+  const requestBody: Record<string, unknown> = {
+    text,
+    quality,
+    output_format: outputFormat,
+    speed,
+  }
+
+  if (voice) requestBody.voice = voice
+  if (temperature !== undefined) requestBody.temperature = temperature
+  if (voiceGuidance !== undefined) requestBody.voice_guidance = voiceGuidance
+  if (textGuidance !== undefined) requestBody.text_guidance = textGuidance
+  if (sampleRate) requestBody.sample_rate = sampleRate
+
+  const response = await fetch('https://api.play.ht/api/v2/tts/stream', {
+    method: 'POST',
+    headers: {
+      AUTHORIZATION: apiKey,
+      'X-USER-ID': userId,
+      'Content-Type': 'application/json',
+    },
+    body: JSON.stringify(requestBody),
+  })
+
+  if (!response.ok) {
+    const error = await response.json().catch(() => ({}))
+    const errorMessage = error.error_message || error.message || response.statusText
+    throw new Error(`PlayHT TTS API error: ${errorMessage}`)
+  }
+
+  const arrayBuffer = await response.arrayBuffer()
+  const audioBuffer = Buffer.from(arrayBuffer)
+
+  const format = outputFormat || 'mp3'
+  const mimeType = getMimeType(format)
+
+  return {
+    audioBuffer,
+    format,
+    mimeType,
+  }
+}
diff --git a/apps/sim/app/api/proxy/video/route.ts b/apps/sim/app/api/proxy/video/route.ts
new file mode 100644
index 0000000000..fe3bf433f1
--- /dev/null
+++ b/apps/sim/app/api/proxy/video/route.ts
@@ -0,0 +1,950 @@
+import { type NextRequest, NextResponse } from 'next/server'
+import { checkHybridAuth } from '@/lib/auth/hybrid'
+import { createLogger } from '@/lib/logs/console/logger'
+import { downloadFileFromStorage } from '@/lib/uploads/utils/file-utils.server'
+import type { UserFile } from '@/executor/types'
+import type { VideoRequestBody } from '@/tools/video/types'
+
+const logger = createLogger('VideoProxyAPI')
+
+export const dynamic = 'force-dynamic'
+export const maxDuration = 600 // 10 minutes for video generation
+
+export async function POST(request: NextRequest) {
+  const requestId = crypto.randomUUID()
+  logger.info(`[${requestId}] Video generation request started`)
+
+  try {
+    const authResult = await checkHybridAuth(request, { requireWorkflowId: false })
+    if (!authResult.success) {
+      return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
+    }
+
+    const body: VideoRequestBody = await request.json()
+    const { provider, apiKey, model, prompt, duration, aspectRatio, resolution } = body
+
+    if (!provider || !apiKey || !prompt) {
+      return NextResponse.json(
+        { error: 'Missing required fields: provider, apiKey, and prompt' },
+        { status: 400 }
+      )
+    }
+
+    const validProviders = ['runway', 'veo', 'luma', 'minimax', 'falai']
+    if (!validProviders.includes(provider)) {
+      return NextResponse.json(
+        { error: `Invalid provider. Must be one of: ${validProviders.join(', ')}` },
+        { status: 400 }
+      )
+    }
+
+    if (prompt.length < 3 || prompt.length > 2000) {
+      return NextResponse.json(
+        { error: 'Prompt must be between 3 and 2000 characters' },
+        { status: 400 }
+      )
+    }
+
+    // Validate duration (provider-specific constraints)
+    if (provider === 'veo') {
+      if (duration !== undefined && ![4, 6, 8].includes(duration)) {
+        return NextResponse.json(
+          { error: 'Duration must be 4, 6, or 8 seconds for Veo' },
+          { status: 400 }
+        )
+      }
+    } else if (provider === 'minimax') {
+      if (duration !== undefined && ![6, 10].includes(duration)) {
+        return NextResponse.json(
+          { error: 'Duration must be 6 or 10 seconds for MiniMax' },
+          { status: 400 }
+        )
+      }
+    } else if (provider !== 'falai' && duration !== undefined && (duration < 5 || duration > 10)) {
+      // Fal.ai has variable duration constraints per model, skip validation
+      return NextResponse.json(
+        { error: 'Duration must be between 5 and 10 seconds' },
+        { status: 400 }
+      )
+    }
+
+    // Validate aspect ratio (Veo only supports 16:9 and 9:16)
+    const validAspectRatios = provider === 'veo' ? ['16:9', '9:16'] : ['16:9', '9:16', '1:1']
+    if (aspectRatio && !validAspectRatios.includes(aspectRatio)) {
+      return NextResponse.json(
+        { error: `Aspect ratio must be ${validAspectRatios.join(', ')}` },
+        { status: 400 }
+      )
+    }
+
+    logger.info(`[${requestId}] Generating video with ${provider}, model: ${model || 'default'}`)
+
+    let videoUrl: string
+    let videoBuffer: Buffer
+    let width: number | undefined
+    let height: number | undefined
+    let jobId: string | undefined
+    let actualDuration: number | undefined
+
+    try {
+      if (provider === 'runway') {
+        const result = await generateWithRunway(
+          apiKey,
+          model || 'gen-4',
+          prompt,
+          duration || 5,
+          aspectRatio || '16:9',
+          resolution || '1080p',
+          body.visualReference,
+          requestId,
+          logger
+        )
+        videoBuffer = result.buffer
+        width = result.width
+        height = result.height
+        jobId = result.jobId
+        actualDuration = result.duration
+      } else if (provider === 'veo') {
+        const result = await generateWithVeo(
+          apiKey,
+          model || 'veo-3',
+          prompt,
+          duration || 8, // Default to 8 seconds (valid: 4, 6, or 8)
+          aspectRatio || '16:9',
+          resolution || '1080p',
+          requestId,
+          logger
+        )
+        videoBuffer = result.buffer
+        width = result.width
+        height = result.height
+        jobId = result.jobId
+        actualDuration = result.duration
+      } else if (provider === 'luma') {
+        const result = await generateWithLuma(
+          apiKey,
+          model || 'ray-2',
+          prompt,
+          duration || 5,
+          aspectRatio || '16:9',
+          resolution || '1080p',
+          body.cameraControl,
+          requestId,
+          logger
+        )
+        videoBuffer = result.buffer
+        width = result.width
+        height = result.height
+        jobId = result.jobId
+        actualDuration = result.duration
+      } else if (provider === 'minimax') {
+        const result = await generateWithMiniMax(
+          apiKey,
+          model || 'hailuo-02',
+          prompt,
+          duration || 6,
+          body.promptOptimizer !== false, // Default true
+          requestId,
+          logger
+        )
+        videoBuffer = result.buffer
+        width = result.width
+        height = result.height
+        jobId = result.jobId
+        actualDuration = result.duration
+      } else if (provider === 'falai') {
+        if (!model) {
+          return NextResponse.json(
+            { error: 'Model is required for Fal.ai provider' },
+            { status: 400 }
+          )
+        }
+        const result = await generateWithFalAI(
+          apiKey,
+          model,
+          prompt,
+          duration,
+          aspectRatio,
+          resolution,
+          body.promptOptimizer,
+          requestId,
+          logger
+        )
+        videoBuffer = result.buffer
+        width = result.width
+        height = result.height
+        jobId = result.jobId
+        actualDuration = result.duration
+      } else {
+        return NextResponse.json({ error: `Unknown provider: ${provider}` }, { status: 400 })
+      }
+    } catch (error) {
+      logger.error(`[${requestId}] Video generation failed:`, error)
+      const errorMessage = error instanceof Error ? error.message : 'Video generation failed'
+      return NextResponse.json({ error: errorMessage }, { status: 500 })
+    }
+
+    const hasExecutionContext = body.workspaceId && body.workflowId && body.executionId
+
+    logger.info(`[${requestId}] Storing video file, size: ${videoBuffer.length} bytes`)
+
+    if (hasExecutionContext) {
+      const { uploadExecutionFile } = await import('@/lib/uploads/contexts/execution')
+      const timestamp = Date.now()
+      const fileName = `video-${provider}-${timestamp}.mp4`
+
+      let videoFile
+      try {
+        videoFile = await uploadExecutionFile(
+          {
+            workspaceId: body.workspaceId!,
+            workflowId: body.workflowId!,
+            executionId: body.executionId!,
+          },
+          videoBuffer,
+          fileName,
+          'video/mp4',
+          authResult.userId
+        )
+
+        logger.info(`[${requestId}] Video stored successfully:`, {
+          fileName,
+          size: videoFile.size,
+          executionId: body.executionId,
+        })
+      } catch (error) {
+        logger.error(`[${requestId}] Failed to upload video file:`, error)
+        throw new Error(
+          `Failed to store video: ${error instanceof Error ? error.message : 'Unknown error'}`
+        )
+      }
+
+      return NextResponse.json({
+        videoUrl: videoFile.url,
+        videoFile,
+        duration: actualDuration || duration,
+        width,
+        height,
+        provider,
+        model: model || 'default',
+        jobId,
+      })
+    }
+
+    const { StorageService } = await import('@/lib/uploads')
+    const { getBaseUrl } = await import('@/lib/urls/utils')
+    const timestamp = Date.now()
+    const fileName = `video-${provider}-${timestamp}.mp4`
+
+    try {
+      const fileInfo = await StorageService.uploadFile({
+        file: videoBuffer,
+        fileName,
+        contentType: 'video/mp4',
+        context: 'copilot',
+      })
+
+      videoUrl = `${getBaseUrl()}${fileInfo.path}`
+    } catch (error) {
+      logger.error(`[${requestId}] Failed to upload video file (fallback):`, error)
+      throw new Error(
+        `Failed to store video: ${error instanceof Error ? error.message : 'Unknown error'}`
+      )
+    }
+
+    logger.info(`[${requestId}] Video generation completed successfully`)
+
+    return NextResponse.json({
+      videoUrl,
+      duration: actualDuration || duration,
+      width,
+      height,
+      provider,
+      model: model || 'default',
+      jobId,
+    })
+  } catch (error) {
+    logger.error(`[${requestId}] Video proxy error:`, error)
+    const errorMessage = error instanceof Error ? error.message : 'Unknown error'
+    return NextResponse.json({ error: errorMessage }, { status: 500 })
+  }
+}
+
+async function generateWithRunway(
+  apiKey: string,
+  model: string,
+  prompt: string,
+  duration: number,
+  aspectRatio: string,
+  resolution: string,
+  visualReference: UserFile | undefined,
+  requestId: string,
+  logger: ReturnType<typeof createLogger>
+): Promise<{ buffer: Buffer; width: number; height: number; jobId: string; duration: number }> {
+  logger.info(`[${requestId}] Starting Runway Gen-4 generation`)
+
+  const dimensions = getVideoDimensions(aspectRatio, resolution)
+
+  // Convert aspect ratio to resolution format for 2024-11-06 API version
+  const ratioMap: { [key: string]: string } = {
+    '16:9': '1280:720', // Landscape (720p)
+    '9:16': '720:1280', // Portrait (720p)
+    '1:1': '960:960', // Square
+  }
+  const runwayRatio = ratioMap[aspectRatio] || '1280:720'
+
+  const createPayload: any = {
+    promptText: prompt,
+    duration,
+    ratio: runwayRatio, // Use resolution-based ratio for 2024-11-06 API
+    model: 'gen4_turbo', // Only gen4_turbo supports image-to-video // Use underscore
+  }
+
+  if (visualReference) {
+    const refBuffer = await downloadFileFromStorage(visualReference, requestId, logger)
+    const refBase64 = refBuffer.toString('base64')
+    createPayload.promptImage = `data:${visualReference.type};base64,${refBase64}` // Use promptImage
+  }
+
+  const createResponse = await fetch('https://api.dev.runwayml.com/v1/image_to_video', {
+    method: 'POST',
+    headers: {
+      Authorization: `Bearer ${apiKey}`,
+      'Content-Type': 'application/json',
+      'X-Runway-Version': '2024-11-06',
+    },
+    body: JSON.stringify(createPayload),
+  })
+
+  if (!createResponse.ok) {
+    const error = await createResponse.text()
+    throw new Error(`Runway API error: ${createResponse.status} - ${error}`)
+  }
+
+  const createData = await createResponse.json()
+  const taskId = createData.id
+
+  logger.info(`[${requestId}] Runway task created: ${taskId}`)
+
+  const maxAttempts = 120 // 10 minutes with 5-second intervals
+  let attempts = 0
+
+  while (attempts < maxAttempts) {
+    await sleep(5000) // Poll every 5 seconds
+
+    const statusResponse = await fetch(`https://api.dev.runwayml.com/v1/tasks/${taskId}`, {
+      headers: {
+        Authorization: `Bearer ${apiKey}`,
+        'X-Runway-Version': '2024-11-06',
+      },
+    })
+
+    if (!statusResponse.ok) {
+      throw new Error(`Runway status check failed: ${statusResponse.status}`)
+    }
+
+    const statusData = await statusResponse.json()
+
+    if (statusData.status === 'SUCCEEDED') {
+      logger.info(`[${requestId}] Runway generation completed after ${attempts * 5}s`)
+
+      const videoResponse = await fetch(statusData.output[0])
+      if (!videoResponse.ok) {
+        throw new Error(`Failed to download video: ${videoResponse.status}`)
+      }
+
+      const arrayBuffer = await videoResponse.arrayBuffer()
+      return {
+        buffer: Buffer.from(arrayBuffer),
+        width: dimensions.width,
+        height: dimensions.height,
+        jobId: taskId,
+        duration,
+      }
+    }
+
+    if (statusData.status === 'FAILED') {
+      throw new Error(`Runway generation failed: ${statusData.failure || 'Unknown error'}`)
+    }
+
+    attempts++
+  }
+
+  throw new Error('Runway generation timed out after 10 minutes')
+}
+
+async function generateWithVeo(
+  apiKey: string,
+  model: string,
+  prompt: string,
+  duration: number,
+  aspectRatio: string,
+  resolution: string,
+  requestId: string,
+  logger: ReturnType<typeof createLogger>
+): Promise<{ buffer: Buffer; width: number; height: number; jobId: string; duration: number }> {
+  logger.info(`[${requestId}] Starting Google Veo generation`)
+
+  const dimensions = getVideoDimensions(aspectRatio, resolution)
+
+  const modelNameMap: Record<string, string> = {
+    'veo-3': 'veo-3.0-generate-001',
+    'veo-3-fast': 'veo-3.0-fast-generate-001', // Fixed: was incorrectly mapped to 3.1
+    'veo-3.1': 'veo-3.1-generate-preview',
+  }
+  const modelName = modelNameMap[model] || 'veo-3.1-generate-preview'
+
+  const createPayload = {
+    instances: [
+      {
+        prompt,
+      },
+    ],
+    parameters: {
+      aspectRatio: aspectRatio, // Keep as "16:9", don't convert
+      resolution: resolution,
+      durationSeconds: duration, // Keep as number
+    },
+  }
+
+  const createResponse = await fetch(
+    `https://generativelanguage.googleapis.com/v1beta/models/${modelName}:predictLongRunning`,
+    {
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+        'x-goog-api-key': apiKey,
+      },
+      body: JSON.stringify(createPayload),
+    }
+  )
+
+  if (!createResponse.ok) {
+    const error = await createResponse.text()
+    throw new Error(`Veo API error: ${createResponse.status} - ${error}`)
+  }
+
+  const createData = await createResponse.json()
+  const operationName = createData.name
+
+  logger.info(`[${requestId}] Veo operation created: ${operationName}`)
+
+  const maxAttempts = 60 // 5 minutes with 5-second intervals
+  let attempts = 0
+
+  while (attempts < maxAttempts) {
+    await sleep(5000)
+
+    const statusResponse = await fetch(
+      `https://generativelanguage.googleapis.com/v1beta/${operationName}`,
+      {
+        headers: {
+          'x-goog-api-key': apiKey,
+        },
+      }
+    )
+
+    if (!statusResponse.ok) {
+      throw new Error(`Veo status check failed: ${statusResponse.status}`)
+    }
+
+    const statusData = await statusResponse.json()
+
+    if (statusData.done) {
+      if (statusData.error) {
+        throw new Error(`Veo generation failed: ${statusData.error.message}`)
+      }
+
+      logger.info(`[${requestId}] Veo generation completed after ${attempts * 5}s`)
+
+      const videoUri = statusData.response?.generateVideoResponse?.generatedSamples?.[0]?.video?.uri
+      if (!videoUri) {
+        throw new Error('No video URI in response')
+      }
+
+      const videoResponse = await fetch(videoUri, {
+        headers: {
+          'x-goog-api-key': apiKey,
+        },
+      })
+
+      if (!videoResponse.ok) {
+        throw new Error(`Failed to download video: ${videoResponse.status}`)
+      }
+
+      const arrayBuffer = await videoResponse.arrayBuffer()
+      return {
+        buffer: Buffer.from(arrayBuffer),
+        width: dimensions.width,
+        height: dimensions.height,
+        jobId: operationName,
+        duration,
+      }
+    }
+
+    attempts++
+  }
+
+  throw new Error('Veo generation timed out after 5 minutes')
+}
+
+async function generateWithLuma(
+  apiKey: string,
+  model: string,
+  prompt: string,
+  duration: number,
+  aspectRatio: string,
+  resolution: string,
+  cameraControl: any | undefined,
+  requestId: string,
+  logger: ReturnType<typeof createLogger>
+): Promise<{ buffer: Buffer; width: number; height: number; jobId: string; duration: number }> {
+  logger.info(`[${requestId}] Starting Luma Dream Machine generation`)
+
+  const dimensions = getVideoDimensions(aspectRatio, resolution)
+
+  const createPayload: any = {
+    prompt,
+    model: model || 'ray-2',
+    aspect_ratio: aspectRatio,
+    loop: false,
+  }
+
+  if (duration) {
+    createPayload.duration = `${duration}s`
+  }
+
+  if (resolution) {
+    createPayload.resolution = resolution
+  }
+
+  if (cameraControl) {
+    createPayload.concepts = Array.isArray(cameraControl) ? cameraControl : [{ key: cameraControl }]
+  }
+
+  const createResponse = await fetch('https://api.lumalabs.ai/dream-machine/v1/generations', {
+    method: 'POST',
+    headers: {
+      Authorization: `Bearer ${apiKey}`,
+      'Content-Type': 'application/json',
+    },
+    body: JSON.stringify(createPayload),
+  })
+
+  if (!createResponse.ok) {
+    const error = await createResponse.text()
+    throw new Error(`Luma API error: ${createResponse.status} - ${error}`)
+  }
+
+  const createData = await createResponse.json()
+  const generationId = createData.id
+
+  logger.info(`[${requestId}] Luma generation created: ${generationId}`)
+
+  const maxAttempts = 120 // 10 minutes
+  let attempts = 0
+
+  while (attempts < maxAttempts) {
+    await sleep(5000)
+
+    const statusResponse = await fetch(
+      `https://api.lumalabs.ai/dream-machine/v1/generations/${generationId}`,
+      {
+        headers: {
+          Authorization: `Bearer ${apiKey}`,
+        },
+      }
+    )
+
+    if (!statusResponse.ok) {
+      throw new Error(`Luma status check failed: ${statusResponse.status}`)
+    }
+
+    const statusData = await statusResponse.json()
+
+    if (statusData.state === 'completed') {
+      logger.info(`[${requestId}] Luma generation completed after ${attempts * 5}s`)
+
+      const videoUrl = statusData.assets?.video
+      if (!videoUrl) {
+        throw new Error('No video URL in response')
+      }
+
+      const videoResponse = await fetch(videoUrl)
+      if (!videoResponse.ok) {
+        throw new Error(`Failed to download video: ${videoResponse.status}`)
+      }
+
+      const arrayBuffer = await videoResponse.arrayBuffer()
+      return {
+        buffer: Buffer.from(arrayBuffer),
+        width: dimensions.width,
+        height: dimensions.height,
+        jobId: generationId,
+        duration,
+      }
+    }
+
+    if (statusData.state === 'failed') {
+      throw new Error(`Luma generation failed: ${statusData.failure_reason || 'Unknown error'}`)
+    }
+
+    attempts++
+  }
+
+  throw new Error('Luma generation timed out after 10 minutes')
+}
+
+async function generateWithMiniMax(
+  apiKey: string,
+  model: string,
+  prompt: string,
+  duration: number,
+  promptOptimizer: boolean,
+  requestId: string,
+  logger: ReturnType<typeof createLogger>
+): Promise<{ buffer: Buffer; width: number; height: number; jobId: string; duration: number }> {
+  logger.info(`[${requestId}] Starting MiniMax Hailuo generation via MiniMax Platform API`)
+  logger.info(
+    `[${requestId}] Request params - model: ${model}, duration: ${duration}, promptOptimizer: ${promptOptimizer}`
+  )
+
+  // Determine resolution and dimensions based on duration
+  // MiniMax-Hailuo-02 supports 768P (6s) or 1080P (10s)
+  const resolution = duration === 10 ? '1080P' : '768P'
+  const dimensions = duration === 10 ? { width: 1920, height: 1080 } : { width: 1360, height: 768 }
+
+  logger.info(
+    `[${requestId}] Using resolution: ${resolution}, dimensions: ${dimensions.width}x${dimensions.height}`
+  )
+
+  // Map our model ID to MiniMax model name
+  const minimaxModel = model === 'hailuo-02' ? 'MiniMax-Hailuo-02' : 'MiniMax-Hailuo-2.3'
+
+  // Create video generation request via MiniMax Platform API
+  const createResponse = await fetch('https://api.minimax.io/v1/video_generation', {
+    method: 'POST',
+    headers: {
+      Authorization: `Bearer ${apiKey}`,
+      'Content-Type': 'application/json',
+    },
+    body: JSON.stringify({
+      model: minimaxModel,
+      prompt: prompt,
+      duration: duration,
+      resolution: resolution,
+      prompt_optimizer: promptOptimizer,
+    }),
+  })
+
+  if (!createResponse.ok) {
+    const errorText = await createResponse.text()
+    if (createResponse.status === 401 || createResponse.status === 1004) {
+      throw new Error(
+        `MiniMax API authentication failed (${createResponse.status}). Please ensure you're using a valid MiniMax API key from platform.minimax.io. Error: ${errorText}`
+      )
+    }
+    throw new Error(`MiniMax API error: ${createResponse.status} - ${errorText}`)
+  }
+
+  const createData = await createResponse.json()
+
+  // Check for error in response
+  if (createData.base_resp?.status_code !== 0) {
+    throw new Error(`MiniMax API error: ${createData.base_resp?.status_msg || 'Unknown error'}`)
+  }
+
+  const taskId = createData.task_id
+
+  logger.info(`[${requestId}] MiniMax task created: ${taskId}`)
+
+  // Poll for completion (6-10 minutes typical)
+  const maxAttempts = 120 // 10 minutes with 5-second intervals
+  let attempts = 0
+
+  while (attempts < maxAttempts) {
+    await sleep(5000)
+
+    // Query task status
+    const statusResponse = await fetch(
+      `https://api.minimax.io/v1/query/video_generation?task_id=${taskId}`,
+      {
+        headers: {
+          Authorization: `Bearer ${apiKey}`,
+        },
+      }
+    )
+
+    if (!statusResponse.ok) {
+      throw new Error(`MiniMax status check failed: ${statusResponse.status}`)
+    }
+
+    const statusData = await statusResponse.json()
+
+    if (
+      statusData.base_resp?.status_code !== 0 &&
+      statusData.base_resp?.status_code !== undefined
+    ) {
+      throw new Error(
+        `MiniMax status query error: ${statusData.base_resp?.status_msg || 'Unknown error'}`
+      )
+    }
+
+    if (statusData.status === 'Success' || statusData.status === 'success') {
+      logger.info(`[${requestId}] MiniMax generation completed after ${attempts * 5}s`)
+
+      const fileId = statusData.file_id
+      if (!fileId) {
+        throw new Error('No file_id in response')
+      }
+
+      // Download the video using file_id
+      const fileResponse = await fetch(
+        `https://api.minimax.io/v1/files/retrieve?file_id=${fileId}`,
+        {
+          headers: {
+            Authorization: `Bearer ${apiKey}`,
+          },
+        }
+      )
+
+      if (!fileResponse.ok) {
+        throw new Error(`Failed to download video: ${fileResponse.status}`)
+      }
+
+      const fileData = await fileResponse.json()
+      const videoUrl = fileData.file?.download_url
+
+      if (!videoUrl) {
+        throw new Error('No download URL in file response')
+      }
+
+      // Download the actual video file
+      const videoResponse = await fetch(videoUrl)
+      if (!videoResponse.ok) {
+        throw new Error(`Failed to download video from URL: ${videoResponse.status}`)
+      }
+
+      const arrayBuffer = await videoResponse.arrayBuffer()
+      return {
+        buffer: Buffer.from(arrayBuffer),
+        width: dimensions.width,
+        height: dimensions.height,
+        jobId: taskId,
+        duration,
+      }
+    }
+
+    if (statusData.status === 'Failed' || statusData.status === 'failed') {
+      throw new Error(`MiniMax generation failed: ${statusData.error || 'Unknown error'}`)
+    }
+
+    // Status is still "Processing" or "Queueing", continue polling
+    attempts++
+  }
+
+  throw new Error('MiniMax generation timed out after 10 minutes')
+}
+
+// Helper function to strip subpaths from Fal.ai model IDs for status/result endpoints
+function getBaseModelId(fullModelId: string): string {
+  const parts = fullModelId.split('/')
+  // Keep only the first two parts (e.g., "fal-ai/sora-2" from "fal-ai/sora-2/text-to-video")
+  if (parts.length > 2) {
+    return parts.slice(0, 2).join('/')
+  }
+  return fullModelId
+}
+
+// Helper function to format duration based on model requirements
+function formatDuration(model: string, duration: number | undefined): string | number | undefined {
+  if (duration === undefined) return undefined
+
+  // Veo 3.1 requires duration with "s" suffix (e.g., "8s")
+  if (model === 'veo-3.1') {
+    return `${duration}s`
+  }
+
+  // Sora 2 requires numeric duration
+  if (model === 'sora-2') {
+    return duration
+  }
+
+  // Other models use string format
+  return String(duration)
+}
+
+async function generateWithFalAI(
+  apiKey: string,
+  model: string,
+  prompt: string,
+  duration: number | undefined,
+  aspectRatio: string | undefined,
+  resolution: string | undefined,
+  promptOptimizer: boolean | undefined,
+  requestId: string,
+  logger: ReturnType<typeof createLogger>
+): Promise<{ buffer: Buffer; width: number; height: number; jobId: string; duration: number }> {
+  logger.info(`[${requestId}] Starting Fal.ai generation with model: ${model}`)
+
+  // Map our model IDs to Fal.ai model paths
+  const modelMap: { [key: string]: string } = {
+    'veo-3.1': 'fal-ai/veo3.1',
+    'sora-2': 'fal-ai/sora-2/text-to-video',
+    'kling-2.5-turbo-pro': 'fal-ai/kling-video/v2.5-turbo/pro/text-to-video',
+    'kling-2.1-pro': 'fal-ai/kling-video/v2.1/master/text-to-video',
+    'minimax-hailuo-2.3-pro': 'fal-ai/minimax/hailuo-02/pro/text-to-video',
+    'minimax-hailuo-2.3-standard': 'fal-ai/minimax/hailuo-02/standard/text-to-video',
+    'wan-2.1': 'fal-ai/wan-t2v',
+    'ltxv-0.9.8': 'fal-ai/ltxv-13b-098-distilled',
+  }
+
+  const falModelId = modelMap[model]
+  if (!falModelId) {
+    throw new Error(`Unknown Fal.ai model: ${model}`)
+  }
+
+  // Build request body based on model requirements
+  const requestBody: any = { prompt }
+
+  // Format duration based on model requirements
+  const formattedDuration = formatDuration(model, duration)
+  if (formattedDuration !== undefined) {
+    requestBody.duration = formattedDuration
+  }
+
+  if (aspectRatio) {
+    requestBody.aspect_ratio = aspectRatio
+  }
+
+  if (resolution) {
+    requestBody.resolution = resolution
+  }
+
+  // MiniMax models support prompt optimizer
+  if (model.startsWith('minimax-hailuo') && promptOptimizer !== undefined) {
+    requestBody.prompt_optimizer = promptOptimizer
+  }
+
+  const createResponse = await fetch(`https://queue.fal.run/${falModelId}`, {
+    method: 'POST',
+    headers: {
+      Authorization: `Key ${apiKey}`,
+      'Content-Type': 'application/json',
+    },
+    body: JSON.stringify(requestBody),
+  })
+
+  if (!createResponse.ok) {
+    const error = await createResponse.text()
+    throw new Error(`Fal.ai API error: ${createResponse.status} - ${error}`)
+  }
+
+  const createData = await createResponse.json()
+  const requestIdFal = createData.request_id
+
+  logger.info(`[${requestId}] Fal.ai request created: ${requestIdFal}`)
+
+  // Get base model ID (without subpath) for status and result endpoints
+  const baseModelId = getBaseModelId(falModelId)
+
+  const maxAttempts = 96 // 8 minutes with 5-second intervals
+  let attempts = 0
+
+  while (attempts < maxAttempts) {
+    await sleep(5000)
+
+    const statusResponse = await fetch(
+      `https://queue.fal.run/${baseModelId}/requests/${requestIdFal}/status`,
+      {
+        headers: {
+          Authorization: `Key ${apiKey}`,
+        },
+      }
+    )
+
+    if (!statusResponse.ok) {
+      throw new Error(`Fal.ai status check failed: ${statusResponse.status}`)
+    }
+
+    const statusData = await statusResponse.json()
+
+    if (statusData.status === 'COMPLETED') {
+      logger.info(`[${requestId}] Fal.ai generation completed after ${attempts * 5}s`)
+
+      const resultResponse = await fetch(
+        `https://queue.fal.run/${baseModelId}/requests/${requestIdFal}`,
+        {
+          headers: {
+            Authorization: `Key ${apiKey}`,
+          },
+        }
+      )
+
+      if (!resultResponse.ok) {
+        throw new Error(`Failed to fetch result: ${resultResponse.status}`)
+      }
+
+      const resultData = await resultResponse.json()
+
+      const videoUrl = resultData.video?.url || resultData.output?.url
+      if (!videoUrl) {
+        throw new Error('No video URL in response')
+      }
+
+      const videoResponse = await fetch(videoUrl)
+      if (!videoResponse.ok) {
+        throw new Error(`Failed to download video: ${videoResponse.status}`)
+      }
+
+      const arrayBuffer = await videoResponse.arrayBuffer()
+
+      // Try to get dimensions from response, or calculate from aspect ratio
+      let width = resultData.video?.width || 1920
+      let height = resultData.video?.height || 1080
+
+      if (!resultData.video?.width && aspectRatio) {
+        const dims = getVideoDimensions(aspectRatio, resolution || '1080p')
+        width = dims.width
+        height = dims.height
+      }
+
+      return {
+        buffer: Buffer.from(arrayBuffer),
+        width,
+        height,
+        jobId: requestIdFal,
+        duration: duration || 5,
+      }
+    }
+
+    if (statusData.status === 'FAILED') {
+      throw new Error(`Fal.ai generation failed: ${statusData.error || 'Unknown error'}`)
+    }
+
+    attempts++
+  }
+
+  throw new Error('Fal.ai generation timed out after 8 minutes')
+}
+
+function getVideoDimensions(
+  aspectRatio: string,
+  resolution: string
+): { width: number; height: number } {
+  let height: number
+  if (resolution === '4k') {
+    height = 2160
+  } else {
+    height = Number.parseInt(resolution.replace('p', ''))
+  }
+
+  const [ratioW, ratioH] = aspectRatio.split(':').map(Number)
+  const width = Math.round((height * ratioW) / ratioH)
+
+  return { width, height }
+}
+
+function sleep(ms: number): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, ms))
+}
diff --git a/apps/sim/app/workspace/[workspaceId]/w/components/sidebar/components-new/search-modal/search-modal.tsx b/apps/sim/app/workspace/[workspaceId]/w/components/sidebar/components-new/search-modal/search-modal.tsx
index 161e90f9f1..6129da68fc 100644
--- a/apps/sim/app/workspace/[workspaceId]/w/components/sidebar/components-new/search-modal/search-modal.tsx
+++ b/apps/sim/app/workspace/[workspaceId]/w/components/sidebar/components-new/search-modal/search-modal.tsx
@@ -359,9 +359,42 @@ export function SearchModal({
       .map((result) => result.item)
   }, [allItems, searchQuery, sectionOrder])
 
+  const groupedItems = useMemo(() => {
+    const groups: Record<string, SearchItem[]> = {
+      workspace: [],
+      workflow: [],
+      page: [],
+      trigger: [],
+      block: [],
+      tool: [],
+      doc: [],
+    }
+
+    filteredItems.forEach((item) => {
+      if (groups[item.type]) {
+        groups[item.type].push(item)
+      }
+    })
+
+    return groups
+  }, [filteredItems])
+
+  const displayedItemsInVisualOrder = useMemo(() => {
+    const visualOrder: SearchItem[] = []
+
+    sectionOrder.forEach((type) => {
+      const items = groupedItems[type] || []
+      items.forEach((item) => {
+        visualOrder.push(item)
+      })
+    })
+
+    return visualOrder
+  }, [groupedItems, sectionOrder])
+
   useEffect(() => {
     setSelectedIndex(0)
-  }, [filteredItems])
+  }, [displayedItemsInVisualOrder])
 
   useEffect(() => {
     if (!open) {
@@ -413,7 +446,7 @@ export function SearchModal({
       switch (e.key) {
         case 'ArrowDown':
           e.preventDefault()
-          setSelectedIndex((prev) => Math.min(prev + 1, filteredItems.length - 1))
+          setSelectedIndex((prev) => Math.min(prev + 1, displayedItemsInVisualOrder.length - 1))
           break
         case 'ArrowUp':
           e.preventDefault()
@@ -421,8 +454,8 @@ export function SearchModal({
           break
         case 'Enter':
           e.preventDefault()
-          if (filteredItems[selectedIndex]) {
-            handleItemClick(filteredItems[selectedIndex])
+          if (displayedItemsInVisualOrder[selectedIndex]) {
+            handleItemClick(displayedItemsInVisualOrder[selectedIndex])
           }
           break
         case 'Escape':
@@ -434,7 +467,7 @@ export function SearchModal({
 
     document.addEventListener('keydown', handleKeyDown)
     return () => document.removeEventListener('keydown', handleKeyDown)
-  }, [open, selectedIndex, filteredItems, handleItemClick, onOpenChange])
+  }, [open, selectedIndex, displayedItemsInVisualOrder, handleItemClick, onOpenChange])
 
   useEffect(() => {
     if (open && selectedIndex >= 0) {
@@ -448,26 +481,6 @@ export function SearchModal({
     }
   }, [selectedIndex, open])
 
-  const groupedItems = useMemo(() => {
-    const groups: Record<string, SearchItem[]> = {
-      workspace: [],
-      workflow: [],
-      page: [],
-      trigger: [],
-      block: [],
-      tool: [],
-      doc: [],
-    }
-
-    filteredItems.forEach((item) => {
-      if (groups[item.type]) {
-        groups[item.type].push(item)
-      }
-    })
-
-    return groups
-  }, [filteredItems])
-
   const sectionTitles: Record<string, string> = {
     workspace: 'Workspaces',
     workflow: 'Workflows',
@@ -501,7 +514,7 @@ export function SearchModal({
           </div>
 
           {/* Floating results container */}
-          {filteredItems.length > 0 ? (
+          {displayedItemsInVisualOrder.length > 0 ? (
             <div className='scrollbar-thin scrollbar-thumb-border scrollbar-track-transparent max-h-[400px] overflow-y-auto rounded-[10px] py-[10px] shadow-sm'>
               {sectionOrder.map((type) => {
                 const items = groupedItems[type] || []
@@ -518,8 +531,8 @@ export function SearchModal({
                     <div className='space-y-[2px]'>
                       {items.map((item, itemIndex) => {
                         const Icon = item.icon
-                        const globalIndex = filteredItems.indexOf(item)
-                        const isSelected = globalIndex === selectedIndex
+                        const visualIndex = displayedItemsInVisualOrder.indexOf(item)
+                        const isSelected = visualIndex === selectedIndex
                         const showColoredIcon =
                           item.type === 'block' || item.type === 'trigger' || item.type === 'tool'
                         const isWorkflow = item.type === 'workflow'
@@ -528,7 +541,7 @@ export function SearchModal({
                         return (
                           <button
                             key={`${item.type}-${item.id}`}
-                            data-search-item-index={globalIndex}
+                            data-search-item-index={visualIndex}
                             onClick={() => handleItemClick(item)}
                             onMouseDown={(e) => e.preventDefault()}
                             className={cn(
diff --git a/apps/sim/blocks/blocks/stt.ts b/apps/sim/blocks/blocks/stt.ts
index 98a53a3ab6..c4885e7f84 100644
--- a/apps/sim/blocks/blocks/stt.ts
+++ b/apps/sim/blocks/blocks/stt.ts
@@ -1,4 +1,4 @@
-import { AudioWaveformIcon } from '@/components/icons'
+import { STTIcon } from '@/components/icons'
 import { AuthMode, type BlockConfig } from '@/blocks/types'
 import type { SttBlockResponse } from '@/tools/stt/types'
 
@@ -12,7 +12,7 @@ export const SttBlock: BlockConfig<SttBlockResponse> = {
   docsLink: 'https://docs.sim.ai/tools/stt',
   category: 'tools',
   bgColor: '#181C1E',
-  icon: AudioWaveformIcon,
+  icon: STTIcon,
 
   subBlocks: [
     // Provider selection
@@ -24,6 +24,8 @@ export const SttBlock: BlockConfig<SttBlockResponse> = {
         { label: 'OpenAI Whisper', id: 'whisper' },
         { label: 'Deepgram', id: 'deepgram' },
         { label: 'ElevenLabs', id: 'elevenlabs' },
+        { label: 'AssemblyAI', id: 'assemblyai' },
+        { label: 'Google Gemini', id: 'gemini' },
       ],
       value: () => 'whisper',
       required: true,
@@ -37,7 +39,7 @@ export const SttBlock: BlockConfig<SttBlockResponse> = {
       condition: { field: 'provider', value: 'whisper' },
       options: [{ label: 'Whisper-1', id: 'whisper-1' }],
       value: () => 'whisper-1',
-      required: false,
+      required: true,
     },
 
     // ElevenLabs model selection
@@ -51,7 +53,7 @@ export const SttBlock: BlockConfig<SttBlockResponse> = {
         { label: 'Scribe v1 Experimental', id: 'scribe_v1_experimental' },
       ],
       value: () => 'scribe_v1',
-      required: false,
+      required: true,
     },
 
     // Deepgram model selection
@@ -69,7 +71,38 @@ export const SttBlock: BlockConfig<SttBlockResponse> = {
         { label: 'Base', id: 'base' },
       ],
       value: () => 'nova-3',
-      required: false,
+      required: true,
+    },
+
+    // AssemblyAI model selection
+    {
+      id: 'model',
+      title: 'Model',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'assemblyai' },
+      options: [
+        { label: 'Best', id: 'best' },
+        { label: 'Nano', id: 'nano' },
+      ],
+      value: () => 'best',
+      required: true,
+    },
+
+    // Gemini model selection
+    {
+      id: 'model',
+      title: 'Model',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'gemini' },
+      options: [
+        { label: 'Gemini 3 Pro', id: 'gemini-3-pro-preview' },
+        { label: 'Gemini 2.5 Pro', id: 'gemini-2.5-pro' },
+        { label: 'Gemini 2.5 Flash', id: 'gemini-2.5-flash' },
+        { label: 'Gemini 2.5 Flash Lite', id: 'gemini-2.5-flash-lite' },
+        { label: 'Gemini 2.0 Flash', id: 'gemini-2.0-flash-exp' },
+      ],
+      value: () => 'gemini-2.5-flash',
+      required: true,
     },
 
     // Audio/Video file upload (basic mode)
@@ -81,7 +114,7 @@ export const SttBlock: BlockConfig<SttBlockResponse> = {
       placeholder: 'Upload an audio or video file',
       mode: 'basic',
       multiple: false,
-      required: false,
+      required: true,
       acceptedTypes: '.mp3,.m4a,.wav,.webm,.ogg,.flac,.aac,.opus,.mp4,.mov,.avi,.mkv',
     },
 
@@ -93,13 +126,13 @@ export const SttBlock: BlockConfig<SttBlockResponse> = {
       canonicalParamId: 'audioFile',
       placeholder: 'Reference audio/video from previous blocks',
       mode: 'advanced',
-      required: false,
+      required: true,
     },
 
     // Audio URL (alternative)
     {
       id: 'audioUrl',
-      title: 'Audio/Video URL (alternative)',
+      title: 'Audio/Video URL',
       type: 'short-input',
       placeholder: 'Or enter publicly accessible audio/video URL',
       required: false,
@@ -133,6 +166,7 @@ export const SttBlock: BlockConfig<SttBlockResponse> = {
         { label: 'Finnish', id: 'fi' },
       ],
       value: () => 'auto',
+      required: true,
     },
 
     // Timestamps (word-level, sentence-level, or none)
@@ -146,6 +180,7 @@ export const SttBlock: BlockConfig<SttBlockResponse> = {
         { label: 'Word-level', id: 'word' },
       ],
       value: () => 'none',
+      required: true,
     },
 
     // Speaker diarization (Deepgram/AssemblyAI only)
@@ -153,7 +188,7 @@ export const SttBlock: BlockConfig<SttBlockResponse> = {
       id: 'diarization',
       title: 'Speaker Diarization',
       type: 'switch',
-      condition: { field: 'provider', value: ['deepgram'] },
+      condition: { field: 'provider', value: ['deepgram', 'assemblyai'] },
     },
 
     // Translate to English (Whisper only)
@@ -164,6 +199,35 @@ export const SttBlock: BlockConfig<SttBlockResponse> = {
       condition: { field: 'provider', value: 'whisper' },
     },
 
+    // AssemblyAI-specific features
+    {
+      id: 'sentiment',
+      title: 'Sentiment Analysis',
+      type: 'switch',
+      condition: { field: 'provider', value: 'assemblyai' },
+    },
+
+    {
+      id: 'entityDetection',
+      title: 'Entity Detection',
+      type: 'switch',
+      condition: { field: 'provider', value: 'assemblyai' },
+    },
+
+    {
+      id: 'piiRedaction',
+      title: 'PII Redaction',
+      type: 'switch',
+      condition: { field: 'provider', value: 'assemblyai' },
+    },
+
+    {
+      id: 'summarization',
+      title: 'Auto Summarization',
+      type: 'switch',
+      condition: { field: 'provider', value: 'assemblyai' },
+    },
+
     // API Key
     {
       id: 'apiKey',
@@ -176,7 +240,7 @@ export const SttBlock: BlockConfig<SttBlockResponse> = {
   ],
 
   tools: {
-    access: ['stt_whisper', 'stt_deepgram', 'stt_elevenlabs'],
+    access: ['stt_whisper', 'stt_deepgram', 'stt_elevenlabs', 'stt_assemblyai', 'stt_gemini'],
     config: {
       tool: (params) => {
         // Select tool based on provider
@@ -187,6 +251,10 @@ export const SttBlock: BlockConfig<SttBlockResponse> = {
             return 'stt_deepgram'
           case 'elevenlabs':
             return 'stt_elevenlabs'
+          case 'assemblyai':
+            return 'stt_assemblyai'
+          case 'gemini':
+            return 'stt_gemini'
           default:
             return 'stt_whisper'
         }
@@ -202,16 +270,24 @@ export const SttBlock: BlockConfig<SttBlockResponse> = {
         timestamps: params.timestamps,
         diarization: params.diarization,
         translateToEnglish: params.translateToEnglish,
+        sentiment: params.sentiment,
+        entityDetection: params.entityDetection,
+        piiRedaction: params.piiRedaction,
+        summarization: params.summarization,
       }),
     },
   },
 
   inputs: {
-    provider: { type: 'string', description: 'STT provider (whisper, deepgram, elevenlabs)' },
+    provider: {
+      type: 'string',
+      description: 'STT provider (whisper, deepgram, elevenlabs, assemblyai, gemini)',
+    },
     apiKey: { type: 'string', description: 'Provider API key' },
     model: {
       type: 'string',
-      description: 'Provider-specific model (e.g., scribe_v1 for ElevenLabs, nova-3 for Deepgram)',
+      description:
+        'Provider-specific model (e.g., scribe_v1 for ElevenLabs, nova-3 for Deepgram, best for AssemblyAI, gemini-2.0-flash-exp for Gemini)',
     },
     audioFile: { type: 'json', description: 'Audio/video file (UserFile)' },
     audioFileReference: { type: 'json', description: 'Audio/video file reference' },
@@ -220,6 +296,10 @@ export const SttBlock: BlockConfig<SttBlockResponse> = {
     timestamps: { type: 'string', description: 'Timestamp granularity (none, sentence, word)' },
     diarization: { type: 'boolean', description: 'Enable speaker diarization' },
     translateToEnglish: { type: 'boolean', description: 'Translate to English (Whisper only)' },
+    sentiment: { type: 'boolean', description: 'Enable sentiment analysis (AssemblyAI only)' },
+    entityDetection: { type: 'boolean', description: 'Enable entity detection (AssemblyAI only)' },
+    piiRedaction: { type: 'boolean', description: 'Enable PII redaction (AssemblyAI only)' },
+    summarization: { type: 'boolean', description: 'Enable auto summarization (AssemblyAI only)' },
   },
 
   outputs: {
@@ -227,6 +307,12 @@ export const SttBlock: BlockConfig<SttBlockResponse> = {
     segments: { type: 'array', description: 'Timestamped segments with speaker labels' },
     language: { type: 'string', description: 'Detected or specified language' },
     duration: { type: 'number', description: 'Audio duration in seconds' },
-    confidence: { type: 'number', description: 'Overall confidence score' },
+    confidence: {
+      type: 'number',
+      description: 'Overall confidence score (Deepgram, AssemblyAI only)',
+    },
+    sentiment: { type: 'array', description: 'Sentiment analysis results (AssemblyAI only)' },
+    entities: { type: 'array', description: 'Detected entities (AssemblyAI only)' },
+    summary: { type: 'string', description: 'Auto-generated summary (AssemblyAI only)' },
   },
 }
diff --git a/apps/sim/blocks/blocks/tts.ts b/apps/sim/blocks/blocks/tts.ts
new file mode 100644
index 0000000000..77a10b7d5b
--- /dev/null
+++ b/apps/sim/blocks/blocks/tts.ts
@@ -0,0 +1,587 @@
+import { TTSIcon } from '@/components/icons'
+import { AuthMode, type BlockConfig } from '@/blocks/types'
+import type { TtsBlockResponse } from '@/tools/tts/types'
+
+export const TtsBlock: BlockConfig<TtsBlockResponse> = {
+  type: 'tts',
+  name: 'Text-to-Speech',
+  description: 'Convert text to speech using AI voices',
+  authMode: AuthMode.ApiKey,
+  longDescription:
+    'Generate natural-sounding speech from text using state-of-the-art AI voices from OpenAI, Deepgram, ElevenLabs, Cartesia, Google Cloud, Azure, and PlayHT. Supports multiple voices, languages, and audio formats.',
+  docsLink: 'https://docs.sim.ai/blocks/tts',
+  category: 'tools',
+  bgColor: '#181C1E',
+  icon: TTSIcon,
+
+  subBlocks: [
+    // Provider selection
+    {
+      id: 'provider',
+      title: 'Provider',
+      type: 'dropdown',
+      options: [
+        { label: 'OpenAI TTS', id: 'openai' },
+        { label: 'Deepgram Aura', id: 'deepgram' },
+        { label: 'ElevenLabs', id: 'elevenlabs' },
+        { label: 'Cartesia Sonic', id: 'cartesia' },
+        { label: 'Google Cloud TTS', id: 'google' },
+        { label: 'Azure TTS', id: 'azure' },
+        { label: 'PlayHT', id: 'playht' },
+      ],
+      value: () => 'openai',
+      required: true,
+    },
+
+    // Text input (common to all providers)
+    {
+      id: 'text',
+      title: 'Text',
+      type: 'long-input',
+      placeholder: 'Enter the text to convert to speech...',
+      required: true,
+    },
+
+    // OpenAI Model Selection
+    {
+      id: 'model',
+      title: 'Model',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'openai' },
+      options: [
+        { label: 'TTS-1', id: 'tts-1' },
+        { label: 'TTS-1-HD', id: 'tts-1-hd' },
+        { label: 'GPT-4o Mini TTS', id: 'gpt-4o-mini-tts' },
+      ],
+      value: () => 'tts-1',
+      required: false,
+    },
+
+    // OpenAI Voice Selection
+    {
+      id: 'voice',
+      title: 'Voice',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'openai' },
+      options: [
+        { label: 'Alloy', id: 'alloy' },
+        { label: 'Ash', id: 'ash' },
+        { label: 'Ballad', id: 'ballad' },
+        { label: 'Cedar', id: 'cedar' },
+        { label: 'Coral', id: 'coral' },
+        { label: 'Echo', id: 'echo' },
+        { label: 'Marin', id: 'marin' },
+        { label: 'Sage', id: 'sage' },
+        { label: 'Shimmer', id: 'shimmer' },
+        { label: 'Verse', id: 'verse' },
+      ],
+      value: () => 'alloy',
+      required: false,
+    },
+
+    // OpenAI Response Format
+    {
+      id: 'responseFormat',
+      title: 'Audio Format',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'openai' },
+      options: [
+        { label: 'MP3', id: 'mp3' },
+        { label: 'Opus', id: 'opus' },
+        { label: 'AAC', id: 'aac' },
+        { label: 'FLAC', id: 'flac' },
+        { label: 'WAV', id: 'wav' },
+      ],
+      value: () => 'mp3',
+      required: false,
+    },
+
+    // OpenAI Speed
+    {
+      id: 'speed',
+      title: 'Speed',
+      type: 'slider',
+      condition: { field: 'provider', value: 'openai' },
+      min: 0.25,
+      max: 4.0,
+      step: 0.25,
+      value: () => '1.0',
+      required: false,
+    },
+
+    // Deepgram Voice Selection
+    {
+      id: 'voice',
+      title: 'Voice',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'deepgram' },
+      options: [
+        { label: 'Asteria', id: 'aura-asteria-en' },
+        { label: 'Luna', id: 'aura-luna-en' },
+        { label: 'Stella', id: 'aura-stella-en' },
+        { label: 'Athena', id: 'aura-athena-en' },
+        { label: 'Hera', id: 'aura-hera-en' },
+        { label: 'Orion', id: 'aura-orion-en' },
+        { label: 'Arcas', id: 'aura-arcas-en' },
+        { label: 'Perseus', id: 'aura-perseus-en' },
+        { label: 'Angus', id: 'aura-angus-en' },
+        { label: 'Orpheus', id: 'aura-orpheus-en' },
+        { label: 'Helios', id: 'aura-helios-en' },
+        { label: 'Zeus', id: 'aura-zeus-en' },
+      ],
+      value: () => 'aura-asteria-en',
+      required: true,
+    },
+
+    // Deepgram Encoding
+    {
+      id: 'encoding',
+      title: 'Audio Format',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'deepgram' },
+      options: [
+        { label: 'MP3', id: 'mp3' },
+        { label: 'Opus', id: 'opus' },
+        { label: 'AAC', id: 'aac' },
+        { label: 'FLAC', id: 'flac' },
+        { label: 'Linear16', id: 'linear16' },
+      ],
+      value: () => 'mp3',
+      required: false,
+    },
+
+    // Deepgram Sample Rate (only for linear16 format)
+    {
+      id: 'sampleRate',
+      title: 'Sample Rate',
+      type: 'dropdown',
+      condition: {
+        field: 'provider',
+        value: 'deepgram',
+        and: { field: 'encoding', value: 'linear16' },
+      },
+      options: [
+        { label: '8000 Hz', id: '8000' },
+        { label: '16000 Hz', id: '16000' },
+        { label: '24000 Hz', id: '24000' },
+        { label: '48000 Hz', id: '48000' },
+      ],
+      value: () => '24000',
+      required: false,
+    },
+
+    // ElevenLabs Voice ID
+    {
+      id: 'voiceId',
+      title: 'Voice ID',
+      type: 'short-input',
+      condition: { field: 'provider', value: 'elevenlabs' },
+      placeholder: 'Enter ElevenLabs voice ID',
+      required: true,
+    },
+
+    // ElevenLabs Model Selection
+    {
+      id: 'modelId',
+      title: 'Model',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'elevenlabs' },
+      options: [
+        { label: 'Turbo v2.5', id: 'eleven_turbo_v2_5' },
+        { label: 'Flash v2.5', id: 'eleven_flash_v2_5' },
+        { label: 'Multilingual v2', id: 'eleven_multilingual_v2' },
+        { label: 'Turbo v2', id: 'eleven_turbo_v2' },
+        { label: 'Monolingual v1', id: 'eleven_monolingual_v1' },
+        { label: 'Multilingual v1', id: 'eleven_multilingual_v1' },
+      ],
+      value: () => 'eleven_turbo_v2_5',
+      required: false,
+    },
+
+    // ElevenLabs Stability
+    {
+      id: 'stability',
+      title: 'Stability',
+      type: 'slider',
+      condition: { field: 'provider', value: 'elevenlabs' },
+      min: 0.0,
+      max: 1.0,
+      step: 0.05,
+      value: () => '0.5',
+      required: false,
+    },
+
+    // ElevenLabs Similarity Boost
+    {
+      id: 'similarityBoost',
+      title: 'Similarity Boost',
+      type: 'slider',
+      condition: { field: 'provider', value: 'elevenlabs' },
+      min: 0.0,
+      max: 1.0,
+      step: 0.05,
+      value: () => '0.8',
+      required: false,
+    },
+
+    // ElevenLabs Style
+    {
+      id: 'style',
+      title: 'Style',
+      type: 'slider',
+      condition: { field: 'provider', value: 'elevenlabs' },
+      min: 0.0,
+      max: 1.0,
+      step: 0.05,
+      value: () => '0.0',
+      required: false,
+    },
+
+    // Cartesia Model Selection
+    {
+      id: 'modelId',
+      title: 'Model',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'cartesia' },
+      options: [
+        { label: 'Sonic', id: 'sonic' },
+        { label: 'Sonic 2', id: 'sonic-2' },
+        { label: 'Sonic Turbo', id: 'sonic-turbo' },
+        { label: 'Sonic 3', id: 'sonic-3' },
+        { label: 'Sonic Multilingual', id: 'sonic-multilingual' },
+      ],
+      value: () => 'sonic-3',
+      required: false,
+    },
+
+    // Cartesia Voice
+    {
+      id: 'voice',
+      title: 'Voice ID',
+      type: 'short-input',
+      condition: { field: 'provider', value: 'cartesia' },
+      placeholder: 'Enter Cartesia voice ID',
+      required: true,
+    },
+
+    // Cartesia Speed
+    {
+      id: 'speed',
+      title: 'Speed',
+      type: 'slider',
+      condition: { field: 'provider', value: 'cartesia' },
+      min: 0.5,
+      max: 2.0,
+      step: 0.1,
+      value: () => '1.0',
+      required: false,
+    },
+
+    // Google Voice ID
+    {
+      id: 'voiceId',
+      title: 'Voice ID',
+      type: 'short-input',
+      condition: { field: 'provider', value: 'google' },
+      placeholder: 'e.g., en-US-Neural2-A',
+      required: false,
+    },
+
+    // Google Language Code
+    {
+      id: 'languageCode',
+      title: 'Language Code',
+      type: 'short-input',
+      condition: { field: 'provider', value: 'google' },
+      placeholder: 'e.g., en-US, es-ES',
+      value: () => 'en-US',
+      required: true,
+    },
+
+    // Google Speaking Rate
+    {
+      id: 'speakingRate',
+      title: 'Speaking Rate',
+      type: 'slider',
+      condition: { field: 'provider', value: 'google' },
+      min: 0.25,
+      max: 2.0,
+      step: 0.25,
+      value: () => '1.0',
+      required: false,
+    },
+
+    // Google Pitch
+    {
+      id: 'pitch',
+      title: 'Pitch',
+      type: 'slider',
+      condition: { field: 'provider', value: 'google' },
+      min: -20.0,
+      max: 20.0,
+      step: 1.0,
+      value: () => '0.0',
+      required: false,
+    },
+
+    // Azure Voice ID
+    {
+      id: 'voiceId',
+      title: 'Voice ID',
+      type: 'short-input',
+      condition: { field: 'provider', value: 'azure' },
+      placeholder: 'e.g., en-US-JennyNeural',
+      required: false,
+    },
+
+    // Azure Region
+    {
+      id: 'region',
+      title: 'Region',
+      type: 'short-input',
+      condition: { field: 'provider', value: 'azure' },
+      placeholder: 'e.g., eastus, westus',
+      required: false,
+    },
+
+    // Azure Output Format
+    {
+      id: 'outputFormat',
+      title: 'Output Format',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'azure' },
+      options: [
+        { label: 'MP3 24kHz 48kbps', id: 'audio-24khz-48kbitrate-mono-mp3' },
+        { label: 'MP3 24kHz 96kbps', id: 'audio-24khz-96kbitrate-mono-mp3' },
+        { label: 'MP3 48kHz 96kbps', id: 'audio-48khz-96kbitrate-mono-mp3' },
+      ],
+      value: () => 'audio-24khz-96kbitrate-mono-mp3',
+      required: false,
+    },
+
+    // Azure Style
+    {
+      id: 'style',
+      title: 'Speaking Style',
+      type: 'short-input',
+      condition: { field: 'provider', value: 'azure' },
+      placeholder: 'e.g., cheerful, sad, angry',
+      required: false,
+    },
+
+    // PlayHT User ID
+    {
+      id: 'userId',
+      title: 'User ID',
+      type: 'short-input',
+      condition: { field: 'provider', value: 'playht' },
+      placeholder: 'Enter your PlayHT user ID',
+      password: true,
+      required: true,
+    },
+
+    // PlayHT Voice
+    {
+      id: 'voice',
+      title: 'Voice',
+      type: 'short-input',
+      condition: { field: 'provider', value: 'playht' },
+      placeholder: 'Voice ID or manifest URL',
+      required: false,
+    },
+
+    // PlayHT Quality
+    {
+      id: 'quality',
+      title: 'Quality',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'playht' },
+      options: [
+        { label: 'Draft', id: 'draft' },
+        { label: 'Standard', id: 'standard' },
+        { label: 'Premium', id: 'premium' },
+      ],
+      value: () => 'standard',
+      required: false,
+    },
+
+    // PlayHT Speed
+    {
+      id: 'speed',
+      title: 'Speed',
+      type: 'slider',
+      condition: { field: 'provider', value: 'playht' },
+      min: 0.5,
+      max: 2.0,
+      step: 0.1,
+      value: () => '1.0',
+      required: false,
+    },
+
+    // API Key (common to all providers)
+    {
+      id: 'apiKey',
+      title: 'API Key',
+      type: 'short-input',
+      placeholder: 'Enter your API key',
+      password: true,
+      required: true,
+    },
+  ],
+
+  tools: {
+    access: [
+      'tts_openai',
+      'tts_deepgram',
+      'tts_elevenlabs',
+      'tts_cartesia',
+      'tts_google',
+      'tts_azure',
+      'tts_playht',
+    ],
+    config: {
+      tool: (params) => {
+        // Select tool based on provider
+        switch (params.provider) {
+          case 'openai':
+            return 'tts_openai'
+          case 'deepgram':
+            return 'tts_deepgram'
+          case 'elevenlabs':
+            return 'tts_elevenlabs'
+          case 'cartesia':
+            return 'tts_cartesia'
+          case 'google':
+            return 'tts_google'
+          case 'azure':
+            return 'tts_azure'
+          case 'playht':
+            return 'tts_playht'
+          default:
+            return 'tts_openai'
+        }
+      },
+      params: (params) => {
+        const baseParams = {
+          text: params.text,
+          apiKey: params.apiKey,
+        }
+
+        if (params.provider === 'openai') {
+          return {
+            ...baseParams,
+            model: params.model,
+            voice: params.voice,
+            responseFormat: params.responseFormat,
+            speed: params.speed ? Number(params.speed) : undefined,
+          }
+        }
+
+        if (params.provider === 'deepgram') {
+          return {
+            ...baseParams,
+            voice: params.voice,
+            encoding: params.encoding,
+            sampleRate: params.sampleRate ? Number(params.sampleRate) : undefined,
+          }
+        }
+
+        if (params.provider === 'elevenlabs') {
+          return {
+            ...baseParams,
+            voiceId: params.voiceId,
+            modelId: params.modelId,
+            stability: params.stability ? Number(params.stability) : undefined,
+            similarityBoost: params.similarityBoost ? Number(params.similarityBoost) : undefined,
+            style: params.style ? Number(params.style) : undefined,
+          }
+        }
+
+        if (params.provider === 'cartesia') {
+          return {
+            ...baseParams,
+            modelId: params.modelId,
+            voice: params.voice,
+            speed: params.speed ? Number(params.speed) : undefined,
+          }
+        }
+
+        if (params.provider === 'google') {
+          return {
+            ...baseParams,
+            voiceId: params.voiceId,
+            languageCode: params.languageCode,
+            speakingRate: params.speakingRate ? Number(params.speakingRate) : undefined,
+            pitch: params.pitch ? Number(params.pitch) : undefined,
+          }
+        }
+
+        if (params.provider === 'azure') {
+          return {
+            ...baseParams,
+            voiceId: params.voiceId,
+            region: params.region,
+            outputFormat: params.outputFormat,
+            style: params.style,
+          }
+        }
+
+        if (params.provider === 'playht') {
+          return {
+            ...baseParams,
+            userId: params.userId,
+            voice: params.voice,
+            quality: params.quality,
+            speed: params.speed ? Number(params.speed) : undefined,
+          }
+        }
+
+        return baseParams
+      },
+    },
+  },
+
+  inputs: {
+    provider: {
+      type: 'string',
+      description: 'TTS provider (openai, deepgram, elevenlabs, cartesia, google, azure, playht)',
+    },
+    text: { type: 'string', description: 'Text to convert to speech' },
+    apiKey: { type: 'string', description: 'Provider API key' },
+    // OpenAI
+    model: { type: 'string', description: 'OpenAI model (tts-1, tts-1-hd, gpt-4o-mini-tts)' },
+    voice: { type: 'string', description: 'Voice identifier' },
+    responseFormat: { type: 'string', description: 'Audio format (mp3, opus, aac, flac, wav)' },
+    speed: { type: 'number', description: 'Speech speed (0.25 to 4.0) or speed multiplier' },
+    // Deepgram
+    encoding: { type: 'string', description: 'Audio encoding' },
+    sampleRate: { type: 'number', description: 'Sample rate in Hz' },
+    // ElevenLabs
+    voiceId: { type: 'string', description: 'Voice ID (ElevenLabs, Google, Azure)' },
+    modelId: { type: 'string', description: 'Model ID (ElevenLabs, Cartesia)' },
+    stability: { type: 'number', description: 'Voice stability (0.0 to 1.0)' },
+    similarityBoost: { type: 'number', description: 'Similarity boost (0.0 to 1.0)' },
+    style: { type: 'string', description: 'Style exaggeration or speaking style' },
+    // Cartesia
+    language: { type: 'string', description: 'Language code (Cartesia)' },
+    // Google Cloud
+    languageCode: { type: 'string', description: 'Language code (Google)' },
+    speakingRate: { type: 'number', description: 'Speaking rate (Google)' },
+    pitch: { type: 'number', description: 'Voice pitch (Google)' },
+    // Azure
+    region: { type: 'string', description: 'Azure region' },
+    outputFormat: { type: 'string', description: 'Output audio format' },
+    // PlayHT
+    userId: { type: 'string', description: 'PlayHT user ID' },
+    quality: { type: 'string', description: 'Quality level (PlayHT)' },
+  },
+
+  outputs: {
+    audioUrl: { type: 'string', description: 'URL to the generated audio file' },
+    audioFile: { type: 'json', description: 'Generated audio file object (UserFile)' },
+    duration: { type: 'number', description: 'Audio duration in seconds' },
+    characterCount: { type: 'number', description: 'Number of characters processed' },
+    format: { type: 'string', description: 'Audio format' },
+    provider: { type: 'string', description: 'TTS provider used' },
+  },
+}
diff --git a/apps/sim/blocks/blocks/video_generator.ts b/apps/sim/blocks/blocks/video_generator.ts
new file mode 100644
index 0000000000..0e6518be59
--- /dev/null
+++ b/apps/sim/blocks/blocks/video_generator.ts
@@ -0,0 +1,384 @@
+import { VideoIcon } from '@/components/icons'
+import { AuthMode, type BlockConfig } from '@/blocks/types'
+import type { VideoBlockResponse } from '@/tools/video/types'
+
+export const VideoGeneratorBlock: BlockConfig<VideoBlockResponse> = {
+  type: 'video_generator',
+  name: 'Video Generator',
+  description: 'Generate videos from text using AI',
+  authMode: AuthMode.ApiKey,
+  longDescription:
+    'Generate high-quality videos from text prompts using leading AI providers. Supports multiple models, aspect ratios, resolutions, and provider-specific features like world consistency, camera controls, and audio generation.',
+  docsLink: 'https://docs.sim.ai/tools/video-generator',
+  category: 'tools',
+  bgColor: '#181C1E',
+  icon: VideoIcon,
+
+  subBlocks: [
+    // Provider selection
+    {
+      id: 'provider',
+      title: 'Provider',
+      type: 'dropdown',
+      options: [
+        { label: 'Runway Gen-4', id: 'runway' },
+        { label: 'Google Veo 3', id: 'veo' },
+        { label: 'Luma Dream Machine', id: 'luma' },
+        { label: 'MiniMax Hailuo', id: 'minimax' },
+        { label: 'Fal.ai (Multi-Model)', id: 'falai' },
+      ],
+      value: () => 'runway',
+      required: true,
+    },
+
+    // Note: Runway Gen-4 only supports Gen-4 Turbo for image-to-video (no model selection needed)
+
+    // Google Veo model selection
+    {
+      id: 'model',
+      title: 'Model',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'veo' },
+      options: [
+        { label: 'Veo 3', id: 'veo-3' },
+        { label: 'Veo 3 Fast', id: 'veo-3-fast' },
+        { label: 'Veo 3.1', id: 'veo-3.1' },
+      ],
+      value: () => 'veo-3',
+      required: false,
+    },
+
+    // Luma model selection
+    {
+      id: 'model',
+      title: 'Model',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'luma' },
+      options: [{ label: 'Ray 2', id: 'ray-2' }],
+      value: () => 'ray-2',
+      required: false,
+    },
+
+    // MiniMax model and endpoint selection
+    {
+      id: 'model',
+      title: 'Model',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'minimax' },
+      options: [{ label: 'Hailuo 2.3', id: 'hailuo-02' }],
+      value: () => 'hailuo-02',
+      required: false,
+    },
+
+    {
+      id: 'endpoint',
+      title: 'Quality Endpoint',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'minimax' },
+      options: [
+        { label: 'Pro', id: 'pro' },
+        { label: 'Standard', id: 'standard' },
+      ],
+      value: () => 'standard',
+      required: false,
+    },
+
+    // Fal.ai model selection
+    {
+      id: 'model',
+      title: 'Model',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'falai' },
+      options: [
+        { label: 'Google Veo 3.1', id: 'veo-3.1' },
+        { label: 'OpenAI Sora 2', id: 'sora-2' },
+        { label: 'Kling 2.5 Turbo Pro', id: 'kling-2.5-turbo-pro' },
+        { label: 'Kling 2.1 Pro', id: 'kling-2.1-pro' },
+        { label: 'MiniMax Hailuo 2.3 Pro', id: 'minimax-hailuo-2.3-pro' },
+        { label: 'MiniMax Hailuo 2.3 Standard', id: 'minimax-hailuo-2.3-standard' },
+        { label: 'WAN 2.1', id: 'wan-2.1' },
+        { label: 'LTXV 0.9.8', id: 'ltxv-0.9.8' },
+      ],
+      value: () => 'veo-3.1',
+      required: true,
+    },
+
+    // Prompt input (required)
+    {
+      id: 'prompt',
+      title: 'Prompt',
+      type: 'long-input',
+      placeholder: 'Describe the video you want to generate...',
+      required: true,
+    },
+
+    // Duration selection - Runway (5 or 10 seconds)
+    {
+      id: 'duration',
+      title: 'Duration (seconds)',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'runway' },
+      options: [
+        { label: '5', id: '5' },
+        { label: '10', id: '10' },
+      ],
+      value: () => '5',
+      required: false,
+    },
+
+    // Duration selection - Veo (4, 6, or 8 seconds)
+    {
+      id: 'duration',
+      title: 'Duration (seconds)',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'veo' },
+      options: [
+        { label: '4', id: '4' },
+        { label: '6', id: '6' },
+        { label: '8', id: '8' },
+      ],
+      value: () => '8',
+      required: false,
+    },
+
+    // Duration selection - Luma (5 or 9 seconds)
+    {
+      id: 'duration',
+      title: 'Duration (seconds)',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'luma' },
+      options: [
+        { label: '5', id: '5' },
+        { label: '9', id: '9' },
+      ],
+      value: () => '5',
+      required: false,
+    },
+
+    // Duration selection - MiniMax (6 or 10 seconds)
+    {
+      id: 'duration',
+      title: 'Duration (seconds)',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'minimax' },
+      options: [
+        { label: '6', id: '6' },
+        { label: '10', id: '10' },
+      ],
+      value: () => '6',
+      required: false,
+    },
+
+    // Aspect ratio selection - Veo (only 16:9 and 9:16)
+    {
+      id: 'aspectRatio',
+      title: 'Aspect Ratio',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'veo' },
+      options: [
+        { label: '16:9', id: '16:9' },
+        { label: '9:16', id: '9:16' },
+      ],
+      value: () => '16:9',
+      required: false,
+    },
+
+    // Aspect ratio selection - Runway (includes 1:1)
+    {
+      id: 'aspectRatio',
+      title: 'Aspect Ratio',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'runway' },
+      options: [
+        { label: '16:9', id: '16:9' },
+        { label: '9:16', id: '9:16' },
+        { label: '1:1', id: '1:1' },
+      ],
+      value: () => '16:9',
+      required: false,
+    },
+
+    // Aspect ratio selection - Luma (includes 1:1)
+    {
+      id: 'aspectRatio',
+      title: 'Aspect Ratio',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'luma' },
+      options: [
+        { label: '16:9', id: '16:9' },
+        { label: '9:16', id: '9:16' },
+        { label: '1:1', id: '1:1' },
+      ],
+      value: () => '16:9',
+      required: false,
+    },
+
+    // Note: MiniMax aspect ratio is fixed at 16:9 (not configurable)
+
+    // Note: Runway Gen-4 Turbo outputs at 720p natively (no resolution selector needed)
+
+    // Resolution selection - Veo
+    {
+      id: 'resolution',
+      title: 'Resolution',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'veo' },
+      options: [
+        { label: '720p', id: '720p' },
+        { label: '1080p', id: '1080p' },
+      ],
+      value: () => '1080p',
+      required: false,
+    },
+
+    // Resolution selection - Luma
+    {
+      id: 'resolution',
+      title: 'Resolution',
+      type: 'dropdown',
+      condition: { field: 'provider', value: 'luma' },
+      options: [
+        { label: '540p', id: '540p' },
+        { label: '720p', id: '720p' },
+        { label: '1080p', id: '1080p' },
+      ],
+      value: () => '1080p',
+      required: false,
+    },
+
+    // Note: MiniMax resolution is fixed per endpoint (Pro=1080p, Standard=768p)
+
+    // Runway-specific: Visual reference (REQUIRED for Gen-4)
+    {
+      id: 'visualReference',
+      title: 'Reference Image',
+      type: 'file-upload',
+      condition: { field: 'provider', value: 'runway' },
+      placeholder: 'Upload reference image',
+      mode: 'basic',
+      multiple: false,
+      required: true,
+      acceptedTypes: '.jpg,.jpeg,.png,.webp',
+    },
+
+    // Luma-specific: Camera controls
+    {
+      id: 'cameraControl',
+      title: 'Camera Controls',
+      type: 'long-input',
+      condition: { field: 'provider', value: 'luma' },
+      placeholder: 'JSON: [{ "key": "pan_right" }, { "key": "zoom_in" }]',
+      required: false,
+    },
+
+    // MiniMax-specific: Prompt optimizer
+    {
+      id: 'promptOptimizer',
+      title: 'Prompt Optimizer',
+      type: 'switch',
+      condition: { field: 'provider', value: 'minimax' },
+    },
+
+    // API Key
+    {
+      id: 'apiKey',
+      title: 'API Key',
+      type: 'short-input',
+      placeholder: 'Enter your provider API key',
+      password: true,
+      required: true,
+    },
+  ],
+
+  tools: {
+    access: ['video_runway', 'video_veo', 'video_luma', 'video_minimax', 'video_falai'],
+    config: {
+      tool: (params) => {
+        // Select tool based on provider
+        switch (params.provider) {
+          case 'runway':
+            return 'video_runway'
+          case 'veo':
+            return 'video_veo'
+          case 'luma':
+            return 'video_luma'
+          case 'minimax':
+            return 'video_minimax'
+          case 'falai':
+            return 'video_falai'
+          default:
+            return 'video_runway'
+        }
+      },
+      params: (params) => ({
+        provider: params.provider,
+        apiKey: params.apiKey,
+        model: params.model,
+        endpoint: params.endpoint,
+        prompt: params.prompt,
+        duration: params.duration ? Number(params.duration) : undefined,
+        aspectRatio: params.aspectRatio,
+        resolution: params.resolution,
+        visualReference: params.visualReference,
+        consistencyMode: params.consistencyMode,
+        stylePreset: params.stylePreset,
+        promptOptimizer: params.promptOptimizer,
+        cameraControl: params.cameraControl
+          ? typeof params.cameraControl === 'string'
+            ? JSON.parse(params.cameraControl)
+            : params.cameraControl
+          : undefined,
+      }),
+    },
+  },
+
+  inputs: {
+    provider: {
+      type: 'string',
+      description: 'Video generation provider (runway, veo, luma, minimax)',
+    },
+    apiKey: { type: 'string', description: 'Provider API key' },
+    model: {
+      type: 'string',
+      description: 'Provider-specific model',
+    },
+    endpoint: {
+      type: 'string',
+      description: 'Quality endpoint for MiniMax (pro, standard)',
+    },
+    prompt: { type: 'string', description: 'Text prompt for video generation' },
+    duration: { type: 'number', description: 'Video duration in seconds' },
+    aspectRatio: {
+      type: 'string',
+      description: 'Aspect ratio (16:9, 9:16, 1:1) - not available for MiniMax',
+    },
+    resolution: {
+      type: 'string',
+      description: 'Video resolution - not available for MiniMax (fixed per endpoint)',
+    },
+    visualReference: { type: 'json', description: 'Reference image for Runway (UserFile)' },
+    consistencyMode: {
+      type: 'string',
+      description: 'Consistency mode for Runway (character, object, style, location)',
+    },
+    stylePreset: { type: 'string', description: 'Style preset for Runway' },
+    promptOptimizer: {
+      type: 'boolean',
+      description: 'Enable prompt optimization for MiniMax (default: true)',
+    },
+    cameraControl: {
+      type: 'json',
+      description: 'Camera controls for Luma (pan, zoom, tilt, truck, tracking)',
+    },
+  },
+
+  outputs: {
+    videoUrl: { type: 'string', description: 'Generated video URL' },
+    videoFile: { type: 'json', description: 'Video file object with metadata' },
+    duration: { type: 'number', description: 'Video duration in seconds' },
+    width: { type: 'number', description: 'Video width in pixels' },
+    height: { type: 'number', description: 'Video height in pixels' },
+    provider: { type: 'string', description: 'Provider used' },
+    model: { type: 'string', description: 'Model used' },
+  },
+}
diff --git a/apps/sim/blocks/registry.ts b/apps/sim/blocks/registry.ts
index 99de2386d9..56ee3293bc 100644
--- a/apps/sim/blocks/registry.ts
+++ b/apps/sim/blocks/registry.ts
@@ -84,10 +84,12 @@ import { TelegramBlock } from '@/blocks/blocks/telegram'
 import { ThinkingBlock } from '@/blocks/blocks/thinking'
 import { TranslateBlock } from '@/blocks/blocks/translate'
 import { TrelloBlock } from '@/blocks/blocks/trello'
+import { TtsBlock } from '@/blocks/blocks/tts'
 import { TwilioSMSBlock } from '@/blocks/blocks/twilio'
 import { TwilioVoiceBlock } from '@/blocks/blocks/twilio_voice'
 import { TypeformBlock } from '@/blocks/blocks/typeform'
 import { VariablesBlock } from '@/blocks/blocks/variables'
+import { VideoGeneratorBlock } from '@/blocks/blocks/video_generator'
 import { VisionBlock } from '@/blocks/blocks/vision'
 import { WaitBlock } from '@/blocks/blocks/wait'
 import { WealthboxBlock } from '@/blocks/blocks/wealthbox'
@@ -179,6 +181,7 @@ export const registry: Record<string, BlockConfig> = {
   slack: SlackBlock,
   starter: StarterBlock,
   stt: SttBlock,
+  tts: TtsBlock,
   start_trigger: StartTriggerBlock,
   input_trigger: InputTriggerBlock,
   chat_trigger: ChatTriggerBlock,
@@ -195,6 +198,7 @@ export const registry: Record<string, BlockConfig> = {
   twilio_voice: TwilioVoiceBlock,
   typeform: TypeformBlock,
   variables: VariablesBlock,
+  video_generator: VideoGeneratorBlock,
   vision: VisionBlock,
   wait: WaitBlock,
   wealthbox: WealthboxBlock,
diff --git a/apps/sim/components/icons.tsx b/apps/sim/components/icons.tsx
index 9562e82bcb..b4dd059b70 100644
--- a/apps/sim/components/icons.tsx
+++ b/apps/sim/components/icons.tsx
@@ -4085,7 +4085,29 @@ export function CalendlyIcon(props: SVGProps<SVGSVGElement>) {
   )
 }
 
-export function AudioWaveformIcon(props: SVGProps<SVGSVGElement>) {
+export function STTIcon(props: SVGProps<SVGSVGElement>) {
+  return (
+    <svg
+      {...props}
+      xmlns='http://www.w3.org/2000/svg'
+      width='24'
+      height='24'
+      viewBox='0 0 24 24'
+      fill='none'
+      stroke='currentColor'
+      strokeWidth='2'
+      strokeLinecap='round'
+      strokeLinejoin='round'
+    >
+      <path d='m15 16 2.536-7.328a1.02 1.02 1 0 1 1.928 0L22 16' />
+      <path d='M15.697 14h5.606' />
+      <path d='m2 16 4.039-9.69a.5.5 0 0 1 .923 0L11 16' />
+      <path d='M3.304 13h6.392' />
+    </svg>
+  )
+}
+
+export function TTSIcon(props: SVGProps<SVGSVGElement>) {
   return (
     <svg
       {...props}
@@ -4108,3 +4130,23 @@ export function AudioWaveformIcon(props: SVGProps<SVGSVGElement>) {
     </svg>
   )
 }
+
+export function VideoIcon(props: SVGProps<SVGSVGElement>) {
+  return (
+    <svg
+      {...props}
+      xmlns='http://www.w3.org/2000/svg'
+      width='24'
+      height='24'
+      viewBox='0 0 24 24'
+      fill='none'
+      stroke='currentColor'
+      strokeWidth='2'
+      strokeLinecap='round'
+      strokeLinejoin='round'
+    >
+      <path d='m16 13 5.223 3.482a.5.5 0 0 0 .777-.416V7.87a.5.5 0 0 0-.752-.432L16 10.5' />
+      <rect x='2' y='6' width='14' height='12' rx='2' />
+    </svg>
+  )
+}
diff --git a/apps/sim/lib/auth.ts b/apps/sim/lib/auth.ts
index 52ef5895fe..58312bec68 100644
--- a/apps/sim/lib/auth.ts
+++ b/apps/sim/lib/auth.ts
@@ -146,7 +146,7 @@ export const auth = betterAuth({
         'github',
         'email-password',
         'confluence',
-        'supabase',
+        // 'supabase',
         'x',
         'notion',
         'microsoft',
@@ -866,55 +866,55 @@ export const auth = betterAuth({
           },
         },
 
-        // Supabase provider
-        {
-          providerId: 'supabase',
-          clientId: env.SUPABASE_CLIENT_ID as string,
-          clientSecret: env.SUPABASE_CLIENT_SECRET as string,
-          authorizationUrl: 'https://api.supabase.com/v1/oauth/authorize',
-          tokenUrl: 'https://api.supabase.com/v1/oauth/token',
-          userInfoUrl: 'https://dummy-not-used.supabase.co',
-          scopes: ['database.read', 'database.write', 'projects.read'],
-          responseType: 'code',
-          pkce: true,
-          redirectURI: `${getBaseUrl()}/api/auth/oauth2/callback/supabase`,
-          getUserInfo: async (tokens) => {
-            try {
-              logger.info('Creating Supabase user profile from token data')
-
-              let userId = 'supabase-user'
-              if (tokens.idToken) {
-                try {
-                  const decodedToken = JSON.parse(
-                    Buffer.from(tokens.idToken.split('.')[1], 'base64').toString()
-                  )
-                  if (decodedToken.sub) {
-                    userId = decodedToken.sub
-                  }
-                } catch (e) {
-                  logger.warn('Failed to decode Supabase ID token', {
-                    error: e,
-                  })
-                }
-              }
-
-              const uniqueId = `${userId}-${Date.now()}`
-              const now = new Date()
-
-              return {
-                id: uniqueId,
-                name: 'Supabase User',
-                email: `${uniqueId.replace(/[^a-zA-Z0-9]/g, '')}@supabase.user`,
-                emailVerified: false,
-                createdAt: now,
-                updatedAt: now,
-              }
-            } catch (error) {
-              logger.error('Error creating Supabase user profile:', { error })
-              return null
-            }
-          },
-        },
+        // Supabase provider (unused)
+        // {
+        //   providerId: 'supabase',
+        //   clientId: env.SUPABASE_CLIENT_ID as string,
+        //   clientSecret: env.SUPABASE_CLIENT_SECRET as string,
+        //   authorizationUrl: 'https://api.supabase.com/v1/oauth/authorize',
+        //   tokenUrl: 'https://api.supabase.com/v1/oauth/token',
+        //   userInfoUrl: 'https://dummy-not-used.supabase.co',
+        //   scopes: ['database.read', 'database.write', 'projects.read'],
+        //   responseType: 'code',
+        //   pkce: true,
+        //   redirectURI: `${getBaseUrl()}/api/auth/oauth2/callback/supabase`,
+        //   getUserInfo: async (tokens) => {
+        //     try {
+        //       logger.info('Creating Supabase user profile from token data')
+
+        //       let userId = 'supabase-user'
+        //       if (tokens.idToken) {
+        //         try {
+        //           const decodedToken = JSON.parse(
+        //             Buffer.from(tokens.idToken.split('.')[1], 'base64').toString()
+        //           )
+        //           if (decodedToken.sub) {
+        //             userId = decodedToken.sub
+        //           }
+        //         } catch (e) {
+        //           logger.warn('Failed to decode Supabase ID token', {
+        //             error: e,
+        //           })
+        //         }
+        //       }
+
+        //       const uniqueId = `${userId}-${Date.now()}`
+        //       const now = new Date()
+
+        //       return {
+        //         id: uniqueId,
+        //         name: 'Supabase User',
+        //         email: `${uniqueId.replace(/[^a-zA-Z0-9]/g, '')}@supabase.user`,
+        //         emailVerified: false,
+        //         createdAt: now,
+        //         updatedAt: now,
+        //       }
+        //     } catch (error) {
+        //       logger.error('Error creating Supabase user profile:', { error })
+        //       return null
+        //     }
+        //   },
+        // },
 
         // X provider
         {
@@ -1049,56 +1049,56 @@ export const auth = betterAuth({
           },
         },
 
-        // Discord provider
-        {
-          providerId: 'discord',
-          clientId: env.DISCORD_CLIENT_ID as string,
-          clientSecret: env.DISCORD_CLIENT_SECRET as string,
-          authorizationUrl: 'https://discord.com/api/oauth2/authorize',
-          tokenUrl: 'https://discord.com/api/oauth2/token',
-          userInfoUrl: 'https://discord.com/api/users/@me',
-          scopes: ['identify', 'bot', 'messages.read', 'guilds', 'guilds.members.read'],
-          responseType: 'code',
-          accessType: 'offline',
-          authentication: 'basic',
-          prompt: 'consent',
-          redirectURI: `${getBaseUrl()}/api/auth/oauth2/callback/discord`,
-          getUserInfo: async (tokens) => {
-            try {
-              const response = await fetch('https://discord.com/api/users/@me', {
-                headers: {
-                  Authorization: `Bearer ${tokens.accessToken}`,
-                },
-              })
-
-              if (!response.ok) {
-                logger.error('Error fetching Discord user info:', {
-                  status: response.status,
-                  statusText: response.statusText,
-                })
-                return null
-              }
-
-              const profile = await response.json()
-              const now = new Date()
-
-              return {
-                id: profile.id,
-                name: profile.username || 'Discord User',
-                email: profile.email || `${profile.id}@discord.user`,
-                image: profile.avatar
-                  ? `https://cdn.discordapp.com/avatars/${profile.id}/${profile.avatar}.png`
-                  : undefined,
-                emailVerified: profile.verified || false,
-                createdAt: now,
-                updatedAt: now,
-              }
-            } catch (error) {
-              logger.error('Error in Discord getUserInfo:', { error })
-              return null
-            }
-          },
-        },
+        // Discord provider (unused)
+        // {
+        //   providerId: 'discord',
+        //   clientId: env.DISCORD_CLIENT_ID as string,
+        //   clientSecret: env.DISCORD_CLIENT_SECRET as string,
+        //   authorizationUrl: 'https://discord.com/api/oauth2/authorize',
+        //   tokenUrl: 'https://discord.com/api/oauth2/token',
+        //   userInfoUrl: 'https://discord.com/api/users/@me',
+        //   scopes: ['identify', 'bot', 'messages.read', 'guilds', 'guilds.members.read'],
+        //   responseType: 'code',
+        //   accessType: 'offline',
+        //   authentication: 'basic',
+        //   prompt: 'consent',
+        //   redirectURI: `${getBaseUrl()}/api/auth/oauth2/callback/discord`,
+        //   getUserInfo: async (tokens) => {
+        //     try {
+        //       const response = await fetch('https://discord.com/api/users/@me', {
+        //         headers: {
+        //           Authorization: `Bearer ${tokens.accessToken}`,
+        //         },
+        //       })
+
+        //       if (!response.ok) {
+        //         logger.error('Error fetching Discord user info:', {
+        //           status: response.status,
+        //           statusText: response.statusText,
+        //         })
+        //         return null
+        //       }
+
+        //       const profile = await response.json()
+        //       const now = new Date()
+
+        //       return {
+        //         id: profile.id,
+        //         name: profile.username || 'Discord User',
+        //         email: profile.email || `${profile.id}@discord.user`,
+        //         image: profile.avatar
+        //           ? `https://cdn.discordapp.com/avatars/${profile.id}/${profile.avatar}.png`
+        //           : undefined,
+        //         emailVerified: profile.verified || false,
+        //         createdAt: now,
+        //         updatedAt: now,
+        //       }
+        //     } catch (error) {
+        //       logger.error('Error in Discord getUserInfo:', { error })
+        //       return null
+        //     }
+        //   },
+        // },
 
         // Jira provider
         {
diff --git a/apps/sim/lib/oauth/oauth.test.ts b/apps/sim/lib/oauth/oauth.test.ts
index 5c9aa9ece0..9e23ec15b1 100644
--- a/apps/sim/lib/oauth/oauth.test.ts
+++ b/apps/sim/lib/oauth/oauth.test.ts
@@ -18,8 +18,8 @@ vi.mock('@/lib/env', () => ({
     SUPABASE_CLIENT_SECRET: 'supabase_client_secret',
     NOTION_CLIENT_ID: 'notion_client_id',
     NOTION_CLIENT_SECRET: 'notion_client_secret',
-    DISCORD_CLIENT_ID: 'discord_client_id',
-    DISCORD_CLIENT_SECRET: 'discord_client_secret',
+    // DISCORD_CLIENT_ID: 'discord_client_id',
+    // DISCORD_CLIENT_SECRET: 'discord_client_secret',
     MICROSOFT_CLIENT_ID: 'microsoft_client_id',
     MICROSOFT_CLIENT_SECRET: 'microsoft_client_secret',
     LINEAR_CLIENT_ID: 'linear_client_id',
diff --git a/apps/sim/tools/registry.ts b/apps/sim/tools/registry.ts
index e8503436a4..1f594a2692 100644
--- a/apps/sim/tools/registry.ts
+++ b/apps/sim/tools/registry.ts
@@ -605,7 +605,13 @@ import {
   stripeUpdateSubscriptionTool,
   stripeVoidInvoiceTool,
 } from '@/tools/stripe'
-import { deepgramSttTool, elevenLabsSttTool, whisperSttTool } from '@/tools/stt'
+import {
+  assemblyaiSttTool,
+  deepgramSttTool,
+  elevenLabsSttTool,
+  geminiSttTool,
+  whisperSttTool,
+} from '@/tools/stt'
 import {
   supabaseCountTool,
   supabaseDeleteTool,
@@ -648,6 +654,15 @@ import {
   trelloListListsTool,
   trelloUpdateCardTool,
 } from '@/tools/trello'
+import {
+  azureTtsTool,
+  cartesiaTtsTool,
+  deepgramTtsTool,
+  elevenLabsTtsUnifiedTool,
+  googleTtsTool,
+  openaiTtsTool,
+  playhtTtsTool,
+} from '@/tools/tts'
 import { sendSMSTool } from '@/tools/twilio'
 import { getRecordingTool, listCallsTool, makeCallTool } from '@/tools/twilio_voice'
 import {
@@ -661,6 +676,13 @@ import {
   typeformUpdateFormTool,
 } from '@/tools/typeform'
 import type { ToolConfig } from '@/tools/types'
+import {
+  falaiVideoTool,
+  lumaVideoTool,
+  minimaxVideoTool,
+  runwayVideoTool,
+  veoVideoTool,
+} from '@/tools/video'
 import { visionTool } from '@/tools/vision'
 import {
   wealthboxReadContactTool,
@@ -1054,6 +1076,20 @@ export const tools: Record<string, ToolConfig> = {
   stt_whisper: whisperSttTool,
   stt_deepgram: deepgramSttTool,
   stt_elevenlabs: elevenLabsSttTool,
+  stt_assemblyai: assemblyaiSttTool,
+  stt_gemini: geminiSttTool,
+  tts_openai: openaiTtsTool,
+  tts_deepgram: deepgramTtsTool,
+  tts_elevenlabs: elevenLabsTtsUnifiedTool,
+  tts_cartesia: cartesiaTtsTool,
+  tts_google: googleTtsTool,
+  tts_azure: azureTtsTool,
+  tts_playht: playhtTtsTool,
+  video_runway: runwayVideoTool,
+  video_veo: veoVideoTool,
+  video_luma: lumaVideoTool,
+  video_minimax: minimaxVideoTool,
+  video_falai: falaiVideoTool,
   s3_get_object: s3GetObjectTool,
   s3_put_object: s3PutObjectTool,
   s3_list_objects: s3ListObjectsTool,
diff --git a/apps/sim/tools/stt/assemblyai.ts b/apps/sim/tools/stt/assemblyai.ts
new file mode 100644
index 0000000000..24e1cf3f80
--- /dev/null
+++ b/apps/sim/tools/stt/assemblyai.ts
@@ -0,0 +1,159 @@
+import type { SttParams, SttResponse } from '@/tools/stt/types'
+import type { ToolConfig } from '@/tools/types'
+
+export const assemblyaiSttTool: ToolConfig<SttParams, SttResponse> = {
+  id: 'stt_assemblyai',
+  name: 'AssemblyAI STT',
+  description: 'Transcribe audio to text using AssemblyAI with advanced NLP features',
+  version: '1.0.0',
+
+  params: {
+    provider: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'STT provider (assemblyai)',
+    },
+    apiKey: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'AssemblyAI API key',
+    },
+    model: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'AssemblyAI model to use (default: best)',
+    },
+    audioFile: {
+      type: 'file',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Audio or video file to transcribe',
+    },
+    audioFileReference: {
+      type: 'file',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Reference to audio/video file from previous blocks',
+    },
+    audioUrl: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'URL to audio or video file',
+    },
+    language: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Language code (e.g., "en", "es", "fr") or "auto" for auto-detection',
+    },
+    timestamps: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description: 'Timestamp granularity: none, sentence, or word',
+    },
+    diarization: {
+      type: 'boolean',
+      required: false,
+      visibility: 'user-only',
+      description: 'Enable speaker diarization',
+    },
+    sentiment: {
+      type: 'boolean',
+      required: false,
+      visibility: 'user-only',
+      description: 'Enable sentiment analysis',
+    },
+    entityDetection: {
+      type: 'boolean',
+      required: false,
+      visibility: 'user-only',
+      description: 'Enable entity detection',
+    },
+    piiRedaction: {
+      type: 'boolean',
+      required: false,
+      visibility: 'user-only',
+      description: 'Enable PII redaction',
+    },
+    summarization: {
+      type: 'boolean',
+      required: false,
+      visibility: 'user-only',
+      description: 'Enable automatic summarization',
+    },
+  },
+
+  request: {
+    url: '/api/proxy/stt',
+    method: 'POST',
+    headers: () => ({
+      'Content-Type': 'application/json',
+    }),
+    body: (
+      params: SttParams & {
+        _context?: { workspaceId?: string; workflowId?: string; executionId?: string }
+      }
+    ) => ({
+      provider: 'assemblyai',
+      apiKey: params.apiKey,
+      model: params.model,
+      audioFile: params.audioFile,
+      audioFileReference: params.audioFileReference,
+      audioUrl: params.audioUrl,
+      language: params.language || 'auto',
+      timestamps: params.timestamps || 'none',
+      diarization: params.diarization || false,
+      sentiment: (params as any).sentiment || false,
+      entityDetection: (params as any).entityDetection || false,
+      piiRedaction: (params as any).piiRedaction || false,
+      summarization: (params as any).summarization || false,
+      workspaceId: params._context?.workspaceId,
+      workflowId: params._context?.workflowId,
+      executionId: params._context?.executionId,
+    }),
+  },
+
+  transformResponse: async (response: Response) => {
+    const data = await response.json()
+
+    if (!response.ok || data.error) {
+      return {
+        success: false,
+        error: data.error || 'Transcription failed',
+        output: {
+          transcript: '',
+        },
+      }
+    }
+
+    return {
+      success: true,
+      output: {
+        transcript: data.transcript,
+        segments: data.segments,
+        language: data.language,
+        duration: data.duration,
+        confidence: data.confidence,
+        sentiment: data.sentiment,
+        entities: data.entities,
+        summary: data.summary,
+      },
+    }
+  },
+
+  outputs: {
+    transcript: { type: 'string', description: 'Full transcribed text' },
+    segments: { type: 'array', description: 'Timestamped segments with speaker labels' },
+    language: { type: 'string', description: 'Detected or specified language' },
+    duration: { type: 'number', description: 'Audio duration in seconds' },
+    confidence: { type: 'number', description: 'Overall confidence score' },
+    sentiment: { type: 'array', description: 'Sentiment analysis results' },
+    entities: { type: 'array', description: 'Detected entities' },
+    summary: { type: 'string', description: 'Auto-generated summary' },
+  },
+}
diff --git a/apps/sim/tools/stt/gemini.ts b/apps/sim/tools/stt/gemini.ts
new file mode 100644
index 0000000000..5261c17c2d
--- /dev/null
+++ b/apps/sim/tools/stt/gemini.ts
@@ -0,0 +1,118 @@
+import type { SttParams, SttResponse } from '@/tools/stt/types'
+import type { ToolConfig } from '@/tools/types'
+
+export const geminiSttTool: ToolConfig<SttParams, SttResponse> = {
+  id: 'stt_gemini',
+  name: 'Google Gemini STT',
+  description: 'Transcribe audio to text using Google Gemini with multimodal capabilities',
+  version: '1.0.0',
+
+  params: {
+    provider: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'STT provider (gemini)',
+    },
+    apiKey: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'Google API key',
+    },
+    model: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Gemini model to use (default: gemini-2.5-flash)',
+    },
+    audioFile: {
+      type: 'file',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Audio or video file to transcribe',
+    },
+    audioFileReference: {
+      type: 'file',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Reference to audio/video file from previous blocks',
+    },
+    audioUrl: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'URL to audio or video file',
+    },
+    language: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Language code (e.g., "en", "es", "fr") or "auto" for auto-detection',
+    },
+    timestamps: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description: 'Timestamp granularity: none, sentence, or word',
+    },
+  },
+
+  request: {
+    url: '/api/proxy/stt',
+    method: 'POST',
+    headers: () => ({
+      'Content-Type': 'application/json',
+    }),
+    body: (
+      params: SttParams & {
+        _context?: { workspaceId?: string; workflowId?: string; executionId?: string }
+      }
+    ) => ({
+      provider: 'gemini',
+      apiKey: params.apiKey,
+      model: params.model,
+      audioFile: params.audioFile,
+      audioFileReference: params.audioFileReference,
+      audioUrl: params.audioUrl,
+      language: params.language || 'auto',
+      timestamps: params.timestamps || 'none',
+      workspaceId: params._context?.workspaceId,
+      workflowId: params._context?.workflowId,
+      executionId: params._context?.executionId,
+    }),
+  },
+
+  transformResponse: async (response: Response) => {
+    const data = await response.json()
+
+    if (!response.ok || data.error) {
+      return {
+        success: false,
+        error: data.error || 'Transcription failed',
+        output: {
+          transcript: '',
+        },
+      }
+    }
+
+    return {
+      success: true,
+      output: {
+        transcript: data.transcript,
+        segments: data.segments,
+        language: data.language,
+        duration: data.duration,
+        confidence: data.confidence,
+      },
+    }
+  },
+
+  outputs: {
+    transcript: { type: 'string', description: 'Full transcribed text' },
+    segments: { type: 'array', description: 'Timestamped segments' },
+    language: { type: 'string', description: 'Detected or specified language' },
+    duration: { type: 'number', description: 'Audio duration in seconds' },
+    confidence: { type: 'number', description: 'Overall confidence score' },
+  },
+}
diff --git a/apps/sim/tools/stt/index.ts b/apps/sim/tools/stt/index.ts
index e96e41cc9d..a3ab7ca4a5 100644
--- a/apps/sim/tools/stt/index.ts
+++ b/apps/sim/tools/stt/index.ts
@@ -1,5 +1,7 @@
+import { assemblyaiSttTool } from '@/tools/stt/assemblyai'
 import { deepgramSttTool } from '@/tools/stt/deepgram'
 import { elevenLabsSttTool } from '@/tools/stt/elevenlabs'
+import { geminiSttTool } from '@/tools/stt/gemini'
 import { whisperSttTool } from '@/tools/stt/whisper'
 
-export { whisperSttTool, deepgramSttTool, elevenLabsSttTool }
+export { whisperSttTool, deepgramSttTool, elevenLabsSttTool, assemblyaiSttTool, geminiSttTool }
diff --git a/apps/sim/tools/stt/types.ts b/apps/sim/tools/stt/types.ts
index c652c38191..49b18202b3 100644
--- a/apps/sim/tools/stt/types.ts
+++ b/apps/sim/tools/stt/types.ts
@@ -2,7 +2,7 @@ import type { UserFile } from '@/executor/types'
 import type { ToolResponse } from '@/tools/types'
 
 export interface SttParams {
-  provider: 'whisper' | 'deepgram' | 'elevenlabs'
+  provider: 'whisper' | 'deepgram' | 'elevenlabs' | 'assemblyai' | 'gemini'
   apiKey: string
   model?: string
   audioFile?: UserFile | UserFile[]
@@ -12,6 +12,11 @@ export interface SttParams {
   timestamps?: 'none' | 'sentence' | 'word'
   diarization?: boolean
   translateToEnglish?: boolean
+  // AssemblyAI-specific options
+  sentiment?: boolean
+  entityDetection?: boolean
+  piiRedaction?: boolean
+  summarization?: boolean
 }
 
 export interface TranscriptSegment {
@@ -29,6 +34,10 @@ export interface SttResponse extends ToolResponse {
     language?: string
     duration?: number
     confidence?: number
+    // AssemblyAI-specific outputs
+    sentiment?: any[]
+    entities?: any[]
+    summary?: string
   }
 }
 
@@ -39,6 +48,10 @@ export interface SttBlockResponse extends ToolResponse {
     language?: string
     duration?: number
     confidence?: number
+    // AssemblyAI-specific outputs
+    sentiment?: any[]
+    entities?: any[]
+    summary?: string
   }
 }
 
@@ -60,3 +73,15 @@ export interface DeepgramParams extends Omit<SttParams, 'provider'> {
 export interface ElevenLabsSttParams extends Omit<SttParams, 'provider'> {
   model?: string
 }
+
+export interface AssemblyAIParams extends Omit<SttParams, 'provider'> {
+  model?: string
+  sentiment?: boolean
+  entityDetection?: boolean
+  piiRedaction?: boolean
+  summarization?: boolean
+}
+
+export interface GeminiParams extends Omit<SttParams, 'provider'> {
+  model?: string
+}
diff --git a/apps/sim/tools/stt/whisper.ts b/apps/sim/tools/stt/whisper.ts
index a47729b56f..bee4b82412 100644
--- a/apps/sim/tools/stt/whisper.ts
+++ b/apps/sim/tools/stt/whisper.ts
@@ -62,6 +62,20 @@ export const whisperSttTool: ToolConfig<SttParams, SttResponse> = {
       visibility: 'user-only',
       description: 'Translate audio to English',
     },
+    prompt: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description:
+        "Optional text to guide the model's style or continue a previous audio segment. Helps with proper nouns and context.",
+    },
+    temperature: {
+      type: 'number',
+      required: false,
+      visibility: 'user-or-llm',
+      description:
+        'Sampling temperature between 0 and 1. Higher values make output more random, lower values more focused and deterministic.',
+    },
   },
 
   request: {
@@ -84,6 +98,8 @@ export const whisperSttTool: ToolConfig<SttParams, SttResponse> = {
       language: params.language || 'auto',
       timestamps: params.timestamps || 'none',
       translateToEnglish: params.translateToEnglish || false,
+      prompt: (params as any).prompt,
+      temperature: (params as any).temperature,
       workspaceId: params._context?.workspaceId,
       workflowId: params._context?.workflowId,
       executionId: params._context?.executionId,
@@ -110,7 +126,6 @@ export const whisperSttTool: ToolConfig<SttParams, SttResponse> = {
         segments: data.segments,
         language: data.language,
         duration: data.duration,
-        confidence: data.confidence,
       },
     }
   },
@@ -120,6 +135,5 @@ export const whisperSttTool: ToolConfig<SttParams, SttResponse> = {
     segments: { type: 'array', description: 'Timestamped segments' },
     language: { type: 'string', description: 'Detected or specified language' },
     duration: { type: 'number', description: 'Audio duration in seconds' },
-    confidence: { type: 'number', description: 'Overall confidence score' },
   },
 }
diff --git a/apps/sim/tools/tts/azure.ts b/apps/sim/tools/tts/azure.ts
new file mode 100644
index 0000000000..bd7c9cab30
--- /dev/null
+++ b/apps/sim/tools/tts/azure.ts
@@ -0,0 +1,135 @@
+import type { AzureTtsParams, TtsBlockResponse } from '@/tools/tts/types'
+import type { ToolConfig } from '@/tools/types'
+
+export const azureTtsTool: ToolConfig<AzureTtsParams, TtsBlockResponse> = {
+  id: 'tts_azure',
+  name: 'Azure TTS',
+  description: 'Convert text to speech using Azure Cognitive Services',
+  version: '1.0.0',
+
+  params: {
+    text: {
+      type: 'string',
+      required: true,
+      visibility: 'user-or-llm',
+      description: 'The text to convert to speech',
+    },
+    apiKey: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'Azure Speech Services API key',
+    },
+    voiceId: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Voice ID (e.g., en-US-JennyNeural, en-US-GuyNeural)',
+    },
+    region: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description: 'Azure region (e.g., eastus, westus, westeurope)',
+    },
+    outputFormat: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description: 'Output audio format',
+    },
+    rate: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description: 'Speaking rate (e.g., +10%, -20%, 1.5)',
+    },
+    pitch: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description: 'Voice pitch (e.g., +5Hz, -2st, low)',
+    },
+    style: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description: 'Speaking style (e.g., cheerful, sad, angry - neural voices only)',
+    },
+    styleDegree: {
+      type: 'number',
+      required: false,
+      visibility: 'user-only',
+      description: 'Style intensity (0.01 to 2.0)',
+    },
+    role: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description: 'Role (e.g., Girl, Boy, YoungAdultFemale)',
+    },
+  },
+
+  request: {
+    url: '/api/proxy/tts/unified',
+    method: 'POST',
+    headers: () => ({
+      'Content-Type': 'application/json',
+    }),
+    body: (
+      params: AzureTtsParams & {
+        _context?: { workspaceId?: string; workflowId?: string; executionId?: string }
+      }
+    ) => ({
+      provider: 'azure',
+      text: params.text,
+      apiKey: params.apiKey,
+      voiceId: params.voiceId || 'en-US-JennyNeural',
+      region: params.region || 'eastus',
+      outputFormat: params.outputFormat || 'audio-24khz-96kbitrate-mono-mp3',
+      rate: params.rate,
+      pitch: params.pitch,
+      style: params.style,
+      styleDegree: params.styleDegree,
+      role: params.role,
+      workspaceId: params._context?.workspaceId,
+      workflowId: params._context?.workflowId,
+      executionId: params._context?.executionId,
+    }),
+  },
+
+  transformResponse: async (response: Response) => {
+    const data = await response.json()
+
+    if (!response.ok || data.error) {
+      return {
+        success: false,
+        error: data.error || 'TTS generation failed',
+        output: {
+          audioUrl: '',
+        },
+      }
+    }
+
+    return {
+      success: true,
+      output: {
+        audioUrl: data.audioUrl,
+        audioFile: data.audioFile,
+        duration: data.duration,
+        characterCount: data.characterCount,
+        format: data.format,
+        provider: data.provider,
+      },
+    }
+  },
+
+  outputs: {
+    audioUrl: { type: 'string', description: 'URL to the generated audio file' },
+    audioFile: { type: 'file', description: 'Generated audio file object' },
+    duration: { type: 'number', description: 'Audio duration in seconds' },
+    characterCount: { type: 'number', description: 'Number of characters processed' },
+    format: { type: 'string', description: 'Audio format' },
+    provider: { type: 'string', description: 'TTS provider used' },
+  },
+}
diff --git a/apps/sim/tools/tts/cartesia.ts b/apps/sim/tools/tts/cartesia.ts
new file mode 100644
index 0000000000..d7b0dc7e81
--- /dev/null
+++ b/apps/sim/tools/tts/cartesia.ts
@@ -0,0 +1,125 @@
+import type { CartesiaTtsParams, TtsBlockResponse } from '@/tools/tts/types'
+import type { ToolConfig } from '@/tools/types'
+
+export const cartesiaTtsTool: ToolConfig<CartesiaTtsParams, TtsBlockResponse> = {
+  id: 'tts_cartesia',
+  name: 'Cartesia TTS',
+  description: 'Convert text to speech using Cartesia Sonic (ultra-low latency)',
+  version: '1.0.0',
+
+  params: {
+    text: {
+      type: 'string',
+      required: true,
+      visibility: 'user-or-llm',
+      description: 'The text to convert to speech',
+    },
+    apiKey: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'Cartesia API key',
+    },
+    modelId: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description: 'Model ID (sonic-english, sonic-multilingual)',
+    },
+    voice: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Voice ID or embedding',
+    },
+    language: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description: 'Language code (en, es, fr, de, it, pt, etc.)',
+    },
+    outputFormat: {
+      type: 'json',
+      required: false,
+      visibility: 'user-only',
+      description: 'Output format configuration (container, encoding, sampleRate)',
+    },
+    speed: {
+      type: 'number',
+      required: false,
+      visibility: 'user-only',
+      description: 'Speed multiplier',
+    },
+    emotion: {
+      type: 'array',
+      required: false,
+      visibility: 'user-only',
+      description: "Emotion tags for Sonic-3 (e.g., ['positivity:high'])",
+    },
+  },
+
+  request: {
+    url: '/api/proxy/tts/unified',
+    method: 'POST',
+    headers: () => ({
+      'Content-Type': 'application/json',
+    }),
+    body: (
+      params: CartesiaTtsParams & {
+        _context?: { workspaceId?: string; workflowId?: string; executionId?: string }
+      }
+    ) => ({
+      provider: 'cartesia',
+      text: params.text,
+      apiKey: params.apiKey,
+      modelId: params.modelId || 'sonic-3',
+      voice: params.voice,
+      language: params.language || 'en',
+      outputFormat: params.outputFormat || {
+        container: 'mp3',
+        encoding: 'pcm_f32le',
+        sampleRate: 44100,
+      },
+      speed: params.speed,
+      emotion: params.emotion,
+      workspaceId: params._context?.workspaceId,
+      workflowId: params._context?.workflowId,
+      executionId: params._context?.executionId,
+    }),
+  },
+
+  transformResponse: async (response: Response) => {
+    const data = await response.json()
+
+    if (!response.ok || data.error) {
+      return {
+        success: false,
+        error: data.error || 'TTS generation failed',
+        output: {
+          audioUrl: '',
+        },
+      }
+    }
+
+    return {
+      success: true,
+      output: {
+        audioUrl: data.audioUrl,
+        audioFile: data.audioFile,
+        duration: data.duration,
+        characterCount: data.characterCount,
+        format: data.format,
+        provider: data.provider,
+      },
+    }
+  },
+
+  outputs: {
+    audioUrl: { type: 'string', description: 'URL to the generated audio file' },
+    audioFile: { type: 'file', description: 'Generated audio file object' },
+    duration: { type: 'number', description: 'Audio duration in seconds' },
+    characterCount: { type: 'number', description: 'Number of characters processed' },
+    format: { type: 'string', description: 'Audio format' },
+    provider: { type: 'string', description: 'TTS provider used' },
+  },
+}
diff --git a/apps/sim/tools/tts/deepgram.ts b/apps/sim/tools/tts/deepgram.ts
new file mode 100644
index 0000000000..42c771f170
--- /dev/null
+++ b/apps/sim/tools/tts/deepgram.ts
@@ -0,0 +1,120 @@
+import type { DeepgramTtsParams, TtsBlockResponse } from '@/tools/tts/types'
+import type { ToolConfig } from '@/tools/types'
+
+export const deepgramTtsTool: ToolConfig<DeepgramTtsParams, TtsBlockResponse> = {
+  id: 'tts_deepgram',
+  name: 'Deepgram TTS',
+  description: 'Convert text to speech using Deepgram Aura',
+  version: '1.0.0',
+
+  params: {
+    text: {
+      type: 'string',
+      required: true,
+      visibility: 'user-or-llm',
+      description: 'The text to convert to speech',
+    },
+    apiKey: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'Deepgram API key',
+    },
+    model: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description: 'Deepgram model/voice (e.g., aura-asteria-en, aura-luna-en)',
+    },
+    voice: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Voice identifier (alternative to model param)',
+    },
+    encoding: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description: 'Audio encoding (linear16, mp3, opus, aac, flac)',
+    },
+    sampleRate: {
+      type: 'number',
+      required: false,
+      visibility: 'user-only',
+      description: 'Sample rate (8000, 16000, 24000, 48000)',
+    },
+    bitRate: {
+      type: 'number',
+      required: false,
+      visibility: 'user-only',
+      description: 'Bit rate for compressed formats',
+    },
+    container: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description: 'Container format (none, wav, ogg)',
+    },
+  },
+
+  request: {
+    url: '/api/proxy/tts/unified',
+    method: 'POST',
+    headers: () => ({
+      'Content-Type': 'application/json',
+    }),
+    body: (
+      params: DeepgramTtsParams & {
+        _context?: { workspaceId?: string; workflowId?: string; executionId?: string }
+      }
+    ) => ({
+      provider: 'deepgram',
+      text: params.text,
+      apiKey: params.apiKey,
+      model: params.model || params.voice || 'aura-asteria-en',
+      encoding: params.encoding || 'mp3',
+      sampleRate: params.sampleRate,
+      bitRate: params.bitRate,
+      container: params.container || 'none',
+      workspaceId: params._context?.workspaceId,
+      workflowId: params._context?.workflowId,
+      executionId: params._context?.executionId,
+    }),
+  },
+
+  transformResponse: async (response: Response) => {
+    const data = await response.json()
+
+    if (!response.ok || data.error) {
+      return {
+        success: false,
+        error: data.error || 'TTS generation failed',
+        output: {
+          audioUrl: '',
+        },
+      }
+    }
+
+    return {
+      success: true,
+      output: {
+        audioUrl: data.audioUrl,
+        audioFile: data.audioFile,
+        duration: data.duration,
+        characterCount: data.characterCount,
+        format: data.format,
+        provider: data.provider,
+      },
+    }
+  },
+
+  outputs: {
+    audioUrl: { type: 'string', description: 'URL to the generated audio file' },
+    audioFile: { type: 'file', description: 'Generated audio file object' },
+    duration: { type: 'number', description: 'Audio duration in seconds' },
+    characterCount: { type: 'number', description: 'Number of characters processed' },
+    format: { type: 'string', description: 'Audio format' },
+    provider: { type: 'string', description: 'TTS provider used' },
+  },
+}
diff --git a/apps/sim/tools/tts/elevenlabs.ts b/apps/sim/tools/tts/elevenlabs.ts
new file mode 100644
index 0000000000..a761e7c3d0
--- /dev/null
+++ b/apps/sim/tools/tts/elevenlabs.ts
@@ -0,0 +1,122 @@
+import type { ElevenLabsTtsUnifiedParams, TtsBlockResponse } from '@/tools/tts/types'
+import type { ToolConfig } from '@/tools/types'
+
+export const elevenLabsTtsUnifiedTool: ToolConfig<ElevenLabsTtsUnifiedParams, TtsBlockResponse> = {
+  id: 'tts_elevenlabs',
+  name: 'ElevenLabs TTS',
+  description: 'Convert text to speech using ElevenLabs voices',
+  version: '1.0.0',
+
+  params: {
+    text: {
+      type: 'string',
+      required: true,
+      visibility: 'user-or-llm',
+      description: 'The text to convert to speech',
+    },
+    voiceId: {
+      type: 'string',
+      required: true,
+      visibility: 'user-or-llm',
+      description: 'The ID of the voice to use',
+    },
+    apiKey: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'ElevenLabs API key',
+    },
+    modelId: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description:
+        'Model to use (e.g., eleven_monolingual_v1, eleven_turbo_v2_5, eleven_flash_v2_5)',
+    },
+    stability: {
+      type: 'number',
+      required: false,
+      visibility: 'user-only',
+      description: 'Voice stability (0.0 to 1.0, default: 0.5)',
+    },
+    similarityBoost: {
+      type: 'number',
+      required: false,
+      visibility: 'user-only',
+      description: 'Similarity boost (0.0 to 1.0, default: 0.8)',
+    },
+    style: {
+      type: 'number',
+      required: false,
+      visibility: 'user-only',
+      description: 'Style exaggeration (0.0 to 1.0)',
+    },
+    useSpeakerBoost: {
+      type: 'boolean',
+      required: false,
+      visibility: 'user-only',
+      description: 'Use speaker boost (default: true)',
+    },
+  },
+
+  request: {
+    url: '/api/proxy/tts/unified',
+    method: 'POST',
+    headers: () => ({
+      'Content-Type': 'application/json',
+    }),
+    body: (
+      params: ElevenLabsTtsUnifiedParams & {
+        _context?: { workspaceId?: string; workflowId?: string; executionId?: string }
+      }
+    ) => ({
+      provider: 'elevenlabs',
+      text: params.text,
+      apiKey: params.apiKey,
+      voiceId: params.voiceId,
+      modelId: params.modelId || 'eleven_turbo_v2_5',
+      stability: params.stability ?? 0.5,
+      similarityBoost: params.similarityBoost ?? 0.8,
+      style: params.style,
+      useSpeakerBoost: params.useSpeakerBoost ?? true,
+      workspaceId: params._context?.workspaceId,
+      workflowId: params._context?.workflowId,
+      executionId: params._context?.executionId,
+    }),
+  },
+
+  transformResponse: async (response: Response) => {
+    const data = await response.json()
+
+    if (!response.ok || data.error) {
+      return {
+        success: false,
+        error: data.error || 'TTS generation failed',
+        output: {
+          audioUrl: '',
+        },
+      }
+    }
+
+    return {
+      success: true,
+      output: {
+        audioUrl: data.audioUrl,
+        audioFile: data.audioFile,
+        duration: data.duration,
+        characterCount: data.characterCount,
+        format: data.format,
+        provider: data.provider,
+      },
+    }
+  },
+
+  outputs: {
+    audioUrl: { type: 'string', description: 'URL to the generated audio file' },
+    audioFile: { type: 'file', description: 'Generated audio file object' },
+    duration: { type: 'number', description: 'Audio duration in seconds' },
+    characterCount: { type: 'number', description: 'Number of characters processed' },
+    format: { type: 'string', description: 'Audio format' },
+    provider: { type: 'string', description: 'TTS provider used' },
+  },
+}
diff --git a/apps/sim/tools/tts/google.ts b/apps/sim/tools/tts/google.ts
new file mode 100644
index 0000000000..7d8fd4bf7c
--- /dev/null
+++ b/apps/sim/tools/tts/google.ts
@@ -0,0 +1,142 @@
+import type { GoogleTtsParams, TtsBlockResponse } from '@/tools/tts/types'
+import type { ToolConfig } from '@/tools/types'
+
+export const googleTtsTool: ToolConfig<GoogleTtsParams, TtsBlockResponse> = {
+  id: 'tts_google',
+  name: 'Google Cloud TTS',
+  description: 'Convert text to speech using Google Cloud Text-to-Speech',
+  version: '1.0.0',
+
+  params: {
+    text: {
+      type: 'string',
+      required: true,
+      visibility: 'user-or-llm',
+      description: 'The text to convert to speech',
+    },
+    apiKey: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'Google Cloud API key',
+    },
+    voiceId: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Voice ID (e.g., en-US-Neural2-A, en-US-Wavenet-D)',
+    },
+    languageCode: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'Language code (e.g., en-US, es-ES, fr-FR)',
+    },
+    gender: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description: 'Voice gender (MALE, FEMALE, NEUTRAL)',
+    },
+    audioEncoding: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description: 'Audio encoding (LINEAR16, MP3, OGG_OPUS, MULAW, ALAW)',
+    },
+    speakingRate: {
+      type: 'number',
+      required: false,
+      visibility: 'user-only',
+      description: 'Speaking rate (0.25 to 2.0, default: 1.0)',
+    },
+    pitch: {
+      type: 'number',
+      required: false,
+      visibility: 'user-only',
+      description: 'Voice pitch (-20.0 to 20.0, default: 0.0)',
+    },
+    volumeGainDb: {
+      type: 'number',
+      required: false,
+      visibility: 'user-only',
+      description: 'Volume gain in dB (-96.0 to 16.0)',
+    },
+    sampleRateHertz: {
+      type: 'number',
+      required: false,
+      visibility: 'user-only',
+      description: 'Sample rate in Hz',
+    },
+    effectsProfileId: {
+      type: 'array',
+      required: false,
+      visibility: 'user-only',
+      description: "Effects profile (e.g., ['headphone-class-device'])",
+    },
+  },
+
+  request: {
+    url: '/api/proxy/tts/unified',
+    method: 'POST',
+    headers: () => ({
+      'Content-Type': 'application/json',
+    }),
+    body: (
+      params: GoogleTtsParams & {
+        _context?: { workspaceId?: string; workflowId?: string; executionId?: string }
+      }
+    ) => ({
+      provider: 'google',
+      text: params.text,
+      apiKey: params.apiKey,
+      voiceId: params.voiceId,
+      languageCode: params.languageCode,
+      gender: params.gender,
+      audioEncoding: params.audioEncoding || 'MP3',
+      speakingRate: params.speakingRate ?? 1.0,
+      pitch: params.pitch ?? 0.0,
+      volumeGainDb: params.volumeGainDb,
+      sampleRateHertz: params.sampleRateHertz,
+      effectsProfileId: params.effectsProfileId,
+      workspaceId: params._context?.workspaceId,
+      workflowId: params._context?.workflowId,
+      executionId: params._context?.executionId,
+    }),
+  },
+
+  transformResponse: async (response: Response) => {
+    const data = await response.json()
+
+    if (!response.ok || data.error) {
+      return {
+        success: false,
+        error: data.error || 'TTS generation failed',
+        output: {
+          audioUrl: '',
+        },
+      }
+    }
+
+    return {
+      success: true,
+      output: {
+        audioUrl: data.audioUrl,
+        audioFile: data.audioFile,
+        duration: data.duration,
+        characterCount: data.characterCount,
+        format: data.format,
+        provider: data.provider,
+      },
+    }
+  },
+
+  outputs: {
+    audioUrl: { type: 'string', description: 'URL to the generated audio file' },
+    audioFile: { type: 'file', description: 'Generated audio file object' },
+    duration: { type: 'number', description: 'Audio duration in seconds' },
+    characterCount: { type: 'number', description: 'Number of characters processed' },
+    format: { type: 'string', description: 'Audio format' },
+    provider: { type: 'string', description: 'TTS provider used' },
+  },
+}
diff --git a/apps/sim/tools/tts/index.ts b/apps/sim/tools/tts/index.ts
new file mode 100644
index 0000000000..f095328750
--- /dev/null
+++ b/apps/sim/tools/tts/index.ts
@@ -0,0 +1,8 @@
+export { azureTtsTool } from './azure'
+export { cartesiaTtsTool } from './cartesia'
+export { deepgramTtsTool } from './deepgram'
+export { elevenLabsTtsUnifiedTool } from './elevenlabs'
+export { googleTtsTool } from './google'
+export { openaiTtsTool } from './openai'
+export { playhtTtsTool } from './playht'
+export * from './types'
diff --git a/apps/sim/tools/tts/openai.ts b/apps/sim/tools/tts/openai.ts
new file mode 100644
index 0000000000..4b0b3e2414
--- /dev/null
+++ b/apps/sim/tools/tts/openai.ts
@@ -0,0 +1,108 @@
+import type { OpenAiTtsParams, TtsBlockResponse } from '@/tools/tts/types'
+import type { ToolConfig } from '@/tools/types'
+
+export const openaiTtsTool: ToolConfig<OpenAiTtsParams, TtsBlockResponse> = {
+  id: 'tts_openai',
+  name: 'OpenAI TTS',
+  description: 'Convert text to speech using OpenAI TTS models',
+  version: '1.0.0',
+
+  params: {
+    text: {
+      type: 'string',
+      required: true,
+      visibility: 'user-or-llm',
+      description: 'The text to convert to speech',
+    },
+    apiKey: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'OpenAI API key',
+    },
+    model: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'TTS model to use (tts-1, tts-1-hd, or gpt-4o-mini-tts)',
+    },
+    voice: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description:
+        'Voice to use (alloy, ash, ballad, cedar, coral, echo, marin, sage, shimmer, verse)',
+    },
+    responseFormat: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description: 'Audio format (mp3, opus, aac, flac, wav, pcm)',
+    },
+    speed: {
+      type: 'number',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Speech speed (0.25 to 4.0, default: 1.0)',
+    },
+  },
+
+  request: {
+    url: '/api/proxy/tts/unified',
+    method: 'POST',
+    headers: () => ({
+      'Content-Type': 'application/json',
+    }),
+    body: (
+      params: OpenAiTtsParams & {
+        _context?: { workspaceId?: string; workflowId?: string; executionId?: string }
+      }
+    ) => ({
+      provider: 'openai',
+      text: params.text,
+      apiKey: params.apiKey,
+      model: params.model || 'tts-1',
+      voice: params.voice || 'alloy',
+      responseFormat: params.responseFormat || 'mp3',
+      speed: params.speed || 1.0,
+      workspaceId: params._context?.workspaceId,
+      workflowId: params._context?.workflowId,
+      executionId: params._context?.executionId,
+    }),
+  },
+
+  transformResponse: async (response: Response) => {
+    const data = await response.json()
+
+    if (!response.ok || data.error) {
+      return {
+        success: false,
+        error: data.error || 'TTS generation failed',
+        output: {
+          audioUrl: '',
+        },
+      }
+    }
+
+    return {
+      success: true,
+      output: {
+        audioUrl: data.audioUrl,
+        audioFile: data.audioFile,
+        duration: data.duration,
+        characterCount: data.characterCount,
+        format: data.format,
+        provider: data.provider,
+      },
+    }
+  },
+
+  outputs: {
+    audioUrl: { type: 'string', description: 'URL to the generated audio file' },
+    audioFile: { type: 'file', description: 'Generated audio file object' },
+    duration: { type: 'number', description: 'Audio duration in seconds' },
+    characterCount: { type: 'number', description: 'Number of characters processed' },
+    format: { type: 'string', description: 'Audio format' },
+    provider: { type: 'string', description: 'TTS provider used' },
+  },
+}
diff --git a/apps/sim/tools/tts/playht.ts b/apps/sim/tools/tts/playht.ts
new file mode 100644
index 0000000000..d909367d09
--- /dev/null
+++ b/apps/sim/tools/tts/playht.ts
@@ -0,0 +1,142 @@
+import type { PlayHtTtsParams, TtsBlockResponse } from '@/tools/tts/types'
+import type { ToolConfig } from '@/tools/types'
+
+export const playhtTtsTool: ToolConfig<PlayHtTtsParams, TtsBlockResponse> = {
+  id: 'tts_playht',
+  name: 'PlayHT TTS',
+  description: 'Convert text to speech using PlayHT (voice cloning)',
+  version: '1.0.0',
+
+  params: {
+    text: {
+      type: 'string',
+      required: true,
+      visibility: 'user-or-llm',
+      description: 'The text to convert to speech',
+    },
+    apiKey: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'PlayHT API key (AUTHORIZATION header)',
+    },
+    userId: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'PlayHT user ID (X-USER-ID header)',
+    },
+    voice: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Voice ID or manifest URL',
+    },
+    quality: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description: 'Quality level (draft, standard, premium)',
+    },
+    outputFormat: {
+      type: 'string',
+      required: false,
+      visibility: 'user-only',
+      description: 'Output format (mp3, wav, ogg, flac, mulaw)',
+    },
+    speed: {
+      type: 'number',
+      required: false,
+      visibility: 'user-only',
+      description: 'Speed multiplier (0.5 to 2.0)',
+    },
+    temperature: {
+      type: 'number',
+      required: false,
+      visibility: 'user-only',
+      description: 'Creativity/randomness (0.0 to 2.0)',
+    },
+    voiceGuidance: {
+      type: 'number',
+      required: false,
+      visibility: 'user-only',
+      description: 'Voice stability (1.0 to 6.0)',
+    },
+    textGuidance: {
+      type: 'number',
+      required: false,
+      visibility: 'user-only',
+      description: 'Text adherence (1.0 to 6.0)',
+    },
+    sampleRate: {
+      type: 'number',
+      required: false,
+      visibility: 'user-only',
+      description: 'Sample rate (8000, 16000, 22050, 24000, 44100, 48000)',
+    },
+  },
+
+  request: {
+    url: '/api/proxy/tts/unified',
+    method: 'POST',
+    headers: () => ({
+      'Content-Type': 'application/json',
+    }),
+    body: (
+      params: PlayHtTtsParams & {
+        _context?: { workspaceId?: string; workflowId?: string; executionId?: string }
+      }
+    ) => ({
+      provider: 'playht',
+      text: params.text,
+      apiKey: params.apiKey,
+      userId: params.userId,
+      voice: params.voice,
+      quality: params.quality || 'standard',
+      outputFormat: params.outputFormat || 'mp3',
+      speed: params.speed ?? 1.0,
+      temperature: params.temperature,
+      voiceGuidance: params.voiceGuidance,
+      textGuidance: params.textGuidance,
+      sampleRate: params.sampleRate,
+      workspaceId: params._context?.workspaceId,
+      workflowId: params._context?.workflowId,
+      executionId: params._context?.executionId,
+    }),
+  },
+
+  transformResponse: async (response: Response) => {
+    const data = await response.json()
+
+    if (!response.ok || data.error) {
+      return {
+        success: false,
+        error: data.error || 'TTS generation failed',
+        output: {
+          audioUrl: '',
+        },
+      }
+    }
+
+    return {
+      success: true,
+      output: {
+        audioUrl: data.audioUrl,
+        audioFile: data.audioFile,
+        duration: data.duration,
+        characterCount: data.characterCount,
+        format: data.format,
+        provider: data.provider,
+      },
+    }
+  },
+
+  outputs: {
+    audioUrl: { type: 'string', description: 'URL to the generated audio file' },
+    audioFile: { type: 'file', description: 'Generated audio file object' },
+    duration: { type: 'number', description: 'Audio duration in seconds' },
+    characterCount: { type: 'number', description: 'Number of characters processed' },
+    format: { type: 'string', description: 'Audio format' },
+    provider: { type: 'string', description: 'TTS provider used' },
+  },
+}
diff --git a/apps/sim/tools/tts/types.ts b/apps/sim/tools/tts/types.ts
new file mode 100644
index 0000000000..c8dbc04f15
--- /dev/null
+++ b/apps/sim/tools/tts/types.ts
@@ -0,0 +1,321 @@
+import type { UserFile } from '@/executor/types'
+import type { ToolResponse } from '@/tools/types'
+
+export type TtsProvider =
+  | 'openai'
+  | 'deepgram'
+  | 'elevenlabs'
+  | 'cartesia'
+  | 'google'
+  | 'azure'
+  | 'playht'
+
+// OpenAI TTS Types
+export interface OpenAiTtsParams {
+  text: string
+  model?: 'tts-1' | 'tts-1-hd' | 'gpt-4o-mini-tts'
+  voice?:
+    | 'alloy'
+    | 'ash'
+    | 'ballad'
+    | 'cedar'
+    | 'coral'
+    | 'echo'
+    | 'marin'
+    | 'sage'
+    | 'shimmer'
+    | 'verse'
+  responseFormat?: 'mp3' | 'opus' | 'aac' | 'flac' | 'wav' | 'pcm'
+  speed?: number // 0.25 to 4.0
+  apiKey: string
+}
+
+// Deepgram TTS Types
+export interface DeepgramTtsParams {
+  text: string
+  model?: string // e.g., 'aura-2'
+  voice?: string // e.g., 'aura-asteria-en', 'aura-luna-en', etc.
+  encoding?: 'linear16' | 'mp3' | 'opus' | 'aac' | 'flac' | 'mulaw' | 'alaw'
+  sampleRate?: number // 8000, 16000, 24000, 48000
+  bitRate?: number // For compressed formats
+  container?: 'none' | 'wav' | 'ogg'
+  apiKey: string
+}
+
+// ElevenLabs TTS Types
+export interface ElevenLabsTtsUnifiedParams {
+  text: string
+  voiceId: string
+  modelId?: string
+  stability?: number // 0.0 to 1.0
+  similarityBoost?: number // 0.0 to 1.0
+  style?: number // 0.0 to 1.0
+  useSpeakerBoost?: boolean
+  apiKey: string
+}
+
+// Cartesia TTS Types
+export interface CartesiaTtsParams {
+  text: string
+  modelId?: string // e.g., 'sonic-english', 'sonic-multilingual'
+  voice?: string // Voice ID or embedding
+  language?: string // Language code (en, es, fr, de, it, pt, etc.)
+  outputFormat?: {
+    container?: 'raw' | 'wav' | 'mp3' | 'ogg'
+    encoding?: 'pcm_f32le' | 'pcm_s16le' | 'pcm_mulaw' | 'pcm_alaw'
+    sampleRate?: number // 8000, 16000, 22050, 24000, 44100, 48000
+  }
+  speed?: number // Speed multiplier
+  emotion?: string[] // For Sonic-3: e.g., ['positivity:high', 'curiosity:medium']
+  apiKey: string
+}
+
+// Google Cloud TTS Types
+export interface GoogleTtsParams {
+  text: string
+  voiceId?: string // e.g., 'en-US-Neural2-A'
+  languageCode?: string // e.g., 'en-US'
+  gender?: 'MALE' | 'FEMALE' | 'NEUTRAL'
+  audioEncoding?: 'LINEAR16' | 'MP3' | 'OGG_OPUS' | 'MULAW' | 'ALAW'
+  speakingRate?: number // 0.25 to 2.0
+  pitch?: number // -20.0 to 20.0
+  volumeGainDb?: number // -96.0 to 16.0
+  sampleRateHertz?: number
+  effectsProfileId?: string[] // e.g., ['headphone-class-device']
+  apiKey: string
+}
+
+// Azure TTS Types
+export interface AzureTtsParams {
+  text: string
+  voiceId?: string // e.g., 'en-US-JennyNeural'
+  region?: string // e.g., 'eastus', 'westus'
+  outputFormat?:
+    | 'riff-8khz-16bit-mono-pcm'
+    | 'riff-24khz-16bit-mono-pcm'
+    | 'audio-24khz-48kbitrate-mono-mp3'
+    | 'audio-24khz-96kbitrate-mono-mp3'
+    | 'audio-48khz-96kbitrate-mono-mp3'
+  rate?: string // e.g., '+10%', '-20%', '1.5'
+  pitch?: string // e.g., '+5Hz', '-2st', 'low'
+  style?: string // e.g., 'cheerful', 'sad', 'angry' (neural voices only)
+  styleDegree?: number // 0.01 to 2.0
+  role?: string // e.g., 'Girl', 'Boy', 'YoungAdultFemale'
+  apiKey: string
+}
+
+// PlayHT TTS Types
+export interface PlayHtTtsParams {
+  text: string
+  voice?: string // Voice ID or manifest URL
+  quality?: 'draft' | 'standard' | 'premium'
+  outputFormat?: 'mp3' | 'wav' | 'ogg' | 'flac' | 'mulaw'
+  speed?: number // 0.5 to 2.0
+  temperature?: number // 0.0 to 2.0 (creativity/randomness)
+  voiceGuidance?: number // 1.0 to 6.0 (voice stability)
+  textGuidance?: number // 1.0 to 6.0 (text adherence)
+  sampleRate?: number // 8000, 16000, 22050, 24000, 44100, 48000
+  userId: string // X-USER-ID header
+  apiKey: string // AUTHORIZATION header
+}
+
+// Unified Response Type for block outputs
+export interface TtsBlockResponse extends ToolResponse {
+  output: {
+    audioUrl: string
+    audioFile?: UserFile
+    duration?: number
+    characterCount?: number
+    format?: string
+    provider?: TtsProvider
+  }
+}
+
+// API Response type (used internally in proxy route)
+export interface TtsResponse {
+  audioUrl: string
+  audioFile?: UserFile
+  duration?: number
+  characterCount?: number
+  format?: string
+  provider?: TtsProvider
+}
+
+// Voice options for different providers
+export const OPENAI_VOICES = {
+  // All voices work with all models
+  alloy: 'Alloy (neutral, balanced)',
+  ash: 'Ash (masculine, clear)',
+  ballad: 'Ballad (smooth, melodic)',
+  coral: 'Coral (warm, friendly)',
+  echo: 'Echo (warm, masculine)',
+  marin: 'Marin (soft, gentle)',
+  cedar: 'Cedar (deep, resonant)',
+  sage: 'Sage (calm, wise)',
+  shimmer: 'Shimmer (warm, empathetic)',
+  verse: 'Verse (poetic, expressive)',
+} as const
+
+export const DEEPGRAM_VOICES = {
+  // Aura-1 English voices (legacy)
+  'aura-asteria-en': 'Asteria (Aura-1, American, warm female)',
+  'aura-luna-en': 'Luna (Aura-1, American, professional female)',
+  'aura-stella-en': 'Stella (Aura-1, American, energetic female)',
+  'aura-athena-en': 'Athena (Aura-1, British, sophisticated female)',
+  'aura-hera-en': 'Hera (Aura-1, American, mature female)',
+  'aura-orion-en': 'Orion (Aura-1, American, confident male)',
+  'aura-arcas-en': 'Arcas (Aura-1, American, professional male)',
+  'aura-perseus-en': 'Perseus (Aura-1, American, strong male)',
+  'aura-angus-en': 'Angus (Aura-1, Irish, friendly male)',
+  'aura-orpheus-en': 'Orpheus (Aura-1, American, smooth male)',
+  'aura-helios-en': 'Helios (Aura-1, British, authoritative male)',
+  'aura-zeus-en': 'Zeus (Aura-1, American, deep male)',
+
+  // Aura-2 English voices
+  'aura-2-arcas-en': 'Arcas (Aura-2, American male)',
+  'aura-2-asteria-en': 'Asteria (Aura-2, American female)',
+  'aura-2-luna-en': 'Luna (Aura-2, American female)',
+  'aura-2-stella-en': 'Stella (Aura-2, American female)',
+  'aura-2-athena-en': 'Athena (Aura-2, British female)',
+  'aura-2-hera-en': 'Hera (Aura-2, American female)',
+  'aura-2-orion-en': 'Orion (Aura-2, American male)',
+  'aura-2-perseus-en': 'Perseus (Aura-2, American male)',
+  'aura-2-orpheus-en': 'Orpheus (Aura-2, American male)',
+  'aura-2-helios-en': 'Helios (Aura-2, British male)',
+  'aura-2-zeus-en': 'Zeus (Aura-2, American male)',
+  'aura-2-angus-en': 'Angus (Aura-2, Irish male)',
+  'aura-2-sasha-en': 'Sasha (Aura-2, American female)',
+  'aura-2-sophia-en': 'Sophia (Aura-2, American female)',
+  'aura-2-oliver-en': 'Oliver (Aura-2, American male)',
+  'aura-2-emma-en': 'Emma (Aura-2, American female)',
+  'aura-2-jack-en': 'Jack (Aura-2, American male)',
+  'aura-2-lily-en': 'Lily (Aura-2, American female)',
+  'aura-2-noah-en': 'Noah (Aura-2, American male)',
+  'aura-2-mia-en': 'Mia (Aura-2, American female)',
+  'aura-2-william-en': 'William (Aura-2, American male)',
+  'aura-2-emily-en': 'Emily (Aura-2, American female)',
+  'aura-2-james-en': 'James (Aura-2, American male)',
+  'aura-2-ava-en': 'Ava (Aura-2, American female)',
+  'aura-2-benjamin-en': 'Benjamin (Aura-2, American male)',
+  'aura-2-charlotte-en': 'Charlotte (Aura-2, American female)',
+  'aura-2-lucas-en': 'Lucas (Aura-2, American male)',
+  'aura-2-harper-en': 'Harper (Aura-2, American female)',
+  'aura-2-henry-en': 'Henry (Aura-2, American male)',
+  'aura-2-evelyn-en': 'Evelyn (Aura-2, American female)',
+  'aura-2-alexander-en': 'Alexander (Aura-2, American male)',
+  'aura-2-abigail-en': 'Abigail (Aura-2, American female)',
+  'aura-2-michael-en': 'Michael (Aura-2, American male)',
+  'aura-2-sofia-en': 'Sofia (Aura-2, American female)',
+  'aura-2-daniel-en': 'Daniel (Aura-2, American male)',
+  'aura-2-ella-en': 'Ella (Aura-2, American female)',
+  'aura-2-matthew-en': 'Matthew (Aura-2, American male)',
+  'aura-2-grace-en': 'Grace (Aura-2, American female)',
+  'aura-2-jackson-en': 'Jackson (Aura-2, American male)',
+  'aura-2-chloe-en': 'Chloe (Aura-2, American female)',
+  'aura-2-samuel-en': 'Samuel (Aura-2, American male)',
+  'aura-2-madison-en': 'Madison (Aura-2, American female)',
+
+  // Aura-2 Spanish voices
+  'aura-2-maria-es': 'Maria (Aura-2, Spanish female)',
+  'aura-2-carmen-es': 'Carmen (Aura-2, Spanish female)',
+  'aura-2-carlos-es': 'Carlos (Aura-2, Spanish male)',
+  'aura-2-diego-es': 'Diego (Aura-2, Spanish male)',
+  'aura-2-isabel-es': 'Isabel (Aura-2, Spanish female)',
+  'aura-2-juan-es': 'Juan (Aura-2, Spanish male)',
+  'aura-2-lucia-es': 'Lucia (Aura-2, Spanish female)',
+  'aura-2-miguel-es': 'Miguel (Aura-2, Spanish male)',
+  'aura-2-sofia-es': 'Sofia (Aura-2, Spanish female)',
+  'aura-2-antonio-es': 'Antonio (Aura-2, Spanish male)',
+} as const
+
+export const ELEVENLABS_MODELS = {
+  // V2 Models
+  eleven_turbo_v2_5: 'Turbo v2.5 (faster, improved)',
+  eleven_flash_v2_5: 'Flash v2.5 (ultra-fast, 75ms latency)',
+  eleven_multilingual_v2: 'Multilingual v2 (32 languages)',
+  eleven_turbo_v2: 'Turbo v2 (fast, good quality)',
+
+  // V1 Models
+  eleven_monolingual_v1: 'Monolingual v1 (English only)',
+  eleven_multilingual_v1: 'Multilingual v1',
+} as const
+
+export const CARTESIA_MODELS = {
+  sonic: 'Sonic (English, low latency)',
+  'sonic-2': 'Sonic 2 (English, improved)',
+  'sonic-turbo': 'Sonic Turbo (English, ultra-fast)',
+  'sonic-3': 'Sonic 3 (English, highest quality)',
+  'sonic-multilingual': 'Sonic Multilingual (100+ languages)',
+} as const
+
+export const GOOGLE_VOICE_GENDERS = {
+  MALE: 'Male',
+  FEMALE: 'Female',
+  NEUTRAL: 'Neutral',
+} as const
+
+export const GOOGLE_AUDIO_ENCODINGS = {
+  LINEAR16: 'LINEAR16 (uncompressed)',
+  MP3: 'MP3 (compressed)',
+  OGG_OPUS: 'OGG Opus (compressed)',
+  MULAW: 'MULAW (8kHz)',
+  ALAW: 'ALAW (8kHz)',
+} as const
+
+export const AZURE_OUTPUT_FORMATS = {
+  'riff-8khz-16bit-mono-pcm': 'PCM 8kHz 16-bit',
+  'riff-24khz-16bit-mono-pcm': 'PCM 24kHz 16-bit',
+  'audio-24khz-48kbitrate-mono-mp3': 'MP3 24kHz 48kbps',
+  'audio-24khz-96kbitrate-mono-mp3': 'MP3 24kHz 96kbps',
+  'audio-48khz-96kbitrate-mono-mp3': 'MP3 48kHz 96kbps (high quality)',
+} as const
+
+export const PLAYHT_QUALITY_LEVELS = {
+  draft: 'Draft (fastest)',
+  standard: 'Standard (recommended)',
+  premium: 'Premium (best quality)',
+} as const
+
+export const PLAYHT_OUTPUT_FORMATS = {
+  mp3: 'MP3',
+  wav: 'WAV',
+  ogg: 'OGG',
+  flac: 'FLAC',
+  mulaw: 'MULAW',
+} as const
+
+// Audio format MIME types
+export const AUDIO_MIME_TYPES: Record<string, string> = {
+  mp3: 'audio/mpeg',
+  opus: 'audio/opus',
+  aac: 'audio/aac',
+  flac: 'audio/flac',
+  wav: 'audio/wav',
+  pcm: 'audio/pcm',
+  linear16: 'audio/pcm',
+  mulaw: 'audio/basic',
+  alaw: 'audio/basic',
+  ogg: 'audio/ogg',
+}
+
+// Get file extension from format
+export function getFileExtension(format: string): string {
+  const formatMap: Record<string, string> = {
+    mp3: 'mp3',
+    opus: 'opus',
+    aac: 'aac',
+    flac: 'flac',
+    wav: 'wav',
+    pcm: 'pcm',
+    linear16: 'wav',
+    mulaw: 'wav',
+    alaw: 'wav',
+    ogg: 'ogg',
+  }
+  return formatMap[format] || 'mp3'
+}
+
+// Get MIME type from format
+export function getMimeType(format: string): string {
+  return AUDIO_MIME_TYPES[format] || 'audio/mpeg'
+}
diff --git a/apps/sim/tools/video/falai.ts b/apps/sim/tools/video/falai.ts
new file mode 100644
index 0000000000..59a0f31751
--- /dev/null
+++ b/apps/sim/tools/video/falai.ts
@@ -0,0 +1,136 @@
+import type { ToolConfig } from '@/tools/types'
+import type { VideoParams, VideoResponse } from '@/tools/video/types'
+
+export const falaiVideoTool: ToolConfig<VideoParams, VideoResponse> = {
+  id: 'video_falai',
+  name: 'Fal.ai Video Generation',
+  description:
+    'Generate videos using Fal.ai platform with access to multiple models including Veo 3.1, Sora 2, Kling 2.5, MiniMax Hailuo, and more',
+  version: '1.0.0',
+
+  params: {
+    provider: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'Video provider (falai)',
+    },
+    apiKey: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'Fal.ai API key',
+    },
+    model: {
+      type: 'string',
+      required: true,
+      visibility: 'user-or-llm',
+      description:
+        'Fal.ai model: veo-3.1 (Google Veo 3.1), sora-2 (OpenAI Sora 2), kling-2.5-turbo-pro (Kling 2.5 Turbo Pro), kling-2.1-pro (Kling 2.1 Master), minimax-hailuo-2.3-pro (MiniMax Hailuo Pro), minimax-hailuo-2.3-standard (MiniMax Hailuo Standard), wan-2.1 (WAN T2V), ltxv-0.9.8 (LTXV 13B)',
+    },
+    prompt: {
+      type: 'string',
+      required: true,
+      visibility: 'user-or-llm',
+      description: 'Text prompt describing the video to generate',
+    },
+    duration: {
+      type: 'number',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Video duration in seconds (varies by model)',
+    },
+    aspectRatio: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Aspect ratio (varies by model): 16:9, 9:16, 1:1',
+    },
+    resolution: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Video resolution (varies by model): 540p, 720p, 1080p',
+    },
+    promptOptimizer: {
+      type: 'boolean',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Enable prompt optimization for MiniMax models (default: true)',
+    },
+  },
+
+  request: {
+    url: '/api/proxy/video',
+    method: 'POST',
+    headers: () => ({
+      'Content-Type': 'application/json',
+    }),
+    body: (
+      params: VideoParams & {
+        _context?: { workspaceId?: string; workflowId?: string; executionId?: string }
+      }
+    ) => ({
+      provider: 'falai',
+      apiKey: params.apiKey,
+      model: params.model,
+      prompt: params.prompt,
+      duration: params.duration,
+      aspectRatio: params.aspectRatio,
+      resolution: params.resolution,
+      promptOptimizer: params.promptOptimizer !== false, // Default true for MiniMax
+      workspaceId: params._context?.workspaceId,
+      workflowId: params._context?.workflowId,
+      executionId: params._context?.executionId,
+    }),
+  },
+
+  transformResponse: async (response: Response) => {
+    const data = await response.json()
+
+    if (!response.ok || data.error) {
+      return {
+        success: false,
+        error: data.error || 'Video generation failed',
+        output: {
+          videoUrl: '',
+        },
+      }
+    }
+
+    if (!data.videoUrl) {
+      return {
+        success: false,
+        error: 'Missing videoUrl in response',
+        output: {
+          videoUrl: '',
+        },
+      }
+    }
+
+    return {
+      success: true,
+      output: {
+        videoUrl: data.videoUrl,
+        videoFile: data.videoFile,
+        duration: data.duration,
+        width: data.width,
+        height: data.height,
+        provider: 'falai',
+        model: data.model,
+        jobId: data.jobId,
+      },
+    }
+  },
+
+  outputs: {
+    videoUrl: { type: 'string', description: 'Generated video URL' },
+    videoFile: { type: 'json', description: 'Video file object with metadata' },
+    duration: { type: 'number', description: 'Video duration in seconds' },
+    width: { type: 'number', description: 'Video width in pixels' },
+    height: { type: 'number', description: 'Video height in pixels' },
+    provider: { type: 'string', description: 'Provider used (falai)' },
+    model: { type: 'string', description: 'Model used' },
+    jobId: { type: 'string', description: 'Job ID' },
+  },
+}
diff --git a/apps/sim/tools/video/index.ts b/apps/sim/tools/video/index.ts
new file mode 100644
index 0000000000..036a61d434
--- /dev/null
+++ b/apps/sim/tools/video/index.ts
@@ -0,0 +1,7 @@
+import { falaiVideoTool } from '@/tools/video/falai'
+import { lumaVideoTool } from '@/tools/video/luma'
+import { minimaxVideoTool } from '@/tools/video/minimax'
+import { runwayVideoTool } from '@/tools/video/runway'
+import { veoVideoTool } from '@/tools/video/veo'
+
+export { runwayVideoTool, veoVideoTool, lumaVideoTool, minimaxVideoTool, falaiVideoTool }
diff --git a/apps/sim/tools/video/luma.ts b/apps/sim/tools/video/luma.ts
new file mode 100644
index 0000000000..d6faf1b683
--- /dev/null
+++ b/apps/sim/tools/video/luma.ts
@@ -0,0 +1,135 @@
+import type { ToolConfig } from '@/tools/types'
+import type { VideoParams, VideoResponse } from '@/tools/video/types'
+
+export const lumaVideoTool: ToolConfig<VideoParams, VideoResponse> = {
+  id: 'video_luma',
+  name: 'Luma Dream Machine Video',
+  description: 'Generate videos using Luma Dream Machine with advanced camera controls',
+  version: '1.0.0',
+
+  params: {
+    provider: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'Video provider (luma)',
+    },
+    apiKey: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'Luma AI API key',
+    },
+    model: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Luma model: ray-2 (default)',
+    },
+    prompt: {
+      type: 'string',
+      required: true,
+      visibility: 'user-or-llm',
+      description: 'Text prompt describing the video to generate',
+    },
+    duration: {
+      type: 'number',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Video duration in seconds (5 or 9, default: 5)',
+    },
+    aspectRatio: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Aspect ratio: 16:9 (landscape), 9:16 (portrait), or 1:1 (square)',
+    },
+    resolution: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Video resolution: 540p, 720p, or 1080p (default: 1080p)',
+    },
+    cameraControl: {
+      type: 'json',
+      required: false,
+      visibility: 'user-or-llm',
+      description:
+        'Camera controls as array of concept objects. Format: [{ "key": "concept_name" }]. Valid keys: truck_left, truck_right, pan_left, pan_right, tilt_up, tilt_down, zoom_in, zoom_out, push_in, pull_out, orbit_left, orbit_right, crane_up, crane_down, static, handheld, and 20+ more predefined options',
+    },
+  },
+
+  request: {
+    url: '/api/proxy/video',
+    method: 'POST',
+    headers: () => ({
+      'Content-Type': 'application/json',
+    }),
+    body: (
+      params: VideoParams & {
+        _context?: { workspaceId?: string; workflowId?: string; executionId?: string }
+      }
+    ) => ({
+      provider: 'luma',
+      apiKey: params.apiKey,
+      model: params.model || 'ray-2',
+      prompt: params.prompt,
+      duration: params.duration || 5,
+      aspectRatio: params.aspectRatio || '16:9',
+      resolution: params.resolution || '1080p',
+      cameraControl: params.cameraControl,
+      workspaceId: params._context?.workspaceId,
+      workflowId: params._context?.workflowId,
+      executionId: params._context?.executionId,
+    }),
+  },
+
+  transformResponse: async (response: Response) => {
+    const data = await response.json()
+
+    if (!response.ok || data.error) {
+      return {
+        success: false,
+        error: data.error || 'Video generation failed',
+        output: {
+          videoUrl: '',
+        },
+      }
+    }
+
+    if (!data.videoUrl) {
+      return {
+        success: false,
+        error: 'Missing videoUrl in response',
+        output: {
+          videoUrl: '',
+        },
+      }
+    }
+
+    return {
+      success: true,
+      output: {
+        videoUrl: data.videoUrl,
+        videoFile: data.videoFile,
+        duration: data.duration,
+        width: data.width,
+        height: data.height,
+        provider: 'luma',
+        model: data.model,
+        jobId: data.jobId,
+      },
+    }
+  },
+
+  outputs: {
+    videoUrl: { type: 'string', description: 'Generated video URL' },
+    videoFile: { type: 'json', description: 'Video file object with metadata' },
+    duration: { type: 'number', description: 'Video duration in seconds' },
+    width: { type: 'number', description: 'Video width in pixels' },
+    height: { type: 'number', description: 'Video height in pixels' },
+    provider: { type: 'string', description: 'Provider used (luma)' },
+    model: { type: 'string', description: 'Model used' },
+    jobId: { type: 'string', description: 'Luma job ID' },
+  },
+}
diff --git a/apps/sim/tools/video/minimax.ts b/apps/sim/tools/video/minimax.ts
new file mode 100644
index 0000000000..756d357e4a
--- /dev/null
+++ b/apps/sim/tools/video/minimax.ts
@@ -0,0 +1,121 @@
+import type { ToolConfig } from '@/tools/types'
+import type { VideoParams, VideoResponse } from '@/tools/video/types'
+
+export const minimaxVideoTool: ToolConfig<VideoParams, VideoResponse> = {
+  id: 'video_minimax',
+  name: 'MiniMax Hailuo Video',
+  description:
+    'Generate videos using MiniMax Hailuo through MiniMax Platform API with advanced realism and prompt optimization',
+  version: '1.0.0',
+
+  params: {
+    provider: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'Video provider (minimax)',
+    },
+    apiKey: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'MiniMax API key from platform.minimax.io',
+    },
+    model: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'MiniMax model: hailuo-02 (default)',
+    },
+    prompt: {
+      type: 'string',
+      required: true,
+      visibility: 'user-or-llm',
+      description: 'Text prompt describing the video to generate',
+    },
+    duration: {
+      type: 'number',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Video duration in seconds (6 or 10, default: 6)',
+    },
+    promptOptimizer: {
+      type: 'boolean',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Enable prompt optimization for better results (default: true)',
+    },
+  },
+
+  request: {
+    url: '/api/proxy/video',
+    method: 'POST',
+    headers: () => ({
+      'Content-Type': 'application/json',
+    }),
+    body: (
+      params: VideoParams & {
+        _context?: { workspaceId?: string; workflowId?: string; executionId?: string }
+      }
+    ) => ({
+      provider: 'minimax',
+      apiKey: params.apiKey,
+      model: params.model || 'hailuo-02',
+      prompt: params.prompt,
+      duration: params.duration || 6,
+      promptOptimizer: params.promptOptimizer !== false, // Default true
+      workspaceId: params._context?.workspaceId,
+      workflowId: params._context?.workflowId,
+      executionId: params._context?.executionId,
+    }),
+  },
+
+  transformResponse: async (response: Response) => {
+    const data = await response.json()
+
+    if (!response.ok || data.error) {
+      return {
+        success: false,
+        error: data.error || 'Video generation failed',
+        output: {
+          videoUrl: '',
+        },
+      }
+    }
+
+    if (!data.videoUrl) {
+      return {
+        success: false,
+        error: 'Missing videoUrl in response',
+        output: {
+          videoUrl: '',
+        },
+      }
+    }
+
+    return {
+      success: true,
+      output: {
+        videoUrl: data.videoUrl,
+        videoFile: data.videoFile,
+        duration: data.duration,
+        width: data.width,
+        height: data.height,
+        provider: 'minimax',
+        model: data.model,
+        jobId: data.jobId,
+      },
+    }
+  },
+
+  outputs: {
+    videoUrl: { type: 'string', description: 'Generated video URL' },
+    videoFile: { type: 'json', description: 'Video file object with metadata' },
+    duration: { type: 'number', description: 'Video duration in seconds' },
+    width: { type: 'number', description: 'Video width in pixels' },
+    height: { type: 'number', description: 'Video height in pixels' },
+    provider: { type: 'string', description: 'Provider used (minimax)' },
+    model: { type: 'string', description: 'Model used' },
+    jobId: { type: 'string', description: 'MiniMax job ID' },
+  },
+}
diff --git a/apps/sim/tools/video/runway.ts b/apps/sim/tools/video/runway.ts
new file mode 100644
index 0000000000..c2f460158d
--- /dev/null
+++ b/apps/sim/tools/video/runway.ts
@@ -0,0 +1,135 @@
+import type { ToolConfig } from '@/tools/types'
+import type { VideoParams, VideoResponse } from '@/tools/video/types'
+
+export const runwayVideoTool: ToolConfig<VideoParams, VideoResponse> = {
+  id: 'video_runway',
+  name: 'Runway Gen-4 Video',
+  description: 'Generate videos using Runway Gen-4 with world consistency and visual references',
+  version: '1.0.0',
+
+  params: {
+    provider: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'Video provider (runway)',
+    },
+    apiKey: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'Runway API key',
+    },
+    model: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Runway model: gen-4 (default, higher quality) or gen-4-turbo (faster)',
+    },
+    prompt: {
+      type: 'string',
+      required: true,
+      visibility: 'user-or-llm',
+      description: 'Text prompt describing the video to generate',
+    },
+    duration: {
+      type: 'number',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Video duration in seconds (5 or 10, default: 5)',
+    },
+    aspectRatio: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Aspect ratio: 16:9 (landscape), 9:16 (portrait), or 1:1 (square)',
+    },
+    resolution: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Video resolution (720p output). Note: Gen-4 Turbo outputs at 720p natively',
+    },
+    visualReference: {
+      type: 'json',
+      required: true,
+      visibility: 'user-or-llm',
+      description:
+        'Reference image REQUIRED for Gen-4 (UserFile object). Gen-4 only supports image-to-video, not text-only generation',
+    },
+  },
+
+  request: {
+    url: '/api/proxy/video',
+    method: 'POST',
+    headers: () => ({
+      'Content-Type': 'application/json',
+    }),
+    body: (
+      params: VideoParams & {
+        _context?: { workspaceId?: string; workflowId?: string; executionId?: string }
+      }
+    ) => ({
+      provider: 'runway',
+      apiKey: params.apiKey,
+      model: 'gen-4-turbo', // Only gen4_turbo model is supported
+      prompt: params.prompt,
+      duration: params.duration || 5,
+      aspectRatio: params.aspectRatio || '16:9',
+      resolution: params.resolution || '720p',
+      visualReference: params.visualReference,
+      workspaceId: params._context?.workspaceId,
+      workflowId: params._context?.workflowId,
+      executionId: params._context?.executionId,
+    }),
+  },
+
+  transformResponse: async (response: Response) => {
+    const data = await response.json()
+
+    if (!response.ok || data.error) {
+      return {
+        success: false,
+        error: data.error || 'Video generation failed',
+        output: {
+          videoUrl: '',
+        },
+      }
+    }
+
+    if (!data.videoUrl) {
+      return {
+        success: false,
+        error: 'Missing videoUrl in response',
+        output: {
+          videoUrl: '',
+        },
+      }
+    }
+
+    return {
+      success: true,
+      output: {
+        videoUrl: data.videoUrl,
+        videoFile: data.videoFile,
+        duration: data.duration,
+        width: data.width,
+        height: data.height,
+        provider: 'runway',
+        model: data.model,
+        jobId: data.jobId,
+      },
+    }
+  },
+
+  outputs: {
+    videoUrl: { type: 'string', description: 'Generated video URL' },
+    videoFile: { type: 'json', description: 'Video file object with metadata' },
+    duration: { type: 'number', description: 'Video duration in seconds' },
+    width: { type: 'number', description: 'Video width in pixels' },
+    height: { type: 'number', description: 'Video height in pixels' },
+    provider: { type: 'string', description: 'Provider used (runway)' },
+    model: { type: 'string', description: 'Model used' },
+    jobId: { type: 'string', description: 'Runway job ID' },
+  },
+}
diff --git a/apps/sim/tools/video/types.ts b/apps/sim/tools/video/types.ts
new file mode 100644
index 0000000000..091ec061e8
--- /dev/null
+++ b/apps/sim/tools/video/types.ts
@@ -0,0 +1,132 @@
+import type { UserFile } from '@/executor/types'
+import type { ToolResponse } from '@/tools/types'
+
+export interface VideoParams {
+  provider: 'runway' | 'veo' | 'luma' | 'minimax' | 'falai'
+  apiKey: string
+  model?: string
+  prompt: string
+  duration?: number
+  aspectRatio?: string
+  resolution?: string
+  // Provider-specific features
+  visualReference?: UserFile // Runway only (required for Runway)
+  cameraControl?: {
+    // Luma only
+    pan?: number
+    zoom?: number
+    tilt?: number
+    truck?: number
+    tracking?: boolean
+  }
+  endpoint?: string // MiniMax: 'pro' | 'standard'
+  promptOptimizer?: boolean // MiniMax and Fal.ai MiniMax models
+}
+
+export interface VideoResponse extends ToolResponse {
+  output: {
+    videoUrl: string
+    videoFile?: UserFile
+    duration?: number
+    width?: number
+    height?: number
+    provider?: string
+    model?: string
+    jobId?: string
+  }
+}
+
+export interface VideoBlockResponse extends ToolResponse {
+  output: {
+    videoUrl: string
+    videoFile?: UserFile
+    duration?: number
+    width?: number
+    height?: number
+    provider?: string
+    model?: string
+  }
+}
+
+export interface RunwayParams extends Omit<VideoParams, 'provider'> {
+  model?: 'gen-4-turbo' // Only gen4_turbo supports image-to-video
+  visualReference: UserFile // REQUIRED for Gen-4
+  resolution?: '720p' // Gen-4 Turbo outputs at 720p
+  duration?: 5 | 10
+}
+
+export interface VeoParams extends Omit<VideoParams, 'provider'> {
+  model?: 'veo-3' | 'veo-3-fast' | 'veo-3.1'
+  aspectRatio?: '16:9' | '9:16'
+  resolution?: '720p' | '1080p'
+  duration?: 4 | 6 | 8
+}
+
+export interface LumaParams extends Omit<VideoParams, 'provider'> {
+  model?: 'ray3'
+  cameraControl?: {
+    pan?: number
+    zoom?: number
+    tilt?: number
+    truck?: number
+    tracking?: boolean
+  }
+  aspectRatio?: '16:9' | '9:16' | '1:1'
+  resolution?: '540p' | '720p' | '1080p'
+  duration?: 5 | 10
+}
+
+export interface MinimaxParams extends Omit<VideoParams, 'provider'> {
+  model?: 'hailuo-02'
+  endpoint?: 'pro' | 'standard'
+  promptOptimizer?: boolean
+  duration?: 6 | 10
+}
+
+export interface VideoRequestBody extends VideoParams {
+  workspaceId?: string
+  workflowId?: string
+  executionId?: string
+  userId?: string
+}
+
+export interface RunwayJobResponse {
+  id: string
+  status: 'pending' | 'processing' | 'completed' | 'failed'
+  videoUrl?: string
+  progress?: number
+  error?: string
+}
+
+export interface VeoJobResponse {
+  name: string
+  done: boolean
+  response?: {
+    generatedVideo: {
+      uri: string
+      mimeType: string
+    }
+  }
+  error?: {
+    message: string
+  }
+}
+
+export interface LumaJobResponse {
+  id: string
+  state: 'queued' | 'processing' | 'completed' | 'failed'
+  video?: {
+    url: string
+    width: number
+    height: number
+    duration: number
+  }
+  failure_reason?: string
+}
+
+export interface MinimaxJobResponse {
+  request_id: string
+  status: 'pending' | 'processing' | 'completed' | 'failed'
+  video_url?: string
+  error?: string
+}
diff --git a/apps/sim/tools/video/veo.ts b/apps/sim/tools/video/veo.ts
new file mode 100644
index 0000000000..582062bf13
--- /dev/null
+++ b/apps/sim/tools/video/veo.ts
@@ -0,0 +1,128 @@
+import type { ToolConfig } from '@/tools/types'
+import type { VideoParams, VideoResponse } from '@/tools/video/types'
+
+export const veoVideoTool: ToolConfig<VideoParams, VideoResponse> = {
+  id: 'video_veo',
+  name: 'Google Veo 3 Video',
+  description: 'Generate videos using Google Veo 3/3.1 with native audio generation',
+  version: '1.0.0',
+
+  params: {
+    provider: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'Video provider (veo)',
+    },
+    apiKey: {
+      type: 'string',
+      required: true,
+      visibility: 'user-only',
+      description: 'Google Gemini API key',
+    },
+    model: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description:
+        'Veo model: veo-3 (default, highest quality), veo-3-fast (faster), or veo-3.1 (latest)',
+    },
+    prompt: {
+      type: 'string',
+      required: true,
+      visibility: 'user-or-llm',
+      description: 'Text prompt describing the video to generate',
+    },
+    duration: {
+      type: 'number',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Video duration in seconds (4, 6, or 8, default: 8)',
+    },
+    aspectRatio: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Aspect ratio: 16:9 (landscape) or 9:16 (portrait)',
+    },
+    resolution: {
+      type: 'string',
+      required: false,
+      visibility: 'user-or-llm',
+      description: 'Video resolution: 720p or 1080p (default: 1080p)',
+    },
+  },
+
+  request: {
+    url: '/api/proxy/video',
+    method: 'POST',
+    headers: () => ({
+      'Content-Type': 'application/json',
+    }),
+    body: (
+      params: VideoParams & {
+        _context?: { workspaceId?: string; workflowId?: string; executionId?: string }
+      }
+    ) => ({
+      provider: 'veo',
+      apiKey: params.apiKey,
+      model: params.model || 'veo-3',
+      prompt: params.prompt,
+      duration: params.duration || 8, // Default 8 seconds (valid: 4, 6, or 8)
+      aspectRatio: params.aspectRatio || '16:9',
+      resolution: params.resolution || '1080p',
+      workspaceId: params._context?.workspaceId,
+      workflowId: params._context?.workflowId,
+      executionId: params._context?.executionId,
+    }),
+  },
+
+  transformResponse: async (response: Response) => {
+    const data = await response.json()
+
+    if (!response.ok || data.error) {
+      return {
+        success: false,
+        error: data.error || 'Video generation failed',
+        output: {
+          videoUrl: '',
+        },
+      }
+    }
+
+    if (!data.videoUrl) {
+      return {
+        success: false,
+        error: 'Missing videoUrl in response',
+        output: {
+          videoUrl: '',
+        },
+      }
+    }
+
+    return {
+      success: true,
+      output: {
+        videoUrl: data.videoUrl,
+        videoFile: data.videoFile,
+        duration: data.duration,
+        width: data.width,
+        height: data.height,
+        provider: 'veo',
+        model: data.model,
+        jobId: data.jobId,
+      },
+    }
+  },
+
+  outputs: {
+    videoUrl: { type: 'string', description: 'Generated video URL' },
+    videoFile: { type: 'json', description: 'Video file object with metadata' },
+    duration: { type: 'number', description: 'Video duration in seconds' },
+    width: { type: 'number', description: 'Video width in pixels' },
+    height: { type: 'number', description: 'Video height in pixels' },
+    provider: { type: 'string', description: 'Provider used (veo)' },
+    model: { type: 'string', description: 'Model used' },
+    jobId: { type: 'string', description: 'Veo job ID' },
+  },
+}