diff --git a/apps/desktop/src/components/settings/ai/stt/shared.tsx b/apps/desktop/src/components/settings/ai/stt/shared.tsx index b7d2cd13dc..dbf27f794a 100644 --- a/apps/desktop/src/components/settings/ai/stt/shared.tsx +++ b/apps/desktop/src/components/settings/ai/stt/shared.tsx @@ -44,6 +44,14 @@ export const displayModelId = (model: string) => { return "Whisper 1"; } + if (model === "gpt-4o-transcribe") { + return "GPT-4o Transcribe"; + } + + if (model === "gpt-4o-mini-transcribe") { + return "GPT-4o mini Transcribe"; + } + if (model.startsWith("am-")) { const am = model as AmModel; if (am == "am-parakeet-v2") { @@ -153,7 +161,7 @@ export const PROVIDERS = [ badge: "Beta", icon: , baseUrl: "https://api.openai.com/v1", - models: ["whisper-1"], + models: ["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"], requiresPro: false, }, { diff --git a/owhisper/owhisper-client/src/adapter/openai/batch.rs b/owhisper/owhisper-client/src/adapter/openai/batch.rs index 984191927a..cc8e93e854 100644 --- a/owhisper/owhisper-client/src/adapter/openai/batch.rs +++ b/owhisper/owhisper-client/src/adapter/openai/batch.rs @@ -11,6 +11,14 @@ use super::OpenAIAdapter; const DEFAULT_API_BASE: &str = "https://api.openai.com/v1"; const DEFAULT_MODEL: &str = "whisper-1"; +const RESPONSE_FORMAT_VERBOSE: &str = "verbose_json"; +const RESPONSE_FORMAT_JSON: &str = "json"; +const TIMESTAMP_GRANULARITY: &str = "word"; + +// Models that support verbose_json with word-level timestamps +fn supports_word_timestamps(model: &str) -> bool { + model == "whisper-1" +} impl BatchSttAdapter for OpenAIAdapter { fn transcribe_file<'a, P: AsRef + Send + 'a>( @@ -33,17 +41,6 @@ struct OpenAIWord { end: f64, } -#[derive(Debug, serde::Deserialize)] -struct OpenAISegment { - #[allow(dead_code)] - id: i32, - #[allow(dead_code)] - seek: i32, - start: f64, - end: f64, - text: String, -} - #[derive(Debug, serde::Deserialize)] struct OpenAIVerboseResponse { #[allow(dead_code)] @@ -54,8 +51,6 @@ struct OpenAIVerboseResponse { text: String, #[serde(default)] words: Vec, - #[serde(default)] - segments: Vec, } async fn do_transcribe_file( @@ -91,9 +86,17 @@ async fn do_transcribe_file( let mut form = Form::new() .part("file", file_part) - .text("model", model.to_string()) - .text("response_format", "verbose_json") - .text("timestamp_granularities[]", "word"); + .text("model", model.to_string()); + + // whisper-1 supports verbose_json with word-level timestamps + // gpt-4o-transcribe and gpt-4o-mini-transcribe only support json/text + if supports_word_timestamps(model) { + form = form + .text("response_format", RESPONSE_FORMAT_VERBOSE) + .text("timestamp_granularities[]", TIMESTAMP_GRANULARITY); + } else { + form = form.text("response_format", RESPONSE_FORMAT_JSON); + } if let Some(lang) = params.languages.first() { form = form.text("language", lang.iso639().code().to_string()); diff --git a/owhisper/owhisper-client/src/adapter/openai/live.rs b/owhisper/owhisper-client/src/adapter/openai/live.rs index 91dae2a75b..4d1c2b003e 100644 --- a/owhisper/owhisper-client/src/adapter/openai/live.rs +++ b/owhisper/owhisper-client/src/adapter/openai/live.rs @@ -7,6 +7,12 @@ use super::OpenAIAdapter; use crate::adapter::parsing::{calculate_time_span, WordBuilder}; use crate::adapter::RealtimeSttAdapter; +// Voice Activity Detection (VAD) configuration defaults +const VAD_DETECTION_TYPE: &str = "server_vad"; +const VAD_THRESHOLD: f32 = 0.5; +const VAD_PREFIX_PADDING_MS: u32 = 300; +const VAD_SILENCE_DURATION_MS: u32 = 500; + impl RealtimeSttAdapter for OpenAIAdapter { fn provider_name(&self) -> &'static str { "openai" @@ -78,10 +84,10 @@ impl RealtimeSttAdapter for OpenAIAdapter { language, }), turn_detection: Some(TurnDetection { - detection_type: "server_vad".to_string(), - threshold: Some(0.5), - prefix_padding_ms: Some(300), - silence_duration_ms: Some(500), + detection_type: VAD_DETECTION_TYPE.to_string(), + threshold: Some(VAD_THRESHOLD), + prefix_padding_ms: Some(VAD_PREFIX_PADDING_MS), + silence_duration_ms: Some(VAD_SILENCE_DURATION_MS), }), }), }), diff --git a/owhisper/owhisper-client/src/adapter/openai/mod.rs b/owhisper/owhisper-client/src/adapter/openai/mod.rs index 089748387f..c6002b2baf 100644 --- a/owhisper/owhisper-client/src/adapter/openai/mod.rs +++ b/owhisper/owhisper-client/src/adapter/openai/mod.rs @@ -3,6 +3,12 @@ mod live; pub(crate) const DEFAULT_WS_HOST: &str = "api.openai.com"; pub(crate) const WS_PATH: &str = "/v1/realtime"; + +// OpenAI STT Models: +// - whisper-1: Legacy model, supports verbose_json with word timestamps (batch only) +// - gpt-4o-transcribe: High quality, supports both batch (json only) and realtime +// - gpt-4o-mini-transcribe: Cost-efficient, supports both batch (json only) and realtime +// - gpt-4o-transcribe-diarize: Speaker diarization (batch only, not yet supported here) pub(crate) const DEFAULT_TRANSCRIPTION_MODEL: &str = "gpt-4o-transcribe"; #[derive(Clone, Default)]