diff --git a/apps/desktop/src/components/settings/ai/stt/shared.tsx b/apps/desktop/src/components/settings/ai/stt/shared.tsx
index b7d2cd13dc..dbf27f794a 100644
--- a/apps/desktop/src/components/settings/ai/stt/shared.tsx
+++ b/apps/desktop/src/components/settings/ai/stt/shared.tsx
@@ -44,6 +44,14 @@ export const displayModelId = (model: string) => {
return "Whisper 1";
}
+ if (model === "gpt-4o-transcribe") {
+ return "GPT-4o Transcribe";
+ }
+
+ if (model === "gpt-4o-mini-transcribe") {
+ return "GPT-4o mini Transcribe";
+ }
+
if (model.startsWith("am-")) {
const am = model as AmModel;
if (am == "am-parakeet-v2") {
@@ -153,7 +161,7 @@ export const PROVIDERS = [
badge: "Beta",
icon: ,
baseUrl: "https://api.openai.com/v1",
- models: ["whisper-1"],
+ models: ["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"],
requiresPro: false,
},
{
diff --git a/owhisper/owhisper-client/src/adapter/openai/batch.rs b/owhisper/owhisper-client/src/adapter/openai/batch.rs
index 984191927a..cc8e93e854 100644
--- a/owhisper/owhisper-client/src/adapter/openai/batch.rs
+++ b/owhisper/owhisper-client/src/adapter/openai/batch.rs
@@ -11,6 +11,14 @@ use super::OpenAIAdapter;
const DEFAULT_API_BASE: &str = "https://api.openai.com/v1";
const DEFAULT_MODEL: &str = "whisper-1";
+const RESPONSE_FORMAT_VERBOSE: &str = "verbose_json";
+const RESPONSE_FORMAT_JSON: &str = "json";
+const TIMESTAMP_GRANULARITY: &str = "word";
+
+// Models that support verbose_json with word-level timestamps
+fn supports_word_timestamps(model: &str) -> bool {
+ model == "whisper-1"
+}
impl BatchSttAdapter for OpenAIAdapter {
fn transcribe_file<'a, P: AsRef + Send + 'a>(
@@ -33,17 +41,6 @@ struct OpenAIWord {
end: f64,
}
-#[derive(Debug, serde::Deserialize)]
-struct OpenAISegment {
- #[allow(dead_code)]
- id: i32,
- #[allow(dead_code)]
- seek: i32,
- start: f64,
- end: f64,
- text: String,
-}
-
#[derive(Debug, serde::Deserialize)]
struct OpenAIVerboseResponse {
#[allow(dead_code)]
@@ -54,8 +51,6 @@ struct OpenAIVerboseResponse {
text: String,
#[serde(default)]
words: Vec,
- #[serde(default)]
- segments: Vec,
}
async fn do_transcribe_file(
@@ -91,9 +86,17 @@ async fn do_transcribe_file(
let mut form = Form::new()
.part("file", file_part)
- .text("model", model.to_string())
- .text("response_format", "verbose_json")
- .text("timestamp_granularities[]", "word");
+ .text("model", model.to_string());
+
+ // whisper-1 supports verbose_json with word-level timestamps
+ // gpt-4o-transcribe and gpt-4o-mini-transcribe only support json/text
+ if supports_word_timestamps(model) {
+ form = form
+ .text("response_format", RESPONSE_FORMAT_VERBOSE)
+ .text("timestamp_granularities[]", TIMESTAMP_GRANULARITY);
+ } else {
+ form = form.text("response_format", RESPONSE_FORMAT_JSON);
+ }
if let Some(lang) = params.languages.first() {
form = form.text("language", lang.iso639().code().to_string());
diff --git a/owhisper/owhisper-client/src/adapter/openai/live.rs b/owhisper/owhisper-client/src/adapter/openai/live.rs
index 91dae2a75b..4d1c2b003e 100644
--- a/owhisper/owhisper-client/src/adapter/openai/live.rs
+++ b/owhisper/owhisper-client/src/adapter/openai/live.rs
@@ -7,6 +7,12 @@ use super::OpenAIAdapter;
use crate::adapter::parsing::{calculate_time_span, WordBuilder};
use crate::adapter::RealtimeSttAdapter;
+// Voice Activity Detection (VAD) configuration defaults
+const VAD_DETECTION_TYPE: &str = "server_vad";
+const VAD_THRESHOLD: f32 = 0.5;
+const VAD_PREFIX_PADDING_MS: u32 = 300;
+const VAD_SILENCE_DURATION_MS: u32 = 500;
+
impl RealtimeSttAdapter for OpenAIAdapter {
fn provider_name(&self) -> &'static str {
"openai"
@@ -78,10 +84,10 @@ impl RealtimeSttAdapter for OpenAIAdapter {
language,
}),
turn_detection: Some(TurnDetection {
- detection_type: "server_vad".to_string(),
- threshold: Some(0.5),
- prefix_padding_ms: Some(300),
- silence_duration_ms: Some(500),
+ detection_type: VAD_DETECTION_TYPE.to_string(),
+ threshold: Some(VAD_THRESHOLD),
+ prefix_padding_ms: Some(VAD_PREFIX_PADDING_MS),
+ silence_duration_ms: Some(VAD_SILENCE_DURATION_MS),
}),
}),
}),
diff --git a/owhisper/owhisper-client/src/adapter/openai/mod.rs b/owhisper/owhisper-client/src/adapter/openai/mod.rs
index 089748387f..c6002b2baf 100644
--- a/owhisper/owhisper-client/src/adapter/openai/mod.rs
+++ b/owhisper/owhisper-client/src/adapter/openai/mod.rs
@@ -3,6 +3,12 @@ mod live;
pub(crate) const DEFAULT_WS_HOST: &str = "api.openai.com";
pub(crate) const WS_PATH: &str = "/v1/realtime";
+
+// OpenAI STT Models:
+// - whisper-1: Legacy model, supports verbose_json with word timestamps (batch only)
+// - gpt-4o-transcribe: High quality, supports both batch (json only) and realtime
+// - gpt-4o-mini-transcribe: Cost-efficient, supports both batch (json only) and realtime
+// - gpt-4o-transcribe-diarize: Speaker diarization (batch only, not yet supported here)
pub(crate) const DEFAULT_TRANSCRIPTION_MODEL: &str = "gpt-4o-transcribe";
#[derive(Clone, Default)]