diff --git a/apps/desktop/src/components/settings/ai/stt/shared.tsx b/apps/desktop/src/components/settings/ai/stt/shared.tsx index e7875af92e..912ab8e3ff 100644 --- a/apps/desktop/src/components/settings/ai/stt/shared.tsx +++ b/apps/desktop/src/components/settings/ai/stt/shared.tsx @@ -23,18 +23,6 @@ type Provider = { export type ProviderId = (typeof PROVIDERS)[number]["id"]; -type ProviderModels = { - hyprnote: (typeof PROVIDERS)[0]["models"]; - deepgram: (typeof PROVIDERS)[1]["models"]; - custom: (typeof PROVIDERS)[2]["models"]; - groq: (typeof PROVIDERS)[3]["models"]; - fireworks: (typeof PROVIDERS)[4]["models"]; -}; - -type LanguageSupportMap = { - [K in ProviderId]: Partial>; -}; - export const displayModelId = (model: string) => { if (model === "cloud") { return "Cloud"; @@ -142,247 +130,6 @@ export const sttProviderRequiresPro = (providerId: ProviderId) => PROVIDERS.find((provider) => provider.id === providerId)?.requiresPro ?? false; -export const LANGUAGE_SUPPORT: LanguageSupportMap = { - hyprnote: { - QuantizedTinyEn: ["en"], - QuantizedSmallEn: ["en"], - "am-parakeet-v2": ["en"], - "am-parakeet-v3": [ - "en", - "de", - "es", - "fr", - "nl", - "it", - "da", - "et", - "fi", - "el", - "hu", - "lv", - "lt", - "mt", - "pl", - "pt", - "ro", - "sk", - "sl", - "sv", - "ru", - "uk", - "bg", - "hr", - "cs", - ], - "am-whisper-large-v3": [ - "af", - "am", - "ar", - "as", - "az", - "ba", - "be", - "bg", - "bn", - "bo", - "br", - "bs", - "ca", - "cs", - "cy", - "da", - "de", - "el", - "en", - "es", - "et", - "eu", - "fa", - "fi", - "fo", - "fr", - "gl", - "gu", - "ha", - "he", - "hi", - "hr", - "ht", - "hu", - "hy", - "id", - "is", - "it", - "ja", - "jv", - "ka", - "kk", - "km", - "kn", - "ko", - "la", - "lb", - "lo", - "lt", - "lv", - "mg", - "mi", - "mk", - "ml", - "mn", - "mr", - "ms", - "mt", - "my", - "ne", - "nl", - "nn", - "no", - "oc", - "pa", - "pl", - "ps", - "pt", - "ro", - "ru", - "sa", - "sd", - "si", - "sk", - "sl", - "sn", - "so", - "sq", - "sr", - "su", - "sv", - "sw", - "ta", - "te", - "tg", - "th", - "tk", - "tl", - "tr", - "tt", - "uk", - "ur", - "uz", - "vi", - "yi", - "yo", - "zh", - ], - }, - deepgram: { - // https://developers.deepgram.com/docs/models-languages-overview#nova-3 - "nova-3-general": [ - "en", - "en-US", - "en-AU", - "en-GB", - "en-IN", - "en-NZ", - "de", - "nl", - "sv", - "sv-SE", - "da", - "da-DK", - "es", - "es-419", - "fr", - "fr-CA", - "pt", - "pt-BR", - "pt-PT", - "it", - "tr", - "no", - "id", - ], - "nova-3-medical": [ - "en", - "en-US", - "en-AU", - "en-CA", - "en-GB", - "en-IE", - "en-IN", - "en-NZ", - ], - // https://developers.deepgram.com/docs/models-languages-overview#nova-2 - "nova-2-general": [ - "bg", - "ca", - "zh", - "zh-CN", - "zh-Hans", - "zh-TW", - "zh-Hant", - "zh-HK", - "cs", - "da", - "da-DK", - "nl", - "en", - "en-US", - "en-AU", - "en-GB", - "en-NZ", - "en-IN", - "et", - "fi", - "nl-BE", - "fr", - "fr-CA", - "de", - "de-CH", - "el", - "hi", - "hu", - "id", - "it", - "ja", - "ko", - "ko-KR", - "lv", - "lt", - "ms", - "no", - "pl", - "pt", - "pt-BR", - "pt-PT", - "ro", - "ru", - "sk", - "es", - "es-419", - "sv", - "sv-SE", - "th", - "th-TH", - "tr", - "uk", - "vi", - ], - "nova-2-meeting": ["en", "en-US"], - "nova-2-phonecall": ["en", "en-US"], - "nova-2-finance": ["en", "en-US"], - "nova-2-conversationalai": ["en", "en-US"], - "nova-2-voicemail": ["en", "en-US"], - "nova-2-video": ["en", "en-US"], - "nova-2-medical": ["en", "en-US"], - "nova-2-drivethru": ["en", "en-US"], - "nova-2-automotive": ["en", "en-US"], - "nova-2-atc": ["en", "en-US"], - }, - custom: {}, - groq: {}, - fireworks: {}, -}; - export const sttModelQueries = { isDownloaded: (model: SupportedSttModel) => queryOptions({ diff --git a/apps/desktop/src/components/settings/general/index.tsx b/apps/desktop/src/components/settings/general/index.tsx index a1e8192c3f..74a10effed 100644 --- a/apps/desktop/src/components/settings/general/index.tsx +++ b/apps/desktop/src/components/settings/general/index.tsx @@ -9,6 +9,7 @@ import * as main from "../../../store/tinybase/main"; import { AppSettingsView } from "./app-settings"; import { MainLanguageView } from "./main-language"; import { Permissions } from "./permissions"; +import { SpokenLanguagesView } from "./spoken-languages"; export function SettingsGeneral() { const value = useConfigValues([ @@ -134,6 +135,15 @@ export function SettingsGeneral() { /> )} + + {(field) => ( + field.handleChange(val)} + supportedLanguages={SUPPORTED_LANGUAGES} + /> + )} + diff --git a/apps/desktop/src/components/settings/general/spoken-languages.tsx b/apps/desktop/src/components/settings/general/spoken-languages.tsx index 50612038d8..b34f12a409 100644 --- a/apps/desktop/src/components/settings/general/spoken-languages.tsx +++ b/apps/desktop/src/components/settings/general/spoken-languages.tsx @@ -14,6 +14,11 @@ interface SpokenLanguagesViewProps { supportedLanguages: ISO_639_1_CODE[]; } +function getLanguageName(code: string): string { + const lang = LANGUAGES_ISO_639_1[code as ISO_639_1_CODE]; + return lang?.name ?? code; +} + export function SpokenLanguagesView({ value, onChange, @@ -28,14 +33,12 @@ export function SpokenLanguagesView({ return []; } const query = languageSearchQuery.toLowerCase(); - return supportedLanguages - .filter((langCode) => { - const langName = LANGUAGES_ISO_639_1[langCode].name; - return ( - !value.includes(langName) && langName.toLowerCase().includes(query) - ); - }) - .map((langCode) => LANGUAGES_ISO_639_1[langCode].name); + return supportedLanguages.filter((langCode) => { + const langName = LANGUAGES_ISO_639_1[langCode].name; + return ( + !value.includes(langCode) && langName.toLowerCase().includes(query) + ); + }); }, [languageSearchQuery, value, supportedLanguages]); const handleLanguageKeyDown = (e: React.KeyboardEvent) => { @@ -63,7 +66,8 @@ export function SpokenLanguagesView({ languageSelectedIndex >= 0 && languageSelectedIndex < filteredLanguages.length ) { - onChange([...value, filteredLanguages[languageSelectedIndex]]); + const selectedCode = filteredLanguages[languageSelectedIndex]; + onChange([...value, selectedCode]); setLanguageSearchQuery(""); setLanguageSelectedIndex(-1); } @@ -90,13 +94,13 @@ export function SpokenLanguagesView({ document.getElementById("language-search-input")?.focus() } > - {value.map((lang) => ( + {value.map((code) => ( - {lang} + {getLanguageName(code)} )) ) : ( diff --git a/owhisper/owhisper-client/src/adapter/argmax.rs b/owhisper/owhisper-client/src/adapter/argmax.rs index c43ec2079d..23ab712f4e 100644 --- a/owhisper/owhisper-client/src/adapter/argmax.rs +++ b/owhisper/owhisper-client/src/adapter/argmax.rs @@ -6,18 +6,51 @@ use owhisper_interface::ListenParams; use super::{BatchFuture, DeepgramAdapter, SttAdapter}; +const PARAKEET_V3_LANGS: &[&str] = &[ + "bg", "cs", "da", "de", "el", "en", "es", "et", "fi", "fr", "hr", "hu", "it", "lt", "lv", "mt", + "nl", "pl", "pt", "ro", "ru", "sk", "sl", "sv", "uk", +]; + #[derive(Clone, Default)] pub struct ArgmaxAdapter { inner: DeepgramAdapter, } +impl ArgmaxAdapter { + fn adapt_params(params: &ListenParams) -> ListenParams { + let mut adapted = params.clone(); + let model = params.model.as_deref().unwrap_or(""); + + let lang = if model.contains("parakeet") && model.contains("v2") { + hypr_language::ISO639::En.into() + } else if model.contains("parakeet") && model.contains("v3") { + params + .languages + .iter() + .find(|lang| PARAKEET_V3_LANGS.contains(&lang.iso639().code())) + .cloned() + .unwrap_or_else(|| hypr_language::ISO639::En.into()) + } else { + params + .languages + .first() + .cloned() + .unwrap_or_else(|| hypr_language::ISO639::En.into()) + }; + + adapted.languages = vec![lang]; + adapted + } +} + impl SttAdapter for ArgmaxAdapter { fn supports_native_multichannel(&self) -> bool { false } fn build_ws_url(&self, api_base: &str, params: &ListenParams, channels: u8) -> url::Url { - self.inner.build_ws_url(api_base, params, channels) + let adapted = Self::adapt_params(params); + self.inner.build_ws_url(api_base, &adapted, channels) } fn build_auth_header(&self, api_key: Option<&str>) -> Option<(&'static str, String)> { diff --git a/owhisper/owhisper-client/src/adapter/deepgram.rs b/owhisper/owhisper-client/src/adapter/deepgram.rs index 30cfee2dc7..1cb4c7138b 100644 --- a/owhisper/owhisper-client/src/adapter/deepgram.rs +++ b/owhisper/owhisper-client/src/adapter/deepgram.rs @@ -5,10 +5,14 @@ use hypr_ws::client::Message; use owhisper_interface::batch::Response as BatchResponse; use owhisper_interface::stream::StreamResponse; use owhisper_interface::ListenParams; +use url::form_urlencoded::Serializer; +use url::UrlQuery; use super::{BatchFuture, SttAdapter}; use crate::error::Error; -use crate::{append_keyword_query, append_language_query}; + +const NOVA2_MULTI_LANGS: &[&str] = &["en", "es"]; +const NOVA3_MULTI_LANGS: &[&str] = &["en", "es", "fr", "de", "hi", "ru", "pt", "ja", "it", "nl"]; #[derive(Clone, Default)] pub struct DeepgramAdapter; @@ -193,6 +197,76 @@ impl SttAdapter for DeepgramAdapter { } } +fn can_use_multi(model: &str, languages: &[hypr_language::Language]) -> bool { + if languages.len() < 2 { + return false; + } + + let multi_langs: &[&str] = if model.contains("nova-3") { + NOVA3_MULTI_LANGS + } else if model.contains("nova-2") { + NOVA2_MULTI_LANGS + } else { + return false; + }; + + languages + .iter() + .all(|lang| multi_langs.contains(&lang.iso639().code())) +} + +fn append_keyword_query<'a>(query_pairs: &mut Serializer<'a, UrlQuery>, params: &ListenParams) { + if params.keywords.is_empty() { + return; + } + + let use_keyterms = params + .model + .as_ref() + .map(|model| model.contains("nova-3") || model.contains("parakeet")) + .unwrap_or(false); + + let param_name = if use_keyterms { "keyterm" } else { "keywords" }; + + for keyword in ¶ms.keywords { + query_pairs.append_pair(param_name, keyword); + } +} + +pub(crate) fn append_language_query<'a>( + query_pairs: &mut Serializer<'a, UrlQuery>, + params: &ListenParams, +) { + let model = params.model.as_deref().unwrap_or(""); + + match params.languages.len() { + 0 => { + query_pairs.append_pair("detect_language", "true"); + } + 1 => { + if let Some(language) = params.languages.first() { + let code = language.iso639().code(); + query_pairs.append_pair("language", code); + } + } + _ => { + if can_use_multi(model, ¶ms.languages) { + query_pairs.append_pair("language", "multi"); + for language in ¶ms.languages { + let code = language.iso639().code(); + query_pairs.append_pair("languages", code); + } + } else { + query_pairs.append_pair("detect_language", "true"); + for language in ¶ms.languages { + let code = language.iso639().code(); + query_pairs.append_pair("languages", code); + } + } + } + } +} + async fn decode_audio_to_linear16(path: PathBuf) -> Result<(bytes::Bytes, u32), Error> { tokio::task::spawn_blocking(move || -> Result<(bytes::Bytes, u32), Error> { let decoder = diff --git a/owhisper/owhisper-client/src/adapter/soniox.rs b/owhisper/owhisper-client/src/adapter/soniox.rs index f90a6aaad4..a64b9ff082 100644 --- a/owhisper/owhisper-client/src/adapter/soniox.rs +++ b/owhisper/owhisper-client/src/adapter/soniox.rs @@ -90,6 +90,12 @@ impl SonioxAdapter { params: &ListenParams, file_id: &str, ) -> Result { + #[derive(Serialize)] + struct Context { + #[serde(skip_serializing_if = "Vec::is_empty")] + terms: Vec, + } + #[derive(Serialize)] struct CreateTranscriptionRequest<'a> { model: &'a str, @@ -98,16 +104,27 @@ impl SonioxAdapter { language_hints: Vec, enable_speaker_diarization: bool, enable_language_identification: bool, + #[serde(skip_serializing_if = "Option::is_none")] + context: Option, } let model = params.model.as_deref().unwrap_or("stt-async-preview"); + let context = if params.keywords.is_empty() { + None + } else { + Some(Context { + terms: params.keywords.clone(), + }) + }; + let request = CreateTranscriptionRequest { model, file_id, language_hints: Self::language_hints(params), enable_speaker_diarization: true, enable_language_identification: true, + context, }; let url = format!("{}/v1/transcriptions", Self::api_base_url(api_base)); @@ -369,6 +386,13 @@ impl SttAdapter for SonioxAdapter { } }; + #[derive(Serialize)] + struct Context { + #[serde(skip_serializing_if = "Vec::is_empty")] + terms: Vec, + } + + // https://soniox.com/docs/stt/api-reference/websocket-api#configuration #[derive(Serialize)] struct SonioxConfig<'a> { api_key: &'a str, @@ -381,10 +405,20 @@ impl SttAdapter for SonioxAdapter { include_nonfinal: bool, enable_endpoint_detection: bool, enable_speaker_diarization: bool, + #[serde(skip_serializing_if = "Option::is_none")] + context: Option, } let model = params.model.as_deref().unwrap_or("stt-rt-preview"); + let context = if params.keywords.is_empty() { + None + } else { + Some(Context { + terms: params.keywords.clone(), + }) + }; + let cfg = SonioxConfig { api_key, model, @@ -395,6 +429,7 @@ impl SttAdapter for SonioxAdapter { include_nonfinal: true, enable_endpoint_detection: true, enable_speaker_diarization: true, + context, }; let json = serde_json::to_string(&cfg).unwrap(); diff --git a/owhisper/owhisper-client/src/lib.rs b/owhisper/owhisper-client/src/lib.rs index 5b6012730c..719337facd 100644 --- a/owhisper/owhisper-client/src/lib.rs +++ b/owhisper/owhisper-client/src/lib.rs @@ -5,9 +5,6 @@ mod live; use std::marker::PhantomData; -use url::form_urlencoded::Serializer; -use url::UrlQuery; - pub use adapter::{ArgmaxAdapter, DeepgramAdapter, SonioxAdapter, SttAdapter}; pub use batch::BatchClient; pub use error::Error; @@ -122,49 +119,3 @@ impl ListenClientBuilder { } } } - -pub(crate) fn append_language_query<'a>( - query_pairs: &mut Serializer<'a, UrlQuery>, - params: &owhisper_interface::ListenParams, -) { - match params.languages.len() { - 0 => { - query_pairs.append_pair("detect_language", "true"); - } - 1 => { - if let Some(language) = params.languages.first() { - let code = language.iso639().code(); - query_pairs.append_pair("language", code); - query_pairs.append_pair("languages", code); - } - } - _ => { - query_pairs.append_pair("language", "multi"); - for language in ¶ms.languages { - let code = language.iso639().code(); - query_pairs.append_pair("languages", code); - } - } - } -} - -pub(crate) fn append_keyword_query<'a>( - query_pairs: &mut Serializer<'a, UrlQuery>, - params: &owhisper_interface::ListenParams, -) { - if params.keywords.is_empty() { - return; - } - - let use_keyterms = params - .model - .as_ref() - .map(|model| model.contains("nova-3")) - .unwrap_or(false); - - let param_name = if use_keyterms { "keyterm" } else { "keywords" }; - - for keyword in ¶ms.keywords { - query_pairs.append_pair(param_name, keyword); - } -}