diff --git a/async-openai/src/audio.rs b/async-openai/src/audio.rs index 08ea7ac0..0c3a8add 100644 --- a/async-openai/src/audio.rs +++ b/async-openai/src/audio.rs @@ -1,9 +1,11 @@ +use bytes::Bytes; + use crate::{ config::Config, error::OpenAIError, types::{ CreateSpeechRequest, CreateSpeechResponse, CreateTranscriptionRequest, - CreateTranscriptionResponse, CreateTranslationRequest, CreateTranslationResponse, + CreateTranscriptionResponseJson, CreateTranscriptionResponseVerboseJson, CreateTranslationRequest, CreateTranslationResponse, }, Client, }; @@ -23,12 +25,32 @@ impl<'c, C: Config> Audio<'c, C> { pub async fn transcribe( &self, request: CreateTranscriptionRequest, - ) -> Result { + ) -> Result { + self.client + .post_form("/audio/transcriptions", request) + .await + } + + /// Transcribes audio into the input language. + pub async fn transcribe_verbose_json( + &self, + request: CreateTranscriptionRequest, + ) -> Result { self.client .post_form("/audio/transcriptions", request) .await } + /// Transcribes audio into the input language. + pub async fn transcribe_raw( + &self, + request: CreateTranscriptionRequest, + ) -> Result { + self.client + .post_form_raw("/audio/transcriptions", request) + .await + } + /// Translates audio into into English. pub async fn translate( &self, diff --git a/async-openai/src/client.rs b/async-openai/src/client.rs index a4480daa..e4b567eb 100644 --- a/async-openai/src/client.rs +++ b/async-openai/src/client.rs @@ -222,6 +222,25 @@ impl Client { self.execute(request_maker).await } + /// POST a form at {path} and return the response body + pub(crate) async fn post_form_raw(&self, path: &str, form: F) -> Result + where + reqwest::multipart::Form: async_convert::TryFrom, + F: Clone, + { + let request_maker = || async { + Ok(self + .http_client + .post(self.config.url(path)) + .query(&self.config.query()) + .headers(self.config.headers()) + .multipart(async_convert::TryFrom::try_from(form.clone()).await?) + .build()?) + }; + + self.execute_raw(request_maker).await + } + /// POST a form at {path} and deserialize the response body pub(crate) async fn post_form(&self, path: &str, form: F) -> Result where diff --git a/async-openai/src/types/audio.rs b/async-openai/src/types/audio.rs index ca655c27..15519ace 100644 --- a/async-openai/src/types/audio.rs +++ b/async-openai/src/types/audio.rs @@ -96,11 +96,84 @@ pub struct CreateTranscriptionRequest { pub timestamp_granularities: Option>, } +/// Represents a transcription response returned by model, based on the provided +/// input. #[derive(Debug, Deserialize, Clone, Serialize)] -pub struct CreateTranscriptionResponse { +pub struct CreateTranscriptionResponseJson { + /// The transcribed text. pub text: String, } +/// Represents a verbose json transcription response returned by model, based on +/// the provided input. +#[derive(Debug, Deserialize, Clone, Serialize)] +pub struct CreateTranscriptionResponseVerboseJson { + /// The language of the input audio. + pub language: String, + + /// The duration of the input audio. + pub duration: f32, + + /// The transcribed text. + pub text: String, + + /// Extracted words and their corresponding timestamps. + #[serde(skip_serializing_if = "Option::is_none")] + pub words: Option>, + + /// Segments of the transcribed text and their corresponding details. + #[serde(skip_serializing_if = "Option::is_none")] + pub segments: Option>, +} + +#[derive(Debug, Deserialize, Clone, Serialize)] +pub struct TranscriptionWord { + /// The text content of the word. + pub word: String, + + /// Start time of the word in seconds. + pub start: f32, + + /// End time of the word in seconds. + pub end: f32, +} + +#[derive(Debug, Deserialize, Clone, Serialize)] +pub struct TranscriptionSegment { + /// Unique identifier of the segment. + pub id: i32, + + // Seek offset of the segment. + pub seek: i32, + + /// Start time of the segment in seconds. + pub start: f32, + + /// End time of the segment in seconds. + pub end: f32, + + /// Text content of the segment. + pub text: String, + + /// Array of token IDs for the text content. + pub tokens: Vec, + + /// Temperature parameter used for generating the segment. + pub temperature: f32, + + /// Average logprob of the segment. If the value is lower than -1, consider + /// the logprobs failed. + pub avg_logprob: f32, + + /// Compression ratio of the segment. If the value is greater than 2.4, + /// consider the compression failed. + pub compression_ratio: f32, + + /// Probability of no speech in the segment. If the value is higher than 1.0 + /// and the `avg_logprob` is below -1, consider this segment silent. + pub no_speech_prob: f32, +} + #[derive(Clone, Default, Debug, Builder, PartialEq, Serialize)] #[builder(name = "CreateSpeechRequestArgs")] #[builder(pattern = "mutable")] diff --git a/async-openai/src/types/impls.rs b/async-openai/src/types/impls.rs index a77c1cba..991dfbc8 100644 --- a/async-openai/src/types/impls.rs +++ b/async-openai/src/types/impls.rs @@ -23,7 +23,7 @@ use super::{ CreateImageEditRequest, CreateImageVariationRequest, CreateSpeechResponse, CreateTranscriptionRequest, CreateTranslationRequest, DallE2ImageSize, EmbeddingInput, FileInput, FunctionName, Image, ImageInput, ImageModel, ImageSize, ImageUrl, ImagesResponse, - ModerationInput, Prompt, ResponseFormat, Role, Stop, + ModerationInput, Prompt, ResponseFormat, Role, Stop, TimestampGranularity, }; /// for `impl_from!(T, Enum)`, implements @@ -228,6 +228,19 @@ impl Display for AudioResponseFormat { } } +impl Display for TimestampGranularity { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}", + match self { + TimestampGranularity::Word => "word", + TimestampGranularity::Segment => "segment", + } + ) + } +} + impl Display for Role { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( @@ -642,6 +655,12 @@ impl async_convert::TryFrom for reqwest::multipart:: form = form.text("language", language); } + if let Some(timestamp_granularities) = request.timestamp_granularities { + for tg in timestamp_granularities { + form = form.text("timestamp_granularities[]", tg.to_string()); + } + } + Ok(form) } } diff --git a/examples/audio-transcribe/src/main.rs b/examples/audio-transcribe/src/main.rs index 851ae2eb..de414b69 100644 --- a/examples/audio-transcribe/src/main.rs +++ b/examples/audio-transcribe/src/main.rs @@ -1,8 +1,18 @@ -use async_openai::{types::CreateTranscriptionRequestArgs, Client}; +use async_openai::{ + types::{AudioResponseFormat, CreateTranscriptionRequestArgs, TimestampGranularity}, + Client +}; use std::error::Error; #[tokio::main] async fn main() -> Result<(), Box> { + transcribe_json().await?; + transcribe_verbose_json().await?; + transcribe_srt().await?; + Ok(()) +} + +async fn transcribe_json() -> Result<(), Box> { let client = Client::new(); // Credits and Source for audio: https://www.youtube.com/watch?v=oQnDVqGIv4s let request = CreateTranscriptionRequestArgs::default() @@ -10,11 +20,49 @@ async fn main() -> Result<(), Box> { "./audio/A Message From Sir David Attenborough A Perfect Planet BBC Earth_320kbps.mp3", ) .model("whisper-1") + .response_format(AudioResponseFormat::Json) .build()?; let response = client.audio().transcribe(request).await?; + println!("{}", response.text); + Ok(()) +} + +async fn transcribe_verbose_json() -> Result<(), Box> { + let client = Client::new(); + let request = CreateTranscriptionRequestArgs::default() + .file( + "./audio/A Message From Sir David Attenborough A Perfect Planet BBC Earth_320kbps.mp3", + ) + .model("whisper-1") + .response_format(AudioResponseFormat::VerboseJson) + .timestamp_granularities(vec![TimestampGranularity::Word, TimestampGranularity::Segment]) + .build()?; + + let response = client.audio().transcribe_verbose_json(request).await?; println!("{}", response.text); + if let Some(words) = &response.words { + println!("- {} words", words.len()); + } + if let Some(segments) = &response.segments { + println!("- {} segments", segments.len()); + } + + Ok(()) +} + +async fn transcribe_srt() -> Result<(), Box> { + let client = Client::new(); + let request = CreateTranscriptionRequestArgs::default() + .file( + "./audio/A Message From Sir David Attenborough A Perfect Planet BBC Earth_320kbps.mp3", + ) + .model("whisper-1") + .response_format(AudioResponseFormat::Srt) + .build()?; + let response = client.audio().transcribe_raw(request).await?; + println!("{}", String::from_utf8_lossy(response.as_ref())); Ok(()) }