From 16e094b5e50e4b61e5d1f6519282a12672fc71a3 Mon Sep 17 00:00:00 2001 From: Stainless Bot Date: Thu, 17 Oct 2024 16:52:53 +0000 Subject: [PATCH] feat(api): add gpt-4o-audio-preview model for chat completions (#1135) This enables audio inputs and outputs. https://platform.openai.com/docs/guides/audio --- .stats.yml | 2 +- api.md | 4 + src/index.ts | 4 + src/lib/AbstractChatCompletionRunner.ts | 4 +- src/resources/beta/assistants.ts | 10 ++ src/resources/chat/chat.ts | 7 + src/resources/chat/completions.ts | 153 ++++++++++++++++++- src/resources/chat/index.ts | 4 + tests/api-resources/chat/completions.test.ts | 2 + 9 files changed, 183 insertions(+), 7 deletions(-) diff --git a/.stats.yml b/.stats.yml index 68789976b..984e8a8d5 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,2 +1,2 @@ configured_endpoints: 68 -openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai-71e58a77027c67e003fdd1b1ac8ac11557d8bfabc7666d1a827c6b1ca8ab98b5.yml +openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai-8729aaa35436531ab453224af10e67f89677db8f350f0346bb3537489edea649.yml diff --git a/api.md b/api.md index 71027acfd..da60f65bd 100644 --- a/api.md +++ b/api.md @@ -33,9 +33,12 @@ Types: - ChatCompletion - ChatCompletionAssistantMessageParam +- ChatCompletionAudio +- ChatCompletionAudioParam - ChatCompletionChunk - ChatCompletionContentPart - ChatCompletionContentPartImage +- ChatCompletionContentPartInputAudio - ChatCompletionContentPartRefusal - ChatCompletionContentPartText - ChatCompletionFunctionCallOption @@ -43,6 +46,7 @@ Types: - ChatCompletionMessage - ChatCompletionMessageParam - ChatCompletionMessageToolCall +- ChatCompletionModality - ChatCompletionNamedToolChoice - ChatCompletionRole - ChatCompletionStreamOptions diff --git a/src/index.ts b/src/index.ts index d3e1d2a78..56108223a 100644 --- a/src/index.ts +++ b/src/index.ts @@ -250,9 +250,12 @@ export namespace OpenAI { export import ChatModel = API.ChatModel; export import ChatCompletion = API.ChatCompletion; export import ChatCompletionAssistantMessageParam = API.ChatCompletionAssistantMessageParam; + export import ChatCompletionAudio = API.ChatCompletionAudio; + export import ChatCompletionAudioParam = API.ChatCompletionAudioParam; export import ChatCompletionChunk = API.ChatCompletionChunk; export import ChatCompletionContentPart = API.ChatCompletionContentPart; export import ChatCompletionContentPartImage = API.ChatCompletionContentPartImage; + export import ChatCompletionContentPartInputAudio = API.ChatCompletionContentPartInputAudio; export import ChatCompletionContentPartRefusal = API.ChatCompletionContentPartRefusal; export import ChatCompletionContentPartText = API.ChatCompletionContentPartText; export import ChatCompletionFunctionCallOption = API.ChatCompletionFunctionCallOption; @@ -260,6 +263,7 @@ export namespace OpenAI { export import ChatCompletionMessage = API.ChatCompletionMessage; export import ChatCompletionMessageParam = API.ChatCompletionMessageParam; export import ChatCompletionMessageToolCall = API.ChatCompletionMessageToolCall; + export import ChatCompletionModality = API.ChatCompletionModality; export import ChatCompletionNamedToolChoice = API.ChatCompletionNamedToolChoice; export import ChatCompletionRole = API.ChatCompletionRole; export import ChatCompletionStreamOptions = API.ChatCompletionStreamOptions; diff --git a/src/lib/AbstractChatCompletionRunner.ts b/src/lib/AbstractChatCompletionRunner.ts index 39ee4e993..e943a4e4f 100644 --- a/src/lib/AbstractChatCompletionRunner.ts +++ b/src/lib/AbstractChatCompletionRunner.ts @@ -105,7 +105,9 @@ export class AbstractChatCompletionRunner< const message = this.messages[i]; if (isAssistantMessage(message)) { const { function_call, ...rest } = message; - const ret: ChatCompletionMessage = { + + // TODO: support audio here + const ret: Omit = { ...rest, content: (message as ChatCompletionMessage).content ?? null, refusal: (message as ChatCompletionMessage).refusal ?? null, diff --git a/src/resources/beta/assistants.ts b/src/resources/beta/assistants.ts index 410d520b0..aa7362297 100644 --- a/src/resources/beta/assistants.ts +++ b/src/resources/beta/assistants.ts @@ -298,6 +298,11 @@ export namespace AssistantStreamEvent { data: ThreadsAPI.Thread; event: 'thread.created'; + + /** + * Whether to enable input audio transcription. + */ + enabled?: boolean; } /** @@ -1084,6 +1089,11 @@ export interface ThreadStreamEvent { data: ThreadsAPI.Thread; event: 'thread.created'; + + /** + * Whether to enable input audio transcription. + */ + enabled?: boolean; } export interface AssistantCreateParams { diff --git a/src/resources/chat/chat.ts b/src/resources/chat/chat.ts index 5bc7de955..43ef5662c 100644 --- a/src/resources/chat/chat.ts +++ b/src/resources/chat/chat.ts @@ -16,7 +16,10 @@ export type ChatModel = | 'gpt-4o' | 'gpt-4o-2024-08-06' | 'gpt-4o-2024-05-13' + | 'gpt-4o-realtime-preview' | 'gpt-4o-realtime-preview-2024-10-01' + | 'gpt-4o-audio-preview' + | 'gpt-4o-audio-preview-2024-10-01' | 'chatgpt-4o-latest' | 'gpt-4o-mini' | 'gpt-4o-mini-2024-07-18' @@ -45,9 +48,12 @@ export namespace Chat { export import Completions = CompletionsAPI.Completions; export import ChatCompletion = CompletionsAPI.ChatCompletion; export import ChatCompletionAssistantMessageParam = CompletionsAPI.ChatCompletionAssistantMessageParam; + export import ChatCompletionAudio = CompletionsAPI.ChatCompletionAudio; + export import ChatCompletionAudioParam = CompletionsAPI.ChatCompletionAudioParam; export import ChatCompletionChunk = CompletionsAPI.ChatCompletionChunk; export import ChatCompletionContentPart = CompletionsAPI.ChatCompletionContentPart; export import ChatCompletionContentPartImage = CompletionsAPI.ChatCompletionContentPartImage; + export import ChatCompletionContentPartInputAudio = CompletionsAPI.ChatCompletionContentPartInputAudio; export import ChatCompletionContentPartRefusal = CompletionsAPI.ChatCompletionContentPartRefusal; export import ChatCompletionContentPartText = CompletionsAPI.ChatCompletionContentPartText; export import ChatCompletionFunctionCallOption = CompletionsAPI.ChatCompletionFunctionCallOption; @@ -55,6 +61,7 @@ export namespace Chat { export import ChatCompletionMessage = CompletionsAPI.ChatCompletionMessage; export import ChatCompletionMessageParam = CompletionsAPI.ChatCompletionMessageParam; export import ChatCompletionMessageToolCall = CompletionsAPI.ChatCompletionMessageToolCall; + export import ChatCompletionModality = CompletionsAPI.ChatCompletionModality; export import ChatCompletionNamedToolChoice = CompletionsAPI.ChatCompletionNamedToolChoice; export import ChatCompletionRole = CompletionsAPI.ChatCompletionRole; export import ChatCompletionStreamOptions = CompletionsAPI.ChatCompletionStreamOptions; diff --git a/src/resources/chat/completions.ts b/src/resources/chat/completions.ts index 27aebdc4c..97174ec1b 100644 --- a/src/resources/chat/completions.ts +++ b/src/resources/chat/completions.ts @@ -11,7 +11,10 @@ import { Stream } from '../../streaming'; export class Completions extends APIResource { /** - * Creates a model response for the given chat conversation. + * Creates a model response for the given chat conversation. Learn more in the + * [text generation](https://platform.openai.com/docs/guides/text-generation), + * [vision](https://platform.openai.com/docs/guides/vision), and + * [audio](https://platform.openai.com/docs/guides/audio) guides. */ create( body: ChatCompletionCreateParamsNonStreaming, @@ -138,6 +141,12 @@ export interface ChatCompletionAssistantMessageParam { */ role: 'assistant'; + /** + * Data about a previous audio response from the model. + * [Learn more](https://platform.openai.com/docs/guides/audio). + */ + audio?: ChatCompletionAssistantMessageParam.Audio | null; + /** * The contents of the assistant message. Required unless `tool_calls` or * `function_call` is specified. @@ -168,6 +177,17 @@ export interface ChatCompletionAssistantMessageParam { } export namespace ChatCompletionAssistantMessageParam { + /** + * Data about a previous audio response from the model. + * [Learn more](https://platform.openai.com/docs/guides/audio). + */ + export interface Audio { + /** + * Unique identifier for a previous audio response from the model. + */ + id: string; + } + /** * @deprecated: Deprecated and replaced by `tool_calls`. The name and arguments of * a function that should be called, as generated by the model. @@ -188,6 +208,54 @@ export namespace ChatCompletionAssistantMessageParam { } } +/** + * If the audio output modality is requested, this object contains data about the + * audio response from the model. + * [Learn more](https://platform.openai.com/docs/guides/audio). + */ +export interface ChatCompletionAudio { + /** + * Unique identifier for this audio response. + */ + id: string; + + /** + * Base64 encoded audio bytes generated by the model, in the format specified in + * the request. + */ + data: string; + + /** + * The Unix timestamp (in seconds) for when this audio response will no longer be + * accessible on the server for use in multi-turn conversations. + */ + expires_at: number; + + /** + * Transcript of the audio generated by the model. + */ + transcript: string; +} + +/** + * Parameters for audio output. Required when audio output is requested with + * `modalities: ["audio"]`. + * [Learn more](https://platform.openai.com/docs/guides/audio). + */ +export interface ChatCompletionAudioParam { + /** + * Specifies the output audio format. Must be one of `wav`, `mp3`, `flac`, `opus`, + * or `pcm16`. + */ + format: 'wav' | 'mp3' | 'flac' | 'opus' | 'pcm16'; + + /** + * Specifies the voice type. Supported voices are `alloy`, `echo`, `fable`, `onyx`, + * `nova`, and `shimmer`. + */ + voice: 'alloy' | 'echo' | 'fable' | 'onyx' | 'nova' | 'shimmer'; +} + /** * Represents a streamed chunk of a chat completion response returned by model, * based on the provided input. @@ -371,8 +439,18 @@ export namespace ChatCompletionChunk { } } -export type ChatCompletionContentPart = ChatCompletionContentPartText | ChatCompletionContentPartImage; +/** + * Learn about + * [text inputs](https://platform.openai.com/docs/guides/text-generation). + */ +export type ChatCompletionContentPart = + | ChatCompletionContentPartText + | ChatCompletionContentPartImage + | ChatCompletionContentPartInputAudio; +/** + * Learn about [image inputs](https://platform.openai.com/docs/guides/vision). + */ export interface ChatCompletionContentPartImage { image_url: ChatCompletionContentPartImage.ImageURL; @@ -397,6 +475,32 @@ export namespace ChatCompletionContentPartImage { } } +/** + * Learn about [audio inputs](https://platform.openai.com/docs/guides/audio). + */ +export interface ChatCompletionContentPartInputAudio { + input_audio: ChatCompletionContentPartInputAudio.InputAudio; + + /** + * The type of the content part. Always `input_audio`. + */ + type: 'input_audio'; +} + +export namespace ChatCompletionContentPartInputAudio { + export interface InputAudio { + /** + * Base64 encoded audio data. + */ + data: string; + + /** + * The format of the encoded audio data. Currently supports "wav" and "mp3". + */ + format: 'wav' | 'mp3'; + } +} + export interface ChatCompletionContentPartRefusal { /** * The refusal message generated by the model. @@ -409,6 +513,10 @@ export interface ChatCompletionContentPartRefusal { type: 'refusal'; } +/** + * Learn about + * [text inputs](https://platform.openai.com/docs/guides/text-generation). + */ export interface ChatCompletionContentPartText { /** * The text content. @@ -471,6 +579,13 @@ export interface ChatCompletionMessage { */ role: 'assistant'; + /** + * If the audio output modality is requested, this object contains data about the + * audio response from the model. + * [Learn more](https://platform.openai.com/docs/guides/audio). + */ + audio?: ChatCompletionAudio | null; + /** * @deprecated: Deprecated and replaced by `tool_calls`. The name and arguments of * a function that should be called, as generated by the model. @@ -548,6 +663,8 @@ export namespace ChatCompletionMessageToolCall { } } +export type ChatCompletionModality = 'text' | 'audio'; + /** * Specifies a tool the model should use. Use to force the model to call a specific * function. @@ -743,6 +860,13 @@ export interface ChatCompletionCreateParamsBase { */ model: (string & {}) | ChatAPI.ChatModel; + /** + * Parameters for audio output. Required when audio output is requested with + * `modalities: ["audio"]`. + * [Learn more](https://platform.openai.com/docs/guides/audio). + */ + audio?: ChatCompletionAudioParam | null; + /** * Number between -2.0 and 2.0. Positive values penalize new tokens based on their * existing frequency in the text so far, decreasing the model's likelihood to @@ -812,10 +936,24 @@ export interface ChatCompletionCreateParamsBase { /** * Developer-defined tags and values used for filtering completions in the - * [dashboard](https://platform.openai.com/completions). + * [dashboard](https://platform.openai.com/chat-completions). */ metadata?: Record | null; + /** + * Output types that you would like the model to generate for this request. Most + * models are capable of generating text, which is the default: + * + * `["text"]` + * + * The `gpt-4o-audio-preview` model can also be used to + * [generate audio](https://platform.openai.com/docs/guides/audio). To request that + * this model generate both text and audio responses, you can use: + * + * `["text", "audio"]` + */ + modalities?: Array | null; + /** * How many chat completion choices to generate for each input message. Note that * you will be charged based on the number of generated tokens across all of the @@ -900,8 +1038,9 @@ export interface ChatCompletionCreateParamsBase { stop?: string | null | Array; /** - * Whether or not to store the output of this completion request for traffic - * logging in the [dashboard](https://platform.openai.com/completions). + * Whether or not to store the output of this chat completion request for use in + * our [model distillation](https://platform.openai.com/docs/guides/distillation) + * or [evals](https://platform.openai.com/docs/guides/evals) products. */ store?: boolean | null; @@ -1049,9 +1188,12 @@ export type CompletionCreateParamsStreaming = ChatCompletionCreateParamsStreamin export namespace Completions { export import ChatCompletion = ChatCompletionsAPI.ChatCompletion; export import ChatCompletionAssistantMessageParam = ChatCompletionsAPI.ChatCompletionAssistantMessageParam; + export import ChatCompletionAudio = ChatCompletionsAPI.ChatCompletionAudio; + export import ChatCompletionAudioParam = ChatCompletionsAPI.ChatCompletionAudioParam; export import ChatCompletionChunk = ChatCompletionsAPI.ChatCompletionChunk; export import ChatCompletionContentPart = ChatCompletionsAPI.ChatCompletionContentPart; export import ChatCompletionContentPartImage = ChatCompletionsAPI.ChatCompletionContentPartImage; + export import ChatCompletionContentPartInputAudio = ChatCompletionsAPI.ChatCompletionContentPartInputAudio; export import ChatCompletionContentPartRefusal = ChatCompletionsAPI.ChatCompletionContentPartRefusal; export import ChatCompletionContentPartText = ChatCompletionsAPI.ChatCompletionContentPartText; export import ChatCompletionFunctionCallOption = ChatCompletionsAPI.ChatCompletionFunctionCallOption; @@ -1059,6 +1201,7 @@ export namespace Completions { export import ChatCompletionMessage = ChatCompletionsAPI.ChatCompletionMessage; export import ChatCompletionMessageParam = ChatCompletionsAPI.ChatCompletionMessageParam; export import ChatCompletionMessageToolCall = ChatCompletionsAPI.ChatCompletionMessageToolCall; + export import ChatCompletionModality = ChatCompletionsAPI.ChatCompletionModality; export import ChatCompletionNamedToolChoice = ChatCompletionsAPI.ChatCompletionNamedToolChoice; export import ChatCompletionRole = ChatCompletionsAPI.ChatCompletionRole; export import ChatCompletionStreamOptions = ChatCompletionsAPI.ChatCompletionStreamOptions; diff --git a/src/resources/chat/index.ts b/src/resources/chat/index.ts index 748770948..22803e819 100644 --- a/src/resources/chat/index.ts +++ b/src/resources/chat/index.ts @@ -3,9 +3,12 @@ export { ChatCompletion, ChatCompletionAssistantMessageParam, + ChatCompletionAudio, + ChatCompletionAudioParam, ChatCompletionChunk, ChatCompletionContentPart, ChatCompletionContentPartImage, + ChatCompletionContentPartInputAudio, ChatCompletionContentPartRefusal, ChatCompletionContentPartText, ChatCompletionFunctionCallOption, @@ -13,6 +16,7 @@ export { ChatCompletionMessage, ChatCompletionMessageParam, ChatCompletionMessageToolCall, + ChatCompletionModality, ChatCompletionNamedToolChoice, ChatCompletionRole, ChatCompletionStreamOptions, diff --git a/tests/api-resources/chat/completions.test.ts b/tests/api-resources/chat/completions.test.ts index 4f015b47e..77d4a251c 100644 --- a/tests/api-resources/chat/completions.test.ts +++ b/tests/api-resources/chat/completions.test.ts @@ -27,6 +27,7 @@ describe('resource completions', () => { const response = await client.chat.completions.create({ messages: [{ content: 'string', role: 'system', name: 'name' }], model: 'gpt-4o', + audio: { format: 'wav', voice: 'alloy' }, frequency_penalty: -2, function_call: 'none', functions: [{ name: 'name', description: 'description', parameters: { foo: 'bar' } }], @@ -35,6 +36,7 @@ describe('resource completions', () => { max_completion_tokens: 0, max_tokens: 0, metadata: { foo: 'string' }, + modalities: ['text', 'audio'], n: 1, parallel_tool_calls: true, presence_penalty: -2,