feat(api): Add support for gpt-4o-transcribe-diarize on audio/transcriptions endpoint

stainless-app[bot] · stainless-app[bot] · commit 25cbb74f8352 · 2025-10-16T15:13:04.000Z
diff --git a/.stats.yml b/.stats.yml
@@ -1,4 +1,4 @@
 configured_endpoints: 136
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-11d308a9ef78ad01aa11c880a084a3982276800d7994db3f454aa515474977d7.yml
-openapi_spec_hash: 0a4bbb5aa0ae532a072bd6b3854e70b1
-config_hash: f0940d0906846178759ef7128e4cb98e
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-104cced8f4c7436a76eea02e26307828166405ccfb296faffb008b72772c11a7.yml
+openapi_spec_hash: fdc03ed84a65a31b80da909255e53924
+config_hash: 03b48e9b8c7231a902403210dbd7dfa0
diff --git a/api.md b/api.md
@@ -171,11 +171,14 @@ Types:
 ```python
 from openai.types.audio import (
     Transcription,
+    TranscriptionDiarized,
+    TranscriptionDiarizedSegment,
     TranscriptionInclude,
     TranscriptionSegment,
     TranscriptionStreamEvent,
     TranscriptionTextDeltaEvent,
     TranscriptionTextDoneEvent,
+    TranscriptionTextSegmentEvent,
     TranscriptionVerbose,
     TranscriptionWord,
     TranscriptionCreateResponse,
diff --git a/src/openai/resources/audio/transcriptions.py b/src/openai/resources/audio/transcriptions.py
diff --git a/src/openai/resources/audio/translations.py b/src/openai/resources/audio/translations.py
@@ -349,7 +349,7 @@ def __init__(self, translations: AsyncTranslations) -> None:
 
 
 def _get_response_format_type(
-    response_format: Literal["json", "text", "srt", "verbose_json", "vtt"] | Omit,
+    response_format: AudioResponseFormat | Omit,
 ) -> type[Translation | TranslationVerbose | str]:
     if isinstance(response_format, Omit) or response_format is None:  # pyright: ignore[reportUnnecessaryComparison]
         return Translation
@@ -360,8 +360,8 @@ def _get_response_format_type(
         return TranslationVerbose
     elif response_format == "srt" or response_format == "text" or response_format == "vtt":
         return str
-    elif TYPE_CHECKING:  # type: ignore[unreachable]
+    elif TYPE_CHECKING and response_format != "diarized_json":  # type: ignore[unreachable]
         assert_never(response_format)
     else:
-        log.warn("Unexpected audio response format: %s", response_format)
-        return Transcription
+        log.warning("Unexpected audio response format: %s", response_format)
+        return Translation
diff --git a/src/openai/resources/vector_stores/vector_stores.py b/src/openai/resources/vector_stores/vector_stores.py
@@ -79,6 +79,7 @@ def create(
         self,
         *,
         chunking_strategy: FileChunkingStrategyParam | Omit = omit,
+        description: str | Omit = omit,
         expires_after: vector_store_create_params.ExpiresAfter | Omit = omit,
         file_ids: SequenceNotStr[str] | Omit = omit,
         metadata: Optional[Metadata] | Omit = omit,
@@ -97,6 +98,9 @@ def create(
           chunking_strategy: The chunking strategy used to chunk the file(s). If not set, will use the `auto`
               strategy. Only applicable if `file_ids` is non-empty.
 
+          description: A description for the vector store. Can be used to describe the vector store's
+              purpose.
+
           expires_after: The expiration policy for a vector store.
 
           file_ids: A list of [File](https://platform.openai.com/docs/api-reference/files) IDs that
@@ -126,6 +130,7 @@ def create(
             body=maybe_transform(
                 {
                     "chunking_strategy": chunking_strategy,
+                    "description": description,
                     "expires_after": expires_after,
                     "file_ids": file_ids,
                     "metadata": metadata,
@@ -424,6 +429,7 @@ async def create(
         self,
         *,
         chunking_strategy: FileChunkingStrategyParam | Omit = omit,
+        description: str | Omit = omit,
         expires_after: vector_store_create_params.ExpiresAfter | Omit = omit,
         file_ids: SequenceNotStr[str] | Omit = omit,
         metadata: Optional[Metadata] | Omit = omit,
@@ -442,6 +448,9 @@ async def create(
           chunking_strategy: The chunking strategy used to chunk the file(s). If not set, will use the `auto`
               strategy. Only applicable if `file_ids` is non-empty.
 
+          description: A description for the vector store. Can be used to describe the vector store's
+              purpose.
+
           expires_after: The expiration policy for a vector store.
 
           file_ids: A list of [File](https://platform.openai.com/docs/api-reference/files) IDs that
@@ -471,6 +480,7 @@ async def create(
             body=await async_maybe_transform(
                 {
                     "chunking_strategy": chunking_strategy,
+                    "description": description,
                     "expires_after": expires_after,
                     "file_ids": file_ids,
                     "metadata": metadata,
diff --git a/src/openai/types/audio/__init__.py b/src/openai/types/audio/__init__.py
@@ -11,10 +11,13 @@
 from .transcription_include import TranscriptionInclude as TranscriptionInclude
 from .transcription_segment import TranscriptionSegment as TranscriptionSegment
 from .transcription_verbose import TranscriptionVerbose as TranscriptionVerbose
+from .transcription_diarized import TranscriptionDiarized as TranscriptionDiarized
 from .translation_create_params import TranslationCreateParams as TranslationCreateParams
 from .transcription_stream_event import TranscriptionStreamEvent as TranscriptionStreamEvent
 from .transcription_create_params import TranscriptionCreateParams as TranscriptionCreateParams
 from .translation_create_response import TranslationCreateResponse as TranslationCreateResponse
 from .transcription_create_response import TranscriptionCreateResponse as TranscriptionCreateResponse
 from .transcription_text_done_event import TranscriptionTextDoneEvent as TranscriptionTextDoneEvent
+from .transcription_diarized_segment import TranscriptionDiarizedSegment as TranscriptionDiarizedSegment
 from .transcription_text_delta_event import TranscriptionTextDeltaEvent as TranscriptionTextDeltaEvent
+from .transcription_text_segment_event import TranscriptionTextSegmentEvent as TranscriptionTextSegmentEvent
diff --git a/src/openai/types/audio/transcription_create_params.py b/src/openai/types/audio/transcription_create_params.py
@@ -5,7 +5,7 @@
 from typing import List, Union, Optional
 from typing_extensions import Literal, Required, TypeAlias, TypedDict
 
-from ..._types import FileTypes
+from ..._types import FileTypes, SequenceNotStr
 from ..audio_model import AudioModel
 from .transcription_include import TranscriptionInclude
 from ..audio_response_format import AudioResponseFormat
@@ -29,8 +29,9 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
     model: Required[Union[str, AudioModel]]
     """ID of the model to use.
 
-    The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1`
-    (which is powered by our open source Whisper V2 model).
+    The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, `whisper-1`
+    (which is powered by our open source Whisper V2 model), and
+    `gpt-4o-transcribe-diarize`.
     """
 
     chunking_strategy: Optional[ChunkingStrategy]
@@ -39,7 +40,8 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
     When set to `"auto"`, the server first normalizes loudness and then uses voice
     activity detection (VAD) to choose boundaries. `server_vad` object can be
     provided to tweak VAD detection parameters manually. If unset, the audio is
-    transcribed as a single block.
+    transcribed as a single block. Required when using `gpt-4o-transcribe-diarize`
+    for inputs longer than 30 seconds.
     """
 
     include: List[TranscriptionInclude]
@@ -48,7 +50,24 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
     return the log probabilities of the tokens in the response to understand the
     model's confidence in the transcription. `logprobs` only works with
     response_format set to `json` and only with the models `gpt-4o-transcribe` and
-    `gpt-4o-mini-transcribe`.
+    `gpt-4o-mini-transcribe`. This field is not supported when using
+    `gpt-4o-transcribe-diarize`.
+    """
+
+    known_speaker_names: SequenceNotStr[str]
+    """
+    Optional list of speaker names that correspond to the audio samples provided in
+    `known_speaker_references[]`. Each entry should be a short identifier (for
+    example `customer` or `agent`). Up to 4 speakers are supported.
+    """
+
+    known_speaker_references: SequenceNotStr[str]
+    """
+    Optional list of audio samples (as
+    [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
+    that contain known speaker references matching `known_speaker_names[]`. Each
+    sample must be between 2 and 10 seconds, and can use any of the same input audio
+    formats supported by `file`.
     """
 
     language: str
@@ -64,14 +83,17 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
     segment.
 
     The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
-    should match the audio language.
+    should match the audio language. This field is not supported when using
+    `gpt-4o-transcribe-diarize`.
     """
 
     response_format: AudioResponseFormat
     """
     The format of the output, in one of these options: `json`, `text`, `srt`,
-    `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
-    the only supported format is `json`.
+    `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
+    `gpt-4o-mini-transcribe`, the only supported format is `json`. For
+    `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
+    `diarized_json`, with `diarized_json` required to receive speaker annotations.
     """
 
     temperature: float
@@ -89,7 +111,8 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
     `response_format` must be set `verbose_json` to use timestamp granularities.
     Either or both of these options are supported: `word`, or `segment`. Note: There
     is no additional latency for segment timestamps, but generating word timestamps
-    incurs additional latency.
+    incurs additional latency. This option is not available for
+    `gpt-4o-transcribe-diarize`.
     """
 
 
diff --git a/src/openai/types/audio/transcription_create_response.py b/src/openai/types/audio/transcription_create_response.py
@@ -5,7 +5,8 @@
 
 from .transcription import Transcription
 from .transcription_verbose import TranscriptionVerbose
+from .transcription_diarized import TranscriptionDiarized
 
 __all__ = ["TranscriptionCreateResponse"]
 
-TranscriptionCreateResponse: TypeAlias = Union[Transcription, TranscriptionVerbose]
+TranscriptionCreateResponse: TypeAlias = Union[Transcription, TranscriptionDiarized, TranscriptionVerbose]
diff --git a/src/openai/types/audio/transcription_diarized.py b/src/openai/types/audio/transcription_diarized.py
@@ -0,0 +1,63 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import List, Union, Optional
+from typing_extensions import Literal, Annotated, TypeAlias
+
+from ..._utils import PropertyInfo
+from ..._models import BaseModel
+from .transcription_diarized_segment import TranscriptionDiarizedSegment
+
+__all__ = ["TranscriptionDiarized", "Usage", "UsageTokens", "UsageTokensInputTokenDetails", "UsageDuration"]
+
+
+class UsageTokensInputTokenDetails(BaseModel):
+    audio_tokens: Optional[int] = None
+    """Number of audio tokens billed for this request."""
+
+    text_tokens: Optional[int] = None
+    """Number of text tokens billed for this request."""
+
+
+class UsageTokens(BaseModel):
+    input_tokens: int
+    """Number of input tokens billed for this request."""
+
+    output_tokens: int
+    """Number of output tokens generated."""
+
+    total_tokens: int
+    """Total number of tokens used (input + output)."""
+
+    type: Literal["tokens"]
+    """The type of the usage object. Always `tokens` for this variant."""
+
+    input_token_details: Optional[UsageTokensInputTokenDetails] = None
+    """Details about the input tokens billed for this request."""
+
+
+class UsageDuration(BaseModel):
+    seconds: float
+    """Duration of the input audio in seconds."""
+
+    type: Literal["duration"]
+    """The type of the usage object. Always `duration` for this variant."""
+
+
+Usage: TypeAlias = Annotated[Union[UsageTokens, UsageDuration], PropertyInfo(discriminator="type")]
+
+
+class TranscriptionDiarized(BaseModel):
+    duration: float
+    """Duration of the input audio in seconds."""
+
+    segments: List[TranscriptionDiarizedSegment]
+    """Segments of the transcript annotated with timestamps and speaker labels."""
+
+    task: Literal["transcribe"]
+    """The type of task that was run. Always `transcribe`."""
+
+    text: str
+    """The concatenated transcript text for the entire audio input."""
+
+    usage: Optional[Usage] = None
+    """Token or duration usage statistics for the request."""
diff --git a/src/openai/types/audio/transcription_diarized_segment.py b/src/openai/types/audio/transcription_diarized_segment.py
@@ -0,0 +1,32 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing_extensions import Literal
+
+from ..._models import BaseModel
+
+__all__ = ["TranscriptionDiarizedSegment"]
+
+
+class TranscriptionDiarizedSegment(BaseModel):
+    id: str
+    """Unique identifier for the segment."""
+
+    end: float
+    """End timestamp of the segment in seconds."""
+
+    speaker: str
+    """Speaker label for this segment.
+
+    When known speakers are provided, the label matches `known_speaker_names[]`.
+    Otherwise speakers are labeled sequentially using capital letters (`A`, `B`,
+    ...).
+    """
+
+    start: float
+    """Start timestamp of the segment in seconds."""
+
+    text: str
+    """Transcript text for this segment."""
+
+    type: Literal["transcript.text.segment"]
+    """The type of the segment. Always `transcript.text.segment`."""
diff --git a/src/openai/types/audio/transcription_stream_event.py b/src/openai/types/audio/transcription_stream_event.py
@@ -6,9 +6,11 @@
 from ..._utils import PropertyInfo
 from .transcription_text_done_event import TranscriptionTextDoneEvent
 from .transcription_text_delta_event import TranscriptionTextDeltaEvent
+from .transcription_text_segment_event import TranscriptionTextSegmentEvent
 
 __all__ = ["TranscriptionStreamEvent"]
 
 TranscriptionStreamEvent: TypeAlias = Annotated[
-    Union[TranscriptionTextDeltaEvent, TranscriptionTextDoneEvent], PropertyInfo(discriminator="type")
+    Union[TranscriptionTextSegmentEvent, TranscriptionTextDeltaEvent, TranscriptionTextDoneEvent],
+    PropertyInfo(discriminator="type"),
 ]
diff --git a/src/openai/types/audio/transcription_text_delta_event.py b/src/openai/types/audio/transcription_text_delta_event.py
@@ -33,3 +33,9 @@ class TranscriptionTextDeltaEvent(BaseModel):
     [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
     with the `include[]` parameter set to `logprobs`.
     """
+
+    segment_id: Optional[str] = None
+    """Identifier of the diarized segment that this delta belongs to.
+
+    Only present when using `gpt-4o-transcribe-diarize`.
+    """
diff --git a/src/openai/types/audio/transcription_text_segment_event.py b/src/openai/types/audio/transcription_text_segment_event.py
@@ -0,0 +1,27 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing_extensions import Literal
+
+from ..._models import BaseModel
+
+__all__ = ["TranscriptionTextSegmentEvent"]
+
+
+class TranscriptionTextSegmentEvent(BaseModel):
+    id: str
+    """Unique identifier for the segment."""
+
+    end: float
+    """End timestamp of the segment in seconds."""
+
+    speaker: str
+    """Speaker label for this segment."""
+
+    start: float
+    """Start timestamp of the segment in seconds."""
+
+    text: str
+    """Transcript text for this segment."""
+
+    type: Literal["transcript.text.segment"]
+    """The type of the event. Always `transcript.text.segment`."""
diff --git a/src/openai/types/audio_model.py b/src/openai/types/audio_model.py
@@ -4,4 +4,4 @@
 
 __all__ = ["AudioModel"]
 
-AudioModel: TypeAlias = Literal["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"]
+AudioModel: TypeAlias = Literal["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe", "gpt-4o-transcribe-diarize"]
diff --git a/src/openai/types/audio_response_format.py b/src/openai/types/audio_response_format.py
@@ -4,4 +4,4 @@
 
 __all__ = ["AudioResponseFormat"]
 
-AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json", "vtt"]
+AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json", "vtt", "diarized_json"]
diff --git a/src/openai/types/realtime/audio_transcription.py b/src/openai/types/realtime/audio_transcription.py
@@ -17,20 +17,21 @@ class AudioTranscription(BaseModel):
     format will improve accuracy and latency.
     """
 
-    model: Optional[Literal["whisper-1", "gpt-4o-transcribe-latest", "gpt-4o-mini-transcribe", "gpt-4o-transcribe"]] = (
-        None
-    )
+    model: Optional[
+        Literal["whisper-1", "gpt-4o-mini-transcribe", "gpt-4o-transcribe", "gpt-4o-transcribe-diarize"]
+    ] = None
     """The model to use for transcription.
 
-    Current options are `whisper-1`, `gpt-4o-transcribe-latest`,
-    `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`.
+    Current options are `whisper-1`, `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`,
+    and `gpt-4o-transcribe-diarize`. Use `gpt-4o-transcribe-diarize` when you need
+    diarization with speaker labels.
     """
 
     prompt: Optional[str] = None
     """
     An optional text to guide the model's style or continue a previous audio
     segment. For `whisper-1`, the
     [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
-    For `gpt-4o-transcribe` models, the prompt is a free text string, for example
-    "expect words related to technology".
+    For `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the
+    prompt is a free text string, for example "expect words related to technology".
     """
diff --git a/src/openai/types/realtime/audio_transcription_param.py b/src/openai/types/realtime/audio_transcription_param.py
diff --git a/src/openai/types/vector_store_create_params.py b/src/openai/types/vector_store_create_params.py
diff --git a/tests/api_resources/audio/test_transcriptions.py b/tests/api_resources/audio/test_transcriptions.py
diff --git a/tests/api_resources/test_vector_stores.py b/tests/api_resources/test_vector_stores.py
diff --git a/tests/lib/test_audio.py b/tests/lib/test_audio.py