Skip to content

Commit 25cbb74

Browse files
feat(api): Add support for gpt-4o-transcribe-diarize on audio/transcriptions endpoint
1 parent 8cdfd06 commit 25cbb74

21 files changed

+475
-81
lines changed

.stats.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
configured_endpoints: 136
2-
openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-11d308a9ef78ad01aa11c880a084a3982276800d7994db3f454aa515474977d7.yml
3-
openapi_spec_hash: 0a4bbb5aa0ae532a072bd6b3854e70b1
4-
config_hash: f0940d0906846178759ef7128e4cb98e
2+
openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-104cced8f4c7436a76eea02e26307828166405ccfb296faffb008b72772c11a7.yml
3+
openapi_spec_hash: fdc03ed84a65a31b80da909255e53924
4+
config_hash: 03b48e9b8c7231a902403210dbd7dfa0

api.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,11 +171,14 @@ Types:
171171
```python
172172
from openai.types.audio import (
173173
Transcription,
174+
TranscriptionDiarized,
175+
TranscriptionDiarizedSegment,
174176
TranscriptionInclude,
175177
TranscriptionSegment,
176178
TranscriptionStreamEvent,
177179
TranscriptionTextDeltaEvent,
178180
TranscriptionTextDoneEvent,
181+
TranscriptionTextSegmentEvent,
179182
TranscriptionVerbose,
180183
TranscriptionWord,
181184
TranscriptionCreateResponse,

src/openai/resources/audio/transcriptions.py

Lines changed: 237 additions & 41 deletions
Large diffs are not rendered by default.

src/openai/resources/audio/translations.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -349,7 +349,7 @@ def __init__(self, translations: AsyncTranslations) -> None:
349349

350350

351351
def _get_response_format_type(
352-
response_format: Literal["json", "text", "srt", "verbose_json", "vtt"] | Omit,
352+
response_format: AudioResponseFormat | Omit,
353353
) -> type[Translation | TranslationVerbose | str]:
354354
if isinstance(response_format, Omit) or response_format is None: # pyright: ignore[reportUnnecessaryComparison]
355355
return Translation
@@ -360,8 +360,8 @@ def _get_response_format_type(
360360
return TranslationVerbose
361361
elif response_format == "srt" or response_format == "text" or response_format == "vtt":
362362
return str
363-
elif TYPE_CHECKING: # type: ignore[unreachable]
363+
elif TYPE_CHECKING and response_format != "diarized_json": # type: ignore[unreachable]
364364
assert_never(response_format)
365365
else:
366-
log.warn("Unexpected audio response format: %s", response_format)
367-
return Transcription
366+
log.warning("Unexpected audio response format: %s", response_format)
367+
return Translation

src/openai/resources/vector_stores/vector_stores.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ def create(
7979
self,
8080
*,
8181
chunking_strategy: FileChunkingStrategyParam | Omit = omit,
82+
description: str | Omit = omit,
8283
expires_after: vector_store_create_params.ExpiresAfter | Omit = omit,
8384
file_ids: SequenceNotStr[str] | Omit = omit,
8485
metadata: Optional[Metadata] | Omit = omit,
@@ -97,6 +98,9 @@ def create(
9798
chunking_strategy: The chunking strategy used to chunk the file(s). If not set, will use the `auto`
9899
strategy. Only applicable if `file_ids` is non-empty.
99100
101+
description: A description for the vector store. Can be used to describe the vector store's
102+
purpose.
103+
100104
expires_after: The expiration policy for a vector store.
101105
102106
file_ids: A list of [File](https://platform.openai.com/docs/api-reference/files) IDs that
@@ -126,6 +130,7 @@ def create(
126130
body=maybe_transform(
127131
{
128132
"chunking_strategy": chunking_strategy,
133+
"description": description,
129134
"expires_after": expires_after,
130135
"file_ids": file_ids,
131136
"metadata": metadata,
@@ -424,6 +429,7 @@ async def create(
424429
self,
425430
*,
426431
chunking_strategy: FileChunkingStrategyParam | Omit = omit,
432+
description: str | Omit = omit,
427433
expires_after: vector_store_create_params.ExpiresAfter | Omit = omit,
428434
file_ids: SequenceNotStr[str] | Omit = omit,
429435
metadata: Optional[Metadata] | Omit = omit,
@@ -442,6 +448,9 @@ async def create(
442448
chunking_strategy: The chunking strategy used to chunk the file(s). If not set, will use the `auto`
443449
strategy. Only applicable if `file_ids` is non-empty.
444450
451+
description: A description for the vector store. Can be used to describe the vector store's
452+
purpose.
453+
445454
expires_after: The expiration policy for a vector store.
446455
447456
file_ids: A list of [File](https://platform.openai.com/docs/api-reference/files) IDs that
@@ -471,6 +480,7 @@ async def create(
471480
body=await async_maybe_transform(
472481
{
473482
"chunking_strategy": chunking_strategy,
483+
"description": description,
474484
"expires_after": expires_after,
475485
"file_ids": file_ids,
476486
"metadata": metadata,

src/openai/types/audio/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,13 @@
1111
from .transcription_include import TranscriptionInclude as TranscriptionInclude
1212
from .transcription_segment import TranscriptionSegment as TranscriptionSegment
1313
from .transcription_verbose import TranscriptionVerbose as TranscriptionVerbose
14+
from .transcription_diarized import TranscriptionDiarized as TranscriptionDiarized
1415
from .translation_create_params import TranslationCreateParams as TranslationCreateParams
1516
from .transcription_stream_event import TranscriptionStreamEvent as TranscriptionStreamEvent
1617
from .transcription_create_params import TranscriptionCreateParams as TranscriptionCreateParams
1718
from .translation_create_response import TranslationCreateResponse as TranslationCreateResponse
1819
from .transcription_create_response import TranscriptionCreateResponse as TranscriptionCreateResponse
1920
from .transcription_text_done_event import TranscriptionTextDoneEvent as TranscriptionTextDoneEvent
21+
from .transcription_diarized_segment import TranscriptionDiarizedSegment as TranscriptionDiarizedSegment
2022
from .transcription_text_delta_event import TranscriptionTextDeltaEvent as TranscriptionTextDeltaEvent
23+
from .transcription_text_segment_event import TranscriptionTextSegmentEvent as TranscriptionTextSegmentEvent

src/openai/types/audio/transcription_create_params.py

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from typing import List, Union, Optional
66
from typing_extensions import Literal, Required, TypeAlias, TypedDict
77

8-
from ..._types import FileTypes
8+
from ..._types import FileTypes, SequenceNotStr
99
from ..audio_model import AudioModel
1010
from .transcription_include import TranscriptionInclude
1111
from ..audio_response_format import AudioResponseFormat
@@ -29,8 +29,9 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
2929
model: Required[Union[str, AudioModel]]
3030
"""ID of the model to use.
3131
32-
The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1`
33-
(which is powered by our open source Whisper V2 model).
32+
The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, `whisper-1`
33+
(which is powered by our open source Whisper V2 model), and
34+
`gpt-4o-transcribe-diarize`.
3435
"""
3536

3637
chunking_strategy: Optional[ChunkingStrategy]
@@ -39,7 +40,8 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
3940
When set to `"auto"`, the server first normalizes loudness and then uses voice
4041
activity detection (VAD) to choose boundaries. `server_vad` object can be
4142
provided to tweak VAD detection parameters manually. If unset, the audio is
42-
transcribed as a single block.
43+
transcribed as a single block. Required when using `gpt-4o-transcribe-diarize`
44+
for inputs longer than 30 seconds.
4345
"""
4446

4547
include: List[TranscriptionInclude]
@@ -48,7 +50,24 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
4850
return the log probabilities of the tokens in the response to understand the
4951
model's confidence in the transcription. `logprobs` only works with
5052
response_format set to `json` and only with the models `gpt-4o-transcribe` and
51-
`gpt-4o-mini-transcribe`.
53+
`gpt-4o-mini-transcribe`. This field is not supported when using
54+
`gpt-4o-transcribe-diarize`.
55+
"""
56+
57+
known_speaker_names: SequenceNotStr[str]
58+
"""
59+
Optional list of speaker names that correspond to the audio samples provided in
60+
`known_speaker_references[]`. Each entry should be a short identifier (for
61+
example `customer` or `agent`). Up to 4 speakers are supported.
62+
"""
63+
64+
known_speaker_references: SequenceNotStr[str]
65+
"""
66+
Optional list of audio samples (as
67+
[data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
68+
that contain known speaker references matching `known_speaker_names[]`. Each
69+
sample must be between 2 and 10 seconds, and can use any of the same input audio
70+
formats supported by `file`.
5271
"""
5372

5473
language: str
@@ -64,14 +83,17 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
6483
segment.
6584
6685
The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
67-
should match the audio language.
86+
should match the audio language. This field is not supported when using
87+
`gpt-4o-transcribe-diarize`.
6888
"""
6989

7090
response_format: AudioResponseFormat
7191
"""
7292
The format of the output, in one of these options: `json`, `text`, `srt`,
73-
`verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
74-
the only supported format is `json`.
93+
`verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
94+
`gpt-4o-mini-transcribe`, the only supported format is `json`. For
95+
`gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
96+
`diarized_json`, with `diarized_json` required to receive speaker annotations.
7597
"""
7698

7799
temperature: float
@@ -89,7 +111,8 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
89111
`response_format` must be set `verbose_json` to use timestamp granularities.
90112
Either or both of these options are supported: `word`, or `segment`. Note: There
91113
is no additional latency for segment timestamps, but generating word timestamps
92-
incurs additional latency.
114+
incurs additional latency. This option is not available for
115+
`gpt-4o-transcribe-diarize`.
93116
"""
94117

95118

src/openai/types/audio/transcription_create_response.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55

66
from .transcription import Transcription
77
from .transcription_verbose import TranscriptionVerbose
8+
from .transcription_diarized import TranscriptionDiarized
89

910
__all__ = ["TranscriptionCreateResponse"]
1011

11-
TranscriptionCreateResponse: TypeAlias = Union[Transcription, TranscriptionVerbose]
12+
TranscriptionCreateResponse: TypeAlias = Union[Transcription, TranscriptionDiarized, TranscriptionVerbose]
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2+
3+
from typing import List, Union, Optional
4+
from typing_extensions import Literal, Annotated, TypeAlias
5+
6+
from ..._utils import PropertyInfo
7+
from ..._models import BaseModel
8+
from .transcription_diarized_segment import TranscriptionDiarizedSegment
9+
10+
__all__ = ["TranscriptionDiarized", "Usage", "UsageTokens", "UsageTokensInputTokenDetails", "UsageDuration"]
11+
12+
13+
class UsageTokensInputTokenDetails(BaseModel):
14+
audio_tokens: Optional[int] = None
15+
"""Number of audio tokens billed for this request."""
16+
17+
text_tokens: Optional[int] = None
18+
"""Number of text tokens billed for this request."""
19+
20+
21+
class UsageTokens(BaseModel):
22+
input_tokens: int
23+
"""Number of input tokens billed for this request."""
24+
25+
output_tokens: int
26+
"""Number of output tokens generated."""
27+
28+
total_tokens: int
29+
"""Total number of tokens used (input + output)."""
30+
31+
type: Literal["tokens"]
32+
"""The type of the usage object. Always `tokens` for this variant."""
33+
34+
input_token_details: Optional[UsageTokensInputTokenDetails] = None
35+
"""Details about the input tokens billed for this request."""
36+
37+
38+
class UsageDuration(BaseModel):
39+
seconds: float
40+
"""Duration of the input audio in seconds."""
41+
42+
type: Literal["duration"]
43+
"""The type of the usage object. Always `duration` for this variant."""
44+
45+
46+
Usage: TypeAlias = Annotated[Union[UsageTokens, UsageDuration], PropertyInfo(discriminator="type")]
47+
48+
49+
class TranscriptionDiarized(BaseModel):
50+
duration: float
51+
"""Duration of the input audio in seconds."""
52+
53+
segments: List[TranscriptionDiarizedSegment]
54+
"""Segments of the transcript annotated with timestamps and speaker labels."""
55+
56+
task: Literal["transcribe"]
57+
"""The type of task that was run. Always `transcribe`."""
58+
59+
text: str
60+
"""The concatenated transcript text for the entire audio input."""
61+
62+
usage: Optional[Usage] = None
63+
"""Token or duration usage statistics for the request."""
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2+
3+
from typing_extensions import Literal
4+
5+
from ..._models import BaseModel
6+
7+
__all__ = ["TranscriptionDiarizedSegment"]
8+
9+
10+
class TranscriptionDiarizedSegment(BaseModel):
11+
id: str
12+
"""Unique identifier for the segment."""
13+
14+
end: float
15+
"""End timestamp of the segment in seconds."""
16+
17+
speaker: str
18+
"""Speaker label for this segment.
19+
20+
When known speakers are provided, the label matches `known_speaker_names[]`.
21+
Otherwise speakers are labeled sequentially using capital letters (`A`, `B`,
22+
...).
23+
"""
24+
25+
start: float
26+
"""Start timestamp of the segment in seconds."""
27+
28+
text: str
29+
"""Transcript text for this segment."""
30+
31+
type: Literal["transcript.text.segment"]
32+
"""The type of the segment. Always `transcript.text.segment`."""

0 commit comments

Comments
 (0)