55from typing import List , Union , Optional
66from typing_extensions import Literal , Required , TypeAlias , TypedDict
77
8- from ..._types import FileTypes
8+ from ..._types import FileTypes , SequenceNotStr
99from ..audio_model import AudioModel
1010from .transcription_include import TranscriptionInclude
1111from ..audio_response_format import AudioResponseFormat
@@ -29,8 +29,9 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
2929 model : Required [Union [str , AudioModel ]]
3030 """ID of the model to use.
3131
32- The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1`
33- (which is powered by our open source Whisper V2 model).
32+ The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, `whisper-1`
33+ (which is powered by our open source Whisper V2 model), and
34+ `gpt-4o-transcribe-diarize`.
3435 """
3536
3637 chunking_strategy : Optional [ChunkingStrategy ]
@@ -39,7 +40,8 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
3940 When set to `"auto"`, the server first normalizes loudness and then uses voice
4041 activity detection (VAD) to choose boundaries. `server_vad` object can be
4142 provided to tweak VAD detection parameters manually. If unset, the audio is
42- transcribed as a single block.
43+ transcribed as a single block. Required when using `gpt-4o-transcribe-diarize`
44+ for inputs longer than 30 seconds.
4345 """
4446
4547 include : List [TranscriptionInclude ]
@@ -48,7 +50,24 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
4850 return the log probabilities of the tokens in the response to understand the
4951 model's confidence in the transcription. `logprobs` only works with
5052 response_format set to `json` and only with the models `gpt-4o-transcribe` and
51- `gpt-4o-mini-transcribe`.
53+ `gpt-4o-mini-transcribe`. This field is not supported when using
54+ `gpt-4o-transcribe-diarize`.
55+ """
56+
57+ known_speaker_names : SequenceNotStr [str ]
58+ """
59+ Optional list of speaker names that correspond to the audio samples provided in
60+ `known_speaker_references[]`. Each entry should be a short identifier (for
61+ example `customer` or `agent`). Up to 4 speakers are supported.
62+ """
63+
64+ known_speaker_references : SequenceNotStr [str ]
65+ """
66+ Optional list of audio samples (as
67+ [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
68+ that contain known speaker references matching `known_speaker_names[]`. Each
69+ sample must be between 2 and 10 seconds, and can use any of the same input audio
70+ formats supported by `file`.
5271 """
5372
5473 language : str
@@ -64,14 +83,17 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
6483 segment.
6584
6685 The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
67- should match the audio language.
86+ should match the audio language. This field is not supported when using
87+ `gpt-4o-transcribe-diarize`.
6888 """
6989
7090 response_format : AudioResponseFormat
7191 """
7292 The format of the output, in one of these options: `json`, `text`, `srt`,
73- `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
74- the only supported format is `json`.
93+ `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
94+ `gpt-4o-mini-transcribe`, the only supported format is `json`. For
95+ `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
96+ `diarized_json`, with `diarized_json` required to receive speaker annotations.
7597 """
7698
7799 temperature : float
@@ -89,7 +111,8 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
89111 `response_format` must be set `verbose_json` to use timestamp granularities.
90112 Either or both of these options are supported: `word`, or `segment`. Note: There
91113 is no additional latency for segment timestamps, but generating word timestamps
92- incurs additional latency.
114+ incurs additional latency. This option is not available for
115+ `gpt-4o-transcribe-diarize`.
93116 """
94117
95118
0 commit comments