Skip to content

Commit ab6a10d

Browse files
chore(api): Minor docs and type updates for realtime
1 parent 847ff0b commit ab6a10d

15 files changed

+325
-135
lines changed

.stats.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
configured_endpoints: 118
2-
openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-16cb18bed32bae8c5840fb39a1bf664026cc40463ad0c487dcb0df1bd3d72db0.yml
3-
openapi_spec_hash: 4cb51b22f98dee1a90bc7add82d1d132
2+
openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-c829f9e7f51d4946dae7b02eb37eb857b538a464cf54c7ced5eff1b1c93e07db.yml
3+
openapi_spec_hash: 1b2eaba46b264bcec8831bc496543649
44
config_hash: 930dac3aa861344867e4ac84f037b5df

src/openai/resources/responses/responses.py

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -288,10 +288,10 @@ def create(
288288
289289
truncation: The truncation strategy to use for the model response.
290290
291-
- `auto`: If the context of this response and previous ones exceeds the model's
292-
context window size, the model will truncate the response to fit the context
293-
window by dropping input items in the middle of the conversation.
294-
- `disabled` (default): If a model response will exceed the context window size
291+
- `auto`: If the input to this Response exceeds the model's context window size,
292+
the model will truncate the response to fit the context window by dropping
293+
items from the beginning of the conversation.
294+
- `disabled` (default): If the input size will exceed the context window size
295295
for a model, the request will fail with a 400 error.
296296
297297
user: This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use
@@ -527,10 +527,10 @@ def create(
527527
528528
truncation: The truncation strategy to use for the model response.
529529
530-
- `auto`: If the context of this response and previous ones exceeds the model's
531-
context window size, the model will truncate the response to fit the context
532-
window by dropping input items in the middle of the conversation.
533-
- `disabled` (default): If a model response will exceed the context window size
530+
- `auto`: If the input to this Response exceeds the model's context window size,
531+
the model will truncate the response to fit the context window by dropping
532+
items from the beginning of the conversation.
533+
- `disabled` (default): If the input size will exceed the context window size
534534
for a model, the request will fail with a 400 error.
535535
536536
user: This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use
@@ -766,10 +766,10 @@ def create(
766766
767767
truncation: The truncation strategy to use for the model response.
768768
769-
- `auto`: If the context of this response and previous ones exceeds the model's
770-
context window size, the model will truncate the response to fit the context
771-
window by dropping input items in the middle of the conversation.
772-
- `disabled` (default): If a model response will exceed the context window size
769+
- `auto`: If the input to this Response exceeds the model's context window size,
770+
the model will truncate the response to fit the context window by dropping
771+
items from the beginning of the conversation.
772+
- `disabled` (default): If the input size will exceed the context window size
773773
for a model, the request will fail with a 400 error.
774774
775775
user: This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use
@@ -1719,10 +1719,10 @@ async def create(
17191719
17201720
truncation: The truncation strategy to use for the model response.
17211721
1722-
- `auto`: If the context of this response and previous ones exceeds the model's
1723-
context window size, the model will truncate the response to fit the context
1724-
window by dropping input items in the middle of the conversation.
1725-
- `disabled` (default): If a model response will exceed the context window size
1722+
- `auto`: If the input to this Response exceeds the model's context window size,
1723+
the model will truncate the response to fit the context window by dropping
1724+
items from the beginning of the conversation.
1725+
- `disabled` (default): If the input size will exceed the context window size
17261726
for a model, the request will fail with a 400 error.
17271727
17281728
user: This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use
@@ -1958,10 +1958,10 @@ async def create(
19581958
19591959
truncation: The truncation strategy to use for the model response.
19601960
1961-
- `auto`: If the context of this response and previous ones exceeds the model's
1962-
context window size, the model will truncate the response to fit the context
1963-
window by dropping input items in the middle of the conversation.
1964-
- `disabled` (default): If a model response will exceed the context window size
1961+
- `auto`: If the input to this Response exceeds the model's context window size,
1962+
the model will truncate the response to fit the context window by dropping
1963+
items from the beginning of the conversation.
1964+
- `disabled` (default): If the input size will exceed the context window size
19651965
for a model, the request will fail with a 400 error.
19661966
19671967
user: This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use
@@ -2197,10 +2197,10 @@ async def create(
21972197
21982198
truncation: The truncation strategy to use for the model response.
21992199
2200-
- `auto`: If the context of this response and previous ones exceeds the model's
2201-
context window size, the model will truncate the response to fit the context
2202-
window by dropping input items in the middle of the conversation.
2203-
- `disabled` (default): If a model response will exceed the context window size
2200+
- `auto`: If the input to this Response exceeds the model's context window size,
2201+
the model will truncate the response to fit the context window by dropping
2202+
items from the beginning of the conversation.
2203+
- `disabled` (default): If the input size will exceed the context window size
22042204
for a model, the request will fail with a 400 error.
22052205
22062206
user: This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use

src/openai/types/realtime/input_audio_buffer_timeout_triggered.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,16 @@
99

1010
class InputAudioBufferTimeoutTriggered(BaseModel):
1111
audio_end_ms: int
12-
"""Millisecond offset where speech ended within the buffered audio."""
12+
"""
13+
Millisecond offset of audio written to the input audio buffer at the time the
14+
timeout was triggered.
15+
"""
1316

1417
audio_start_ms: int
15-
"""Millisecond offset where speech started within the buffered audio."""
18+
"""
19+
Millisecond offset of audio written to the input audio buffer that was after the
20+
playback time of the last model response.
21+
"""
1622

1723
event_id: str
1824
"""The unique ID of the server event."""

src/openai/types/realtime/realtime_audio_config_input.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,11 @@ class RealtimeAudioConfigInput(BaseModel):
4949
"""Configuration for turn detection, ether Server VAD or Semantic VAD.
5050
5151
This can be set to `null` to turn off, in which case the client must manually
52-
trigger model response. Server VAD means that the model will detect the start
53-
and end of speech based on audio volume and respond at the end of user speech.
52+
trigger model response.
53+
54+
Server VAD means that the model will detect the start and end of speech based on
55+
audio volume and respond at the end of user speech.
56+
5457
Semantic VAD is more advanced and uses a turn detection model (in conjunction
5558
with VAD) to semantically estimate whether the user has finished speaking, then
5659
dynamically sets a timeout based on this probability. For example, if user audio

src/openai/types/realtime/realtime_audio_config_input_param.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from __future__ import annotations
44

5+
from typing import Optional
56
from typing_extensions import TypedDict
67

78
from .noise_reduction_type import NoiseReductionType
@@ -46,12 +47,15 @@ class RealtimeAudioConfigInputParam(TypedDict, total=False):
4647
transcription, these offer additional guidance to the transcription service.
4748
"""
4849

49-
turn_detection: RealtimeAudioInputTurnDetectionParam
50+
turn_detection: Optional[RealtimeAudioInputTurnDetectionParam]
5051
"""Configuration for turn detection, ether Server VAD or Semantic VAD.
5152
5253
This can be set to `null` to turn off, in which case the client must manually
53-
trigger model response. Server VAD means that the model will detect the start
54-
and end of speech based on audio volume and respond at the end of user speech.
54+
trigger model response.
55+
56+
Server VAD means that the model will detect the start and end of speech based on
57+
audio volume and respond at the end of user speech.
58+
5559
Semantic VAD is more advanced and uses a turn detection model (in conjunction
5660
with VAD) to semantically estimate whether the user has finished speaking, then
5761
dynamically sets a timeout based on this probability. For example, if user audio

src/openai/types/realtime/realtime_audio_input_turn_detection.py

Lines changed: 51 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,38 @@
11
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
22

3-
from typing import Optional
4-
from typing_extensions import Literal
3+
from typing import Union, Optional
4+
from typing_extensions import Literal, Annotated, TypeAlias
55

6+
from ..._utils import PropertyInfo
67
from ..._models import BaseModel
78

8-
__all__ = ["RealtimeAudioInputTurnDetection"]
9+
__all__ = ["RealtimeAudioInputTurnDetection", "ServerVad", "SemanticVad"]
910

1011

11-
class RealtimeAudioInputTurnDetection(BaseModel):
12+
class ServerVad(BaseModel):
13+
type: Literal["server_vad"]
14+
"""Type of turn detection, `server_vad` to turn on simple Server VAD."""
15+
1216
create_response: Optional[bool] = None
1317
"""
1418
Whether or not to automatically generate a response when a VAD stop event
1519
occurs.
1620
"""
1721

18-
eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None
19-
"""Used only for `semantic_vad` mode.
22+
idle_timeout_ms: Optional[int] = None
23+
"""Optional timeout after which a model response will be triggered automatically.
2024
21-
The eagerness of the model to respond. `low` will wait longer for the user to
22-
continue speaking, `high` will respond more quickly. `auto` is the default and
23-
is equivalent to `medium`. `low`, `medium`, and `high` have max timeouts of 8s,
24-
4s, and 2s respectively.
25-
"""
25+
This is useful for situations in which a long pause from the user is unexpected,
26+
such as a phone call. The model will effectively prompt the user to continue the
27+
conversation based on the current context.
2628
27-
idle_timeout_ms: Optional[int] = None
28-
"""
29-
Optional idle timeout after which turn detection will auto-timeout when no
30-
additional audio is received and emits a `timeout_triggered` event.
29+
The timeout value will be applied after the last model response's audio has
30+
finished playing, i.e. it's set to the `response.done` time plus audio playback
31+
duration.
32+
33+
An `input_audio_buffer.timeout_triggered` event (plus events associated with the
34+
Response) will be emitted when the timeout is reached. Idle timeout is currently
35+
only supported for `server_vad` mode.
3136
"""
3237

3338
interrupt_response: Optional[bool] = None
@@ -60,5 +65,34 @@ class RealtimeAudioInputTurnDetection(BaseModel):
6065
perform better in noisy environments.
6166
"""
6267

63-
type: Optional[Literal["server_vad", "semantic_vad"]] = None
64-
"""Type of turn detection."""
68+
69+
class SemanticVad(BaseModel):
70+
type: Literal["semantic_vad"]
71+
"""Type of turn detection, `semantic_vad` to turn on Semantic VAD."""
72+
73+
create_response: Optional[bool] = None
74+
"""
75+
Whether or not to automatically generate a response when a VAD stop event
76+
occurs.
77+
"""
78+
79+
eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None
80+
"""Used only for `semantic_vad` mode.
81+
82+
The eagerness of the model to respond. `low` will wait longer for the user to
83+
continue speaking, `high` will respond more quickly. `auto` is the default and
84+
is equivalent to `medium`. `low`, `medium`, and `high` have max timeouts of 8s,
85+
4s, and 2s respectively.
86+
"""
87+
88+
interrupt_response: Optional[bool] = None
89+
"""
90+
Whether or not to automatically interrupt any ongoing response with output to
91+
the default conversation (i.e. `conversation` of `auto`) when a VAD start event
92+
occurs.
93+
"""
94+
95+
96+
RealtimeAudioInputTurnDetection: TypeAlias = Annotated[
97+
Union[ServerVad, SemanticVad, None], PropertyInfo(discriminator="type")
98+
]

src/openai/types/realtime/realtime_audio_input_turn_detection_param.py

Lines changed: 48 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,32 +2,36 @@
22

33
from __future__ import annotations
44

5-
from typing import Optional
6-
from typing_extensions import Literal, TypedDict
5+
from typing import Union, Optional
6+
from typing_extensions import Literal, Required, TypeAlias, TypedDict
77

8-
__all__ = ["RealtimeAudioInputTurnDetectionParam"]
8+
__all__ = ["RealtimeAudioInputTurnDetectionParam", "ServerVad", "SemanticVad"]
99

1010

11-
class RealtimeAudioInputTurnDetectionParam(TypedDict, total=False):
11+
class ServerVad(TypedDict, total=False):
12+
type: Required[Literal["server_vad"]]
13+
"""Type of turn detection, `server_vad` to turn on simple Server VAD."""
14+
1215
create_response: bool
1316
"""
1417
Whether or not to automatically generate a response when a VAD stop event
1518
occurs.
1619
"""
1720

18-
eagerness: Literal["low", "medium", "high", "auto"]
19-
"""Used only for `semantic_vad` mode.
21+
idle_timeout_ms: Optional[int]
22+
"""Optional timeout after which a model response will be triggered automatically.
2023
21-
The eagerness of the model to respond. `low` will wait longer for the user to
22-
continue speaking, `high` will respond more quickly. `auto` is the default and
23-
is equivalent to `medium`. `low`, `medium`, and `high` have max timeouts of 8s,
24-
4s, and 2s respectively.
25-
"""
24+
This is useful for situations in which a long pause from the user is unexpected,
25+
such as a phone call. The model will effectively prompt the user to continue the
26+
conversation based on the current context.
2627
27-
idle_timeout_ms: Optional[int]
28-
"""
29-
Optional idle timeout after which turn detection will auto-timeout when no
30-
additional audio is received and emits a `timeout_triggered` event.
28+
The timeout value will be applied after the last model response's audio has
29+
finished playing, i.e. it's set to the `response.done` time plus audio playback
30+
duration.
31+
32+
An `input_audio_buffer.timeout_triggered` event (plus events associated with the
33+
Response) will be emitted when the timeout is reached. Idle timeout is currently
34+
only supported for `server_vad` mode.
3135
"""
3236

3337
interrupt_response: bool
@@ -60,5 +64,32 @@ class RealtimeAudioInputTurnDetectionParam(TypedDict, total=False):
6064
perform better in noisy environments.
6165
"""
6266

63-
type: Literal["server_vad", "semantic_vad"]
64-
"""Type of turn detection."""
67+
68+
class SemanticVad(TypedDict, total=False):
69+
type: Required[Literal["semantic_vad"]]
70+
"""Type of turn detection, `semantic_vad` to turn on Semantic VAD."""
71+
72+
create_response: bool
73+
"""
74+
Whether or not to automatically generate a response when a VAD stop event
75+
occurs.
76+
"""
77+
78+
eagerness: Literal["low", "medium", "high", "auto"]
79+
"""Used only for `semantic_vad` mode.
80+
81+
The eagerness of the model to respond. `low` will wait longer for the user to
82+
continue speaking, `high` will respond more quickly. `auto` is the default and
83+
is equivalent to `medium`. `low`, `medium`, and `high` have max timeouts of 8s,
84+
4s, and 2s respectively.
85+
"""
86+
87+
interrupt_response: bool
88+
"""
89+
Whether or not to automatically interrupt any ongoing response with output to
90+
the default conversation (i.e. `conversation` of `auto`) when a VAD start event
91+
occurs.
92+
"""
93+
94+
95+
RealtimeAudioInputTurnDetectionParam: TypeAlias = Union[ServerVad, SemanticVad]

0 commit comments

Comments
 (0)