openai
diff --git a/‎.stats.yml‎
Lines changed: 2 additions & 2 deletions b/‎.stats.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/openai/resources/responses/responses.py‎
Lines changed: 24 additions & 24 deletions b/‎src/openai/resources/responses/responses.py‎
Lines changed: 24 additions & 24 deletions
diff --git a/‎src/openai/types/realtime/input_audio_buffer_timeout_triggered.py‎
Lines changed: 8 additions & 2 deletions b/‎src/openai/types/realtime/input_audio_buffer_timeout_triggered.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎src/openai/types/realtime/realtime_audio_config_input.py‎
Lines changed: 5 additions & 2 deletions b/‎src/openai/types/realtime/realtime_audio_config_input.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎src/openai/types/realtime/realtime_audio_config_input_param.py‎
Lines changed: 7 additions & 3 deletions b/‎src/openai/types/realtime/realtime_audio_config_input_param.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎src/openai/types/realtime/realtime_audio_input_turn_detection.py‎
Lines changed: 51 additions & 17 deletions b/‎src/openai/types/realtime/realtime_audio_input_turn_detection.py‎
Lines changed: 51 additions & 17 deletions
diff --git a/‎src/openai/types/realtime/realtime_audio_input_turn_detection_param.py‎
Lines changed: 48 additions & 17 deletions b/‎src/openai/types/realtime/realtime_audio_input_turn_detection_param.py‎
Lines changed: 48 additions & 17 deletions
@@ -1,4 +1,4 @@
 configured_endpoints: 118
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-16cb18bed32bae8c5840fb39a1bf664026cc40463ad0c487dcb0df1bd3d72db0.yml
-openapi_spec_hash: 4cb51b22f98dee1a90bc7add82d1d132
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-c829f9e7f51d4946dae7b02eb37eb857b538a464cf54c7ced5eff1b1c93e07db.yml
+openapi_spec_hash: 1b2eaba46b264bcec8831bc496543649
 config_hash: 930dac3aa861344867e4ac84f037b5df
@@ -288,10 +288,10 @@ def create(
 
           truncation: The truncation strategy to use for the model response.
 
-              - `auto`: If the context of this response and previous ones exceeds the model's
-                context window size, the model will truncate the response to fit the context
-                window by dropping input items in the middle of the conversation.
-              - `disabled` (default): If a model response will exceed the context window size
+              - `auto`: If the input to this Response exceeds the model's context window size,
+                the model will truncate the response to fit the context window by dropping
+                items from the beginning of the conversation.
+              - `disabled` (default): If the input size will exceed the context window size
                 for a model, the request will fail with a 400 error.
 
           user: This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use
@@ -527,10 +527,10 @@ def create(
 
           truncation: The truncation strategy to use for the model response.
 
-              - `auto`: If the context of this response and previous ones exceeds the model's
-                context window size, the model will truncate the response to fit the context
-                window by dropping input items in the middle of the conversation.
-              - `disabled` (default): If a model response will exceed the context window size
+              - `auto`: If the input to this Response exceeds the model's context window size,
+                the model will truncate the response to fit the context window by dropping
+                items from the beginning of the conversation.
+              - `disabled` (default): If the input size will exceed the context window size
                 for a model, the request will fail with a 400 error.
 
           user: This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use
@@ -766,10 +766,10 @@ def create(
 
           truncation: The truncation strategy to use for the model response.
 
-              - `auto`: If the context of this response and previous ones exceeds the model's
-                context window size, the model will truncate the response to fit the context
-                window by dropping input items in the middle of the conversation.
-              - `disabled` (default): If a model response will exceed the context window size
+              - `auto`: If the input to this Response exceeds the model's context window size,
+                the model will truncate the response to fit the context window by dropping
+                items from the beginning of the conversation.
+              - `disabled` (default): If the input size will exceed the context window size
                 for a model, the request will fail with a 400 error.
 
           user: This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use
@@ -1719,10 +1719,10 @@ async def create(
 
           truncation: The truncation strategy to use for the model response.
 
-              - `auto`: If the context of this response and previous ones exceeds the model's
-                context window size, the model will truncate the response to fit the context
-                window by dropping input items in the middle of the conversation.
-              - `disabled` (default): If a model response will exceed the context window size
+              - `auto`: If the input to this Response exceeds the model's context window size,
+                the model will truncate the response to fit the context window by dropping
+                items from the beginning of the conversation.
+              - `disabled` (default): If the input size will exceed the context window size
                 for a model, the request will fail with a 400 error.
 
           user: This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use
@@ -1958,10 +1958,10 @@ async def create(
 
           truncation: The truncation strategy to use for the model response.
 
-              - `auto`: If the context of this response and previous ones exceeds the model's
-                context window size, the model will truncate the response to fit the context
-                window by dropping input items in the middle of the conversation.
-              - `disabled` (default): If a model response will exceed the context window size
+              - `auto`: If the input to this Response exceeds the model's context window size,
+                the model will truncate the response to fit the context window by dropping
+                items from the beginning of the conversation.
+              - `disabled` (default): If the input size will exceed the context window size
                 for a model, the request will fail with a 400 error.
 
           user: This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use
@@ -2197,10 +2197,10 @@ async def create(
 
           truncation: The truncation strategy to use for the model response.
 
-              - `auto`: If the context of this response and previous ones exceeds the model's
-                context window size, the model will truncate the response to fit the context
-                window by dropping input items in the middle of the conversation.
-              - `disabled` (default): If a model response will exceed the context window size
+              - `auto`: If the input to this Response exceeds the model's context window size,
+                the model will truncate the response to fit the context window by dropping
+                items from the beginning of the conversation.
+              - `disabled` (default): If the input size will exceed the context window size
                 for a model, the request will fail with a 400 error.
 
           user: This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use
 
@@ -9,10 +9,16 @@
 
 class InputAudioBufferTimeoutTriggered(BaseModel):
     audio_end_ms: int
-    """Millisecond offset where speech ended within the buffered audio."""
+    """
+    Millisecond offset of audio written to the input audio buffer at the time the
+    timeout was triggered.
+    """
 
     audio_start_ms: int
-    """Millisecond offset where speech started within the buffered audio."""
+    """
+    Millisecond offset of audio written to the input audio buffer that was after the
+    playback time of the last model response.
+    """
 
     event_id: str
     """The unique ID of the server event."""
 
@@ -49,8 +49,11 @@ class RealtimeAudioConfigInput(BaseModel):
     """Configuration for turn detection, ether Server VAD or Semantic VAD.
 
     This can be set to `null` to turn off, in which case the client must manually
-    trigger model response. Server VAD means that the model will detect the start
-    and end of speech based on audio volume and respond at the end of user speech.
+    trigger model response.
+
+    Server VAD means that the model will detect the start and end of speech based on
+    audio volume and respond at the end of user speech.
+
     Semantic VAD is more advanced and uses a turn detection model (in conjunction
     with VAD) to semantically estimate whether the user has finished speaking, then
     dynamically sets a timeout based on this probability. For example, if user audio
 
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+from typing import Optional
 from typing_extensions import TypedDict
 
 from .noise_reduction_type import NoiseReductionType
@@ -46,12 +47,15 @@ class RealtimeAudioConfigInputParam(TypedDict, total=False):
     transcription, these offer additional guidance to the transcription service.
     """
 
-    turn_detection: RealtimeAudioInputTurnDetectionParam
+    turn_detection: Optional[RealtimeAudioInputTurnDetectionParam]
     """Configuration for turn detection, ether Server VAD or Semantic VAD.
 
     This can be set to `null` to turn off, in which case the client must manually
-    trigger model response. Server VAD means that the model will detect the start
-    and end of speech based on audio volume and respond at the end of user speech.
+    trigger model response.
+
+    Server VAD means that the model will detect the start and end of speech based on
+    audio volume and respond at the end of user speech.
+
     Semantic VAD is more advanced and uses a turn detection model (in conjunction
     with VAD) to semantically estimate whether the user has finished speaking, then
     dynamically sets a timeout based on this probability. For example, if user audio
 
@@ -1,33 +1,38 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-from typing import Optional
-from typing_extensions import Literal
+from typing import Union, Optional
+from typing_extensions import Literal, Annotated, TypeAlias
 
+from ..._utils import PropertyInfo
 from ..._models import BaseModel
 
-__all__ = ["RealtimeAudioInputTurnDetection"]
+__all__ = ["RealtimeAudioInputTurnDetection", "ServerVad", "SemanticVad"]
 
 
-class RealtimeAudioInputTurnDetection(BaseModel):
+class ServerVad(BaseModel):
+    type: Literal["server_vad"]
+    """Type of turn detection, `server_vad` to turn on simple Server VAD."""
+
     create_response: Optional[bool] = None
     """
     Whether or not to automatically generate a response when a VAD stop event
     occurs.
     """
 
-    eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None
-    """Used only for `semantic_vad` mode.
+    idle_timeout_ms: Optional[int] = None
+    """Optional timeout after which a model response will be triggered automatically.
 
-    The eagerness of the model to respond. `low` will wait longer for the user to
-    continue speaking, `high` will respond more quickly. `auto` is the default and
-    is equivalent to `medium`. `low`, `medium`, and `high` have max timeouts of 8s,
-    4s, and 2s respectively.
-    """
+    This is useful for situations in which a long pause from the user is unexpected,
+    such as a phone call. The model will effectively prompt the user to continue the
+    conversation based on the current context.
 
-    idle_timeout_ms: Optional[int] = None
-    """
-    Optional idle timeout after which turn detection will auto-timeout when no
-    additional audio is received and emits a `timeout_triggered` event.
+    The timeout value will be applied after the last model response's audio has
+    finished playing, i.e. it's set to the `response.done` time plus audio playback
+    duration.
+
+    An `input_audio_buffer.timeout_triggered` event (plus events associated with the
+    Response) will be emitted when the timeout is reached. Idle timeout is currently
+    only supported for `server_vad` mode.
     """
 
     interrupt_response: Optional[bool] = None
@@ -60,5 +65,34 @@ class RealtimeAudioInputTurnDetection(BaseModel):
     perform better in noisy environments.
     """
 
-    type: Optional[Literal["server_vad", "semantic_vad"]] = None
-    """Type of turn detection."""
+
+class SemanticVad(BaseModel):
+    type: Literal["semantic_vad"]
+    """Type of turn detection, `semantic_vad` to turn on Semantic VAD."""
+
+    create_response: Optional[bool] = None
+    """
+    Whether or not to automatically generate a response when a VAD stop event
+    occurs.
+    """
+
+    eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None
+    """Used only for `semantic_vad` mode.
+
+    The eagerness of the model to respond. `low` will wait longer for the user to
+    continue speaking, `high` will respond more quickly. `auto` is the default and
+    is equivalent to `medium`. `low`, `medium`, and `high` have max timeouts of 8s,
+    4s, and 2s respectively.
+    """
+
+    interrupt_response: Optional[bool] = None
+    """
+    Whether or not to automatically interrupt any ongoing response with output to
+    the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+    occurs.
+    """
+
+
+RealtimeAudioInputTurnDetection: TypeAlias = Annotated[
+    Union[ServerVad, SemanticVad, None], PropertyInfo(discriminator="type")
+]
@@ -2,32 +2,36 @@
 
 from __future__ import annotations
 
-from typing import Optional
-from typing_extensions import Literal, TypedDict
+from typing import Union, Optional
+from typing_extensions import Literal, Required, TypeAlias, TypedDict
 
-__all__ = ["RealtimeAudioInputTurnDetectionParam"]
+__all__ = ["RealtimeAudioInputTurnDetectionParam", "ServerVad", "SemanticVad"]
 
 
-class RealtimeAudioInputTurnDetectionParam(TypedDict, total=False):
+class ServerVad(TypedDict, total=False):
+    type: Required[Literal["server_vad"]]
+    """Type of turn detection, `server_vad` to turn on simple Server VAD."""
+
     create_response: bool
     """
     Whether or not to automatically generate a response when a VAD stop event
     occurs.
     """
 
-    eagerness: Literal["low", "medium", "high", "auto"]
-    """Used only for `semantic_vad` mode.
+    idle_timeout_ms: Optional[int]
+    """Optional timeout after which a model response will be triggered automatically.
 
-    The eagerness of the model to respond. `low` will wait longer for the user to
-    continue speaking, `high` will respond more quickly. `auto` is the default and
-    is equivalent to `medium`. `low`, `medium`, and `high` have max timeouts of 8s,
-    4s, and 2s respectively.
-    """
+    This is useful for situations in which a long pause from the user is unexpected,
+    such as a phone call. The model will effectively prompt the user to continue the
+    conversation based on the current context.
 
-    idle_timeout_ms: Optional[int]
-    """
-    Optional idle timeout after which turn detection will auto-timeout when no
-    additional audio is received and emits a `timeout_triggered` event.
+    The timeout value will be applied after the last model response's audio has
+    finished playing, i.e. it's set to the `response.done` time plus audio playback
+    duration.
+
+    An `input_audio_buffer.timeout_triggered` event (plus events associated with the
+    Response) will be emitted when the timeout is reached. Idle timeout is currently
+    only supported for `server_vad` mode.
     """
 
     interrupt_response: bool
@@ -60,5 +64,32 @@ class RealtimeAudioInputTurnDetectionParam(TypedDict, total=False):
     perform better in noisy environments.
     """
 
-    type: Literal["server_vad", "semantic_vad"]
-    """Type of turn detection."""
+
+class SemanticVad(TypedDict, total=False):
+    type: Required[Literal["semantic_vad"]]
+    """Type of turn detection, `semantic_vad` to turn on Semantic VAD."""
+
+    create_response: bool
+    """
+    Whether or not to automatically generate a response when a VAD stop event
+    occurs.
+    """
+
+    eagerness: Literal["low", "medium", "high", "auto"]
+    """Used only for `semantic_vad` mode.
+
+    The eagerness of the model to respond. `low` will wait longer for the user to
+    continue speaking, `high` will respond more quickly. `auto` is the default and
+    is equivalent to `medium`. `low`, `medium`, and `high` have max timeouts of 8s,
+    4s, and 2s respectively.
+    """
+
+    interrupt_response: bool
+    """
+    Whether or not to automatically interrupt any ongoing response with output to
+    the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+    occurs.
+    """
+
+
+RealtimeAudioInputTurnDetectionParam: TypeAlias = Union[ServerVad, SemanticVad]