feat(api): Realtime API token_limits, Hybrid searching ranking options

stainless-app[bot] · stainless-app[bot] · commit 6a60d70ecbf6 · 2025-11-03T23:51:29.000Z
diff --git a/.stats.yml b/.stats.yml
@@ -1,4 +1,4 @@
 configured_endpoints: 135
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-f68f718cd45ac3f9336603601bccc38a718af44d0b26601031de3d0a71b7ce2f.yml
-openapi_spec_hash: 1560717860bba4105936647dde8f618d
-config_hash: 50ee3382a63c021a9f821a935950e926
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-3c5d1593d7c6f2b38a7d78d7906041465ee9d6e9022f0651e1da194654488108.yml
+openapi_spec_hash: 0a4d8ad2469823ce24a3fd94f23f1c2b
+config_hash: 032995825500a503a76da119f5354905
diff --git a/src/resources/images.ts b/src/resources/images.ts
@@ -545,7 +545,10 @@ export interface ImageEditParamsBase {
   background?: 'transparent' | 'opaque' | 'auto' | null;
 
   /**
-   * Control how much effort the model will exert to match the style and features, especially facial features, of input images. This parameter is only supported for `gpt-image-1`. Unsupported for `gpt-image-1-mini`. Supports `high` and `low`. Defaults to `low`.
+   * Control how much effort the model will exert to match the style and features,
+   * especially facial features, of input images. This parameter is only supported
+   * for `gpt-image-1`. Unsupported for `gpt-image-1-mini`. Supports `high` and
+   * `low`. Defaults to `low`.
    */
   input_fidelity?: 'high' | 'low' | null;
 
diff --git a/src/resources/realtime/calls.ts b/src/resources/realtime/calls.ts
@@ -177,8 +177,19 @@ export interface CallAcceptParams {
   tracing?: RealtimeAPI.RealtimeTracingConfig | null;
 
   /**
-   * Controls how the realtime conversation is truncated prior to model inference.
-   * The default is `auto`.
+   * When the number of tokens in a conversation exceeds the model's input token
+   * limit, the conversation be truncated, meaning messages (starting from the
+   * oldest) will not be included in the model's context. A 32k context model with
+   * 4,096 max output tokens can only include 28,224 tokens in the context before
+   * truncation occurs. Clients can configure truncation behavior to truncate with a
+   * lower max token limit, which is an effective way to control token usage and
+   * cost. Truncation will reduce the number of cached tokens on the next turn
+   * (busting the cache), since messages are dropped from the beginning of the
+   * context. However, clients can also configure truncation to retain messages up to
+   * a fraction of the maximum context size, which will reduce the need for future
+   * truncations and thus improve the cache rate. Truncation can be disabled
+   * entirely, which means the server will never truncate but would instead return an
+   * error if the conversation exceeds the model's input token limit.
    */
   truncation?: RealtimeAPI.RealtimeTruncation;
 }
diff --git a/src/resources/realtime/client-secrets.ts b/src/resources/realtime/client-secrets.ts
@@ -144,8 +144,19 @@ export interface RealtimeSessionCreateResponse {
   tracing?: 'auto' | RealtimeSessionCreateResponse.TracingConfiguration | null;
 
   /**
-   * Controls how the realtime conversation is truncated prior to model inference.
-   * The default is `auto`.
+   * When the number of tokens in a conversation exceeds the model's input token
+   * limit, the conversation be truncated, meaning messages (starting from the
+   * oldest) will not be included in the model's context. A 32k context model with
+   * 4,096 max output tokens can only include 28,224 tokens in the context before
+   * truncation occurs. Clients can configure truncation behavior to truncate with a
+   * lower max token limit, which is an effective way to control token usage and
+   * cost. Truncation will reduce the number of cached tokens on the next turn
+   * (busting the cache), since messages are dropped from the beginning of the
+   * context. However, clients can also configure truncation to retain messages up to
+   * a fraction of the maximum context size, which will reduce the need for future
+   * truncations and thus improve the cache rate. Truncation can be disabled
+   * entirely, which means the server will never truncate but would instead return an
+   * error if the conversation exceeds the model's input token limit.
    */
   truncation?: RealtimeAPI.RealtimeTruncation;
 }
diff --git a/src/resources/realtime/realtime.ts b/src/resources/realtime/realtime.ts
@@ -3050,8 +3050,19 @@ export interface RealtimeSessionCreateRequest {
   tracing?: RealtimeTracingConfig | null;
 
   /**
-   * Controls how the realtime conversation is truncated prior to model inference.
-   * The default is `auto`.
+   * When the number of tokens in a conversation exceeds the model's input token
+   * limit, the conversation be truncated, meaning messages (starting from the
+   * oldest) will not be included in the model's context. A 32k context model with
+   * 4,096 max output tokens can only include 28,224 tokens in the context before
+   * truncation occurs. Clients can configure truncation behavior to truncate with a
+   * lower max token limit, which is an effective way to control token usage and
+   * cost. Truncation will reduce the number of cached tokens on the next turn
+   * (busting the cache), since messages are dropped from the beginning of the
+   * context. However, clients can also configure truncation to retain messages up to
+   * a fraction of the maximum context size, which will reduce the need for future
+   * truncations and thus improve the cache rate. Truncation can be disabled
+   * entirely, which means the server will never truncate but would instead return an
+   * error if the conversation exceeds the model's input token limit.
    */
   truncation?: RealtimeTruncation;
 }
@@ -3474,8 +3485,19 @@ export interface RealtimeTranscriptionSessionCreateRequest {
 }
 
 /**
- * Controls how the realtime conversation is truncated prior to model inference.
- * The default is `auto`.
+ * When the number of tokens in a conversation exceeds the model's input token
+ * limit, the conversation be truncated, meaning messages (starting from the
+ * oldest) will not be included in the model's context. A 32k context model with
+ * 4,096 max output tokens can only include 28,224 tokens in the context before
+ * truncation occurs. Clients can configure truncation behavior to truncate with a
+ * lower max token limit, which is an effective way to control token usage and
+ * cost. Truncation will reduce the number of cached tokens on the next turn
+ * (busting the cache), since messages are dropped from the beginning of the
+ * context. However, clients can also configure truncation to retain messages up to
+ * a fraction of the maximum context size, which will reduce the need for future
+ * truncations and thus improve the cache rate. Truncation can be disabled
+ * entirely, which means the server will never truncate but would instead return an
+ * error if the conversation exceeds the model's input token limit.
  */
 export type RealtimeTruncation = 'auto' | 'disabled' | RealtimeTruncationRetentionRatio;
 
@@ -3486,15 +3508,40 @@ export type RealtimeTruncation = 'auto' | 'disabled' | RealtimeTruncationRetenti
  */
 export interface RealtimeTruncationRetentionRatio {
   /**
-   * Fraction of post-instruction conversation tokens to retain (0.0 - 1.0) when the
-   * conversation exceeds the input token limit.
+   * Fraction of post-instruction conversation tokens to retain (`0.0` - `1.0`) when
+   * the conversation exceeds the input token limit. Setting this to `0.8` means that
+   * messages will be dropped until 80% of the maximum allowed tokens are used. This
+   * helps reduce the frequency of truncations and improve cache rates.
    */
   retention_ratio: number;
 
   /**
    * Use retention ratio truncation.
    */
   type: 'retention_ratio';
+
+  /**
+   * Optional custom token limits for this truncation strategy. If not provided, the
+   * model's default token limits will be used.
+   */
+  token_limits?: RealtimeTruncationRetentionRatio.TokenLimits;
+}
+
+export namespace RealtimeTruncationRetentionRatio {
+  /**
+   * Optional custom token limits for this truncation strategy. If not provided, the
+   * model's default token limits will be used.
+   */
+  export interface TokenLimits {
+    /**
+     * Maximum tokens allowed in the conversation after instructions (which including
+     * tool definitions). For example, setting this to 5,000 would mean that truncation
+     * would occur when the conversation exceeds 5,000 tokens after instructions. This
+     * cannot be higher than the model's context window size minus the maximum output
+     * tokens.
+     */
+    post_instructions?: number;
+  }
 }
 
 /**
diff --git a/src/resources/responses/responses.ts b/src/resources/responses/responses.ts
@@ -322,6 +322,12 @@ export namespace FileSearchTool {
    * Ranking options for search.
    */
   export interface RankingOptions {
+    /**
+     * Weights that control how reciprocal rank fusion balances semantic embedding
+     * matches versus sparse keyword matches when hybrid search is enabled.
+     */
+    hybrid_search?: RankingOptions.HybridSearch;
+
     /**
      * The ranker to use for the file search.
      */
@@ -334,6 +340,24 @@ export namespace FileSearchTool {
      */
     score_threshold?: number;
   }
+
+  export namespace RankingOptions {
+    /**
+     * Weights that control how reciprocal rank fusion balances semantic embedding
+     * matches versus sparse keyword matches when hybrid search is enabled.
+     */
+    export interface HybridSearch {
+      /**
+       * The weight of the embedding in the reciprocal ranking fusion.
+       */
+      embedding_weight: number;
+
+      /**
+       * The weight of the text in the reciprocal ranking fusion.
+       */
+      text_weight: number;
+    }
+  }
 }
 
 /**
@@ -3846,6 +3870,8 @@ export interface ResponseOutputText {
     | ResponseOutputText.FilePath
   >;
 
+  logprobs: Array<ResponseOutputText.Logprob>;
+
   /**
    * The text output from the model.
    */
@@ -3855,8 +3881,6 @@ export interface ResponseOutputText {
    * The type of the output text. Always `output_text`.
    */
   type: 'output_text';
-
-  logprobs?: Array<ResponseOutputText.Logprob>;
 }
 
 export namespace ResponseOutputText {
@@ -5047,6 +5071,8 @@ export namespace Tool {
        * An optional list of uploaded files to make available to your code.
        */
       file_ids?: Array<string>;
+
+      memory_limit?: '1g' | '4g' | '16g' | '64g' | null;
     }
   }
 
@@ -5066,7 +5092,10 @@ export namespace Tool {
     background?: 'transparent' | 'opaque' | 'auto';
 
     /**
-     * Control how much effort the model will exert to match the style and features, especially facial features, of input images. This parameter is only supported for `gpt-image-1`. Unsupported for `gpt-image-1-mini`. Supports `high` and `low`. Defaults to `low`.
+     * Control how much effort the model will exert to match the style and features,
+     * especially facial features, of input images. This parameter is only supported
+     * for `gpt-image-1`. Unsupported for `gpt-image-1-mini`. Supports `high` and
+     * `low`. Defaults to `low`.
      */
     input_fidelity?: 'high' | 'low' | null;
 
diff --git a/src/resources/shared.ts b/src/resources/shared.ts
@@ -135,13 +135,19 @@ export interface CompoundFilter {
 export type CustomToolInputFormat = CustomToolInputFormat.Text | CustomToolInputFormat.Grammar;
 
 export namespace CustomToolInputFormat {
+  /**
+   * Unconstrained free-form text.
+   */
   export interface Text {
     /**
      * Unconstrained text format. Always `text`.
      */
     type: 'text';
   }
 
+  /**
+   * A grammar defined by the user.
+   */
   export interface Grammar {
     /**
      * The grammar definition.
diff --git a/src/resources/vector-stores/file-batches.ts b/src/resources/vector-stores/file-batches.ts
@@ -255,13 +255,6 @@ export namespace VectorStoreFileBatch {
 }
 
 export interface FileBatchCreateParams {
-  /**
-   * A list of [File](https://platform.openai.com/docs/api-reference/files) IDs that
-   * the vector store should use. Useful for tools like `file_search` that can access
-   * files.
-   */
-  file_ids: Array<string>;
-
   /**
    * Set of 16 key-value pairs that can be attached to an object. This can be useful
    * for storing additional information about the object in a structured format, and
@@ -276,6 +269,48 @@ export interface FileBatchCreateParams {
    * strategy. Only applicable if `file_ids` is non-empty.
    */
   chunking_strategy?: VectorStoresAPI.FileChunkingStrategyParam;
+
+  /**
+   * A list of [File](https://platform.openai.com/docs/api-reference/files) IDs that
+   * the vector store should use. Useful for tools like `file_search` that can access
+   * files. If `attributes` or `chunking_strategy` are provided, they will be applied
+   * to all files in the batch. Mutually exclusive with `files`.
+   */
+  file_ids?: Array<string>;
+
+  /**
+   * A list of objects that each include a `file_id` plus optional `attributes` or
+   * `chunking_strategy`. Use this when you need to override metadata for specific
+   * files. The global `attributes` or `chunking_strategy` will be ignored and must
+   * be specified for each file. Mutually exclusive with `file_ids`.
+   */
+  files?: Array<FileBatchCreateParams.File>;
+}
+
+export namespace FileBatchCreateParams {
+  export interface File {
+    /**
+     * A [File](https://platform.openai.com/docs/api-reference/files) ID that the
+     * vector store should use. Useful for tools like `file_search` that can access
+     * files.
+     */
+    file_id: string;
+
+    /**
+     * Set of 16 key-value pairs that can be attached to an object. This can be useful
+     * for storing additional information about the object in a structured format, and
+     * querying for objects via API or the dashboard. Keys are strings with a maximum
+     * length of 64 characters. Values are strings with a maximum length of 512
+     * characters, booleans, or numbers.
+     */
+    attributes?: { [key: string]: string | number | boolean } | null;
+
+    /**
+     * The chunking strategy used to chunk the file(s). If not set, will use the `auto`
+     * strategy. Only applicable if `file_ids` is non-empty.
+     */
+    chunking_strategy?: VectorStoresAPI.FileChunkingStrategyParam;
+  }
 }
 
 export interface FileBatchRetrieveParams {
diff --git a/src/resources/videos.ts b/src/resources/videos.ts
@@ -114,6 +114,11 @@ export interface Video {
    */
   progress: number;
 
+  /**
+   * The prompt that was used to generate the video.
+   */
+  prompt: string | null;
+
   /**
    * Identifier of the source video if this video is a remix.
    */
diff --git a/tests/api-resources/vector-stores/file-batches.test.ts b/tests/api-resources/vector-stores/file-batches.test.ts
@@ -8,8 +8,8 @@ const client = new OpenAI({
 });
 
 describe('resource fileBatches', () => {
-  test('create: only required params', async () => {
-    const responsePromise = client.vectorStores.fileBatches.create('vs_abc123', { file_ids: ['string'] });
+  test('create', async () => {
+    const responsePromise = client.vectorStores.fileBatches.create('vs_abc123', {});
     const rawResponse = await responsePromise.asResponse();
     expect(rawResponse).toBeInstanceOf(Response);
     const response = await responsePromise;
@@ -19,14 +19,6 @@ describe('resource fileBatches', () => {
     expect(dataAndResponse.response).toBe(rawResponse);
   });
 
-  test('create: required and optional params', async () => {
-    const response = await client.vectorStores.fileBatches.create('vs_abc123', {
-      file_ids: ['string'],
-      attributes: { foo: 'string' },
-      chunking_strategy: { type: 'auto' },
-    });
-  });
-
   test('retrieve: only required params', async () => {
     const responsePromise = client.vectorStores.fileBatches.retrieve('vsfb_abc123', {
       vector_store_id: 'vs_abc123',