chore(openai-native): generalize Responses API options and extended prompt caching

hannesrudolph · hannesrudolph · commit a136ac20286c · 2025-11-13T21:54:43.000-07:00
diff --git a/packages/types/src/model.ts b/packages/types/src/model.ts
@@ -74,6 +74,10 @@ export const modelInfoSchema = z.object({
 	contextWindow: z.number(),
 	supportsImages: z.boolean().optional(),
 	supportsPromptCache: z.boolean(),
+	// Optional default prompt cache retention policy for providers that support it.
+	// When set to "24h", extended prompt caching will be requested; when omitted
+	// or set to "in_memory", the default in‑memory cache is used.
+	promptCacheRetention: z.enum(["in_memory", "24h"]).optional(),
 	// Capability flag to indicate whether the model supports an output verbosity parameter
 	supportsVerbosity: z.boolean().optional(),
 	supportsReasoningBudget: z.boolean().optional(),
diff --git a/packages/types/src/providers/openai.ts b/packages/types/src/providers/openai.ts
@@ -11,6 +11,7 @@ export const openAiNativeModels = {
 		contextWindow: 400000,
 		supportsImages: true,
 		supportsPromptCache: true,
+		promptCacheRetention: "24h",
 		supportsReasoningEffort: ["none", "low", "medium", "high"],
 		reasoningEffort: "medium",
 		inputPrice: 1.25,
@@ -29,6 +30,7 @@ export const openAiNativeModels = {
 		contextWindow: 400000,
 		supportsImages: true,
 		supportsPromptCache: true,
+		promptCacheRetention: "24h",
 		supportsReasoningEffort: ["low", "medium", "high"],
 		reasoningEffort: "medium",
 		inputPrice: 1.25,
@@ -43,6 +45,7 @@ export const openAiNativeModels = {
 		contextWindow: 400000,
 		supportsImages: true,
 		supportsPromptCache: true,
+		promptCacheRetention: "24h",
 		supportsReasoningEffort: ["low", "medium", "high"],
 		reasoningEffort: "medium",
 		inputPrice: 0.25,
diff --git a/src/api/providers/openai-native.ts b/src/api/providers/openai-native.ts
@@ -52,9 +52,10 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 	constructor(options: ApiHandlerOptions) {
 		super()
 		this.options = options
-		// Default to including reasoning.summary: "auto" for GPT‑5 unless explicitly disabled
-		if (this.options.enableGpt5ReasoningSummary === undefined) {
-			this.options.enableGpt5ReasoningSummary = true
+		// Default to including reasoning.summary: "auto" for models that support Responses API
+		// reasoning summaries unless explicitly disabled.
+		if (this.options.enableResponsesReasoningSummary === undefined) {
+			this.options.enableResponsesReasoningSummary = true
 		}
 		const apiKey = this.options.openAiNativeApiKey ?? "not-provided"
 		this.client = new OpenAI({ baseURL: this.options.openAiNativeBaseUrl, apiKey })
@@ -176,10 +177,10 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 		reasoningEffort: ReasoningEffortExtended | undefined,
 		metadata?: ApiHandlerCreateMessageMetadata,
 	): any {
-		// Build a request body
-		// Ensure we explicitly pass max_output_tokens for GPT‑5 based on Roo's reserved model response calculation
+		// Build a request body for the OpenAI Responses API.
+		// Ensure we explicitly pass max_output_tokens based on Roo's reserved model response calculation
 		// so requests do not default to very large limits (e.g., 120k).
-		interface Gpt5RequestBody {
+		interface ResponsesRequestBody {
 			model: string
 			input: Array<{ role: "user" | "assistant"; content: any[] } | { type: string; content: string }>
 			stream: boolean
@@ -202,7 +203,7 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 		// Decide whether to enable extended prompt cache retention for this request
 		const promptCacheRetention = this.getPromptCacheRetention(model)
 
-		const body: Gpt5RequestBody = {
+		const body: ResponsesRequestBody = {
 			model: model.id,
 			input: formattedInput,
 			stream: true,
@@ -218,7 +219,7 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 				? {
 						reasoning: {
 							...(reasoningEffort ? { effort: reasoningEffort } : {}),
-							...(this.options.enableGpt5ReasoningSummary ? { summary: "auto" as const } : {}),
+							...(this.options.enableResponsesReasoningSummary ? { summary: "auto" as const } : {}),
 						},
 					}
 				: {}),
@@ -271,7 +272,7 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 			}
 		} catch (sdkErr: any) {
 			// For errors, fallback to manual SSE via fetch
-			yield* this.makeGpt5ResponsesAPIRequest(requestBody, model, metadata, systemPrompt, messages)
+			yield* this.makeResponsesApiRequest(requestBody, model, metadata, systemPrompt, messages)
 		}
 	}
 
@@ -330,7 +331,7 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 		return formattedMessages
 	}
 
-	private async *makeGpt5ResponsesAPIRequest(
+	private async *makeResponsesApiRequest(
 		requestBody: any,
 		model: OpenAiNativeModel,
 		metadata?: ApiHandlerCreateMessageMetadata,
@@ -355,7 +356,7 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 			if (!response.ok) {
 				const errorText = await response.text()
 
-				let errorMessage = `GPT-5 API request failed (${response.status})`
+				let errorMessage = `OpenAI Responses API request failed (${response.status})`
 				let errorDetails = ""
 
 				// Try to parse error as JSON for better error messages
@@ -811,7 +812,7 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 									}
 								}
 
-								// Usage for done/completed is already handled by processGpt5Event in SDK path.
+								// Usage for done/completed is already handled by processEvent in the SDK path.
 								// For SSE path, usage often arrives separately; avoid double-emitting here.
 							}
 							// These are structural or status events, we can just log them at a lower level or ignore.
@@ -988,19 +989,14 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 	/**
 	 * Returns the appropriate prompt cache retention policy for the given model, if any.
 	 *
-	 * Extended prompt cache retention ("24h") is only available for GPT‑5.1 family models that
-	 * support prompt caching. For other models we omit the parameter so the default in‑memory
-	 * policy is used.
+	 * The policy is driven by ModelInfo.promptCacheRetention so that model-specific details
+	 * live in the shared types layer rather than this provider. When set to "24h" and the
+	 * model supports prompt caching, extended prompt cache retention is requested.
 	 */
 	private getPromptCacheRetention(model: OpenAiNativeModel): "24h" | undefined {
 		if (!model.info.supportsPromptCache) return undefined
 
-		// Extended prompt cache retention is only supported for GPT‑5.1 models:
-		//   - gpt-5.1
-		//   - gpt-5.1-codex
-		//   - gpt-5.1-codex-mini
-		//   - gpt-5.1-chat-latest (future compatibility)
-		if (model.id.startsWith("gpt-5.1")) {
+		if (model.info.promptCacheRetention === "24h") {
 			return "24h"
 		}
 
@@ -1113,7 +1109,7 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 			if (reasoningEffort) {
 				requestBody.reasoning = {
 					effort: reasoningEffort,
-					...(this.options.enableGpt5ReasoningSummary ? { summary: "auto" as const } : {}),
+					...(this.options.enableResponsesReasoningSummary ? { summary: "auto" as const } : {}),
 				}
 			}
 
diff --git a/src/shared/api.ts b/src/shared/api.ts
@@ -13,11 +13,11 @@ import {
 // Extend ProviderSettings (minus apiProvider) with handler-specific toggles.
 export type ApiHandlerOptions = Omit<ProviderSettings, "apiProvider"> & {
 	/**
-	 * When true and using GPT‑5 Responses API, include reasoning.summary: "auto"
-	 * so the API returns reasoning summaries (we already parse and surface them).
-	 * Defaults to true; set to false to disable summaries.
+	 * When true and using OpenAI Responses API models that support reasoning summaries,
+	 * include reasoning.summary: "auto" so the API returns summaries (we already parse
+	 * and surface them). Defaults to true; set to false to disable summaries.
 	 */
-	enableGpt5ReasoningSummary?: boolean
+	enableResponsesReasoningSummary?: boolean
 	/**
 	 * Optional override for Ollama's num_ctx parameter.
 	 * When set, this value will be used in Ollama chat requests.