Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions packages/types/src/model.ts
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,10 @@ export const modelInfoSchema = z.object({
contextWindow: z.number(),
supportsImages: z.boolean().optional(),
supportsPromptCache: z.boolean(),
// Optional default prompt cache retention policy for providers that support it.
// When set to "24h", extended prompt caching will be requested; when omitted
// or set to "in_memory", the default in‑memory cache is used.
promptCacheRetention: z.enum(["in_memory", "24h"]).optional(),
// Capability flag to indicate whether the model supports an output verbosity parameter
supportsVerbosity: z.boolean().optional(),
supportsReasoningBudget: z.boolean().optional(),
Expand Down
3 changes: 3 additions & 0 deletions packages/types/src/providers/openai.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ export const openAiNativeModels = {
contextWindow: 400000,
supportsImages: true,
supportsPromptCache: true,
promptCacheRetention: "24h",
supportsReasoningEffort: ["none", "low", "medium", "high"],
reasoningEffort: "medium",
inputPrice: 1.25,
Expand All @@ -29,6 +30,7 @@ export const openAiNativeModels = {
contextWindow: 400000,
supportsImages: true,
supportsPromptCache: true,
promptCacheRetention: "24h",
supportsReasoningEffort: ["low", "medium", "high"],
reasoningEffort: "medium",
inputPrice: 1.25,
Expand All @@ -43,6 +45,7 @@ export const openAiNativeModels = {
contextWindow: 400000,
supportsImages: true,
supportsPromptCache: true,
promptCacheRetention: "24h",
supportsReasoningEffort: ["low", "medium", "high"],
reasoningEffort: "medium",
inputPrice: 0.25,
Expand Down
45 changes: 45 additions & 0 deletions src/api/providers/__tests__/openai-native-usage.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,51 @@ describe("OpenAiNativeHandler - normalizeUsage", () => {
})
})

describe("OpenAiNativeHandler - prompt cache retention", () => {
let handler: OpenAiNativeHandler

beforeEach(() => {
handler = new OpenAiNativeHandler({
openAiNativeApiKey: "test-key",
})
})

const buildRequestBodyForModel = (modelId: string) => {
// Force the handler to use the requested model ID
;(handler as any).options.apiModelId = modelId
const model = handler.getModel()
// Minimal formatted input/systemPrompt/verbosity/metadata for building the body
return (handler as any).buildRequestBody(model, [], "", model.verbosity, undefined, undefined)
}

it("should set prompt_cache_retention=24h for gpt-5.1 models that support prompt caching", () => {
const body = buildRequestBodyForModel("gpt-5.1")
expect(body.prompt_cache_retention).toBe("24h")

const codexBody = buildRequestBodyForModel("gpt-5.1-codex")
expect(codexBody.prompt_cache_retention).toBe("24h")

const codexMiniBody = buildRequestBodyForModel("gpt-5.1-codex-mini")
expect(codexMiniBody.prompt_cache_retention).toBe("24h")
})

it("should not set prompt_cache_retention for non-gpt-5.1 models even if they support prompt caching", () => {
const body = buildRequestBodyForModel("gpt-5")
expect(body.prompt_cache_retention).toBeUndefined()

const fourOBody = buildRequestBodyForModel("gpt-4o")
expect(fourOBody.prompt_cache_retention).toBeUndefined()
})

it("should not set prompt_cache_retention when the model does not support prompt caching", () => {
const modelId = "codex-mini-latest"
expect(openAiNativeModels[modelId as keyof typeof openAiNativeModels].supportsPromptCache).toBe(false)

const body = buildRequestBodyForModel(modelId)
expect(body.prompt_cache_retention).toBeUndefined()
})
})

describe("cost calculation", () => {
it("should pass total input tokens to calculateApiCostOpenAI", () => {
const usage = {
Expand Down
58 changes: 45 additions & 13 deletions src/api/providers/openai-native.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,10 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
constructor(options: ApiHandlerOptions) {
super()
this.options = options
// Default to including reasoning.summary: "auto" for GPT‑5 unless explicitly disabled
if (this.options.enableGpt5ReasoningSummary === undefined) {
this.options.enableGpt5ReasoningSummary = true
// Default to including reasoning.summary: "auto" for models that support Responses API
// reasoning summaries unless explicitly disabled.
if (this.options.enableResponsesReasoningSummary === undefined) {
this.options.enableResponsesReasoningSummary = true
}
const apiKey = this.options.openAiNativeApiKey ?? "not-provided"
this.client = new OpenAI({ baseURL: this.options.openAiNativeBaseUrl, apiKey })
Expand Down Expand Up @@ -176,10 +177,10 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
reasoningEffort: ReasoningEffortExtended | undefined,
metadata?: ApiHandlerCreateMessageMetadata,
): any {
// Build a request body
// Ensure we explicitly pass max_output_tokens for GPT‑5 based on Roo's reserved model response calculation
// Build a request body for the OpenAI Responses API.
// Ensure we explicitly pass max_output_tokens based on Roo's reserved model response calculation
// so requests do not default to very large limits (e.g., 120k).
interface Gpt5RequestBody {
interface ResponsesRequestBody {
model: string
input: Array<{ role: "user" | "assistant"; content: any[] } | { type: string; content: string }>
stream: boolean
Expand All @@ -191,13 +192,18 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
instructions?: string
service_tier?: ServiceTier
include?: string[]
/** Prompt cache retention policy: "in_memory" (default) or "24h" for extended caching */
prompt_cache_retention?: "in_memory" | "24h"
}

// Validate requested tier against model support; if not supported, omit.
const requestedTier = (this.options.openAiNativeServiceTier as ServiceTier | undefined) || undefined
const allowedTierNames = new Set(model.info.tiers?.map((t) => t.name).filter(Boolean) || [])

const body: Gpt5RequestBody = {
// Decide whether to enable extended prompt cache retention for this request
const promptCacheRetention = this.getPromptCacheRetention(model)

const body: ResponsesRequestBody = {
model: model.id,
input: formattedInput,
stream: true,
Expand All @@ -213,7 +219,7 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
? {
reasoning: {
...(reasoningEffort ? { effort: reasoningEffort } : {}),
...(this.options.enableGpt5ReasoningSummary ? { summary: "auto" as const } : {}),
...(this.options.enableResponsesReasoningSummary ? { summary: "auto" as const } : {}),
},
}
: {}),
Expand All @@ -229,6 +235,9 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
(requestedTier === "default" || allowedTierNames.has(requestedTier)) && {
service_tier: requestedTier,
}),
// Enable extended prompt cache retention for models that support it.
// This uses the OpenAI Responses API `prompt_cache_retention` parameter.
...(promptCacheRetention ? { prompt_cache_retention: promptCacheRetention } : {}),
}

// Include text.verbosity only when the model explicitly supports it
Expand Down Expand Up @@ -263,7 +272,7 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
}
} catch (sdkErr: any) {
// For errors, fallback to manual SSE via fetch
yield* this.makeGpt5ResponsesAPIRequest(requestBody, model, metadata, systemPrompt, messages)
yield* this.makeResponsesApiRequest(requestBody, model, metadata, systemPrompt, messages)
}
}

Expand Down Expand Up @@ -322,7 +331,7 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
return formattedMessages
}

private async *makeGpt5ResponsesAPIRequest(
private async *makeResponsesApiRequest(
requestBody: any,
model: OpenAiNativeModel,
metadata?: ApiHandlerCreateMessageMetadata,
Expand All @@ -347,7 +356,7 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
if (!response.ok) {
const errorText = await response.text()

let errorMessage = `GPT-5 API request failed (${response.status})`
let errorMessage = `OpenAI Responses API request failed (${response.status})`
let errorDetails = ""

// Try to parse error as JSON for better error messages
Expand Down Expand Up @@ -803,7 +812,7 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
}
}

// Usage for done/completed is already handled by processGpt5Event in SDK path.
// Usage for done/completed is already handled by processEvent in the SDK path.
// For SSE path, usage often arrives separately; avoid double-emitting here.
}
// These are structural or status events, we can just log them at a lower level or ignore.
Expand Down Expand Up @@ -977,6 +986,23 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
return selected && selected !== "disable" ? (selected as any) : undefined
}

/**
* Returns the appropriate prompt cache retention policy for the given model, if any.
*
* The policy is driven by ModelInfo.promptCacheRetention so that model-specific details
* live in the shared types layer rather than this provider. When set to "24h" and the
* model supports prompt caching, extended prompt cache retention is requested.
*/
private getPromptCacheRetention(model: OpenAiNativeModel): "24h" | undefined {
if (!model.info.supportsPromptCache) return undefined

if (model.info.promptCacheRetention === "24h") {
return "24h"
}

return undefined
}

/**
* Returns a shallow-cloned ModelInfo with pricing overridden for the given tier, if available.
* If no tier or no overrides exist, the original ModelInfo is returned.
Expand Down Expand Up @@ -1083,7 +1109,7 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
if (reasoningEffort) {
requestBody.reasoning = {
effort: reasoningEffort,
...(this.options.enableGpt5ReasoningSummary ? { summary: "auto" as const } : {}),
...(this.options.enableResponsesReasoningSummary ? { summary: "auto" as const } : {}),
}
}

Expand All @@ -1102,6 +1128,12 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
requestBody.text = { verbosity: (verbosity || "medium") as VerbosityLevel }
}

// Enable extended prompt cache retention for eligible models
const promptCacheRetention = this.getPromptCacheRetention(model)
if (promptCacheRetention) {
requestBody.prompt_cache_retention = promptCacheRetention
}

// Make the non-streaming request
const response = await (this.client as any).responses.create(requestBody)

Expand Down
8 changes: 4 additions & 4 deletions src/shared/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ import {
// Extend ProviderSettings (minus apiProvider) with handler-specific toggles.
export type ApiHandlerOptions = Omit<ProviderSettings, "apiProvider"> & {
/**
* When true and using GPT‑5 Responses API, include reasoning.summary: "auto"
* so the API returns reasoning summaries (we already parse and surface them).
* Defaults to true; set to false to disable summaries.
* When true and using OpenAI Responses API models that support reasoning summaries,
* include reasoning.summary: "auto" so the API returns summaries (we already parse
* and surface them). Defaults to true; set to false to disable summaries.
*/
enableGpt5ReasoningSummary?: boolean
enableResponsesReasoningSummary?: boolean
/**
* Optional override for Ollama's num_ctx parameter.
* When set, this value will be used in Ollama chat requests.
Expand Down
Loading