diff --git a/src/api/providers/__tests__/openrouter.spec.ts b/src/api/providers/__tests__/openrouter.spec.ts index ea850c47be9..ae36fc1399d 100644 --- a/src/api/providers/__tests__/openrouter.spec.ts +++ b/src/api/providers/__tests__/openrouter.spec.ts @@ -98,9 +98,11 @@ describe("OpenRouterHandler", () => { }) const result = await handler.fetchModel() - expect(result.maxTokens).toBe(128000) // Use actual implementation value - expect(result.reasoningBudget).toBeUndefined() // Use actual implementation value - expect(result.temperature).toBe(0) // Use actual implementation value + // With the new clamping logic, 128000 tokens (64% of 200000 context window) + // gets clamped to 20% of context window: 200000 * 0.2 = 40000 + expect(result.maxTokens).toBe(40000) + expect(result.reasoningBudget).toBeUndefined() + expect(result.temperature).toBe(0) }) it("does not honor custom maxTokens for non-thinking models", async () => { diff --git a/src/api/transform/__tests__/model-params.spec.ts b/src/api/transform/__tests__/model-params.spec.ts index 44930aa5b8b..b5a02f534ec 100644 --- a/src/api/transform/__tests__/model-params.spec.ts +++ b/src/api/transform/__tests__/model-params.spec.ts @@ -293,12 +293,12 @@ describe("getModelParams", () => { it("should not honor customMaxThinkingTokens for non-reasoning budget models", () => { const model: ModelInfo = { ...baseModel, - maxTokens: 4000, + maxTokens: 3000, // 3000 is 18.75% of 16000 context window, within 20% threshold } expect(getModelParams({ ...anthropicParams, settings: { modelMaxThinkingTokens: 1500 }, model })).toEqual({ format: anthropicParams.format, - maxTokens: 4000, + maxTokens: 3000, // Uses model.maxTokens since it's within 20% threshold temperature: 0, // Using default temperature. reasoningEffort: undefined, reasoningBudget: undefined, // Should remain undefined despite customMaxThinkingTokens being set. @@ -565,7 +565,7 @@ describe("getModelParams", () => { it("should use reasoningEffort if supportsReasoningEffort is false but reasoningEffort is set", () => { const model: ModelInfo = { ...baseModel, - maxTokens: 8000, + maxTokens: 3000, // Changed to 3000 (18.75% of 16000), which is within 20% threshold supportsReasoningEffort: false, reasoningEffort: "medium", } @@ -576,7 +576,7 @@ describe("getModelParams", () => { model, }) - expect(result.maxTokens).toBe(8000) + expect(result.maxTokens).toBe(3000) // Now uses model.maxTokens since it's within 20% threshold expect(result.reasoningEffort).toBe("medium") }) }) @@ -595,7 +595,8 @@ describe("getModelParams", () => { model, }) - // Should discard model's maxTokens and use default + // For hybrid models (supportsReasoningBudget) in Anthropic contexts, + // should discard model's maxTokens and use ANTHROPIC_DEFAULT_MAX_TOKENS expect(result.maxTokens).toBe(ANTHROPIC_DEFAULT_MAX_TOKENS) expect(result.reasoningBudget).toBeUndefined() }) diff --git a/src/shared/__tests__/api.spec.ts b/src/shared/__tests__/api.spec.ts index 08d4bdf3bbc..80c3db1b7d7 100644 --- a/src/shared/__tests__/api.spec.ts +++ b/src/shared/__tests__/api.spec.ts @@ -25,11 +25,13 @@ describe("getModelMaxOutputTokens", () => { expect(result).toBe(16384) }) - test("should return model maxTokens when not using claude-code provider", () => { + test("should return model maxTokens when not using claude-code provider and maxTokens is within 20% of context window", () => { const settings: ProviderSettings = { apiProvider: "anthropic", } + // mockModel has maxTokens: 8192 and contextWindow: 200000 + // 8192 is 4.096% of 200000, which is <= 20%, so it should use model.maxTokens const result = getModelMaxOutputTokens({ modelId: "claude-3-5-sonnet-20241022", model: mockModel, @@ -115,7 +117,7 @@ describe("getModelMaxOutputTokens", () => { contextWindow: 1_048_576, supportsPromptCache: false, supportsReasoningBudget: true, - maxTokens: 65_535, + maxTokens: 65_535, // 65_535 is ~6.25% of 1_048_576, which is <= 20% } const settings: ProviderSettings = { @@ -124,7 +126,68 @@ describe("getModelMaxOutputTokens", () => { } const result = getModelMaxOutputTokens({ modelId: geminiModelId, model, settings }) - expect(result).toBe(65_535) // Should use model.maxTokens, not ANTHROPIC_DEFAULT_MAX_TOKENS + expect(result).toBe(65_535) // Should use model.maxTokens since it's within 20% threshold + }) + + test("should clamp maxTokens to 20% of context window when maxTokens exceeds threshold", () => { + const model: ModelInfo = { + contextWindow: 100_000, + supportsPromptCache: false, + maxTokens: 50_000, // 50% of context window, exceeds 20% threshold + } + + const settings: ProviderSettings = { + apiProvider: "openai", + } + + const result = getModelMaxOutputTokens({ + modelId: "gpt-4", + model, + settings, + format: "openai", + }) + // Should clamp to 20% of context window: 100_000 * 0.2 = 20_000 + expect(result).toBe(20_000) + }) + + test("should clamp maxTokens to 20% of context window for Anthropic models when maxTokens exceeds threshold", () => { + const model: ModelInfo = { + contextWindow: 100_000, + supportsPromptCache: true, + maxTokens: 50_000, // 50% of context window, exceeds 20% threshold + } + + const settings: ProviderSettings = { + apiProvider: "anthropic", + } + + const result = getModelMaxOutputTokens({ + modelId: "claude-3-5-sonnet-20241022", + model, + settings, + }) + // Should clamp to 20% of context window: 100_000 * 0.2 = 20_000 + expect(result).toBe(20_000) + }) + + test("should use model.maxTokens when exactly at 20% threshold", () => { + const model: ModelInfo = { + contextWindow: 100_000, + supportsPromptCache: false, + maxTokens: 20_000, // Exactly 20% of context window + } + + const settings: ProviderSettings = { + apiProvider: "openai", + } + + const result = getModelMaxOutputTokens({ + modelId: "gpt-4", + model, + settings, + format: "openai", + }) + expect(result).toBe(20_000) // Should use model.maxTokens since it's exactly at 20% }) test("should return modelMaxTokens from settings when reasoning budget is required", () => { diff --git a/src/shared/api.ts b/src/shared/api.ts index 44227ad7e4e..4cd2459f70f 100644 --- a/src/shared/api.ts +++ b/src/shared/api.ts @@ -90,9 +90,9 @@ export const getModelMaxOutputTokens = ({ return ANTHROPIC_DEFAULT_MAX_TOKENS } - // If model has explicit maxTokens and it's not the full context window, use it - if (model.maxTokens && model.maxTokens !== model.contextWindow) { - return model.maxTokens + // If model has explicit maxTokens, clamp it to 20% of the context window + if (model.maxTokens) { + return Math.min(model.maxTokens, model.contextWindow * 0.2) } // For non-Anthropic formats without explicit maxTokens, return undefined