From b90cdcd7d4d6180f2313a22c233c09adc527152c Mon Sep 17 00:00:00 2001 From: Seb Duerr Date: Tue, 2 Dec 2025 16:00:39 -0800 Subject: [PATCH 1/4] fix(cerebras): use conservative max_tokens and add integration header --- .changeset/cerebras-conservative-max-tokens.md | 13 +++++++++++++ src/api/providers/cerebras.ts | 16 +++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 .changeset/cerebras-conservative-max-tokens.md diff --git a/.changeset/cerebras-conservative-max-tokens.md b/.changeset/cerebras-conservative-max-tokens.md new file mode 100644 index 00000000000..1447e7c62cd --- /dev/null +++ b/.changeset/cerebras-conservative-max-tokens.md @@ -0,0 +1,13 @@ +--- +"roo-cline": patch +--- + +fix(cerebras): use conservative max_tokens and add integration header + +**Conservative max_tokens:** +Cerebras rate limiter estimates token consumption using max_completion_tokens upfront rather than actual usage. When agentic tools automatically set this to the model maximum (e.g., 64K), users exhaust their quota prematurely and get rate-limited despite minimal actual token consumption. + +This fix uses a conservative default of 8K tokens instead of the model maximum. This is sufficient for most agentic tool use while preserving rate limit headroom. + +**Integration header:** +Added `X-Cerebras-3rd-Party-Integration: roocode` header to all Cerebras API requests for tracking and analytics. diff --git a/src/api/providers/cerebras.ts b/src/api/providers/cerebras.ts index 398e32f4901..624402e7daa 100644 --- a/src/api/providers/cerebras.ts +++ b/src/api/providers/cerebras.ts @@ -16,6 +16,16 @@ import { t } from "../../i18n" const CEREBRAS_BASE_URL = "https://api.cerebras.ai/v1" const CEREBRAS_DEFAULT_TEMPERATURE = 0 +/** + * Conservative max_tokens for Cerebras to avoid premature rate limiting. + * Cerebras rate limiter estimates token consumption using max_completion_tokens upfront, + * so requesting the model maximum (e.g., 64K) reserves that quota even if actual usage is low. + * 8K is sufficient for most agentic tool use while preserving rate limit headroom. + */ +const CEREBRAS_DEFAULT_MAX_TOKENS = 8_192 +const CEREBRAS_INTEGRATION_HEADER = "X-Cerebras-3rd-Party-Integration" +const CEREBRAS_INTEGRATION_NAME = "roocode" + export class CerebrasHandler extends BaseProvider implements SingleCompletionHandler { private apiKey: string private providerModels: typeof cerebrasModels @@ -105,12 +115,14 @@ export class CerebrasHandler extends BaseProvider implements SingleCompletionHan const openaiMessages = convertToOpenAiMessages(messages) // Prepare request body following Cerebras API specification exactly + // Use conservative default to avoid premature rate limiting (Cerebras reserves quota upfront) + const effectiveMaxTokens = Math.min(max_tokens || CEREBRAS_DEFAULT_MAX_TOKENS, CEREBRAS_DEFAULT_MAX_TOKENS) const requestBody: Record = { model, messages: [{ role: "system", content: systemPrompt }, ...openaiMessages], stream: true, // Use max_completion_tokens (Cerebras-specific parameter) - ...(max_tokens && max_tokens > 0 && max_tokens <= 32768 ? { max_completion_tokens: max_tokens } : {}), + ...(effectiveMaxTokens > 0 ? { max_completion_tokens: effectiveMaxTokens } : {}), // Clamp temperature to Cerebras range (0 to 1.5) ...(temperature !== undefined && temperature !== CEREBRAS_DEFAULT_TEMPERATURE ? { @@ -130,6 +142,7 @@ export class CerebrasHandler extends BaseProvider implements SingleCompletionHan ...DEFAULT_HEADERS, "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}`, + [CEREBRAS_INTEGRATION_HEADER]: CEREBRAS_INTEGRATION_NAME, }, body: JSON.stringify(requestBody), }) @@ -291,6 +304,7 @@ export class CerebrasHandler extends BaseProvider implements SingleCompletionHan ...DEFAULT_HEADERS, "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}`, + [CEREBRAS_INTEGRATION_HEADER]: CEREBRAS_INTEGRATION_NAME, }, body: JSON.stringify(requestBody), }) From 8858b0a68c46911ab6b19bb9631ef2f9dc43f54a Mon Sep 17 00:00:00 2001 From: Seb Duerr Date: Wed, 3 Dec 2025 18:27:19 -0800 Subject: [PATCH 2/4] fix: update cerebras provider implementation to match model changes --- src/api/providers/cerebras.ts | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/api/providers/cerebras.ts b/src/api/providers/cerebras.ts index 624402e7daa..905a8681b57 100644 --- a/src/api/providers/cerebras.ts +++ b/src/api/providers/cerebras.ts @@ -46,11 +46,12 @@ export class CerebrasHandler extends BaseProvider implements SingleCompletionHan } getModel(): { id: CerebrasModelId; info: (typeof cerebrasModels)[CerebrasModelId] } { - const modelId = (this.options.apiModelId as CerebrasModelId) || this.defaultProviderModelId + const modelId = this.options.apiModelId as CerebrasModelId + const validModelId = modelId && this.providerModels[modelId] ? modelId : this.defaultProviderModelId return { - id: modelId, - info: this.providerModels[modelId], + id: validModelId, + info: this.providerModels[validModelId], } } From c5dc8bfa9738905bd81ed56614093041138c8be9 Mon Sep 17 00:00:00 2001 From: Seb Duerr Date: Wed, 3 Dec 2025 21:12:13 -0800 Subject: [PATCH 3/4] refactor: move conservative maxTokens to types file per review feedback Instead of clamping max_tokens in the provider, set maxTokens to 8192 directly in cerebras.ts types file. This is cleaner and keeps the rate-limiting logic in one place. --- packages/types/src/providers/cerebras.ts | 10 +++++----- src/api/providers/cerebras.ts | 11 +---------- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/packages/types/src/providers/cerebras.ts b/packages/types/src/providers/cerebras.ts index be705744111..1ac8f637040 100644 --- a/packages/types/src/providers/cerebras.ts +++ b/packages/types/src/providers/cerebras.ts @@ -7,7 +7,7 @@ export const cerebrasDefaultModelId: CerebrasModelId = "gpt-oss-120b" export const cerebrasModels = { "zai-glm-4.6": { - maxTokens: 16384, // consistent with their other models + maxTokens: 8192, // Conservative default to avoid premature rate limiting (Cerebras reserves quota upfront) contextWindow: 131072, supportsImages: false, supportsPromptCache: false, @@ -17,7 +17,7 @@ export const cerebrasModels = { description: "Highly intelligent general purpose model with up to 1,000 tokens/s", }, "qwen-3-235b-a22b-instruct-2507": { - maxTokens: 64000, + maxTokens: 8192, // Conservative default to avoid premature rate limiting contextWindow: 64000, supportsImages: false, supportsPromptCache: false, @@ -27,7 +27,7 @@ export const cerebrasModels = { description: "Intelligent model with ~1400 tokens/s", }, "llama-3.3-70b": { - maxTokens: 64000, + maxTokens: 8192, // Conservative default to avoid premature rate limiting contextWindow: 64000, supportsImages: false, supportsPromptCache: false, @@ -37,7 +37,7 @@ export const cerebrasModels = { description: "Powerful model with ~2600 tokens/s", }, "qwen-3-32b": { - maxTokens: 64000, + maxTokens: 8192, // Conservative default to avoid premature rate limiting contextWindow: 64000, supportsImages: false, supportsPromptCache: false, @@ -47,7 +47,7 @@ export const cerebrasModels = { description: "SOTA coding performance with ~2500 tokens/s", }, "gpt-oss-120b": { - maxTokens: 8000, + maxTokens: 8192, // Conservative default to avoid premature rate limiting contextWindow: 64000, supportsImages: false, supportsPromptCache: false, diff --git a/src/api/providers/cerebras.ts b/src/api/providers/cerebras.ts index 905a8681b57..99e7c4cc3d4 100644 --- a/src/api/providers/cerebras.ts +++ b/src/api/providers/cerebras.ts @@ -16,13 +16,6 @@ import { t } from "../../i18n" const CEREBRAS_BASE_URL = "https://api.cerebras.ai/v1" const CEREBRAS_DEFAULT_TEMPERATURE = 0 -/** - * Conservative max_tokens for Cerebras to avoid premature rate limiting. - * Cerebras rate limiter estimates token consumption using max_completion_tokens upfront, - * so requesting the model maximum (e.g., 64K) reserves that quota even if actual usage is low. - * 8K is sufficient for most agentic tool use while preserving rate limit headroom. - */ -const CEREBRAS_DEFAULT_MAX_TOKENS = 8_192 const CEREBRAS_INTEGRATION_HEADER = "X-Cerebras-3rd-Party-Integration" const CEREBRAS_INTEGRATION_NAME = "roocode" @@ -116,14 +109,12 @@ export class CerebrasHandler extends BaseProvider implements SingleCompletionHan const openaiMessages = convertToOpenAiMessages(messages) // Prepare request body following Cerebras API specification exactly - // Use conservative default to avoid premature rate limiting (Cerebras reserves quota upfront) - const effectiveMaxTokens = Math.min(max_tokens || CEREBRAS_DEFAULT_MAX_TOKENS, CEREBRAS_DEFAULT_MAX_TOKENS) const requestBody: Record = { model, messages: [{ role: "system", content: systemPrompt }, ...openaiMessages], stream: true, // Use max_completion_tokens (Cerebras-specific parameter) - ...(effectiveMaxTokens > 0 ? { max_completion_tokens: effectiveMaxTokens } : {}), + ...(max_tokens && max_tokens > 0 && max_tokens <= 32768 ? { max_completion_tokens: max_tokens } : {}), // Clamp temperature to Cerebras range (0 to 1.5) ...(temperature !== undefined && temperature !== CEREBRAS_DEFAULT_TEMPERATURE ? { From 680a84f8b8f11f2bfefad80e1689a2844d249d73 Mon Sep 17 00:00:00 2001 From: Matt Rubens Date: Thu, 4 Dec 2025 00:42:54 -0500 Subject: [PATCH 4/4] Delete .changeset/cerebras-conservative-max-tokens.md --- .changeset/cerebras-conservative-max-tokens.md | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 .changeset/cerebras-conservative-max-tokens.md diff --git a/.changeset/cerebras-conservative-max-tokens.md b/.changeset/cerebras-conservative-max-tokens.md deleted file mode 100644 index 1447e7c62cd..00000000000 --- a/.changeset/cerebras-conservative-max-tokens.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -"roo-cline": patch ---- - -fix(cerebras): use conservative max_tokens and add integration header - -**Conservative max_tokens:** -Cerebras rate limiter estimates token consumption using max_completion_tokens upfront rather than actual usage. When agentic tools automatically set this to the model maximum (e.g., 64K), users exhaust their quota prematurely and get rate-limited despite minimal actual token consumption. - -This fix uses a conservative default of 8K tokens instead of the model maximum. This is sufficient for most agentic tool use while preserving rate limit headroom. - -**Integration header:** -Added `X-Cerebras-3rd-Party-Integration: roocode` header to all Cerebras API requests for tracking and analytics.