Configurable max_tokens/max_completion_tokens key (#399)

sjmonson · tlrmchlsmth · sjmonson · commit e6c7e5598115 · 2025-10-21T11:06:11.000-04:00
Makes the `max_tokens` request key configurable through an environment variable per endpoint type. Defaults to `max_tokens` for legacy `completions` and `max_completion_tokens` for `chat/completions`  - Add the `GUIDELLM__OPENAI__MAX_OUTPUT_KEY` config option which is a dict mapping from route name -> output tokens key. Default is `{"text_completions": "max_tokens", "chat_completions": "max_completion_tokens"}`  -  - Closes #395 - Closes #269 - Related #210 --- - [x] "I certify that all code in this PR is my own, except as noted below." - [ ] Includes AI-assisted code completion - [ ] Includes code generated by an AI application - [ ] Includes AI-generated tests (NOTE: AI written tests should have a docstring that includes `## WRITTEN BY AI ##`) --------- Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Signed-off-by: Samuel Monson <smonson@redhat.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
diff --git a/src/guidellm/backends/openai.py b/src/guidellm/backends/openai.py
@@ -30,6 +30,7 @@
     GenerationResponse,
 )
 from guidellm.scheduler import HistoryT, ScheduledRequestInfo
+from guidellm.settings import settings
 
 __all__ = ["OpenAIHTTPBackend", "UsageStats"]
 
@@ -628,12 +629,10 @@ def _get_body(
         # Handle token limits
         max_tokens = max_output_tokens or self.max_output_tokens
         if max_tokens is not None:
-            body.update(
-                {
-                    "max_tokens": max_tokens,
-                    "max_completion_tokens": max_tokens,
-                }
+            max_output_key = settings.openai.max_output_key.get(
+                endpoint_type, "max_tokens"
             )
+            body[max_output_key] = max_output_tokens
             # Set stop conditions only for request-level limits
             if max_output_tokens:
                 body.update({"stop": None, "ignore_eos": True})
diff --git a/src/guidellm/settings.py b/src/guidellm/settings.py
@@ -89,6 +89,10 @@ class OpenAISettings(BaseModel):
     base_url: str = "http://localhost:8000"
     max_output_tokens: int = 16384
     verify: bool = True
+    max_output_key: dict[str, str] = {
+        "text_completions": "max_tokens",
+        "chat_completions": "max_completion_tokens",
+    }
 
 
 class ReportGenerationSettings(BaseModel):