From 2a793039c0a590627c71561e6e66f10b7410b382 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 15 Sep 2025 14:46:56 +0000
Subject: [PATCH 1/2] Add test for GPT-5-mini temperature issue with
 litellm_proxy

This test reproduces issue #265 where GPT-5-mini requires temperature=1.0
but the LLM class defaults to temperature=0.0, causing an error.

The test includes three scenarios:
1. Failing case: GPT-5-mini with default temperature=0.0 (reproduces the issue)
2. Working case: GPT-5-mini with explicit temperature=1.0
3. Working case: GPT-5-mini with temperature override in completion call

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .../llm/test_gpt5_mini_temperature_issue.py   | 226 ++++++++++++++++++
 1 file changed, 226 insertions(+)
 create mode 100644 tests/sdk/llm/test_gpt5_mini_temperature_issue.py

diff --git a/tests/sdk/llm/test_gpt5_mini_temperature_issue.py b/tests/sdk/llm/test_gpt5_mini_temperature_issue.py
new file mode 100644
index 0000000000..cb4f86d4fa
--- /dev/null
+++ b/tests/sdk/llm/test_gpt5_mini_temperature_issue.py
@@ -0,0 +1,226 @@
+"""Test for GPT-5-mini temperature issue with litellm_proxy.
+
+This test reproduces the issue where GPT-5-mini requires temperature to be set to 1,
+but the LLM class defaults to temperature=0.0, causing an error.
+"""
+
+from unittest.mock import patch
+
+import pytest
+from pydantic import SecretStr
+
+from openhands.sdk.llm import LLM
+
+
+@patch("openhands.sdk.llm.llm.litellm_completion")
+def test_gpt5_mini_temperature_issue_without_temperature(mock_completion):
+    """Test that GPT-5-mini fails when temperature is not explicitly set to 1.
+
+    This test reproduces the issue where GPT-5-mini requires temperature=1
+    but the LLM class defaults to temperature=0.0, causing an error.
+    """
+
+    # Mock the litellm completion to raise an error when temperature != 1
+    # This simulates the actual behavior of GPT-5-mini
+    def mock_completion_side_effect(*args, **kwargs):
+        temperature = kwargs.get("temperature", 0.0)
+        if temperature != 1.0:
+            raise ValueError(
+                "GPT-5-mini requires temperature to be set to 1.0, "
+                f"but got temperature={temperature}"
+            )
+        # If temperature is 1.0, return a mock response
+        from litellm.types.utils import Choices, Message, ModelResponse, Usage
+
+        return ModelResponse(
+            id="test-response",
+            choices=[
+                Choices(
+                    finish_reason="stop",
+                    index=0,
+                    message=Message(
+                        content="Test response from GPT-5-mini",
+                        role="assistant",
+                    ),
+                )
+            ],
+            created=1234567890,
+            model="gpt-5-mini",
+            object="chat.completion",
+            system_fingerprint="test",
+            usage=Usage(
+                prompt_tokens=10,
+                completion_tokens=5,
+                total_tokens=15,
+            ),
+        )
+
+    mock_completion.side_effect = mock_completion_side_effect
+
+    # Create LLM with litellm_proxy/openai/gpt-5-mini without specifying temperature
+    # This should use the default temperature=0.0
+    llm = LLM(
+        model="litellm_proxy/openai/gpt-5-mini",
+        api_key=SecretStr("test_key"),
+        num_retries=1,  # Reduce retries for faster test
+        retry_min_wait=1,
+        retry_max_wait=2,
+    )
+
+    # Verify that the LLM has the default temperature of 0.0
+    assert llm.temperature == 0.0
+
+    # Try to use the completion API - this should fail
+    messages = [{"role": "user", "content": "Hello, GPT-5-mini!"}]
+
+    with pytest.raises(
+        ValueError, match="GPT-5-mini requires temperature to be set to 1.0"
+    ):
+        llm.completion(messages=messages)
+
+    # Verify that the mock was called with temperature=0.0
+    mock_completion.assert_called_once()
+    call_kwargs = mock_completion.call_args[1]
+    assert call_kwargs.get("temperature") == 0.0
+
+
+@patch("openhands.sdk.llm.llm.litellm_completion")
+def test_gpt5_mini_works_with_temperature_1(mock_completion):
+    """Test that GPT-5-mini works when temperature is explicitly set to 1.
+
+    This test shows that the issue can be resolved by explicitly setting temperature=1.
+    """
+
+    # Mock the litellm completion to work when temperature == 1
+    def mock_completion_side_effect(*args, **kwargs):
+        temperature = kwargs.get("temperature", 0.0)
+        if temperature != 1.0:
+            raise ValueError(
+                "GPT-5-mini requires temperature to be set to 1.0, "
+                f"but got temperature={temperature}"
+            )
+        # If temperature is 1.0, return a mock response
+        from litellm.types.utils import Choices, Message, ModelResponse, Usage
+
+        return ModelResponse(
+            id="test-response",
+            choices=[
+                Choices(
+                    finish_reason="stop",
+                    index=0,
+                    message=Message(
+                        content="Test response from GPT-5-mini",
+                        role="assistant",
+                    ),
+                )
+            ],
+            created=1234567890,
+            model="gpt-5-mini",
+            object="chat.completion",
+            system_fingerprint="test",
+            usage=Usage(
+                prompt_tokens=10,
+                completion_tokens=5,
+                total_tokens=15,
+            ),
+        )
+
+    mock_completion.side_effect = mock_completion_side_effect
+
+    # Create LLM with litellm_proxy/openai/gpt-5-mini with temperature=1.0
+    llm = LLM(
+        model="litellm_proxy/openai/gpt-5-mini",
+        api_key=SecretStr("test_key"),
+        temperature=1.0,  # Explicitly set temperature to 1.0
+        num_retries=1,
+        retry_min_wait=1,
+        retry_max_wait=2,
+    )
+
+    # Verify that the LLM has temperature set to 1.0
+    assert llm.temperature == 1.0
+
+    # Try to use the completion API - this should work
+    messages = [{"role": "user", "content": "Hello, GPT-5-mini!"}]
+
+    response = llm.completion(messages=messages)
+
+    # Verify the response
+    assert response is not None
+    assert response.choices[0].message.content == "Test response from GPT-5-mini"  # type: ignore
+
+    # Verify that the mock was called with temperature=1.0
+    mock_completion.assert_called_once()
+    call_kwargs = mock_completion.call_args[1]
+    assert call_kwargs.get("temperature") == 1.0
+
+
+@patch("openhands.sdk.llm.llm.litellm_completion")
+def test_gpt5_mini_temperature_override_in_completion_call(mock_completion):
+    """Test that temperature can be overridden in the completion call.
+
+    This test shows that even if the LLM has a default temperature,
+    it can be overridden in the completion call.
+    """
+
+    # Mock the litellm completion to work when temperature == 1
+    def mock_completion_side_effect(*args, **kwargs):
+        temperature = kwargs.get("temperature", 0.0)
+        if temperature != 1.0:
+            raise ValueError(
+                "GPT-5-mini requires temperature to be set to 1.0, "
+                f"but got temperature={temperature}"
+            )
+        # If temperature is 1.0, return a mock response
+        from litellm.types.utils import Choices, Message, ModelResponse, Usage
+
+        return ModelResponse(
+            id="test-response",
+            choices=[
+                Choices(
+                    finish_reason="stop",
+                    index=0,
+                    message=Message(
+                        content="Test response from GPT-5-mini",
+                        role="assistant",
+                    ),
+                )
+            ],
+            created=1234567890,
+            model="gpt-5-mini",
+            object="chat.completion",
+            system_fingerprint="test",
+            usage=Usage(
+                prompt_tokens=10,
+                completion_tokens=5,
+                total_tokens=15,
+            ),
+        )
+
+    mock_completion.side_effect = mock_completion_side_effect
+
+    # Create LLM with litellm_proxy/openai/gpt-5-mini with default temperature=0.0
+    llm = LLM(
+        model="litellm_proxy/openai/gpt-5-mini",
+        api_key=SecretStr("test_key"),
+        num_retries=1,
+        retry_min_wait=1,
+        retry_max_wait=2,
+    )
+
+    # Verify that the LLM has the default temperature of 0.0
+    assert llm.temperature == 0.0
+
+    # Try to use the completion API with temperature=1.0 override - this should work
+    messages = [{"role": "user", "content": "Hello, GPT-5-mini!"}]
+
+    response = llm.completion(messages=messages, temperature=1.0)
+
+    # Verify the response
+    assert response is not None
+    assert response.choices[0].message.content == "Test response from GPT-5-mini"  # type: ignore
+
+    # Verify that the mock was called with temperature=1.0 (overridden)
+    mock_completion.assert_called_once()
+    call_kwargs = mock_completion.call_args[1]
+    assert call_kwargs.get("temperature") == 1.0

From 99908920997c83c85c2eabbd1c811d0e28236f11 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 15 Sep 2025 15:34:22 +0000
Subject: [PATCH 2/2] Replace mocked tests with real LLM API calls using
 LLM_API_KEY and LLM_BASE_URL

- Remove all mocking code from GPT-5-mini temperature tests
- Use environment variables LLM_API_KEY and LLM_BASE_URL for configuration
- Tests now skip gracefully when environment variables are not set
- Maintain the same test scenarios: default temperature failure, explicit temperature=1.0 success, and temperature override
- Add proper type ignores for response attribute access

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .../llm/test_gpt5_mini_temperature_issue.py   | 182 +++++-------------
 1 file changed, 43 insertions(+), 139 deletions(-)

diff --git a/tests/sdk/llm/test_gpt5_mini_temperature_issue.py b/tests/sdk/llm/test_gpt5_mini_temperature_issue.py
index cb4f86d4fa..d601fcfa97 100644
--- a/tests/sdk/llm/test_gpt5_mini_temperature_issue.py
+++ b/tests/sdk/llm/test_gpt5_mini_temperature_issue.py
@@ -2,9 +2,12 @@
 
 This test reproduces the issue where GPT-5-mini requires temperature to be set to 1,
 but the LLM class defaults to temperature=0.0, causing an error.
+
+These tests use real LLM API calls and require LLM_API_KEY and LLM_BASE_URL
+environment variables to be set.
 """
 
-from unittest.mock import patch
+import os
 
 import pytest
 from pydantic import SecretStr
@@ -12,56 +15,34 @@
 from openhands.sdk.llm import LLM
 
 
-@patch("openhands.sdk.llm.llm.litellm_completion")
-def test_gpt5_mini_temperature_issue_without_temperature(mock_completion):
+def get_llm_config():
+    """Get LLM configuration from environment variables."""
+    api_key = os.getenv("LLM_API_KEY")
+    base_url = os.getenv("LLM_BASE_URL")
+
+    if not api_key or not base_url:
+        pytest.skip(
+            "LLM_API_KEY and LLM_BASE_URL environment variables must be set "
+            "to run real LLM tests"
+        )
+
+    return api_key, base_url
+
+
+def test_gpt5_mini_temperature_issue_without_temperature():
     """Test that GPT-5-mini fails when temperature is not explicitly set to 1.
 
     This test reproduces the issue where GPT-5-mini requires temperature=1
     but the LLM class defaults to temperature=0.0, causing an error.
     """
-
-    # Mock the litellm completion to raise an error when temperature != 1
-    # This simulates the actual behavior of GPT-5-mini
-    def mock_completion_side_effect(*args, **kwargs):
-        temperature = kwargs.get("temperature", 0.0)
-        if temperature != 1.0:
-            raise ValueError(
-                "GPT-5-mini requires temperature to be set to 1.0, "
-                f"but got temperature={temperature}"
-            )
-        # If temperature is 1.0, return a mock response
-        from litellm.types.utils import Choices, Message, ModelResponse, Usage
-
-        return ModelResponse(
-            id="test-response",
-            choices=[
-                Choices(
-                    finish_reason="stop",
-                    index=0,
-                    message=Message(
-                        content="Test response from GPT-5-mini",
-                        role="assistant",
-                    ),
-                )
-            ],
-            created=1234567890,
-            model="gpt-5-mini",
-            object="chat.completion",
-            system_fingerprint="test",
-            usage=Usage(
-                prompt_tokens=10,
-                completion_tokens=5,
-                total_tokens=15,
-            ),
-        )
-
-    mock_completion.side_effect = mock_completion_side_effect
+    api_key, base_url = get_llm_config()
 
     # Create LLM with litellm_proxy/openai/gpt-5-mini without specifying temperature
     # This should use the default temperature=0.0
     llm = LLM(
         model="litellm_proxy/openai/gpt-5-mini",
-        api_key=SecretStr("test_key"),
+        api_key=SecretStr(api_key),
+        base_url=base_url,
         num_retries=1,  # Reduce retries for faster test
         retry_min_wait=1,
         retry_max_wait=2,
@@ -70,67 +51,31 @@ def mock_completion_side_effect(*args, **kwargs):
     # Verify that the LLM has the default temperature of 0.0
     assert llm.temperature == 0.0
 
-    # Try to use the completion API - this should fail
+    # Try to use the completion API - this should fail with GPT-5-mini
     messages = [{"role": "user", "content": "Hello, GPT-5-mini!"}]
 
-    with pytest.raises(
-        ValueError, match="GPT-5-mini requires temperature to be set to 1.0"
-    ):
+    # The exact error message may vary depending on the LLM proxy implementation
+    # but it should fail when temperature is not 1.0 for GPT-5-mini
+    with pytest.raises(Exception) as exc_info:
         llm.completion(messages=messages)
 
-    # Verify that the mock was called with temperature=0.0
-    mock_completion.assert_called_once()
-    call_kwargs = mock_completion.call_args[1]
-    assert call_kwargs.get("temperature") == 0.0
+    # Check that the error is related to temperature requirements
+    error_message = str(exc_info.value).lower()
+    assert "temperature" in error_message or "parameter" in error_message
 
 
-@patch("openhands.sdk.llm.llm.litellm_completion")
-def test_gpt5_mini_works_with_temperature_1(mock_completion):
+def test_gpt5_mini_works_with_temperature_1():
     """Test that GPT-5-mini works when temperature is explicitly set to 1.
 
     This test shows that the issue can be resolved by explicitly setting temperature=1.
     """
-
-    # Mock the litellm completion to work when temperature == 1
-    def mock_completion_side_effect(*args, **kwargs):
-        temperature = kwargs.get("temperature", 0.0)
-        if temperature != 1.0:
-            raise ValueError(
-                "GPT-5-mini requires temperature to be set to 1.0, "
-                f"but got temperature={temperature}"
-            )
-        # If temperature is 1.0, return a mock response
-        from litellm.types.utils import Choices, Message, ModelResponse, Usage
-
-        return ModelResponse(
-            id="test-response",
-            choices=[
-                Choices(
-                    finish_reason="stop",
-                    index=0,
-                    message=Message(
-                        content="Test response from GPT-5-mini",
-                        role="assistant",
-                    ),
-                )
-            ],
-            created=1234567890,
-            model="gpt-5-mini",
-            object="chat.completion",
-            system_fingerprint="test",
-            usage=Usage(
-                prompt_tokens=10,
-                completion_tokens=5,
-                total_tokens=15,
-            ),
-        )
-
-    mock_completion.side_effect = mock_completion_side_effect
+    api_key, base_url = get_llm_config()
 
     # Create LLM with litellm_proxy/openai/gpt-5-mini with temperature=1.0
     llm = LLM(
         model="litellm_proxy/openai/gpt-5-mini",
-        api_key=SecretStr("test_key"),
+        api_key=SecretStr(api_key),
+        base_url=base_url,
         temperature=1.0,  # Explicitly set temperature to 1.0
         num_retries=1,
         retry_min_wait=1,
@@ -147,62 +92,24 @@ def mock_completion_side_effect(*args, **kwargs):
 
     # Verify the response
     assert response is not None
-    assert response.choices[0].message.content == "Test response from GPT-5-mini"  # type: ignore
-
-    # Verify that the mock was called with temperature=1.0
-    mock_completion.assert_called_once()
-    call_kwargs = mock_completion.call_args[1]
-    assert call_kwargs.get("temperature") == 1.0
+    assert len(response.choices) > 0
+    assert response.choices[0].message.content is not None  # type: ignore
+    assert len(response.choices[0].message.content.strip()) > 0  # type: ignore
 
 
-@patch("openhands.sdk.llm.llm.litellm_completion")
-def test_gpt5_mini_temperature_override_in_completion_call(mock_completion):
+def test_gpt5_mini_temperature_override_in_completion_call():
     """Test that temperature can be overridden in the completion call.
 
     This test shows that even if the LLM has a default temperature,
     it can be overridden in the completion call.
     """
-
-    # Mock the litellm completion to work when temperature == 1
-    def mock_completion_side_effect(*args, **kwargs):
-        temperature = kwargs.get("temperature", 0.0)
-        if temperature != 1.0:
-            raise ValueError(
-                "GPT-5-mini requires temperature to be set to 1.0, "
-                f"but got temperature={temperature}"
-            )
-        # If temperature is 1.0, return a mock response
-        from litellm.types.utils import Choices, Message, ModelResponse, Usage
-
-        return ModelResponse(
-            id="test-response",
-            choices=[
-                Choices(
-                    finish_reason="stop",
-                    index=0,
-                    message=Message(
-                        content="Test response from GPT-5-mini",
-                        role="assistant",
-                    ),
-                )
-            ],
-            created=1234567890,
-            model="gpt-5-mini",
-            object="chat.completion",
-            system_fingerprint="test",
-            usage=Usage(
-                prompt_tokens=10,
-                completion_tokens=5,
-                total_tokens=15,
-            ),
-        )
-
-    mock_completion.side_effect = mock_completion_side_effect
+    api_key, base_url = get_llm_config()
 
     # Create LLM with litellm_proxy/openai/gpt-5-mini with default temperature=0.0
     llm = LLM(
         model="litellm_proxy/openai/gpt-5-mini",
-        api_key=SecretStr("test_key"),
+        api_key=SecretStr(api_key),
+        base_url=base_url,
         num_retries=1,
         retry_min_wait=1,
         retry_max_wait=2,
@@ -218,9 +125,6 @@ def mock_completion_side_effect(*args, **kwargs):
 
     # Verify the response
     assert response is not None
-    assert response.choices[0].message.content == "Test response from GPT-5-mini"  # type: ignore
-
-    # Verify that the mock was called with temperature=1.0 (overridden)
-    mock_completion.assert_called_once()
-    call_kwargs = mock_completion.call_args[1]
-    assert call_kwargs.get("temperature") == 1.0
+    assert len(response.choices) > 0
+    assert response.choices[0].message.content is not None  # type: ignore
+    assert len(response.choices[0].message.content.strip()) > 0  # type: ignore