From 2a793039c0a590627c71561e6e66f10b7410b382 Mon Sep 17 00:00:00 2001 From: openhands Date: Mon, 15 Sep 2025 14:46:56 +0000 Subject: [PATCH 1/2] Add test for GPT-5-mini temperature issue with litellm_proxy This test reproduces issue #265 where GPT-5-mini requires temperature=1.0 but the LLM class defaults to temperature=0.0, causing an error. The test includes three scenarios: 1. Failing case: GPT-5-mini with default temperature=0.0 (reproduces the issue) 2. Working case: GPT-5-mini with explicit temperature=1.0 3. Working case: GPT-5-mini with temperature override in completion call Co-authored-by: openhands --- .../llm/test_gpt5_mini_temperature_issue.py | 226 ++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100644 tests/sdk/llm/test_gpt5_mini_temperature_issue.py diff --git a/tests/sdk/llm/test_gpt5_mini_temperature_issue.py b/tests/sdk/llm/test_gpt5_mini_temperature_issue.py new file mode 100644 index 0000000000..cb4f86d4fa --- /dev/null +++ b/tests/sdk/llm/test_gpt5_mini_temperature_issue.py @@ -0,0 +1,226 @@ +"""Test for GPT-5-mini temperature issue with litellm_proxy. + +This test reproduces the issue where GPT-5-mini requires temperature to be set to 1, +but the LLM class defaults to temperature=0.0, causing an error. +""" + +from unittest.mock import patch + +import pytest +from pydantic import SecretStr + +from openhands.sdk.llm import LLM + + +@patch("openhands.sdk.llm.llm.litellm_completion") +def test_gpt5_mini_temperature_issue_without_temperature(mock_completion): + """Test that GPT-5-mini fails when temperature is not explicitly set to 1. + + This test reproduces the issue where GPT-5-mini requires temperature=1 + but the LLM class defaults to temperature=0.0, causing an error. + """ + + # Mock the litellm completion to raise an error when temperature != 1 + # This simulates the actual behavior of GPT-5-mini + def mock_completion_side_effect(*args, **kwargs): + temperature = kwargs.get("temperature", 0.0) + if temperature != 1.0: + raise ValueError( + "GPT-5-mini requires temperature to be set to 1.0, " + f"but got temperature={temperature}" + ) + # If temperature is 1.0, return a mock response + from litellm.types.utils import Choices, Message, ModelResponse, Usage + + return ModelResponse( + id="test-response", + choices=[ + Choices( + finish_reason="stop", + index=0, + message=Message( + content="Test response from GPT-5-mini", + role="assistant", + ), + ) + ], + created=1234567890, + model="gpt-5-mini", + object="chat.completion", + system_fingerprint="test", + usage=Usage( + prompt_tokens=10, + completion_tokens=5, + total_tokens=15, + ), + ) + + mock_completion.side_effect = mock_completion_side_effect + + # Create LLM with litellm_proxy/openai/gpt-5-mini without specifying temperature + # This should use the default temperature=0.0 + llm = LLM( + model="litellm_proxy/openai/gpt-5-mini", + api_key=SecretStr("test_key"), + num_retries=1, # Reduce retries for faster test + retry_min_wait=1, + retry_max_wait=2, + ) + + # Verify that the LLM has the default temperature of 0.0 + assert llm.temperature == 0.0 + + # Try to use the completion API - this should fail + messages = [{"role": "user", "content": "Hello, GPT-5-mini!"}] + + with pytest.raises( + ValueError, match="GPT-5-mini requires temperature to be set to 1.0" + ): + llm.completion(messages=messages) + + # Verify that the mock was called with temperature=0.0 + mock_completion.assert_called_once() + call_kwargs = mock_completion.call_args[1] + assert call_kwargs.get("temperature") == 0.0 + + +@patch("openhands.sdk.llm.llm.litellm_completion") +def test_gpt5_mini_works_with_temperature_1(mock_completion): + """Test that GPT-5-mini works when temperature is explicitly set to 1. + + This test shows that the issue can be resolved by explicitly setting temperature=1. + """ + + # Mock the litellm completion to work when temperature == 1 + def mock_completion_side_effect(*args, **kwargs): + temperature = kwargs.get("temperature", 0.0) + if temperature != 1.0: + raise ValueError( + "GPT-5-mini requires temperature to be set to 1.0, " + f"but got temperature={temperature}" + ) + # If temperature is 1.0, return a mock response + from litellm.types.utils import Choices, Message, ModelResponse, Usage + + return ModelResponse( + id="test-response", + choices=[ + Choices( + finish_reason="stop", + index=0, + message=Message( + content="Test response from GPT-5-mini", + role="assistant", + ), + ) + ], + created=1234567890, + model="gpt-5-mini", + object="chat.completion", + system_fingerprint="test", + usage=Usage( + prompt_tokens=10, + completion_tokens=5, + total_tokens=15, + ), + ) + + mock_completion.side_effect = mock_completion_side_effect + + # Create LLM with litellm_proxy/openai/gpt-5-mini with temperature=1.0 + llm = LLM( + model="litellm_proxy/openai/gpt-5-mini", + api_key=SecretStr("test_key"), + temperature=1.0, # Explicitly set temperature to 1.0 + num_retries=1, + retry_min_wait=1, + retry_max_wait=2, + ) + + # Verify that the LLM has temperature set to 1.0 + assert llm.temperature == 1.0 + + # Try to use the completion API - this should work + messages = [{"role": "user", "content": "Hello, GPT-5-mini!"}] + + response = llm.completion(messages=messages) + + # Verify the response + assert response is not None + assert response.choices[0].message.content == "Test response from GPT-5-mini" # type: ignore + + # Verify that the mock was called with temperature=1.0 + mock_completion.assert_called_once() + call_kwargs = mock_completion.call_args[1] + assert call_kwargs.get("temperature") == 1.0 + + +@patch("openhands.sdk.llm.llm.litellm_completion") +def test_gpt5_mini_temperature_override_in_completion_call(mock_completion): + """Test that temperature can be overridden in the completion call. + + This test shows that even if the LLM has a default temperature, + it can be overridden in the completion call. + """ + + # Mock the litellm completion to work when temperature == 1 + def mock_completion_side_effect(*args, **kwargs): + temperature = kwargs.get("temperature", 0.0) + if temperature != 1.0: + raise ValueError( + "GPT-5-mini requires temperature to be set to 1.0, " + f"but got temperature={temperature}" + ) + # If temperature is 1.0, return a mock response + from litellm.types.utils import Choices, Message, ModelResponse, Usage + + return ModelResponse( + id="test-response", + choices=[ + Choices( + finish_reason="stop", + index=0, + message=Message( + content="Test response from GPT-5-mini", + role="assistant", + ), + ) + ], + created=1234567890, + model="gpt-5-mini", + object="chat.completion", + system_fingerprint="test", + usage=Usage( + prompt_tokens=10, + completion_tokens=5, + total_tokens=15, + ), + ) + + mock_completion.side_effect = mock_completion_side_effect + + # Create LLM with litellm_proxy/openai/gpt-5-mini with default temperature=0.0 + llm = LLM( + model="litellm_proxy/openai/gpt-5-mini", + api_key=SecretStr("test_key"), + num_retries=1, + retry_min_wait=1, + retry_max_wait=2, + ) + + # Verify that the LLM has the default temperature of 0.0 + assert llm.temperature == 0.0 + + # Try to use the completion API with temperature=1.0 override - this should work + messages = [{"role": "user", "content": "Hello, GPT-5-mini!"}] + + response = llm.completion(messages=messages, temperature=1.0) + + # Verify the response + assert response is not None + assert response.choices[0].message.content == "Test response from GPT-5-mini" # type: ignore + + # Verify that the mock was called with temperature=1.0 (overridden) + mock_completion.assert_called_once() + call_kwargs = mock_completion.call_args[1] + assert call_kwargs.get("temperature") == 1.0 From 99908920997c83c85c2eabbd1c811d0e28236f11 Mon Sep 17 00:00:00 2001 From: openhands Date: Mon, 15 Sep 2025 15:34:22 +0000 Subject: [PATCH 2/2] Replace mocked tests with real LLM API calls using LLM_API_KEY and LLM_BASE_URL - Remove all mocking code from GPT-5-mini temperature tests - Use environment variables LLM_API_KEY and LLM_BASE_URL for configuration - Tests now skip gracefully when environment variables are not set - Maintain the same test scenarios: default temperature failure, explicit temperature=1.0 success, and temperature override - Add proper type ignores for response attribute access Co-authored-by: openhands --- .../llm/test_gpt5_mini_temperature_issue.py | 182 +++++------------- 1 file changed, 43 insertions(+), 139 deletions(-) diff --git a/tests/sdk/llm/test_gpt5_mini_temperature_issue.py b/tests/sdk/llm/test_gpt5_mini_temperature_issue.py index cb4f86d4fa..d601fcfa97 100644 --- a/tests/sdk/llm/test_gpt5_mini_temperature_issue.py +++ b/tests/sdk/llm/test_gpt5_mini_temperature_issue.py @@ -2,9 +2,12 @@ This test reproduces the issue where GPT-5-mini requires temperature to be set to 1, but the LLM class defaults to temperature=0.0, causing an error. + +These tests use real LLM API calls and require LLM_API_KEY and LLM_BASE_URL +environment variables to be set. """ -from unittest.mock import patch +import os import pytest from pydantic import SecretStr @@ -12,56 +15,34 @@ from openhands.sdk.llm import LLM -@patch("openhands.sdk.llm.llm.litellm_completion") -def test_gpt5_mini_temperature_issue_without_temperature(mock_completion): +def get_llm_config(): + """Get LLM configuration from environment variables.""" + api_key = os.getenv("LLM_API_KEY") + base_url = os.getenv("LLM_BASE_URL") + + if not api_key or not base_url: + pytest.skip( + "LLM_API_KEY and LLM_BASE_URL environment variables must be set " + "to run real LLM tests" + ) + + return api_key, base_url + + +def test_gpt5_mini_temperature_issue_without_temperature(): """Test that GPT-5-mini fails when temperature is not explicitly set to 1. This test reproduces the issue where GPT-5-mini requires temperature=1 but the LLM class defaults to temperature=0.0, causing an error. """ - - # Mock the litellm completion to raise an error when temperature != 1 - # This simulates the actual behavior of GPT-5-mini - def mock_completion_side_effect(*args, **kwargs): - temperature = kwargs.get("temperature", 0.0) - if temperature != 1.0: - raise ValueError( - "GPT-5-mini requires temperature to be set to 1.0, " - f"but got temperature={temperature}" - ) - # If temperature is 1.0, return a mock response - from litellm.types.utils import Choices, Message, ModelResponse, Usage - - return ModelResponse( - id="test-response", - choices=[ - Choices( - finish_reason="stop", - index=0, - message=Message( - content="Test response from GPT-5-mini", - role="assistant", - ), - ) - ], - created=1234567890, - model="gpt-5-mini", - object="chat.completion", - system_fingerprint="test", - usage=Usage( - prompt_tokens=10, - completion_tokens=5, - total_tokens=15, - ), - ) - - mock_completion.side_effect = mock_completion_side_effect + api_key, base_url = get_llm_config() # Create LLM with litellm_proxy/openai/gpt-5-mini without specifying temperature # This should use the default temperature=0.0 llm = LLM( model="litellm_proxy/openai/gpt-5-mini", - api_key=SecretStr("test_key"), + api_key=SecretStr(api_key), + base_url=base_url, num_retries=1, # Reduce retries for faster test retry_min_wait=1, retry_max_wait=2, @@ -70,67 +51,31 @@ def mock_completion_side_effect(*args, **kwargs): # Verify that the LLM has the default temperature of 0.0 assert llm.temperature == 0.0 - # Try to use the completion API - this should fail + # Try to use the completion API - this should fail with GPT-5-mini messages = [{"role": "user", "content": "Hello, GPT-5-mini!"}] - with pytest.raises( - ValueError, match="GPT-5-mini requires temperature to be set to 1.0" - ): + # The exact error message may vary depending on the LLM proxy implementation + # but it should fail when temperature is not 1.0 for GPT-5-mini + with pytest.raises(Exception) as exc_info: llm.completion(messages=messages) - # Verify that the mock was called with temperature=0.0 - mock_completion.assert_called_once() - call_kwargs = mock_completion.call_args[1] - assert call_kwargs.get("temperature") == 0.0 + # Check that the error is related to temperature requirements + error_message = str(exc_info.value).lower() + assert "temperature" in error_message or "parameter" in error_message -@patch("openhands.sdk.llm.llm.litellm_completion") -def test_gpt5_mini_works_with_temperature_1(mock_completion): +def test_gpt5_mini_works_with_temperature_1(): """Test that GPT-5-mini works when temperature is explicitly set to 1. This test shows that the issue can be resolved by explicitly setting temperature=1. """ - - # Mock the litellm completion to work when temperature == 1 - def mock_completion_side_effect(*args, **kwargs): - temperature = kwargs.get("temperature", 0.0) - if temperature != 1.0: - raise ValueError( - "GPT-5-mini requires temperature to be set to 1.0, " - f"but got temperature={temperature}" - ) - # If temperature is 1.0, return a mock response - from litellm.types.utils import Choices, Message, ModelResponse, Usage - - return ModelResponse( - id="test-response", - choices=[ - Choices( - finish_reason="stop", - index=0, - message=Message( - content="Test response from GPT-5-mini", - role="assistant", - ), - ) - ], - created=1234567890, - model="gpt-5-mini", - object="chat.completion", - system_fingerprint="test", - usage=Usage( - prompt_tokens=10, - completion_tokens=5, - total_tokens=15, - ), - ) - - mock_completion.side_effect = mock_completion_side_effect + api_key, base_url = get_llm_config() # Create LLM with litellm_proxy/openai/gpt-5-mini with temperature=1.0 llm = LLM( model="litellm_proxy/openai/gpt-5-mini", - api_key=SecretStr("test_key"), + api_key=SecretStr(api_key), + base_url=base_url, temperature=1.0, # Explicitly set temperature to 1.0 num_retries=1, retry_min_wait=1, @@ -147,62 +92,24 @@ def mock_completion_side_effect(*args, **kwargs): # Verify the response assert response is not None - assert response.choices[0].message.content == "Test response from GPT-5-mini" # type: ignore - - # Verify that the mock was called with temperature=1.0 - mock_completion.assert_called_once() - call_kwargs = mock_completion.call_args[1] - assert call_kwargs.get("temperature") == 1.0 + assert len(response.choices) > 0 + assert response.choices[0].message.content is not None # type: ignore + assert len(response.choices[0].message.content.strip()) > 0 # type: ignore -@patch("openhands.sdk.llm.llm.litellm_completion") -def test_gpt5_mini_temperature_override_in_completion_call(mock_completion): +def test_gpt5_mini_temperature_override_in_completion_call(): """Test that temperature can be overridden in the completion call. This test shows that even if the LLM has a default temperature, it can be overridden in the completion call. """ - - # Mock the litellm completion to work when temperature == 1 - def mock_completion_side_effect(*args, **kwargs): - temperature = kwargs.get("temperature", 0.0) - if temperature != 1.0: - raise ValueError( - "GPT-5-mini requires temperature to be set to 1.0, " - f"but got temperature={temperature}" - ) - # If temperature is 1.0, return a mock response - from litellm.types.utils import Choices, Message, ModelResponse, Usage - - return ModelResponse( - id="test-response", - choices=[ - Choices( - finish_reason="stop", - index=0, - message=Message( - content="Test response from GPT-5-mini", - role="assistant", - ), - ) - ], - created=1234567890, - model="gpt-5-mini", - object="chat.completion", - system_fingerprint="test", - usage=Usage( - prompt_tokens=10, - completion_tokens=5, - total_tokens=15, - ), - ) - - mock_completion.side_effect = mock_completion_side_effect + api_key, base_url = get_llm_config() # Create LLM with litellm_proxy/openai/gpt-5-mini with default temperature=0.0 llm = LLM( model="litellm_proxy/openai/gpt-5-mini", - api_key=SecretStr("test_key"), + api_key=SecretStr(api_key), + base_url=base_url, num_retries=1, retry_min_wait=1, retry_max_wait=2, @@ -218,9 +125,6 @@ def mock_completion_side_effect(*args, **kwargs): # Verify the response assert response is not None - assert response.choices[0].message.content == "Test response from GPT-5-mini" # type: ignore - - # Verify that the mock was called with temperature=1.0 (overridden) - mock_completion.assert_called_once() - call_kwargs = mock_completion.call_args[1] - assert call_kwargs.get("temperature") == 1.0 + assert len(response.choices) > 0 + assert response.choices[0].message.content is not None # type: ignore + assert len(response.choices[0].message.content.strip()) > 0 # type: ignore