diff --git a/altk/pre_tool/core/config.py b/altk/pre_tool/core/config.py index 07a2a3c..03e458f 100644 --- a/altk/pre_tool/core/config.py +++ b/altk/pre_tool/core/config.py @@ -3,12 +3,11 @@ from enum import Enum from altk.core.toolkit import ComponentConfig from altk.pre_tool.core.consts import ( + METRIC_GENERAL_CONVERSATION_GROUNDED_CORRECTNESS, METRIC_GENERAL_HALLUCINATION_CHECK, METRIC_GENERAL_VALUE_FORMAT_ALIGNMENT, METRIC_FUNCTION_SELECTION_APPROPRIATENESS, METRIC_AGENTIC_CONSTRAINTS_SATISFACTION, - METRIC_PARAMETER_VALUE_FORMAT_ALIGNMENT, - METRIC_PARAMETER_HALLUCINATION_CHECK, ) @@ -23,6 +22,7 @@ class Track(str, Enum): """Predefined configuration tracks for the reflection pipeline.""" SYNTAX = "syntax" + SPEC_FREE = "spec_free" FAST_TRACK = "fast_track" SLOW_TRACK = "slow_track" TRANSFORMATIONS_ONLY = "transformations_only" @@ -70,22 +70,17 @@ class SPARCReflectionConfig(BaseModel): general_metrics: Optional[List[str]] = Field( default=[ METRIC_GENERAL_HALLUCINATION_CHECK, - METRIC_GENERAL_VALUE_FORMAT_ALIGNMENT, ], description="List of general metrics to evaluate", ) function_metrics: Optional[List[str]] = Field( default=[ METRIC_FUNCTION_SELECTION_APPROPRIATENESS, - METRIC_AGENTIC_CONSTRAINTS_SATISFACTION, ], description="List of function-specific metrics to evaluate", ) parameter_metrics: Optional[List[str]] = Field( - default=[ - METRIC_PARAMETER_HALLUCINATION_CHECK, - METRIC_PARAMETER_VALUE_FORMAT_ALIGNMENT, - ], + default=[], description="List of parameter-specific metrics to evaluate", ) @@ -113,6 +108,11 @@ class Config: function_metrics=None, parameter_metrics=None, ), + "spec_free": SPARCReflectionConfig( + general_metrics=[METRIC_GENERAL_CONVERSATION_GROUNDED_CORRECTNESS], + function_metrics=None, + parameter_metrics=None, + ), "fast_track": SPARCReflectionConfig( general_metrics=[METRIC_GENERAL_HALLUCINATION_CHECK], function_metrics=[METRIC_FUNCTION_SELECTION_APPROPRIATENESS], diff --git a/altk/pre_tool/core/consts.py b/altk/pre_tool/core/consts.py index d255797..f2f8366 100644 --- a/altk/pre_tool/core/consts.py +++ b/altk/pre_tool/core/consts.py @@ -2,6 +2,9 @@ ## General metrics METRIC_GENERAL_HALLUCINATION_CHECK = "general_hallucination_check" METRIC_GENERAL_VALUE_FORMAT_ALIGNMENT = "general_value_format_alignment" +METRIC_GENERAL_CONVERSATION_GROUNDED_CORRECTNESS = ( + "general_conversation_grounded_correctness" +) ## Function selection metrics METRIC_FUNCTION_SELECTION_APPROPRIATENESS = "function_selection_appropriateness" @@ -17,6 +20,11 @@ METRIC_GENERAL_VALUE_FORMAT_ALIGNMENT, ] +## Tool-spec-free metrics (can run without tool specifications) +TOOL_SPEC_FREE_METRICS = [ + METRIC_GENERAL_CONVERSATION_GROUNDED_CORRECTNESS, +] + FUNCTION_SELECTION_METRICS = [ METRIC_FUNCTION_SELECTION_APPROPRIATENESS, METRIC_AGENTIC_CONSTRAINTS_SATISFACTION, diff --git a/altk/pre_tool/core/types.py b/altk/pre_tool/core/types.py index 63a9e85..55f404b 100644 --- a/altk/pre_tool/core/types.py +++ b/altk/pre_tool/core/types.py @@ -4,15 +4,6 @@ from altk.core.llm import LLMClient from altk.core.toolkit import ComponentInput, ComponentOutput -from altk.pre_tool.refraction.src.schemas.results import ( - DebuggingResult, -) -from altk.pre_tool.refraction.src.schemas.mappings import Mapping - -try: - from nestful.schemas.api import Catalog -except ImportError: - Catalog = None class SPARCReflectionDecision(str, Enum): @@ -102,23 +93,6 @@ class SPARCReflectionRunOutput(PreToolReflectionRunOutput): ) -class RefractionRunInput(PreToolReflectionRunInput): - mappings: Optional[list[Mapping]] = None - memory_objects: Optional[dict[str, Any]] = None - use_given_operators_only: bool = False - - -class RefractionBuildInput(PreToolReflectionBuildInput): - tool_specs: list[dict[str, Any]] | Catalog - top_k: int = 5 - threshold: float = 0.8 - compute_maps: bool = True - - -class RefractionRunOutput(PreToolReflectionRunOutput): - result: Optional[DebuggingResult] = None - - class ToolGuardBuildInputMetaData(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) policy_text: str = Field(description="Text of the policy document file") diff --git a/altk/pre_tool/refraction/README.md b/altk/pre_tool/refraction/README.md index f02661d..ae3206a 100644 --- a/altk/pre_tool/refraction/README.md +++ b/altk/pre_tool/refraction/README.md @@ -2,10 +2,12 @@ Refraction is a low-cost (no LLMs!), low-latency, domain-agnostic, data-agnostic, model-agnostic approach towards validation and repair for a sequence of tool calls, based on classical AI planning techniques. We have seen as much as 48% error correction in certain scenarios. ## Table of Contents -- [When it is recommended to use this component](#when-it-is-recommended-to-use-this-component) -- [Quick Start](#quick-start) -- [License](#license) -- [Under the Hood](#under-the-hood) +- [Refraction - Syntactic Validation of Tool Calls](#refraction---syntactic-validation-of-tool-calls) + - [Table of Contents](#table-of-contents) + - [When it is recommended to use this component](#when-it-is-recommended-to-use-this-component) + - [Quick Start](#quick-start) + - [License](#license) + - [Under the Hood](#under-the-hood) ## When it is recommended to use this component @@ -22,7 +24,7 @@ Make sure the dependencies for Refraction are included by running `pip install " ```python import os from altk.pre_tool.refraction.refraction import RefractionComponent -from altk.pre_tool.core.types import RefractionBuildInput, RefractionRunInput +from altk.pre_tool.refraction.types import RefractionBuildInput, RefractionRunInput from altk.pre_tool.core.config import RefractionConfig, RefractionMode from altk.core.toolkit import AgentPhase diff --git a/altk/pre_tool/refraction/refraction.py b/altk/pre_tool/refraction/refraction.py index ab6f25d..f192f7e 100644 --- a/altk/pre_tool/refraction/refraction.py +++ b/altk/pre_tool/refraction/refraction.py @@ -15,7 +15,7 @@ RefractionConfig, RefractionMode, ) -from altk.pre_tool.core.types import ( +from altk.pre_tool.refraction.types import ( RefractionBuildInput, RefractionRunInput, RefractionRunOutput, diff --git a/altk/pre_tool/refraction/types.py b/altk/pre_tool/refraction/types.py new file mode 100644 index 0000000..bca1485 --- /dev/null +++ b/altk/pre_tool/refraction/types.py @@ -0,0 +1,46 @@ +from typing import Optional, Any + +from altk.pre_tool.core.types import ( + PreToolReflectionRunInput, + PreToolReflectionRunOutput, + PreToolReflectionBuildInput, +) +from altk.pre_tool.refraction.src.schemas.results import ( + DebuggingResult, +) +from altk.pre_tool.refraction.src.schemas.mappings import Mapping + +try: + from nestful.schemas.api import Catalog +except ImportError: + Catalog = None + + +class RefractionRunInput(PreToolReflectionRunInput): + """Input for running Refraction reflection.""" + + mappings: Optional[list[Mapping]] = None + memory_objects: Optional[dict[str, Any]] = None + use_given_operators_only: bool = False + + +class RefractionBuildInput(PreToolReflectionBuildInput): + """Input for building Refraction component.""" + + tool_specs: list[dict[str, Any]] | Catalog + top_k: int = 5 + threshold: float = 0.8 + compute_maps: bool = True + + +class RefractionRunOutput(PreToolReflectionRunOutput): + """Output from running Refraction reflection.""" + + result: Optional[DebuggingResult] = None + + +__all__ = [ + "RefractionRunInput", + "RefractionRunOutput", + "RefractionBuildInput", +] diff --git a/altk/pre_tool/sparc/README.md b/altk/pre_tool/sparc/README.md index 84edd25..0904bdb 100644 --- a/altk/pre_tool/sparc/README.md +++ b/altk/pre_tool/sparc/README.md @@ -65,7 +65,6 @@ from altk.pre_tool.core import ( ) from altk.pre_tool.sparc.sparc import SPARCReflectionComponent from altk.core.toolkit import AgentPhase, ComponentConfig -from langchain_core.messages import HumanMessage, AIMessage from altk.core.llm import get_llm @@ -115,8 +114,14 @@ tool_specs = [{ # Prepare conversation context messages = [ - HumanMessage(content="Send an email to team@company.com about the meeting"), - AIMessage(content="I'll send that email for you.") + { + "role": "user", + "content": "Send an email to team@company.com about the meeting" + }, + { + "role": "assistant", + "content": "I'll send that email for you." + } ] # Tool call to validate (OpenAI format) @@ -155,13 +160,15 @@ The component expects three main inputs in OpenAI-compatible formats: List of messages representing the conversation context: ```python -from langchain_core.messages import HumanMessage, AIMessage - messages = [ - HumanMessage(content="What's the weather in New York?"), - AIMessage(content="I'll check the weather for you."), - HumanMessage(content="Make sure to use Fahrenheit please"), - AIMessage(content="I'll get the weather in New York using Fahrenheit.") + { + "role": "user", + "content": "What's the weather in New York?" + }, + { + "role": "assistant", + "content": "I'll check the weather for you." + }, ] ``` @@ -314,6 +321,22 @@ sparc = SPARCReflectionComponent( ) ``` +#### `Track.SPEC_FREE` - Semantic Validation without Tool Specifications +- **LLM Calls**: 1 +- **Validates**: General correctness check (spec-free) +- **Use Case**: Single-turn or multi-turn conversations, performance-sensitive applications +- **Performance**: Very fast +- **Model Required**: Yes + +```python +config = build_config() # ValidatingLLMClient required +sparc = SPARCReflectionComponent( + config=config, + track=Track.SPEC_FREE, + execution_mode=SPARCExecutionMode.ASYNC, +) +``` + #### `Track.TRANSFORMATIONS_ONLY` - Unit/Format Conversion Focus - **LLM Calls**: 1 + N (where N = parameters needing transformation, executed in parallel) - **Validates**: Units conversion, format transformations @@ -352,6 +375,9 @@ Each track includes specific validation metrics optimized for different use case - **METRIC_AGENTIC_CONSTRAINTS_SATISFACTION**: Validates adherence to agentic conversation constraints and context - **Transform enabled**: Unit/format conversions when needed +#### `Track.SPEC_FREE` +- **METRIC_GENERAL_CONVERSATION_GROUNDED_CORRECTNESS**: Detects incorrect tool calls + #### `Track.TRANSFORMATIONS_ONLY` - **Transform enabled**: Focus on unit/format conversions - **METRIC_TRANSFORMATION_DETECTION**: Identifies parameters needing transformation @@ -396,6 +422,7 @@ For advanced users who need specific combinations of validation metrics, you can from llmevalkit.function_calling.consts import ( METRIC_GENERAL_HALLUCINATION_CHECK, # Detects hallucinated parameter values METRIC_GENERAL_VALUE_FORMAT_ALIGNMENT, # Validates parameter format requirements + METRIC_GENERAL_CONVERSATION_GROUNDED_CORRECTNESS, # Validate tool call correctness (spec-free) METRIC_FUNCTION_SELECTION_APPROPRIATENESS, # Validates function choice matches intent METRIC_AGENTIC_CONSTRAINTS_SATISFACTION, # Validates agentic conversation constraints METRIC_PARAMETER_VALUE_FORMAT_ALIGNMENT, # Validates parameter format requirements @@ -491,6 +518,7 @@ sparc = SPARCReflectionComponent( - **General Metrics**: Applied to the overall tool call context - `METRIC_GENERAL_HALLUCINATION_CHECK`: Detects fabricated or hallucinated information - `METRIC_GENERAL_VALUE_FORMAT_ALIGNMENT`: Validates parameter format requirements + - `METRIC_GENERAL_CONVERSATION_GROUNDED_CORRECTNESS`: Validate tool call correctness (spec-free) - **Function Metrics**: Applied to function selection and appropriateness - `METRIC_FUNCTION_SELECTION_APPROPRIATENESS`: Validates function choice matches user intent @@ -787,8 +815,6 @@ uv run pytest tests/pre_tool/sparc/units_conversion_test.py - **Semantic Validation Tests**: Intent alignment, parameter grounding, hallucination detection - **Units Conversion Tests**: Temperature, distance, and format transformation validation - - ## License Apache 2.0 - see LICENSE file for details. diff --git a/altk/pre_tool/examples/custom_config_example.py b/altk/pre_tool/sparc/examples/custom_config_example.py similarity index 97% rename from altk/pre_tool/examples/custom_config_example.py rename to altk/pre_tool/sparc/examples/custom_config_example.py index 55533fd..cfa41bc 100644 --- a/altk/pre_tool/examples/custom_config_example.py +++ b/altk/pre_tool/sparc/examples/custom_config_example.py @@ -2,7 +2,6 @@ import json from dotenv import load_dotenv from typing import cast -from langchain_core.messages import HumanMessage, AIMessage from altk.pre_tool.core.types import SPARCReflectionRunOutput # Import middleware components @@ -69,8 +68,14 @@ def run_custom_config_examples(): # Test with function selection misalignment conversation_context = [ - HumanMessage(content="What's the weather like in New York today?"), - AIMessage(content="I'll check the weather for you."), + { + "role": "user", + "content": "What's the weather like in New York today?", + }, + { + "role": "assistant", + "content": "I'll check the weather for you.", + }, ] tool_specs = [ diff --git a/altk/pre_tool/examples/semantic_issues_example.py b/altk/pre_tool/sparc/examples/semantic_issues_example.py similarity index 93% rename from altk/pre_tool/examples/semantic_issues_example.py rename to altk/pre_tool/sparc/examples/semantic_issues_example.py index 60e2ac7..b1d0891 100644 --- a/altk/pre_tool/examples/semantic_issues_example.py +++ b/altk/pre_tool/sparc/examples/semantic_issues_example.py @@ -1,6 +1,5 @@ import json import os -from langchain_core.messages import HumanMessage, AIMessage import asyncio from altk.pre_tool.core import ( @@ -238,8 +237,14 @@ async def run_function_misalignment_example(middleware: SPARCReflectionComponent print("User asks about weather, but tool call tries to book a flight\n") conversation_context = [ - HumanMessage(content="What's the weather like in New York today?"), - AIMessage(content="I'll check the weather for you in New York."), + { + "role": "user", + "content": "What's the weather like in New York today?", + }, + { + "role": "assistant", + "content": "I'll check the weather for you in New York.", + }, ] # Wrong function selected - booking flight instead of getting weather @@ -278,9 +283,10 @@ async def run_value_grounding_example(middleware: SPARCReflectionComponent): ) conversation_context = [ - HumanMessage( - content="Send an SMS to my mom at +1234567890 saying 'Happy Birthday'" - ), + { + "role": "user", + "content": "Send an SMS to my mom at +1234567890 saying 'Happy Birthday'", + } ] # Parameter values don't match what user said @@ -315,8 +321,14 @@ async def run_parameter_hallucination_example(middleware: SPARCReflectionCompone print("User asks simple weather question, but tool call adds invented details\n") conversation_context = [ - HumanMessage(content="Is it sunny in Miami?"), - AIMessage(content="I'll check the current weather in Miami for you."), + { + "role": "user", + "content": "Is it sunny in Miami?", + }, + { + "role": "assistant", + "content": "I'll check the current weather in Miami for you.", + }, ] # Tool call adds details not mentioned by user (hallucinated parameters) @@ -353,12 +365,14 @@ async def run_valid_semantic_example(middleware: SPARCReflectionComponent): ) conversation_context = [ - HumanMessage( - content="What's the weather like in Boston today? I prefer Fahrenheit." - ), - AIMessage( - content="I'll check the current weather in Boston with Fahrenheit temperature." - ), + { + "role": "user", + "content": "What's the weather like in Boston today? I prefer Fahrenheit.", + }, + { + "role": "assistant", + "content": "I'll check the current weather in Boston with Fahrenheit temperature.", + }, ] # Well-aligned function call that matches user intent and preferences diff --git a/altk/pre_tool/examples/static_issues_example.py b/altk/pre_tool/sparc/examples/static_issues_example.py similarity index 92% rename from altk/pre_tool/examples/static_issues_example.py rename to altk/pre_tool/sparc/examples/static_issues_example.py index f6065ef..929ef80 100644 --- a/altk/pre_tool/examples/static_issues_example.py +++ b/altk/pre_tool/sparc/examples/static_issues_example.py @@ -1,6 +1,5 @@ import json import os -from langchain_core.messages import HumanMessage, AIMessage from altk.pre_tool.core import ( SPARCReflectionRunInput, @@ -196,8 +195,14 @@ def run_missing_parameters_example(middleware: SPARCReflectionComponent): print("Tool call missing required 'subject' and 'body' parameters\n") conversation_context = [ - HumanMessage(content="Send an email to john@example.com"), - AIMessage(content="I'll send an email to john@example.com"), + { + "role": "user", + "content": "Send an email to john@example.com", + }, + { + "role": "assistant", + "content": "I'll send an email to john@example.com", + }, ] # Missing required parameters: subject and body @@ -232,12 +237,15 @@ def run_invalid_types_example(middleware: SPARCReflectionComponent): print( "Tool call with incorrect parameter types (string instead of array, number instead of string)\n" ) - conversation_context = [ - HumanMessage( - content="Schedule a 2-hour meeting with the team tomorrow at 2 PM" - ), - AIMessage(content="I'll schedule a 2-hour meeting with the team"), + { + "role": "user", + "content": "Schedule a 2-hour meeting with the team tomorrow at 2 PM", + }, + { + "role": "assistant", + "content": "I'll schedule a 2-hour meeting with the team", + }, ] # Invalid types: participants should be array, duration_minutes should be integer @@ -277,8 +285,14 @@ def run_schema_violations_example(middleware: SPARCReflectionComponent): ) conversation_context = [ - HumanMessage(content="Send a high priority email to invalid-email"), - AIMessage(content="I'll send a high priority email"), + { + "role": "user", + "content": "Send a high priority email to invalid-email", + }, + { + "role": "assistant", + "content": "I'll send a high priority email", + }, ] # Schema violations: invalid email format, invalid priority enum, empty subject @@ -315,10 +329,14 @@ def run_valid_static_example(middleware: SPARCReflectionComponent): print("Tool call with all required parameters and correct types\n") conversation_context = [ - HumanMessage( - content="Send an email to team@example.com with subject 'Weekly Update'" - ), - AIMessage(content="I'll send the weekly update email to the team"), + { + "role": "user", + "content": "Send an email to team@example.com with subject 'Weekly Update'", + }, + { + "role": "assistant", + "content": "I'll send the weekly update email to the team", + }, ] # Valid tool call diff --git a/altk/pre_tool/examples/units_conversion_error_example.py b/altk/pre_tool/sparc/examples/units_conversion_error_example.py similarity index 91% rename from altk/pre_tool/examples/units_conversion_error_example.py rename to altk/pre_tool/sparc/examples/units_conversion_error_example.py index 4560290..3b974fb 100644 --- a/altk/pre_tool/examples/units_conversion_error_example.py +++ b/altk/pre_tool/sparc/examples/units_conversion_error_example.py @@ -1,7 +1,6 @@ import json import os import asyncio -from langchain_core.messages import HumanMessage, AIMessage from altk.pre_tool.core import ( SPARCReflectionRunInput, @@ -134,10 +133,14 @@ async def run_temperature_conversion_example(middleware: SPARCReflectionComponen print("Wrong conversion: Using 75 directly instead of converting 75°F to ~24°C\n") conversation_context = [ - HumanMessage(content="Set the thermostat to 75 Fahrenheit in the living room"), - AIMessage( - content="I'll set the thermostat to 75 Fahrenheit in the living room." - ), + { + "role": "user", + "content": "Set the thermostat to 75 Fahrenheit in the living room", + }, + { + "role": "assistant", + "content": "I'll set the thermostat to 75 Fahrenheit in the living room.", + }, ] # Incorrect tool call - using 75 directly instead of converting F to C @@ -174,8 +177,14 @@ async def run_distance_conversion_example(middleware: SPARCReflectionComponent): print("Wrong conversion: Using miles/mph values directly\n") conversation_context = [ - HumanMessage(content="Calculate travel time for 50 miles at 60 mph"), - AIMessage(content="I'll calculate the travel time for 50 miles at 60 mph."), + { + "role": "user", + "content": "Calculate travel time for 50 miles at 60 mph", + }, + { + "role": "assistant", + "content": "I'll calculate the travel time for 50 miles at 60 mph.", + }, ] # Incorrect tool call - using miles/mph instead of km/kmh @@ -211,10 +220,14 @@ async def run_correct_conversion_example(middleware: SPARCReflectionComponent): print("Correct conversion: 75°F converted to 24°C\n") conversation_context = [ - HumanMessage(content="Set the thermostat to 75 Fahrenheit in the bedroom"), - AIMessage( - content="I'll set the thermostat to 24 degrees Celsius (75°F) in the bedroom." - ), + { + "role": "user", + "content": "Set the thermostat to 75 Fahrenheit in the bedroom", + }, + { + "role": "assistant", + "content": "I'll set the thermostat to 24 degrees Celsius (75°F) in the bedroom.", + }, ] # Correct tool call - properly converted F to C diff --git a/altk/pre_tool/sparc/function_calling/__init__.py b/altk/pre_tool/sparc/function_calling/__init__.py index d71c67c..73c503c 100644 --- a/altk/pre_tool/sparc/function_calling/__init__.py +++ b/altk/pre_tool/sparc/function_calling/__init__.py @@ -1,5 +1,6 @@ from altk.pre_tool.sparc.function_calling.metrics import ( GeneralMetricsPrompt, + GeneralMetricsPromptNoSpec, FunctionSelectionPrompt, ParameterMetricsPrompt, get_general_metrics_prompt, @@ -12,6 +13,7 @@ __all__ = [ "GeneralMetricsPrompt", + "GeneralMetricsPromptNoSpec", "FunctionSelectionPrompt", "ParameterMetricsPrompt", "get_general_metrics_prompt", diff --git a/altk/pre_tool/sparc/function_calling/metrics/__init__.py b/altk/pre_tool/sparc/function_calling/metrics/__init__.py index 1c821b4..7f3da8c 100644 --- a/altk/pre_tool/sparc/function_calling/metrics/__init__.py +++ b/altk/pre_tool/sparc/function_calling/metrics/__init__.py @@ -2,6 +2,7 @@ from altk.pre_tool.sparc.function_calling.metrics.function_call import ( GeneralMetricsPrompt, + GeneralMetricsPromptNoSpec, get_general_metrics_prompt, ) from altk.pre_tool.sparc.function_calling.metrics.function_selection import ( @@ -22,6 +23,7 @@ __all__ = [ "get_general_metrics_prompt", "GeneralMetricsPrompt", + "GeneralMetricsPromptNoSpec", "FunctionSelectionPrompt", "get_parameter_metrics_prompt", "ParameterMetricsPrompt", diff --git a/altk/pre_tool/sparc/function_calling/metrics/function_call/__init__.py b/altk/pre_tool/sparc/function_calling/metrics/function_call/__init__.py index 0c6a8de..8316145 100644 --- a/altk/pre_tool/sparc/function_calling/metrics/function_call/__init__.py +++ b/altk/pre_tool/sparc/function_calling/metrics/function_call/__init__.py @@ -1,8 +1,13 @@ """General function-call metrics.""" -from altk.pre_tool.sparc.function_calling.metrics.function_call.general import ( +from .general import ( GeneralMetricsPrompt, + GeneralMetricsPromptNoSpec, get_general_metrics_prompt, ) -__all__ = ["GeneralMetricsPrompt", "get_general_metrics_prompt"] +__all__ = [ + "GeneralMetricsPrompt", + "GeneralMetricsPromptNoSpec", + "get_general_metrics_prompt", +] diff --git a/altk/pre_tool/sparc/function_calling/metrics/function_call/general.py b/altk/pre_tool/sparc/function_calling/metrics/function_call/general.py index f9125ae..fa3dd3b 100644 --- a/altk/pre_tool/sparc/function_calling/metrics/function_call/general.py +++ b/altk/pre_tool/sparc/function_calling/metrics/function_call/general.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Union +from typing import Any, Dict, List, Optional, Union from altk.pre_tool.sparc.function_calling.metrics.base import ( FunctionMetricsPrompt, ) @@ -20,6 +20,14 @@ "Return a JSON object as specified in the system prompt. You MUST keep the same order of fields in the JSON object as provided in the JSON schema and examples." ) +_general_user_no_spec: str = ( + "Conversation context:\n" + "{{ conversation_context }}\n\n" + "Proposed tool call:\n" + "{{ tool_call }}\n\n" + "Return a JSON object as specified in the system prompt. You MUST keep the same order of fields in the JSON object as provided in the JSON schema and examples." +) + class GeneralMetricsPrompt(FunctionMetricsPrompt): """Prompt builder for general tool-call semantic metrics.""" @@ -28,21 +36,37 @@ class GeneralMetricsPrompt(FunctionMetricsPrompt): user_template = _general_user +class GeneralMetricsPromptNoSpec(FunctionMetricsPrompt): + """Prompt builder for tool-spec-free general metrics.""" + + system_template = _general_system + user_template = _general_user_no_spec + + def get_general_metrics_prompt( - prompt: GeneralMetricsPrompt, + prompt: Union[GeneralMetricsPrompt, GeneralMetricsPromptNoSpec], conversation_context: Union[str, List[Dict[str, str]]], - tool_inventory: List[Dict[str, Any]], tool_call: Dict[str, Any], + tool_inventory: Optional[List[Dict[str, Any]]] = None, ) -> List[Dict[str, str]]: """ Build the messages for a general semantic evaluation. + Args: + prompt: Prompt instance (with or without tool spec support) + conversation_context: Conversation history + tool_call: The tool call to evaluate + tool_inventory: Optional tool specifications (not needed for tool-spec-free metrics) + Returns the list of chat messages (system -> [few-shot] -> user). """ - return prompt.build_messages( - user_kwargs={ - "conversation_context": conversation_context, - "tool_inventory": tool_inventory, - "tool_call": tool_call, - } - ) + user_kwargs = { + "conversation_context": conversation_context, + "tool_call": tool_call, + } + + # Only include tool_inventory if provided and prompt expects it + if tool_inventory is not None and isinstance(prompt, GeneralMetricsPrompt): + user_kwargs["tool_inventory"] = tool_inventory + + return prompt.build_messages(user_kwargs=user_kwargs) diff --git a/altk/pre_tool/sparc/function_calling/metrics/function_call/general_metrics.json b/altk/pre_tool/sparc/function_calling/metrics/function_call/general_metrics.json index 7ef928d..d4ff2e1 100644 --- a/altk/pre_tool/sparc/function_calling/metrics/function_call/general_metrics.json +++ b/altk/pre_tool/sparc/function_calling/metrics/function_call/general_metrics.json @@ -94,36 +94,34 @@ }, "actionable_recommendations": { "type": "array", - "description": "Provide actionable recommendations to help the agent developer prevent parameter hallucinations and similar mistakes in future tool calls. Include multiple detailed suggestions only when there is a mistake and the recommendation can genuinely help. Recommend only items that are important and can significantly improve the agent, such as:\n\n1. PARAMETER_DOCUMENTATION: Update the API parameter specification to clarify parameter requirements, formats, units, edge cases, and other relevant information. Suggest the exact fix—addition, deletion, or adjustment.\n2. PARAMETER_EXAMPLES: Add examples to the parameter documentation that will help prevent similar mistakes in the future.\n3. INSTRUCTIONS_ADDITIONS: Add general instructions that can help the agent handle such mistakes - DO NOT include the specific parameter name, but give a general instruction for this specific mistake. Only include instructions that are essential for the agent and do not repeat existing ones. If an existing instruction (usually in the system prompt) can be improved, recommend updating it. Use this recommendation only if a clear, general instruction will significantly improve the agent in this kind of specific mistake.\n4. OTHER: Any other relevant recommendations that can help improve the agent's performance in future tool calls.", + "description": "Provide clear, actionable recommendations that can be automatically applied to prevent similar issues. Only include recommendations when there are actual issues (output <= 3). Focus on:\n\n1. PARAMETER_DOCUMENTATION: Add or update parameter documentation to clarify requirements, formats, or defaults.\n2. PARAMETER_EXAMPLES: Add concrete examples showing correct parameter usage.\n3. SYSTEM_PROMPT_INSTRUCTION: Add a general instruction to the system prompt (do not mention specific parameter names).", "items": { "type": "object", "properties": { "recommendation": { "type": "string", - "description": "A specific, actionable recommendation to reduce parameter hallucinations.", "enum": [ "PARAMETER_DOCUMENTATION", "PARAMETER_EXAMPLES", - "INSTRUCTIONS_ADDITIONS", - "OTHER" - ] + "SYSTEM_PROMPT_INSTRUCTION" + ], + "description": "The type of recommendation being made." }, "parameter_name": { "type": "string", - "description": "The name of the parameter to which the recommendation applies." + "description": "The name of the parameter to which the recommendation applies. Omit for SYSTEM_PROMPT_INSTRUCTION." }, "details": { "type": "string", - "description": "An explanation of the recommendation, including what specific changes should be made, how they will prevent parameter hallucinations, and any relevant examples or best practices." + "description": "Brief explanation of what should be changed and why." }, "quote": { "type": "string", - "description": "The specific quote of the additions made to the parameter documentation (description in the tool specification), examples (that will be added to the tool parameter documentation), or instructions that will be added to the system prompt of the agent." + "description": "Exact text to add to documentation or system prompt. Must be directly usable without modification." } }, "required": [ "recommendation", - "parameter_name", "details", "quote" ] @@ -260,14 +258,13 @@ { "recommendation": "PARAMETER_DOCUMENTATION", "parameter_name": "user_id", - "details": "Add to the documentation of the user_id parameter to clarify its required nature.", - "quote": "user_id must be explicitly provided by the user or derived from authentication tool response. No default should ever be assumed." + "details": "Clarify that user_id must come from conversation or authentication, never assumed.", + "quote": "Required. Must be explicitly provided by the user or obtained from a prior authentication tool call. Never assume or fabricate this value." }, { - "recommendation": "INSTRUCTIONS_ADDITIONS", - "parameter_name": "user_id", - "details": "Add instructions to never invent parameter values, and if needed, ask the user for clarification.", - "quote": "Never invent parameter values. If missing, explicitly ask the user to provide it." + "recommendation": "SYSTEM_PROMPT_INSTRUCTION", + "details": "Add instruction to never fabricate parameter values.", + "quote": "Never invent or fabricate parameter values. If required information is missing from the conversation, ask the user to provide it before making tool calls." } ] } @@ -342,14 +339,13 @@ { "recommendation": "PARAMETER_DOCUMENTATION", "parameter_name": "seat_class", - "details": "Add to the documentation of seat_class to clarify that it is a required parameter.", - "quote": "Required. Must be explicitly provided by user. No default assumed." + "details": "Clarify that seat_class is required and has no default value.", + "quote": "Required. Must be explicitly provided by the user. Valid values: economy, business, first. No default value." }, { - "recommendation": "INSTRUCTIONS_ADDITIONS", - "parameter_name": "seat_class", - "details": "Add instructions to always confirm required parameters with the user if not provided.", - "quote": "Always confirm all required details with the user before calling tools." + "recommendation": "SYSTEM_PROMPT_INSTRUCTION", + "details": "Add instruction to confirm required parameters before tool calls.", + "quote": "Before making tool calls, ensure all required parameters have been provided by the user. If any required information is missing, ask the user for it." } ] } @@ -415,20 +411,19 @@ { "recommendation": "PARAMETER_DOCUMENTATION", "parameter_name": "departure_date", - "details": "Clarify the required date format in the parameter documentation.", - "quote": "YYYY-MM-DD only. No other formats accepted. If a different format is given in the conversation, reformat it to the required format before calling the tool." + "details": "Specify exact date format requirement.", + "quote": "Required format: YYYY-MM-DD (e.g., 2025-09-15). Always convert user-provided dates to this format before making the tool call." }, { "recommendation": "PARAMETER_EXAMPLES", "parameter_name": "departure_date", - "details": "Add an example of a departure date that was reformatted correctly from 'DD/MM/YYYY' to 'YYYY-MM-DD'.", - "quote": "Example: given a user input with a departure date of '15/09/2025', it should be reformatted to '2025-09-15' before calling the tool." + "details": "Show example of date format conversion.", + "quote": "Example: User says '15/09/2025' → convert to '2025-09-15' before calling the tool." }, { - "recommendation": "INSTRUCTIONS_ADDITIONS", - "parameter_name": "departure_date", - "details": "Add instructions to always reformat dates to the required format before calling the tool.", - "quote": "Make sure you adhere to the required formats and units as specified in the API documentation, and if a different format is given in the conversation, reformat it to the required format before calling the tool." + "recommendation": "SYSTEM_PROMPT_INSTRUCTION", + "details": "Add instruction to convert dates to required formats.", + "quote": "Always convert dates and times to the exact format specified in the API documentation before making tool calls, regardless of how the user provides them." } ] } @@ -779,5 +774,315 @@ } } ] + }, + { + "name": "general_conversation_grounded_correctness", + "task_description": "Evaluate the overall correctness and appropriateness of the tool call based SOLELY on the conversation context, WITHOUT requiring explicit tool specifications. This metric assesses whether the tool call is a logical, non-redundant action that is fully grounded in the conversation history.\n\n**IMPORTANT**: If tool specifications appear in the conversation (e.g., in system prompts or assistant messages), you MAY use them to enhance your assessment. However, your primary focus should be on conversation grounding.\n\n---\n\n### 1. Tool Call Appropriateness\n\nAssess whether the tool call makes sense given the conversation flow:\n\n- **Intent Alignment**: Does the tool call directly address the user's request or a logical next step?\n- **Non-Redundancy**: Is this tool call necessary, or has equivalent information already been obtained?\n- **Timing**: Are there prerequisite actions that should have been completed first?\n- **Relevance**: Is the chosen function appropriate for the current conversation context?\n\n---\n\n### 2. Conversation Grounding Hierarchy\n\nParameter values should be grounded in the following priority order:\n\n1. **Tool Outputs** (highest priority): Values from previous tool call results\n2. **Explicit User Statements**: Direct mentions in user messages\n3. **Confirmed Assistant Statements**: Information the assistant provided that the user acknowledged\n4. **Logical Inference**: Values that can be reasonably inferred from context (use cautiously)\n\n**Key Principle**: When there's a conflict between sources, tool outputs take precedence over user/assistant statements, as they represent ground truth from the system.\n\n---\n\n### 3. Parameter Value Assessment\n\nFor each parameter in the tool call:\n\n- **Explicit Grounding**: Is the value directly stated in the conversation?\n- **Implicit Grounding**: Can the value be reasonably inferred from context?\n- **Assumption Detection**: Is the value assumed without any conversational basis?\n- **Conflict Resolution**: If multiple sources provide different values, is the most authoritative source used?\n\n---\n\n### 4. Format and Consistency (Conversation-Based)\n\nEven without formal specifications:\n\n- **Internal Consistency**: Do parameter values make logical sense together (e.g., end_date after start_date)?\n- **Observable Patterns**: If tool specifications are mentioned in the conversation, do values conform to described patterns?\n- **Common Sense Validation**: Do values follow basic logical constraints (e.g., no negative quantities for counts)?\n\n---\n\n### 5. Prerequisite Satisfaction\n\nCheck if the tool call has all necessary preconditions:\n\n- **Dependent Information**: Is all required information available from the conversation or prior tool calls?\n- **State Requirements**: Based on conversation context, is the system in the right state for this call?\n- **Missing Steps**: Are there obvious gaps in the conversation that should be addressed first?\n\n---\n\n### Correctness Rating Scale\n\n- **5 - Fully Correct**: Tool call is perfectly appropriate, all parameters are explicitly or clearly inferably grounded in conversation, no redundancy, all prerequisites satisfied\n- **4 - Mostly Correct**: Tool call is appropriate and well-grounded, with only minor implicit inferences that are very reasonable\n- **3 - Partially Correct**: Tool call is generally appropriate but has some parameters with weak grounding or minor timing/relevance issues\n- **2 - Problematic**: Tool call has significant issues—multiple ungrounded parameters, questionable relevance, or missing prerequisites\n- **1 - Incorrect**: Tool call is inappropriate, redundant, or has parameters that are clearly fabricated or contradict conversation context\n\n---\n\n### Evaluation Guidelines\n\n- **Conservative Assessment**: When grounding is unclear, prefer lower scores\n- **Prioritize Tool Outputs**: Always favor values from tool results over human statements\n- **Consider Conversation Flow**: Assess whether the tool call makes sense in the sequence of events\n- **Watch for Hallucinations**: Flag any parameters that appear without conversational basis\n- **Use Available Specs**: If tool specifications appear in the conversation, incorporate them into your assessment\n\nYour evaluation should reflect a holistic view of whether a human reviewing the conversation would consider this tool call to be correct, appropriate, and fully justified by the available information.", + "jsonschema": { + "title": "general_conversation_grounded_correctness", + "description": "Holistic assessment of tool call correctness based on conversation grounding, appropriateness, and logical consistency without requiring explicit tool specifications.", + "type": "object", + "additionalProperties": false, + "properties": { + "evidence": { + "type": "string", + "description": "Provide EXACT quotes from the conversation that support or contradict the tool call. Include: (1) relevant user messages, (2) assistant messages, (3) prior tool outputs, and (4) any tool specifications mentioned in the conversation. Organize evidence by parameter and by aspect (appropriateness, grounding, prerequisites)." + }, + "explanation": { + "type": "string", + "description": "Provide an analysis covering: (1) Tool Call Appropriateness - why this tool call makes sense (or doesn't) in the conversation flow, (2) Parameter Grounding - for EACH parameter, explain its grounding source and quality, noting conflicts between sources and resolution, (3) Prerequisites - whether all necessary preconditions are satisfied, (4) Format and Consistency - whether parameters make logical sense together, (5) Overall Correctness - summary judgment on whether the tool call is fully justified by the conversation." + }, + "output": { + "type": "integer", + "minimum": 1, + "maximum": 5, + "threshold_low": 4, + "threshold_high": 5, + "description": "Overall Correctness Score (1-5):\n\n5 - Fully Correct: Tool call is the right action at the right time. All parameters are explicitly grounded in conversation (user messages, confirmed assistant statements, or tool outputs). No redundancy. All prerequisites satisfied. If tool specs are in conversation, the call aligns with them.\nExample: User asks \"What's the weather in Boston?\", assistant calls get_weather(location=\"Boston\") with no prior weather call for Boston.\n\n4 - Mostly Correct: Tool call is appropriate and timely. Most parameters are explicitly grounded, a few may be reasonably inferred from clear context. No significant issues. Prerequisites satisfied.\nExample: User says \"Book a table for dinner tonight\", assistant calls book_restaurant(date=\"2025-12-17\", time=\"19:00\", party_size=2) where date is inferred from 'tonight' and party_size from typical default.\n\n3 - Partially Correct: Tool call is generally appropriate but has noticeable issues—some parameters are weakly grounded, minor relevance concerns, or unclear if prerequisites are met. The call might work but has questionable elements.\nExample: User says \"I need a hotel\", assistant calls book_hotel(location=\"New York\", check_in=\"2025-12-20\") where location and dates aren't mentioned in conversation.\n\n2 - Problematic: Tool call has significant correctness issues—multiple parameters lack grounding, the timing is questionable, important prerequisites are missing, or the call seems partially redundant. Likely to fail or produce incorrect results.\nExample: User asks \"What flights are available?\", but assistant calls book_flight(...) with specific dates and destinations not discussed, skipping the search step.\n\n1 - Incorrect: Tool call is inappropriate, clearly redundant, or fundamentally misaligned with conversation. Parameters are fabricated or contradict known information. Missing critical prerequisites. The call should not be made.\nExample: User asks about restaurant recommendations, assistant calls cancel_reservation(reservation_id=\"12345\") with no prior discussion of cancellations or that ID." + }, + "confidence": { + "type": "number", + "minimum": 0, + "maximum": 1, + "threshold_low": 0, + "threshold_high": 1, + "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Higher confidence when conversation context is clear and complete. Lower confidence when context is ambiguous, conversation is incomplete, or tool specifications are not available." + }, + "correction": { + "type": "object", + "description": "Corrections for the tool call. For correct calls (output >= 4), return {}. For problematic calls (output <= 3), provide either need_more_information OR corrected_tool_call.\n\nCRITICAL FORMAT REQUIREMENT: When providing corrected_tool_call, the 'arguments' field MUST be a JSON object/dict (e.g., {\"key\": \"value\"}), NOT a JSON string (e.g., NOT \"{\\\"key\\\": \\\"value\\\"}\").", + "properties": { + "need_more_information": { + "type": "array", + "description": "List of questions to ask the user if information is missing. Each question should be specific and actionable.", + "items": { + "type": "string" + } + }, + "corrected_tool_call": { + "type": "object", + "description": "Complete corrected tool call with all issues addressed. Only provide if correction is possible without additional user input. IMPORTANT: The 'arguments' field must be a JSON object (dict), not a JSON string.", + "properties": { + "name": { + "type": "string", + "description": "Name of the function to call." + }, + "arguments": { + "type": "object", + "description": "Corrected arguments for the function call as a JSON object (not a string). Example: {\"param1\": \"value1\", \"param2\": 123}", + "additionalProperties": true + } + }, + "required": ["name", "arguments"] + } + }, + "required": [] + }, + "actionable_recommendations": { + "type": "array", + "description": "Provide clear, actionable recommendations that can be automatically applied to improve conversation grounding. Only include recommendations when there are actual issues (output <= 3). Focus on:\n\n1. SYSTEM_PROMPT_INSTRUCTION: Add a general instruction to the system prompt about grounding, prerequisites, or clarification (do not mention specific parameter names).", + "items": { + "type": "object", + "properties": { + "recommendation": { + "type": "string", + "enum": [ + "SYSTEM_PROMPT_INSTRUCTION" + ], + "description": "The type of recommendation being made." + }, + "details": { + "type": "string", + "description": "Brief explanation of what should be changed and why." + }, + "quote": { + "type": "string", + "description": "Exact text to add to documentation or system prompt. Must be directly usable without modification." + } + }, + "required": [ + "recommendation", + "details", + "quote" + ] + } + } + }, + "required": [ + "evidence", + "explanation", + "output", + "confidence", + "correction", + "actionable_recommendations" + ] + }, + "examples": [ + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "What's the weather like in Boston?" + } + ], + "tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "get_weather", + "arguments": "{\"location\": \"Boston\"}" + } + } + }, + "output": { + "evidence": "User message: \"What's the weather like in Boston?\"\nTool call parameter: location=\"Boston\"\nNo prior tool calls in conversation.", + "explanation": "Tool Call Appropriateness: The tool call directly addresses the user's explicit request for weather information. It's timely, relevant, and non-redundant.\n\nParameter Grounding: The 'location' parameter value \"Boston\" is explicitly mentioned in the user's question, providing perfect grounding.\n\nPrerequisites: No prerequisites are needed for a weather query beyond having the location, which is provided.\n\nFormat and Consistency: The location parameter is a simple string matching the user's input, internally consistent.\n\nOverall Correctness: This tool call is perfectly appropriate and fully grounded in the conversation.", + "output": 5, + "confidence": 0.98, + "correction": {}, + "actionable_recommendations": [] + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "I want to book a flight." + }, + { + "role": "assistant", + "content": "I'd be happy to help you book a flight. Could you please provide the departure city, destination, and travel dates?" + } + ], + "tool_call": { + "id": "call_002", + "type": "function", + "function": { + "name": "book_flight", + "arguments": "{\"departure\": \"New York\", \"destination\": \"Los Angeles\", \"date\": \"2025-12-25\"}" + } + } + }, + "output": { + "evidence": "User message: \"I want to book a flight.\"\nAssistant message: \"Could you please provide the departure city, destination, and travel dates?\"\nTool call parameters: departure=\"New York\", destination=\"Los Angeles\", date=\"2025-12-25\"\nNo user response providing the requested information.", + "explanation": "Tool Call Appropriateness: The tool call is premature and inappropriate. The assistant explicitly asked the user for departure city, destination, and dates, but the user has not yet provided this information. Making the booking call now is incorrect.\n\nParameter Grounding: All three parameters (departure, destination, date) are completely ungrounded. They do not appear anywhere in the conversation and represent pure hallucination. The user only expressed a general intent to book a flight.\n\nPrerequisites: The critical prerequisite—obtaining the necessary booking information from the user—has not been satisfied. The assistant asked for this information but didn't wait for the response.\n\nFormat and Consistency: While the parameters themselves are internally consistent (valid city names and date format), they have no basis in reality for this conversation.\n\nOverall Correctness: This tool call is fundamentally incorrect and should not be made. The system should wait for user input before proceeding.", + "output": 1, + "confidence": 0.95, + "correction": { + "need_more_information": [ + "What city will you be departing from?", + "What is your destination?", + "What date would you like to travel?" + ] + }, + "actionable_recommendations": [ + { + "recommendation": "SYSTEM_PROMPT_INSTRUCTION", + "details": "Add instruction to wait for user responses before making tool calls.", + "quote": "If you ask the user for information, you must wait for their response before making any tool calls that require that information. Never fabricate or assume parameter values." + }, + { + "recommendation": "SYSTEM_PROMPT_INSTRUCTION", + "details": "Add instruction about prerequisite checking.", + "quote": "Before making tool calls, verify that all required information is available in the conversation. If any required parameter is missing, ask the user to provide it first." + } + ] + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "Get me the user profile for the logged-in user." + }, + { + "role": "assistant", + "content": null, + "tool_calls": [ + { + "id": "call_001", + "type": "function", + "function": { + "name": "get_current_user_id", + "arguments": "{}" + } + } + ] + }, + { + "role": "tool", + "tool_call_id": "call_001", + "content": "{\"user_id\": 42, \"username\": \"john_doe\"}" + }, + { + "role": "assistant", + "content": "I found that you're logged in as john_doe (user ID: 42). Let me get your full profile now." + } + ], + "tool_call": { + "id": "call_002", + "type": "function", + "function": { + "name": "get_user_profile", + "arguments": "{\"user_id\": 42}" + } + } + }, + "output": { + "evidence": "User message: \"Get me the user profile for the logged-in user.\"\nPrior tool call: get_current_user_id()\nTool output: {\"user_id\": 42, \"username\": \"john_doe\"}\nAssistant confirmation: \"you're logged in as john_doe (user ID: 42)\"\nCurrent tool call: get_user_profile(user_id=42)", + "explanation": "Tool Call Appropriateness: The tool call is perfectly appropriate and represents the logical next step. The user requested the profile for the logged-in user. The assistant first identified the user (ID: 42) via a tool call, confirmed this with the user, and is now proceeding to fetch the full profile.\n\nParameter Grounding: The 'user_id' parameter value of 42 is grounded in the tool output from get_current_user_id. This is the highest-quality grounding source (tool output), taking precedence over any human statements.\n\nPrerequisites: All prerequisites are satisfied—the user ID was obtained through a prior tool call, which is the proper way to get this information before fetching the profile.\n\nFormat and Consistency: The user_id is an integer (42) which is appropriate for an ID field. It's consistent with the value returned from the authentication check.\n\nOverall Correctness: This tool call demonstrates excellent conversation flow—proper sequencing, authoritative grounding from tool output, and clear alignment with user intent.", + "output": 5, + "confidence": 0.97, + "correction": {}, + "actionable_recommendations": [] + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "Book a table for dinner tonight at 7 PM." + } + ], + "tool_call": { + "id": "call_003", + "type": "function", + "function": { + "name": "book_restaurant", + "arguments": "{\"date\": \"2025-12-17\", \"time\": \"19:00\", \"party_size\": 2}" + } + } + }, + "output": { + "evidence": "User message: \"Book a table for dinner tonight at 7 PM.\"\nTool call parameters: date=\"2025-12-17\", time=\"19:00\", party_size=2\nCurrent date context: December 17, 2025\nNo mention of restaurant name or party size in conversation.", + "explanation": "Tool Call Appropriateness: The tool call is generally appropriate as it responds to the user's booking request. However, it's missing a critical piece of information—which restaurant to book.\n\nParameter Grounding:\n- 'date': The value \"2025-12-17\" is a reasonable inference from \"tonight\" given the current date context. This is a clear implicit inference (score: 4 level).\n- 'time': The value \"19:00\" (7 PM in 24-hour format) is explicitly grounded in the user's statement \"7 PM\". This is well-grounded.\n- 'party_size': The value 2 is an assumption with no basis in the conversation. The user didn't mention how many people. This is a hallucination.\n\nPrerequisites: The restaurant name/location is missing, which is typically a required prerequisite for booking. The user should specify which restaurant.\n\nFormat and Consistency: The parameters are internally consistent (date is today, time is evening, party size is positive), but the party_size lacks grounding.\n\nOverall Correctness: The tool call has mixed quality—some parameters are well-grounded while others are assumed. The missing restaurant information is also problematic.", + "output": 3, + "confidence": 0.85, + "correction": { + "need_more_information": [ + "Which restaurant would you like to book?", + "How many people will be dining?" + ] + }, + "actionable_recommendations": [ + { + "recommendation": "SYSTEM_PROMPT_INSTRUCTION", + "details": "Add instruction about when inference is acceptable vs. when to ask.", + "quote": "Temporal references like 'tonight' or 'tomorrow' can be inferred from the current date. However, user-specific values like party size, number of travelers, or personal preferences must never be assumed—always ask the user if not explicitly provided." + } + ] + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "I need a reservation at The French Bistro for 2 people tomorrow at 7 PM." + }, + { + "role": "assistant", + "content": "I'll book a table for you at The French Bistro." + } + ], + "tool_call": { + "id": "call_004", + "type": "function", + "function": { + "name": "book_restaurant", + "arguments": "{\"restaurant_name\": \"The French Bistro\", \"date\": \"2025-12-18\", \"time\": \"19:00\", \"party_size\": 4, \"dietary_restrictions\": [\"vegetarian\"]}" + } + } + }, + "output": { + "evidence": "User message: \"I need a reservation at The French Bistro for 2 people tomorrow at 7 PM.\"\nTool call parameters: restaurant_name=\"The French Bistro\", date=\"2025-12-18\", time=\"19:00\", party_size=4, dietary_restrictions=[\"vegetarian\"]\nCurrent date context: December 17, 2025", + "explanation": "Tool Call Appropriateness: The tool call is appropriate in intent but has parameter errors.\n\nParameter Grounding:\n- 'restaurant_name': \"The French Bistro\" is explicitly grounded.\n- 'date': \"2025-12-18\" is correctly inferred from \"tomorrow\".\n- 'time': \"19:00\" is explicitly grounded in \"7 PM\".\n- 'party_size': Value is 4, but user explicitly said \"2 people\" - this is a contradiction.\n- 'dietary_restrictions': [\"vegetarian\"] is completely ungrounded - user never mentioned dietary restrictions.\n\nOverall Correctness: Significant issues with party_size contradiction and hallucinated dietary_restrictions.", + "output": 2, + "confidence": 0.9, + "correction": { + "corrected_tool_call": { + "name": "book_restaurant", + "arguments": { + "restaurant_name": "The French Bistro", + "date": "2025-12-18", + "time": "19:00", + "party_size": 2 + } + } + }, + "actionable_recommendations": [ + { + "recommendation": "SYSTEM_PROMPT_INSTRUCTION", + "details": "Add instruction to verify numbers carefully.", + "quote": "When users provide specific quantities (party size, number of guests, passenger count, etc.), use the exact number stated. Never modify or assume different quantities unless explicitly confirmed by the user." + } + ] + } + } + ] } -] +] \ No newline at end of file diff --git a/altk/pre_tool/sparc/function_calling/metrics/function_call/general_metrics_runtime.json b/altk/pre_tool/sparc/function_calling/metrics/function_call/general_metrics_runtime.json index a07109a..cd64548 100644 --- a/altk/pre_tool/sparc/function_calling/metrics/function_call/general_metrics_runtime.json +++ b/altk/pre_tool/sparc/function_calling/metrics/function_call/general_metrics_runtime.json @@ -576,5 +576,257 @@ } } ] + }, + { + "name": "general_conversation_grounded_correctness", + "task_description": "Evaluate the overall correctness and appropriateness of the tool call based SOLELY on the conversation context, WITHOUT requiring explicit tool specifications. This metric assesses whether the tool call is a logical, non-redundant action that is fully grounded in the conversation history.\n\n**IMPORTANT**: If tool specifications appear in the conversation (e.g., in system prompts or assistant messages), you MAY use them to enhance your assessment. However, your primary focus should be on conversation grounding.\n\n---\n\n### 1. Tool Call Appropriateness\n\nAssess whether the tool call makes sense given the conversation flow:\n\n- **Intent Alignment**: Does the tool call directly address the user's request or a logical next step?\n- **Non-Redundancy**: Is this tool call necessary, or has equivalent information already been obtained?\n- **Timing**: Are there prerequisite actions that should have been completed first?\n- **Relevance**: Is the chosen function appropriate for the current conversation context?\n\n---\n\n### 2. Conversation Grounding Hierarchy\n\nParameter values should be grounded in the following priority order:\n\n1. **Tool Outputs** (highest priority): Values from previous tool call results\n2. **Explicit User Statements**: Direct mentions in user messages\n3. **Confirmed Assistant Statements**: Information the assistant provided that the user acknowledged\n4. **Logical Inference**: Values that can be reasonably inferred from context (use cautiously)\n\n**Key Principle**: When there's a conflict between sources, tool outputs take precedence over user/assistant statements, as they represent ground truth from the system.\n\n---\n\n### 3. Parameter Value Assessment\n\nFor each parameter in the tool call:\n\n- **Explicit Grounding**: Is the value directly stated in the conversation?\n- **Implicit Grounding**: Can the value be reasonably inferred from context?\n- **Assumption Detection**: Is the value assumed without any conversational basis?\n- **Conflict Resolution**: If multiple sources provide different values, is the most authoritative source used?\n\n---\n\n### 4. Format and Consistency (Conversation-Based)\n\nEven without formal specifications:\n\n- **Internal Consistency**: Do parameter values make logical sense together (e.g., end_date after start_date)?\n- **Observable Patterns**: If tool specifications are mentioned in the conversation, do values conform to described patterns?\n- **Common Sense Validation**: Do values follow basic logical constraints (e.g., no negative quantities for counts)?\n\n---\n\n### 5. Prerequisite Satisfaction\n\nCheck if the tool call has all necessary preconditions:\n\n- **Dependent Information**: Is all required information available from the conversation or prior tool calls?\n- **State Requirements**: Based on conversation context, is the system in the right state for this call?\n- **Missing Steps**: Are there obvious gaps in the conversation that should be addressed first?\n\n---\n\n### Correctness Rating Scale\n\n- **5 - Fully Correct**: Tool call is perfectly appropriate, all parameters are explicitly or clearly inferably grounded in conversation, no redundancy, all prerequisites satisfied\n- **4 - Mostly Correct**: Tool call is appropriate and well-grounded, with only minor implicit inferences that are very reasonable\n- **3 - Partially Correct**: Tool call is generally appropriate but has some parameters with weak grounding or minor timing/relevance issues\n- **2 - Problematic**: Tool call has significant issues—multiple ungrounded parameters, questionable relevance, or missing prerequisites\n- **1 - Incorrect**: Tool call is inappropriate, redundant, or has parameters that are clearly fabricated or contradict conversation context\n\n---\n\n### Evaluation Guidelines\n\n- **Conservative Assessment**: When grounding is unclear, prefer lower scores\n- **Prioritize Tool Outputs**: Always favor values from tool results over human statements\n- **Consider Conversation Flow**: Assess whether the tool call makes sense in the sequence of events\n- **Watch for Hallucinations**: Flag any parameters that appear without conversational basis\n- **Use Available Specs**: If tool specifications appear in the conversation, incorporate them into your assessment\n\nYour evaluation should reflect a holistic view of whether a human reviewing the conversation would consider this tool call to be correct, appropriate, and fully justified by the available information.", + "jsonschema": { + "title": "general_conversation_grounded_correctness", + "description": "Holistic assessment of tool call correctness based on conversation grounding, appropriateness, and logical consistency without requiring explicit tool specifications.", + "type": "object", + "additionalProperties": false, + "properties": { + "evidence": { + "type": "string", + "description": "Provide EXACT quotes from the conversation that support or contradict the tool call. Include: (1) relevant user messages, (2) assistant messages, (3) prior tool outputs, and (4) any tool specifications mentioned in the conversation. Organize evidence by parameter and by aspect (appropriateness, grounding, prerequisites)." + }, + "explanation": { + "type": "string", + "description": "Provide an analysis covering: (1) Tool Call Appropriateness - why this tool call makes sense (or doesn't) in the conversation flow, (2) Parameter Grounding - for EACH parameter, explain its grounding source and quality, noting conflicts between sources and resolution, (3) Prerequisites - whether all necessary preconditions are satisfied, (4) Format and Consistency - whether parameters make logical sense together, (5) Overall Correctness - summary judgment on whether the tool call is fully justified by the conversation." + }, + "output": { + "type": "integer", + "minimum": 1, + "maximum": 5, + "threshold_low": 4, + "threshold_high": 5, + "description": "Overall Correctness Score (1-5):\n\n5 - Fully Correct: Tool call is the right action at the right time. All parameters are explicitly grounded in conversation (user messages, confirmed assistant statements, or tool outputs). No redundancy. All prerequisites satisfied. If tool specs are in conversation, the call aligns with them.\nExample: User asks \"What's the weather in Boston?\", assistant calls get_weather(location=\"Boston\") with no prior weather call for Boston.\n\n4 - Mostly Correct: Tool call is appropriate and timely. Most parameters are explicitly grounded, a few may be reasonably inferred from clear context. No significant issues. Prerequisites satisfied.\nExample: User says \"Book a table for dinner tonight\", assistant calls book_restaurant(date=\"2025-12-17\", time=\"19:00\", party_size=2) where date is inferred from 'tonight' and party_size from typical default.\n\n3 - Partially Correct: Tool call is generally appropriate but has noticeable issues—some parameters are weakly grounded, minor relevance concerns, or unclear if prerequisites are met. The call might work but has questionable elements.\nExample: User says \"I need a hotel\", assistant calls book_hotel(location=\"New York\", check_in=\"2025-12-20\") where location and dates aren't mentioned in conversation.\n\n2 - Problematic: Tool call has significant correctness issues—multiple parameters lack grounding, the timing is questionable, important prerequisites are missing, or the call seems partially redundant. Likely to fail or produce incorrect results.\nExample: User asks \"What flights are available?\", but assistant calls book_flight(...) with specific dates and destinations not discussed, skipping the search step.\n\n1 - Incorrect: Tool call is inappropriate, clearly redundant, or fundamentally misaligned with conversation. Parameters are fabricated or contradict known information. Missing critical prerequisites. The call should not be made.\nExample: User asks about restaurant recommendations, assistant calls cancel_reservation(reservation_id=\"12345\") with no prior discussion of cancellations or that ID." + }, + "confidence": { + "type": "number", + "minimum": 0, + "maximum": 1, + "threshold_low": 0, + "threshold_high": 1, + "description": "Confidence in the accuracy of this assessment (range: 0.0-1.0). Higher confidence when conversation context is clear and complete. Lower confidence when context is ambiguous, conversation is incomplete, or tool specifications are not available." + }, + "correction": { + "type": "object", + "description": "Corrections for the tool call. For correct calls (output >= 4), return {}. For problematic calls (output <= 3), provide either need_more_information OR corrected_tool_call.\n\nCRITICAL FORMAT REQUIREMENT: When providing corrected_tool_call, the 'arguments' field MUST be a JSON object/dict (e.g., {\"key\": \"value\"}), NOT a JSON string (e.g., NOT \"{\\\"key\\\": \\\"value\\\"}\").", + "properties": { + "need_more_information": { + "type": "array", + "description": "List of questions to ask the user if information is missing. Each question should be specific and actionable.", + "items": { + "type": "string" + } + }, + "corrected_tool_call": { + "type": "object", + "description": "Complete corrected tool call with all issues addressed. Only provide if correction is possible without additional user input. IMPORTANT: The 'arguments' field must be a JSON object (dict), not a JSON string.", + "properties": { + "name": { + "type": "string", + "description": "Name of the function to call." + }, + "arguments": { + "type": "object", + "description": "Corrected arguments for the function call as a JSON object (not a string). Example: {\"param1\": \"value1\", \"param2\": 123}", + "additionalProperties": true + } + }, + "required": ["name", "arguments"] + } + }, + "required": [] + } + }, + "required": [ + "evidence", + "explanation", + "output", + "confidence", + "correction" + ] + }, + "examples": [ + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "What's the weather like in Boston?" + } + ], + "tool_call": { + "id": "call_001", + "type": "function", + "function": { + "name": "get_weather", + "arguments": "{\"location\": \"Boston\"}" + } + } + }, + "output": { + "evidence": "User message: \"What's the weather like in Boston?\"\nTool call parameter: location=\"Boston\"\nNo prior tool calls in conversation.", + "explanation": "Tool Call Appropriateness: The tool call directly addresses the user's explicit request for weather information. It's timely, relevant, and non-redundant.\n\nParameter Grounding: The 'location' parameter value \"Boston\" is explicitly mentioned in the user's question, providing perfect grounding.\n\nPrerequisites: No prerequisites are needed for a weather query beyond having the location, which is provided.\n\nFormat and Consistency: The location parameter is a simple string matching the user's input, internally consistent.\n\nOverall Correctness: This tool call is perfectly appropriate and fully grounded in the conversation.", + "output": 5, + "confidence": 0.98, + "correction": {} + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "I want to book a flight." + }, + { + "role": "assistant", + "content": "I'd be happy to help you book a flight. Could you please provide the departure city, destination, and travel dates?" + } + ], + "tool_call": { + "id": "call_002", + "type": "function", + "function": { + "name": "book_flight", + "arguments": "{\"departure\": \"New York\", \"destination\": \"Los Angeles\", \"date\": \"2025-12-25\"}" + } + } + }, + "output": { + "evidence": "User message: \"I want to book a flight.\"\nAssistant message: \"Could you please provide the departure city, destination, and travel dates?\"\nTool call parameters: departure=\"New York\", destination=\"Los Angeles\", date=\"2025-12-25\"\nNo user response providing the requested information.", + "explanation": "Tool Call Appropriateness: The tool call is premature and inappropriate. The assistant explicitly asked the user for departure city, destination, and dates, but the user has not yet provided this information. Making the booking call now is incorrect.\n\nParameter Grounding: All three parameters (departure, destination, date) are completely ungrounded. They do not appear anywhere in the conversation and represent pure hallucination. The user only expressed a general intent to book a flight.\n\nPrerequisites: The critical prerequisite—obtaining the necessary booking information from the user—has not been satisfied. The assistant asked for this information but didn't wait for the response.\n\nFormat and Consistency: While the parameters themselves are internally consistent (valid city names and date format), they have no basis in reality for this conversation.\n\nOverall Correctness: This tool call is fundamentally incorrect and should not be made. The system should wait for user input before proceeding.", + "output": 1, + "confidence": 0.95, + "correction": { + "need_more_information": [ + "What city will you be departing from?", + "What is your destination?", + "What date would you like to travel?" + ] + } + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "Get me the user profile for the logged-in user." + }, + { + "role": "assistant", + "content": null, + "tool_calls": [ + { + "id": "call_001", + "type": "function", + "function": { + "name": "get_current_user_id", + "arguments": "{}" + } + } + ] + }, + { + "role": "tool", + "tool_call_id": "call_001", + "content": "{\"user_id\": 42, \"username\": \"john_doe\"}" + }, + { + "role": "assistant", + "content": "I found that you're logged in as john_doe (user ID: 42). Let me get your full profile now." + } + ], + "tool_call": { + "id": "call_002", + "type": "function", + "function": { + "name": "get_user_profile", + "arguments": "{\"user_id\": 42}" + } + } + }, + "output": { + "evidence": "User message: \"Get me the user profile for the logged-in user.\"\nPrior tool call: get_current_user_id()\nTool output: {\"user_id\": 42, \"username\": \"john_doe\"}\nAssistant confirmation: \"you're logged in as john_doe (user ID: 42)\"\nCurrent tool call: get_user_profile(user_id=42)", + "explanation": "Tool Call Appropriateness: The tool call is perfectly appropriate and represents the logical next step. The user requested the profile for the logged-in user. The assistant first identified the user (ID: 42) via a tool call, confirmed this with the user, and is now proceeding to fetch the full profile.\n\nParameter Grounding: The 'user_id' parameter value of 42 is grounded in the tool output from get_current_user_id. This is the highest-quality grounding source (tool output), taking precedence over any human statements.\n\nPrerequisites: All prerequisites are satisfied—the user ID was obtained through a prior tool call, which is the proper way to get this information before fetching the profile.\n\nFormat and Consistency: The user_id is an integer (42) which is appropriate for an ID field. It's consistent with the value returned from the authentication check.\n\nOverall Correctness: This tool call demonstrates excellent conversation flow—proper sequencing, authoritative grounding from tool output, and clear alignment with user intent.", + "output": 5, + "confidence": 0.97, + "correction": {} + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "Book a table for dinner tonight at 7 PM." + } + ], + "tool_call": { + "id": "call_003", + "type": "function", + "function": { + "name": "book_restaurant", + "arguments": "{\"date\": \"2025-12-17\", \"time\": \"19:00\", \"party_size\": 2}" + } + } + }, + "output": { + "evidence": "User message: \"Book a table for dinner tonight at 7 PM.\"\nTool call parameters: date=\"2025-12-17\", time=\"19:00\", party_size=2\nCurrent date context: December 17, 2025\nNo mention of restaurant name or party size in conversation.", + "explanation": "Tool Call Appropriateness: The tool call is generally appropriate as it responds to the user's booking request. However, it's missing a critical piece of information—which restaurant to book.\n\nParameter Grounding:\n- 'date': The value \"2025-12-17\" is a reasonable inference from \"tonight\" given the current date context. This is a clear implicit inference (score: 4 level).\n- 'time': The value \"19:00\" (7 PM in 24-hour format) is explicitly grounded in the user's statement \"7 PM\". This is well-grounded.\n- 'party_size': The value 2 is an assumption with no basis in the conversation. The user didn't mention how many people. This is a hallucination.\n\nPrerequisites: The restaurant name/location is missing, which is typically a required prerequisite for booking. The user should specify which restaurant.\n\nFormat and Consistency: The parameters are internally consistent (date is today, time is evening, party size is positive), but the party_size lacks grounding.\n\nOverall Correctness: The tool call has mixed quality—some parameters are well-grounded while others are assumed. The missing restaurant information is also problematic.", + "output": 3, + "confidence": 0.85, + "correction": { + "need_more_information": [ + "Which restaurant would you like to book?", + "How many people will be dining?" + ] + } + } + }, + { + "user_kwargs": { + "conversation_context": [ + { + "role": "user", + "content": "I need a reservation at The French Bistro for 2 people tomorrow at 7 PM." + }, + { + "role": "assistant", + "content": "I'll book a table for you at The French Bistro." + } + ], + "tool_call": { + "id": "call_004", + "type": "function", + "function": { + "name": "book_restaurant", + "arguments": "{\"restaurant_name\": \"The French Bistro\", \"date\": \"2025-12-18\", \"time\": \"19:00\", \"party_size\": 4, \"dietary_restrictions\": [\"vegetarian\"]}" + } + } + }, + "output": { + "evidence": "User message: \"I need a reservation at The French Bistro for 2 people tomorrow at 7 PM.\"\nTool call parameters: restaurant_name=\"The French Bistro\", date=\"2025-12-18\", time=\"19:00\", party_size=4, dietary_restrictions=[\"vegetarian\"]\nCurrent date context: December 17, 2025", + "explanation": "Tool Call Appropriateness: The tool call is appropriate in intent but has parameter errors.\n\nParameter Grounding:\n- 'restaurant_name': \"The French Bistro\" is explicitly grounded.\n- 'date': \"2025-12-18\" is correctly inferred from \"tomorrow\".\n- 'time': \"19:00\" is explicitly grounded in \"7 PM\".\n- 'party_size': Value is 4, but user explicitly said \"2 people\" - this is a contradiction.\n- 'dietary_restrictions': [\"vegetarian\"] is completely ungrounded - user never mentioned dietary restrictions.\n\nOverall Correctness: Significant issues with party_size contradiction and hallucinated dietary_restrictions.", + "output": 2, + "confidence": 0.9, + "correction": { + "corrected_tool_call": { + "name": "book_restaurant", + "arguments": { + "restaurant_name": "The French Bistro", + "date": "2025-12-18", + "time": "19:00", + "party_size": 2 + } + } + } + } + } + ] } -] +] \ No newline at end of file diff --git a/altk/pre_tool/sparc/function_calling/metrics/loader.py b/altk/pre_tool/sparc/function_calling/metrics/loader.py index 0b338e4..2ffb4b7 100644 --- a/altk/pre_tool/sparc/function_calling/metrics/loader.py +++ b/altk/pre_tool/sparc/function_calling/metrics/loader.py @@ -15,6 +15,7 @@ from altk.pre_tool.sparc.metrics import Metric, MetricPrompt from altk.pre_tool.sparc.function_calling.metrics.function_call.general import ( GeneralMetricsPrompt, + GeneralMetricsPromptNoSpec, ) from altk.pre_tool.sparc.function_calling.metrics.function_selection.function_selection import ( FunctionSelectionPrompt, @@ -162,10 +163,20 @@ def load_prompts_from_list( except Exception as e: raise LoaderError(f"Record {idx} invalid schema: {e}") from e + # Detect tool-spec-free metrics and use appropriate prompt class + metric_name = rec.get("name", "") + from altk.pre_tool.core.consts import TOOL_SPEC_FREE_METRICS + + actual_prompt_cls = PromptCls + if kind == PromptKind.GENERAL and metric_name in TOOL_SPEC_FREE_METRICS: + actual_prompt_cls = GeneralMetricsPromptNoSpec + try: - prompt = PromptCls(metric=metric, task_description=rec["task_description"]) + prompt = actual_prompt_cls( + metric=metric, task_description=rec["task_description"] + ) except TypeError: - prompt = PromptCls(metric=metric) + prompt = actual_prompt_cls(metric=metric) for ex_idx, ex in enumerate(examples, start=1): try: diff --git a/altk/pre_tool/sparc/function_calling/pipeline/pipeline.py b/altk/pre_tool/sparc/function_calling/pipeline/pipeline.py index 188d6d5..8ab6d8f 100644 --- a/altk/pre_tool/sparc/function_calling/pipeline/pipeline.py +++ b/altk/pre_tool/sparc/function_calling/pipeline/pipeline.py @@ -65,6 +65,7 @@ def __init__( transform_enabled: Optional[bool] = False, runtime_pipeline: Optional[bool] = True, use_examples: Optional[bool] = True, + skip_static_checks: Optional[bool] = False, ): self.metrics_client = metrics_client if codegen_client is None: @@ -75,6 +76,7 @@ def __init__( self.general_metrics = general_metrics self.function_metrics = function_metrics self.parameter_metrics = parameter_metrics + self.skip_static_checks = skip_static_checks metrics_definitions = [] @@ -159,6 +161,29 @@ def __init__( transform_enabled=transform_enabled, ) + # 4) Auto-detect if we should skip static checks + # Skip if: only tool-spec-free metrics are used and no function/parameter metrics + if not self.skip_static_checks: + from altk.pre_tool.core.consts import TOOL_SPEC_FREE_METRICS + + # Check if only general metrics with tool-spec-free metrics are active + only_general = ( + gen_defs is not None and fun_defs is None and par_defs is None + ) + if only_general and gen_defs: + # Check if all general metrics are tool-spec-free + all_tool_spec_free = all( + metric.get("name") in TOOL_SPEC_FREE_METRICS for metric in gen_defs + ) + if all_tool_spec_free: + self._auto_skip_static = True + else: + self._auto_skip_static = False + else: + self._auto_skip_static = False + else: + self._auto_skip_static = False + @staticmethod def static_only( inventory: List[ToolSpec], @@ -237,7 +262,16 @@ def run_sync( """ Full sync pipeline: static -> semantic -> assemble PipelineResult. """ - static_res = self.static_only(inventory, call) + # Auto-skip static checks if only tool-spec-free metrics and empty inventory + should_skip_static = ( + self.skip_static_checks or self._auto_skip_static + ) and not inventory + + if should_skip_static: + # Create empty static result when skipping + static_res = StaticResult(metrics={}, final_decision=True) + else: + static_res = self.static_only(inventory, call) if not static_res.final_decision and not continue_on_static: inputs = FunctionCallInput( @@ -284,7 +318,16 @@ async def run_async( """ Full async pipeline: static -> semantic -> assemble PipelineResult. """ - static_res = self.static_only(inventory, call) + # Auto-skip static checks if only tool-spec-free metrics and empty inventory + should_skip_static = ( + self.skip_static_checks or self._auto_skip_static + ) and not inventory + + if should_skip_static: + # Create empty static result when skipping + static_res = StaticResult(metrics={}, final_decision=True) + else: + static_res = self.static_only(inventory, call) if not static_res.final_decision and not continue_on_static: inputs = FunctionCallInput( diff --git a/altk/pre_tool/sparc/function_calling/pipeline/semantic_checker.py b/altk/pre_tool/sparc/function_calling/pipeline/semantic_checker.py index 039015e..4906f27 100644 --- a/altk/pre_tool/sparc/function_calling/pipeline/semantic_checker.py +++ b/altk/pre_tool/sparc/function_calling/pipeline/semantic_checker.py @@ -41,6 +41,7 @@ GeneralMetricsPrompt, FunctionSelectionPrompt, ParameterMetricsPrompt, + GeneralMetricsPromptNoSpec, ) @@ -97,6 +98,9 @@ def __init__( ) def _make_adapter(self, apis_specs, tool_call): + # Handle empty specs for tool-spec-free metrics + if not apis_specs: + return OpenAIAdapter([], tool_call) first = apis_specs[0] if isinstance(first, ToolSpec): return OpenAIAdapter(apis_specs, tool_call) @@ -204,8 +208,10 @@ def run_sync( tools_inventory_summary = adapter.get_tools_inventory_summary() call_dict = adapter.get_call_dict() fn_name = adapter.get_function_name() - cur_tool_spec = adapter.get_tool_spec(fn_name) - params = self._collect_params(adapter) + + # Handle empty inventory for tool-spec-free metrics + cur_tool_spec = adapter.get_tool_spec(fn_name) if apis_specs else {} + params = self._collect_params(adapter) if apis_specs else {} if transform_enabled is not None: old_transform_enabled = self.transform_enabled @@ -213,18 +219,22 @@ def run_sync( # 2) GENERAL METRICS general_results: Optional[SemanticCategoryResult] - entries: List[Tuple[GeneralMetricsPrompt, Dict[str, Any]]] = [] + entries: List[ + Tuple[ + Union[GeneralMetricsPrompt, GeneralMetricsPromptNoSpec], Dict[str, Any] + ] + ] = [] for prompt in self.general_prompts: - entries.append( - ( - prompt, - { - "conversation_context": context, - "tool_inventory": cur_tool_spec, - "tool_call": call_dict, - }, - ) - ) + # Build kwargs based on whether prompt needs tool spec + kwargs = { + "conversation_context": context, + "tool_call": call_dict, + } + # Only include tool_inventory if prompt expects it (not tool-spec-free) + if isinstance(prompt, GeneralMetricsPrompt): + kwargs["tool_inventory"] = cur_tool_spec + + entries.append((prompt, kwargs)) if entries: try: runner = MetricRunner(entries) @@ -454,8 +464,10 @@ async def run_async( tools_inventory_summary = adapter.get_tools_inventory_summary() call_dict = adapter.get_call_dict() fn_name = adapter.get_function_name() - cur_tool_spec = adapter.get_tool_spec(fn_name) - params = self._collect_params(adapter) + + # Handle empty inventory for tool-spec-free metrics + cur_tool_spec = adapter.get_tool_spec(fn_name) if apis_specs else {} + params = self._collect_params(adapter) if apis_specs else {} # Handle optional override of transform_enabled if transform_enabled is not None: @@ -464,20 +476,24 @@ async def run_async( # 2) GENERAL METRICS general_results: SemanticCategoryResult = {} - general_entries: List[Tuple[GeneralMetricsPrompt, Dict[str, Any]]] = [] + general_entries: List[ + Tuple[ + Union[GeneralMetricsPrompt, GeneralMetricsPromptNoSpec], Dict[str, Any] + ] + ] = [] general_async_results: List[MetricRunResult] = [] for prompt in self.general_prompts: - general_entries.append( - ( - prompt, - { - "conversation_context": context, - "tool_inventory": cur_tool_spec, - "tool_call": call_dict, - }, - ) - ) + # Build kwargs based on whether prompt needs tool spec + kwargs = { + "conversation_context": context, + "tool_call": call_dict, + } + # Only include tool_inventory if prompt expects it (not tool-spec-free) + if isinstance(prompt, GeneralMetricsPrompt): + kwargs["tool_inventory"] = cur_tool_spec + + general_entries.append((prompt, kwargs)) # 3) FUNCTION-SELECTION METRICS function_results: SemanticCategoryResult = {} @@ -532,9 +548,9 @@ async def run_async( # Split the results back into categories for entry, result in zip(all_entries, async_results): prompt_obj, ctx_dict = entry - if isinstance(prompt_obj, GeneralMetricsPrompt) and isinstance( - result, MetricRunResult - ): + if isinstance( + prompt_obj, (GeneralMetricsPrompt, GeneralMetricsPromptNoSpec) + ) and isinstance(result, MetricRunResult): general_async_results.append(result) elif isinstance(prompt_obj, FunctionSelectionPrompt) and isinstance( result, MetricRunResult diff --git a/altk/pre_tool/sparc/sparc.py b/altk/pre_tool/sparc/sparc.py index b283af9..403c1bf 100644 --- a/altk/pre_tool/sparc/sparc.py +++ b/altk/pre_tool/sparc/sparc.py @@ -370,6 +370,7 @@ def _process_pipeline_result( ) -> SPARCReflectionResult: """Process the pipeline result into a structured reflection result.""" issues = [] + has_errors = False decision = SPARCReflectionDecision.APPROVE # Check static issues @@ -386,16 +387,30 @@ def _process_pipeline_result( ) ) - # Check semantic issues + # Check semantic issues and errors if pipeline_result.semantic: - # Function selection metrics + # Function selection metrics - check for errors and issues function_selection_issues = False if pipeline_result.semantic.function_selection: for ( metric_name, metric_result, ) in pipeline_result.semantic.function_selection.metrics.items(): - if hasattr(metric_result, "is_issue") and metric_result.is_issue: + # Check for errors first + if hasattr(metric_result, "error") and metric_result.error: + has_errors = True + logger.error( + f"Error in function selection metric '{metric_name}': {metric_result.error}" + ) + issues.append( + SPARCReflectionIssue( + issue_type=SPARCReflectionIssueType.ERROR, + metric_name=f"function_selection.{metric_name}", + explanation=f"LLM execution error in function selection metric: {metric_result.error}", + correction=None, + ) + ) + elif hasattr(metric_result, "is_issue") and metric_result.is_issue: function_selection_issues = True issues.append( SPARCReflectionIssue( @@ -407,14 +422,29 @@ def _process_pipeline_result( correction=metric_result.raw_response.get("correction"), ) ) + if not function_selection_issues: - # General metrics + # General metrics - check for errors and issues if pipeline_result.semantic.general: for ( metric_name, metric_result, ) in pipeline_result.semantic.general.metrics.items(): - if ( + # Check for errors first + if hasattr(metric_result, "error") and metric_result.error: + has_errors = True + logger.error( + f"Error in general metric '{metric_name}': {metric_result.error}" + ) + issues.append( + SPARCReflectionIssue( + issue_type=SPARCReflectionIssueType.ERROR, + metric_name=f"general.{metric_name}", + explanation=f"LLM execution error in general metric: {metric_result.error}", + correction=None, + ) + ) + elif ( hasattr(metric_result, "is_issue") and metric_result.is_issue ): @@ -431,14 +461,28 @@ def _process_pipeline_result( ) ) - # Parameter metrics + # Parameter metrics - check for errors and issues if pipeline_result.semantic.parameter: for ( param_name, param_metrics, ) in pipeline_result.semantic.parameter.items(): for metric_name, metric_result in param_metrics.metrics.items(): - if ( + # Check for errors first + if hasattr(metric_result, "error") and metric_result.error: + has_errors = True + logger.error( + f"Error in parameter metric '{param_name}.{metric_name}': {metric_result.error}" + ) + issues.append( + SPARCReflectionIssue( + issue_type=SPARCReflectionIssueType.ERROR, + metric_name=f"parameter.{param_name}.{metric_name}", + explanation=f"LLM execution error in parameter metric: {metric_result.error}", + correction=None, + ) + ) + elif ( hasattr(metric_result, "is_issue") and metric_result.is_issue ): @@ -455,13 +499,27 @@ def _process_pipeline_result( ) ) - # Transform results + # Transform results - check for errors and corrections if pipeline_result.semantic.transform: for ( param_name, param_info, ) in pipeline_result.semantic.transform.items(): - if param_info.correction: + # Check for transformation errors + if hasattr(param_info, "error") and param_info.error: + has_errors = True + logger.error( + f"Error in transformation for parameter '{param_name}': {param_info.error}" + ) + issues.append( + SPARCReflectionIssue( + issue_type=SPARCReflectionIssueType.ERROR, + metric_name=f"transform.{param_name}", + explanation=f"Transformation error: {param_info.error}", + correction=None, + ) + ) + elif param_info.correction: issues.append( SPARCReflectionIssue( issue_type=SPARCReflectionIssueType.TRANSFORM, @@ -475,8 +533,13 @@ def _process_pipeline_result( ) ) - # Determine final decision - if issues: + # Determine final decision based on errors and issues + if has_errors: + decision = SPARCReflectionDecision.ERROR + logger.warning( + f"Pipeline execution resulted in ERROR decision due to {sum(1 for i in issues if i.issue_type == SPARCReflectionIssueType.ERROR)} error(s)" + ) + elif issues: decision = SPARCReflectionDecision.REJECT else: decision = SPARCReflectionDecision.APPROVE diff --git a/examples/langgraph_agent_sparc_example.py b/examples/langgraph_agent_sparc_example.py index 0125a3f..7e3675d 100644 --- a/examples/langgraph_agent_sparc_example.py +++ b/examples/langgraph_agent_sparc_example.py @@ -6,6 +6,7 @@ Note that this example will require installing langgraph and langchain-openai """ +import json import re from langchain_openai import ChatOpenAI from langgraph.graph import StateGraph, START, END @@ -13,6 +14,7 @@ from typing_extensions import Annotated from langgraph.prebuilt import ToolNode from langchain_core.messages import BaseMessage, HumanMessage +from langchain_core.messages.base import messages_to_dict import operator from typing import TypedDict, List @@ -106,10 +108,23 @@ class AgentState(TypedDict): def tool_pre_hook(state): # Creates a pre-tool node that runs the reflector, blocks and explains if the input is faulty + tool_calls = state["messages"][-1].tool_calls + formatted_tool_calls = [] + for call in tool_calls: + formatted_tool_calls.append( + { + "id": call["id"], + "type": "function", + "function": { + "name": call["name"], + "arguments": json.dumps(call["args"]), + }, + } + ) reflect_input = SPARCReflectionRunInput( - messages=state["messages"], + messages=messages_to_dict(state["messages"]), tool_specs=tool_specs, - tool_calls=state["messages"][-1].additional_kwargs["tool_calls"], + tool_calls=formatted_tool_calls, ) reflect_result = reflector.process(reflect_input, AgentPhase.RUNTIME) if reflect_result.output.reflection_result.decision == "approve": diff --git a/examples/langgraph_agent_sparc_example_streamlit.py b/examples/langgraph_agent_sparc_example_streamlit.py index 56ce875..2018179 100644 --- a/examples/langgraph_agent_sparc_example_streamlit.py +++ b/examples/langgraph_agent_sparc_example_streamlit.py @@ -7,6 +7,7 @@ Execute this demo with `streamlit run langgraph_agent_sparc_example_streamlit.py` """ +import json import re from langchain_openai import ChatOpenAI from langgraph.graph import StateGraph, START, END @@ -14,6 +15,7 @@ from typing_extensions import Annotated from langgraph.prebuilt import ToolNode from langchain_core.messages import BaseMessage, HumanMessage +from langchain_core.messages.base import messages_to_dict import operator from typing import TypedDict, List import streamlit as st @@ -108,16 +110,44 @@ class AgentState(TypedDict): def tool_pre_hook(state): if use_sparc: # Creates a pre-tool node that runs the reflector, blocks and explains if the input is faulty + tool_calls = state["messages"][-1].tool_calls + formatted_tool_calls = [] + for call in tool_calls: + formatted_tool_calls.append( + { + "id": call["id"], + "type": "function", + "function": { + "name": call["name"], + "arguments": json.dumps(call["args"]), + }, + } + ) reflect_input = SPARCReflectionRunInput( - messages=state["messages"], + messages=messages_to_dict(state["messages"]), tool_specs=tool_specs, - tool_calls=state["messages"][-1].additional_kwargs["tool_calls"], + tool_calls=formatted_tool_calls, ) reflect_result = reflector.process(reflect_input, AgentPhase.RUNTIME) if reflect_result.output.reflection_result.decision == "approve": print("✅ Tool call approved") return {"next": "call_tool"} - else: + elif reflect_result.output.reflection_result.decision == "error": + print("⚠️ ERROR: Validation encountered errors") + error_msg = "Validation failed due to system errors:\n" + error_issues = [ + i + for i in reflect_result.output.reflection_result.issues + if i.issue_type == "error" + ] + for issue in error_issues: + error_msg += f"\n - {issue.metric_name}: {issue.explanation}" + print(error_msg) + return { + "next": "final_message", + "messages": [HumanMessage(content=error_msg)], + } + else: # reject print("❌ Tool call rejected") issues = "Tool call rejected for the following reasons:" for issue in reflect_result.output.reflection_result.issues: diff --git a/tests/pre_tool/refraction/test_refraction.py b/tests/pre_tool/refraction/test_refraction.py index 9b67342..9ac92e7 100644 --- a/tests/pre_tool/refraction/test_refraction.py +++ b/tests/pre_tool/refraction/test_refraction.py @@ -7,7 +7,7 @@ ) from altk.pre_tool.refraction.refraction import RefractionComponent from altk.pre_tool.refraction.refraction import Refractor -from altk.pre_tool.core.types import ( +from altk.pre_tool.refraction.types import ( RefractionBuildInput, RefractionRunInput, RefractionRunOutput, diff --git a/tests/pre_tool/sparc/error_handling_test.py b/tests/pre_tool/sparc/error_handling_test.py new file mode 100644 index 0000000..39aeba2 --- /dev/null +++ b/tests/pre_tool/sparc/error_handling_test.py @@ -0,0 +1,557 @@ +import json +import os +import pytest +from unittest.mock import patch + +from altk.pre_tool.core import ( + SPARCReflectionRunInput, + SPARCReflectionDecision, + SPARCExecutionMode, + Track, +) +from altk.pre_tool.sparc import ( + SPARCReflectionComponent, +) +from altk.core.toolkit import AgentPhase, ComponentConfig +from altk.core.llm import get_llm +from altk.pre_tool.sparc.function_calling.pipeline.types import ( + PipelineResult, + FunctionCallInput, + SemanticResult, + SemanticCategoryResult, + SemanticMetricResult, + TransformResult, +) +from dotenv import load_dotenv + +load_dotenv() + + +class TestErrorHandling: + """Test suite for error handling functionality in SPARC.""" + + @pytest.fixture + def middleware(self): + """Create middleware instance for testing.""" + # Build ComponentConfig with WatsonX ValidatingLLMClient + WATSONX_CLIENT = get_llm("watsonx.output_val") + config = ComponentConfig( + llm_client=WATSONX_CLIENT( + model_id="meta-llama/llama-3-3-70b-instruct", + api_key=os.getenv("WX_API_KEY"), + project_id=os.getenv("WX_PROJECT_ID"), + url=os.getenv("WX_URL", "https://us-south.ml.cloud.ibm.com"), + ) + ) + return SPARCReflectionComponent( + config=config, + track=Track.FAST_TRACK, + execution_mode=SPARCExecutionMode.SYNC, + ) + + @pytest.fixture + def tool_specs(self): + """Basic tool specifications for testing.""" + return [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get current weather information", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "City name", + }, + "units": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + "description": "Temperature units", + }, + }, + "required": ["location"], + }, + }, + } + ] + + @pytest.fixture + def basic_messages(self): + """Basic conversation messages for testing.""" + return [ + {"role": "user", "content": "What's the weather in New York?"}, + {"role": "assistant", "content": "I'll check the weather for you."}, + ] + + @pytest.fixture + def basic_tool_call(self): + """Basic tool call for testing.""" + return { + "id": "1", + "type": "function", + "function": { + "name": "get_weather", + "arguments": json.dumps({"location": "New York"}), + }, + } + + def test_function_selection_metric_error( + self, middleware, tool_specs, basic_messages, basic_tool_call + ): + """Test error in function selection metric.""" + # Create a mock pipeline result with error in function selection metric + mock_metric = SemanticMetricResult( + metric_name="function_selection_appropriateness", + jsonschema={}, + prompt="test prompt", + raw_response={}, + numeric_thresholds_checks={}, + is_important=True, + importance_reason=None, + error="LLM timeout: Request timed out after 30 seconds", + is_correct=False, + is_issue=False, + ) + + mock_pipeline_result = PipelineResult( + inputs=FunctionCallInput( + conversation_context=basic_messages, + tools_inventory=tool_specs, + tool_call=basic_tool_call, + ), + static=None, + semantic=SemanticResult( + function_selection=SemanticCategoryResult( + metrics={"function_selection_appropriateness": mock_metric}, + final_decision=False, + ), + general=None, + parameter=None, + transform=None, + ), + overall_valid=False, + ) + + # Build input + run_input = SPARCReflectionRunInput( + messages=basic_messages, + tool_specs=tool_specs, + tool_calls=[basic_tool_call], + ) + + # Patch the pipeline to return our mock result + with patch.object( + middleware._pipeline, "run_sync", return_value=mock_pipeline_result + ): + result = middleware.process(run_input, phase=AgentPhase.RUNTIME) + + # Assert that decision is ERROR + assert result.output.reflection_result.decision == SPARCReflectionDecision.ERROR + + # Assert that error information is captured in issues + assert len(result.output.reflection_result.issues) > 0 + error_issues = [ + issue + for issue in result.output.reflection_result.issues + if "LLM timeout" in issue.explanation + or "error" in issue.explanation.lower() + ] + assert len(error_issues) > 0 + + def test_general_metric_error( + self, middleware, tool_specs, basic_messages, basic_tool_call + ): + """Test error in general semantic metric.""" + mock_metric = SemanticMetricResult( + metric_name="general_hallucination_check", + jsonschema={}, + prompt="test prompt", + raw_response={}, + numeric_thresholds_checks={}, + is_important=True, + importance_reason=None, + error="JSON parsing failed: Invalid response format", + is_correct=False, + is_issue=False, + ) + + mock_pipeline_result = PipelineResult( + inputs=FunctionCallInput( + conversation_context=basic_messages, + tools_inventory=tool_specs, + tool_call=basic_tool_call, + ), + static=None, + semantic=SemanticResult( + function_selection=None, + general=SemanticCategoryResult( + metrics={"general_hallucination_check": mock_metric}, + final_decision=False, + ), + parameter=None, + transform=None, + ), + overall_valid=False, + ) + + run_input = SPARCReflectionRunInput( + messages=basic_messages, + tool_specs=tool_specs, + tool_calls=[basic_tool_call], + ) + + with patch.object( + middleware._pipeline, "run_sync", return_value=mock_pipeline_result + ): + result = middleware.process(run_input, phase=AgentPhase.RUNTIME) + + assert result.output.reflection_result.decision == SPARCReflectionDecision.ERROR + assert len(result.output.reflection_result.issues) > 0 + + def test_parameter_metric_error( + self, middleware, tool_specs, basic_messages, basic_tool_call + ): + """Test error in parameter-level metric.""" + mock_metric = SemanticMetricResult( + metric_name="parameter_hallucination_check", + jsonschema={}, + prompt="test prompt", + raw_response={}, + numeric_thresholds_checks={}, + is_important=True, + importance_reason=None, + error="Model unavailable: Service temporarily unavailable", + is_correct=False, + is_issue=False, + ) + + mock_param_metrics = SemanticCategoryResult( + metrics={"parameter_hallucination_check": mock_metric}, + final_decision=False, + ) + + mock_pipeline_result = PipelineResult( + inputs=FunctionCallInput( + conversation_context=basic_messages, + tools_inventory=tool_specs, + tool_call=basic_tool_call, + ), + static=None, + semantic=SemanticResult( + function_selection=None, + general=None, + parameter={"location": mock_param_metrics}, + transform=None, + ), + overall_valid=False, + ) + + run_input = SPARCReflectionRunInput( + messages=basic_messages, + tool_specs=tool_specs, + tool_calls=[basic_tool_call], + ) + + with patch.object( + middleware._pipeline, "run_sync", return_value=mock_pipeline_result + ): + result = middleware.process(run_input, phase=AgentPhase.RUNTIME) + + assert result.output.reflection_result.decision == SPARCReflectionDecision.ERROR + assert len(result.output.reflection_result.issues) > 0 + + def test_transformation_error( + self, middleware, tool_specs, basic_messages, basic_tool_call + ): + """Test error in transformation execution.""" + mock_transform_info = TransformResult( + units={"user_units": "km", "spec_units": "m", "user_value": 5}, + generated_code="result = value * 1000", + execution_success=False, + correct=False, + execution_output=None, + correction=None, + error="Code execution failed: Division by zero in transformation", + ) + + mock_pipeline_result = PipelineResult( + inputs=FunctionCallInput( + conversation_context=basic_messages, + tools_inventory=tool_specs, + tool_call=basic_tool_call, + ), + static=None, + semantic=SemanticResult( + function_selection=None, + general=None, + parameter=None, + transform={"location": mock_transform_info}, + ), + overall_valid=False, + ) + + run_input = SPARCReflectionRunInput( + messages=basic_messages, + tool_specs=tool_specs, + tool_calls=[basic_tool_call], + ) + + with patch.object( + middleware._pipeline, "run_sync", return_value=mock_pipeline_result + ): + result = middleware.process(run_input, phase=AgentPhase.RUNTIME) + + assert result.output.reflection_result.decision == SPARCReflectionDecision.ERROR + assert len(result.output.reflection_result.issues) > 0 + + def test_mixed_errors_and_issues( + self, middleware, tool_specs, basic_messages, basic_tool_call + ): + """Test scenario with both errors and validation issues.""" + # Create metric with error + error_metric = SemanticMetricResult( + metric_name="general_hallucination_check", + jsonschema={}, + prompt="test prompt", + raw_response={}, + numeric_thresholds_checks={}, + is_important=True, + importance_reason=None, + error="LLM timeout", + is_correct=False, + is_issue=False, + ) + + # Create metric with error (not a validation issue) + issue_metric = SemanticMetricResult( + metric_name="function_selection_appropriateness", + jsonschema={}, + prompt="test prompt", + raw_response={}, + numeric_thresholds_checks={}, + is_important=True, + importance_reason=None, + error="Function selection error: Invalid function chosen", + is_correct=False, + is_issue=False, + ) + + mock_pipeline_result = PipelineResult( + inputs=FunctionCallInput( + conversation_context=basic_messages, + tools_inventory=tool_specs, + tool_call=basic_tool_call, + ), + static=None, + semantic=SemanticResult( + function_selection=SemanticCategoryResult( + metrics={"function_selection_appropriateness": issue_metric}, + final_decision=False, + ), + general=SemanticCategoryResult( + metrics={"general_hallucination_check": error_metric}, + final_decision=False, + ), + parameter=None, + transform=None, + ), + overall_valid=False, + ) + + run_input = SPARCReflectionRunInput( + messages=basic_messages, + tool_specs=tool_specs, + tool_calls=[basic_tool_call], + ) + + with patch.object( + middleware._pipeline, "run_sync", return_value=mock_pipeline_result + ): + result = middleware.process(run_input, phase=AgentPhase.RUNTIME) + + # ERROR should take precedence over REJECT when both errors and issues exist + assert result.output.reflection_result.decision == SPARCReflectionDecision.ERROR + assert len(result.output.reflection_result.issues) >= 2 + + def test_multiple_errors_in_different_stages( + self, middleware, tool_specs, basic_messages, basic_tool_call + ): + """Test multiple errors across different validation stages.""" + error_metric_1 = SemanticMetricResult( + metric_name="general_hallucination_check", + jsonschema={}, + prompt="test prompt", + raw_response={}, + numeric_thresholds_checks={}, + is_important=True, + importance_reason=None, + error="Error 1: LLM timeout", + is_correct=False, + is_issue=False, + ) + + error_metric_2 = SemanticMetricResult( + metric_name="parameter_hallucination_check", + jsonschema={}, + prompt="test prompt", + raw_response={}, + numeric_thresholds_checks={}, + is_important=True, + importance_reason=None, + error="Error 2: Invalid response format", + is_correct=False, + is_issue=False, + ) + + mock_param_metrics = SemanticCategoryResult( + metrics={"parameter_hallucination_check": error_metric_2}, + final_decision=False, + ) + + mock_pipeline_result = PipelineResult( + inputs=FunctionCallInput( + conversation_context=basic_messages, + tools_inventory=tool_specs, + tool_call=basic_tool_call, + ), + static=None, + semantic=SemanticResult( + function_selection=None, + general=SemanticCategoryResult( + metrics={"general_hallucination_check": error_metric_1}, + final_decision=False, + ), + parameter={"location": mock_param_metrics}, + transform=None, + ), + overall_valid=False, + ) + + run_input = SPARCReflectionRunInput( + messages=basic_messages, + tool_specs=tool_specs, + tool_calls=[basic_tool_call], + ) + + with patch.object( + middleware._pipeline, "run_sync", return_value=mock_pipeline_result + ): + result = middleware.process(run_input, phase=AgentPhase.RUNTIME) + + assert result.output.reflection_result.decision == SPARCReflectionDecision.ERROR + # Should have issues for both errors + assert len(result.output.reflection_result.issues) >= 2 + + def test_no_exception_on_error( + self, middleware, tool_specs, basic_messages, basic_tool_call + ): + """Test that errors are handled gracefully without raising exceptions.""" + mock_metric = SemanticMetricResult( + metric_name="general_hallucination_check", + jsonschema={}, + prompt="test prompt", + raw_response={}, + numeric_thresholds_checks={}, + is_important=True, + importance_reason=None, + error="Critical LLM failure", + is_correct=False, + is_issue=False, + ) + + mock_pipeline_result = PipelineResult( + inputs=FunctionCallInput( + conversation_context=basic_messages, + tools_inventory=tool_specs, + tool_call=basic_tool_call, + ), + static=None, + semantic=SemanticResult( + function_selection=None, + general=SemanticCategoryResult( + metrics={"general_hallucination_check": mock_metric}, + final_decision=False, + ), + parameter=None, + transform=None, + ), + overall_valid=False, + ) + + run_input = SPARCReflectionRunInput( + messages=basic_messages, + tool_specs=tool_specs, + tool_calls=[basic_tool_call], + ) + + # Should not raise any exception + with patch.object( + middleware._pipeline, "run_sync", return_value=mock_pipeline_result + ): + result = middleware.process(run_input, phase=AgentPhase.RUNTIME) + + assert result.output.reflection_result.decision == SPARCReflectionDecision.ERROR + + def test_error_messages_are_informative( + self, middleware, tool_specs, basic_messages, basic_tool_call + ): + """Test that error messages contain useful debugging information.""" + error_message = ( + "LLM API Error: Rate limit exceeded - Please try again in 60 seconds" + ) + + mock_metric = SemanticMetricResult( + metric_name="function_selection_appropriateness", + jsonschema={}, + prompt="test prompt", + raw_response={}, + numeric_thresholds_checks={}, + is_important=True, + importance_reason=None, + error=error_message, + is_correct=False, + is_issue=False, + ) + + mock_pipeline_result = PipelineResult( + inputs=FunctionCallInput( + conversation_context=basic_messages, + tools_inventory=tool_specs, + tool_call=basic_tool_call, + ), + static=None, + semantic=SemanticResult( + function_selection=SemanticCategoryResult( + metrics={"function_selection_appropriateness": mock_metric}, + final_decision=False, + ), + general=None, + parameter=None, + transform=None, + ), + overall_valid=False, + ) + + run_input = SPARCReflectionRunInput( + messages=basic_messages, + tool_specs=tool_specs, + tool_calls=[basic_tool_call], + ) + + with patch.object( + middleware._pipeline, "run_sync", return_value=mock_pipeline_result + ): + result = middleware.process(run_input, phase=AgentPhase.RUNTIME) + + assert result.output.reflection_result.decision == SPARCReflectionDecision.ERROR + + # Check that error message is included in issues + error_found = False + for issue in result.output.reflection_result.issues: + if "Rate limit exceeded" in issue.explanation: + error_found = True + break + + assert error_found, "Error message not found in issues"