diff --git a/src/uipath/_cli/_evals/_models/_evaluation_set.py b/src/uipath/_cli/_evals/_models/_evaluation_set.py index 61390beba..40dbccbda 100644 --- a/src/uipath/_cli/_evals/_models/_evaluation_set.py +++ b/src/uipath/_cli/_evals/_models/_evaluation_set.py @@ -44,6 +44,15 @@ class LLMMockingStrategy(BaseMockingStrategy): ) +class InputMockingStrategy(BaseModel): + prompt: str = Field(..., alias="prompt") + model: Optional[ModelSettings] = Field(None, alias="model") + + model_config = ConfigDict( + validate_by_name=True, validate_by_alias=True, extra="allow" + ) + + class MockingArgument(BaseModel): args: List[Any] = Field(default_factory=lambda: [], alias="args") kwargs: Dict[str, Any] = Field(default_factory=lambda: {}, alias="kwargs") @@ -110,6 +119,10 @@ class EvaluationItem(BaseModel): default=None, alias="mockingStrategy", ) + input_mocking_strategy: Optional[InputMockingStrategy] = Field( + default=None, + alias="inputMockingStrategy", + ) class EvaluationSet(BaseModel): diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py index 43d8d66a8..d063beaa0 100644 --- a/src/uipath/_cli/_evals/_runtime.py +++ b/src/uipath/_cli/_evals/_runtime.py @@ -11,6 +11,10 @@ from opentelemetry.sdk.trace import ReadableSpan, Span from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult +from uipath._cli._evals.mocks.input_mocker import ( + generate_llm_input, +) + from ..._events._event_bus import EventBus from ..._events._events import ( EvalItemExceptionDetails, @@ -318,6 +322,10 @@ async def _execute_eval( evaluators: List[BaseEvaluator[Any]], event_bus: EventBus, ) -> EvaluationRunResult: + # Generate LLM-based input if input_mocking_strategy is defined + if eval_item.input_mocking_strategy: + eval_item = await self._generate_input_for_eval(eval_item) + set_execution_context(eval_item, self.span_collector) await event_bus.publish( @@ -417,6 +425,16 @@ async def _execute_eval( return evaluation_run_results + async def _generate_input_for_eval( + self, eval_item: EvaluationItem + ) -> EvaluationItem: + """Use LLM to generate a mock input for an evaluation item.""" + # TODO(bai): get the input schema from agent definition, once it is available there. + input_schema: dict[str, Any] = {} + generated_input = await generate_llm_input(eval_item, input_schema) + updated_eval_item = eval_item.model_copy(update={"inputs": generated_input}) + return updated_eval_item + def _get_and_clear_execution_data( self, execution_id: str ) -> tuple[List[ReadableSpan], list[logging.LogRecord]]: diff --git a/src/uipath/_cli/_evals/mocks/input_mocker.py b/src/uipath/_cli/_evals/mocks/input_mocker.py new file mode 100644 index 000000000..a7830e824 --- /dev/null +++ b/src/uipath/_cli/_evals/mocks/input_mocker.py @@ -0,0 +1,111 @@ +"""LLM Input Mocker implementation.""" + +import json +from datetime import datetime +from typing import Any, Dict + +from uipath import UiPath +from uipath._cli._evals._models._evaluation_set import EvaluationItem +from uipath.tracing._traced import traced + +from .mocker import UiPathInputMockingError + + +def get_input_mocking_prompt( + input_schema: str, + input_generation_instructions: str, + expected_behavior: str, + expected_output: str, +) -> str: + """Generate the LLM input mocking prompt.""" + current_datetime = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") + + return f"""You are simulating input for automated testing purposes of an Agent as part of a simulation run. +You will need to generate realistic input to a LLM agent which will call various tools to achieve a goal. This must be in the exact format of the INPUT_SCHEMA. +You may need to follow specific INPUT_GENERATION_INSTRUCTIONS. If no relevant instructions are provided pertaining to input generation, use the other provided information and your own judgement to generate input. +If the INPUT_GENERATION_INSTRUCTIONS are provided, you MUST follow them exactly. For example if the instructions say to generate a value for a field to be before a certain calendar date, you must generate a value that is before that date. + +The current date and time is: {current_datetime} + +#INPUT_SCHEMA: You MUST OUTPUT THIS EXACT JSON SCHEMA +{input_schema} +#END_INPUT_SCHEMA + +#INPUT_GENERATION_INSTRUCTIONS +{input_generation_instructions} +#END_INPUT_GENERATION_INSTRUCTIONS + +#EXPECTED_BEHAVIOR +{expected_behavior} +#END_EXPECTED_BEHAVIOR + +#EXPECTED_OUTPUT +{expected_output} +#END_EXPECTED_OUTPUT + +Based on the above information, provide a realistic input to the LLM agent. Your response should: +1. Match the expected input format according to the INPUT_SCHEMA exactly +2. Be consistent with the style and level of detail in the example inputs +3. Consider the context of the the agent being tested +4. Be realistic and representative of what a real user might say or ask + +OUTPUT: ONLY the simulated agent input in the exact format of the INPUT_SCHEMA in valid JSON. Do not include any explanations, quotation marks, or markdown.""" + + +@traced(name="__mocker__") +async def generate_llm_input( + evaluation_item: EvaluationItem, + input_schema: Dict[str, Any], +) -> Dict[str, Any]: + """Generate synthetic input using an LLM based on the evaluation context.""" + try: + llm = UiPath().llm + + prompt = get_input_mocking_prompt( + input_schema=json.dumps(input_schema, indent=2), + input_generation_instructions=evaluation_item.input_mocking_strategy.prompt + if evaluation_item.input_mocking_strategy + else "", + expected_behavior=evaluation_item.expected_agent_behavior or "", + expected_output=json.dumps(evaluation_item.expected_output, indent=2) + if evaluation_item.expected_output + else "", + ) + + response_format = { + "type": "json_schema", + "json_schema": { + "name": "agent_input", + "strict": True, + "schema": input_schema, + }, + } + + model_parameters = ( + evaluation_item.input_mocking_strategy.model + if evaluation_item.input_mocking_strategy + else None + ) + completion_kwargs = ( + model_parameters.model_dump(by_alias=False, exclude_none=True) + if model_parameters + else {} + ) + + response = await llm.chat_completions( + [{"role": "user", "content": prompt}], + response_format=response_format, + **completion_kwargs, + ) + + generated_input_str = response.choices[0].message.content + + return json.loads(generated_input_str) + except json.JSONDecodeError as e: + raise UiPathInputMockingError( + f"Failed to parse LLM response as JSON: {str(e)}" + ) from e + except UiPathInputMockingError: + raise + except Exception as e: + raise UiPathInputMockingError(f"Failed to generate input: {str(e)}") from e diff --git a/src/uipath/_cli/_evals/mocks/mocker.py b/src/uipath/_cli/_evals/mocks/mocker.py index aed040b3e..57cb8bcc3 100644 --- a/src/uipath/_cli/_evals/mocks/mocker.py +++ b/src/uipath/_cli/_evals/mocks/mocker.py @@ -33,3 +33,9 @@ class UiPathMockResponseGenerationError(Exception): """Exception when a mocker is configured unable to generate a response.""" pass + + +class UiPathInputMockingError(Exception): + """Exception when input mocking fails.""" + + pass diff --git a/src/uipath/agent/_utils.py b/src/uipath/agent/_utils.py index 1ce644f88..7e04a62eb 100644 --- a/src/uipath/agent/_utils.py +++ b/src/uipath/agent/_utils.py @@ -4,7 +4,10 @@ from httpx import Response from pydantic import TypeAdapter -from uipath._cli._evals._models._evaluation_set import LLMMockingStrategy +from uipath._cli._evals._models._evaluation_set import ( + InputMockingStrategy, + LLMMockingStrategy, +) from uipath._cli._push.sw_file_handler import SwFileHandler from uipath._cli._utils._studio_project import ( ProjectFile, @@ -137,4 +140,14 @@ async def load_agent_definition(project_id: str) -> AgentDefinition: evaluation.mocking_strategy = LLMMockingStrategy( prompt=prompt, tools_to_simulate=tools_to_simulate ) + + if not evaluation.input_mocking_strategy: + # Migrate lowCode input mocking fields + if evaluation.model_extra.get("simulateInput", False): + prompt = evaluation.model_extra.get( + "inputGenerationInstructions", + ) + evaluation.input_mocking_strategy = InputMockingStrategy( + prompt=prompt + ) return agent_definition diff --git a/tests/cli/eval/mocks/test_input_mocker.py b/tests/cli/eval/mocks/test_input_mocker.py new file mode 100644 index 000000000..8d14361b7 --- /dev/null +++ b/tests/cli/eval/mocks/test_input_mocker.py @@ -0,0 +1,106 @@ +from typing import Any + +import pytest +from _pytest.monkeypatch import MonkeyPatch +from pytest_httpx import HTTPXMock + +from uipath._cli._evals._models._evaluation_set import ( + EvaluationItem, + InputMockingStrategy, + ModelSettings, +) +from uipath._cli._evals.mocks.input_mocker import generate_llm_input + + +@pytest.mark.asyncio +@pytest.mark.httpx_mock(assert_all_responses_were_requested=False) +async def test_generate_llm_input_with_model_settings( + httpx_mock: HTTPXMock, monkeypatch: MonkeyPatch +): + monkeypatch.setenv("UIPATH_URL", "https://example.com") + monkeypatch.setenv("UIPATH_ACCESS_TOKEN", "test-token") + + evaluation_item: dict[str, Any] = { + "id": "test-eval-id", + "name": "Test Input Generation", + "inputs": {}, + "expectedOutput": {"result": 35}, + "expectedAgentBehavior": "Agent should multiply the numbers", + "inputMockingStrategy": { + "prompt": "Generate a multiplication query with 5 and 7", + "model": { + "model": "gpt-4o-mini-2024-07-18", + "temperature": 0.5, + "maxTokens": 150, + }, + }, + "evalSetId": "test-eval-set-id", + "createdAt": "2025-09-04T18:54:58.378Z", + "updatedAt": "2025-09-04T18:55:55.416Z", + } + eval_item = EvaluationItem(**evaluation_item) + + assert isinstance(eval_item.input_mocking_strategy, InputMockingStrategy) + assert isinstance(eval_item.input_mocking_strategy.model, ModelSettings) + assert eval_item.input_mocking_strategy.model.model == "gpt-4o-mini-2024-07-18" + assert eval_item.input_mocking_strategy.model.temperature == 0.5 + assert eval_item.input_mocking_strategy.model.max_tokens == 150 + + input_schema = { + "type": "object", + "properties": { + "query": {"type": "string"}, + }, + "required": ["query"], + "additionalProperties": False, + } + + httpx_mock.add_response( + url="https://example.com/agenthub_/llm/api/capabilities", + status_code=200, + json={}, + ) + httpx_mock.add_response( + url="https://example.com/orchestrator_/llm/api/capabilities", + status_code=200, + json={}, + ) + + httpx_mock.add_response( + url="https://example.com/api/chat/completions?api-version=2024-08-01-preview", + status_code=200, + json={ + "role": "assistant", + "id": "response-id", + "object": "chat.completion", + "created": 0, + "model": "gpt-4o-mini-2024-07-18", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": '{"query": "Calculate 5 times 7"}', + "tool_calls": None, + }, + "finish_reason": "stop", + } + ], + "usage": { + "prompt_tokens": 100, + "completion_tokens": 20, + "total_tokens": 120, + }, + }, + ) + + result = await generate_llm_input(eval_item, input_schema) + + # Verify the mocked input is correct + assert result == {"query": "Calculate 5 times 7"} + + requests = httpx_mock.get_requests() + chat_completion_requests = [r for r in requests if "chat/completions" in str(r.url)] + assert len(chat_completion_requests) == 1, ( + "Expected exactly one chat completion request" + ) diff --git a/tests/cli/eval/mocks/test_mocks.py b/tests/cli/eval/mocks/test_mocks.py index 78b64b0f4..47fb92c67 100644 --- a/tests/cli/eval/mocks/test_mocks.py +++ b/tests/cli/eval/mocks/test_mocks.py @@ -136,6 +136,7 @@ async def foofoo(*args, **kwargs): assert await foo(x=2) == "bar1" +@pytest.mark.httpx_mock(assert_all_responses_were_requested=False) def test_llm_mockable_sync(httpx_mock: HTTPXMock, monkeypatch: MonkeyPatch): monkeypatch.setenv("UIPATH_URL", "https://example.com") monkeypatch.setenv("UIPATH_ACCESS_TOKEN", "1234567890")