Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/uipath/_cli/_evals/_models/_evaluation_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,15 @@ class LLMMockingStrategy(BaseMockingStrategy):
)


class InputMockingStrategy(BaseModel):
prompt: str = Field(..., alias="prompt")
model: Optional[ModelSettings] = Field(None, alias="model")

model_config = ConfigDict(
validate_by_name=True, validate_by_alias=True, extra="allow"
)


class MockingArgument(BaseModel):
args: List[Any] = Field(default_factory=lambda: [], alias="args")
kwargs: Dict[str, Any] = Field(default_factory=lambda: {}, alias="kwargs")
Expand Down Expand Up @@ -110,6 +119,10 @@ class EvaluationItem(BaseModel):
default=None,
alias="mockingStrategy",
)
input_mocking_strategy: Optional[InputMockingStrategy] = Field(
default=None,
alias="inputMockingStrategy",
)


class EvaluationSet(BaseModel):
Expand Down
18 changes: 18 additions & 0 deletions src/uipath/_cli/_evals/_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
from opentelemetry.sdk.trace import ReadableSpan, Span
from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult

from uipath._cli._evals.mocks.input_mocker import (
generate_llm_input,
)

from ..._events._event_bus import EventBus
from ..._events._events import (
EvalItemExceptionDetails,
Expand Down Expand Up @@ -318,6 +322,10 @@ async def _execute_eval(
evaluators: List[BaseEvaluator[Any]],
event_bus: EventBus,
) -> EvaluationRunResult:
# Generate LLM-based input if input_mocking_strategy is defined
if eval_item.input_mocking_strategy:
eval_item = await self._generate_input_for_eval(eval_item)

set_execution_context(eval_item, self.span_collector)

await event_bus.publish(
Expand Down Expand Up @@ -417,6 +425,16 @@ async def _execute_eval(

return evaluation_run_results

async def _generate_input_for_eval(
self, eval_item: EvaluationItem
) -> EvaluationItem:
"""Use LLM to generate a mock input for an evaluation item."""
# TODO(bai): get the input schema from agent definition, once it is available there.
input_schema: dict[str, Any] = {}
generated_input = await generate_llm_input(eval_item, input_schema)
updated_eval_item = eval_item.model_copy(update={"inputs": generated_input})
return updated_eval_item

def _get_and_clear_execution_data(
self, execution_id: str
) -> tuple[List[ReadableSpan], list[logging.LogRecord]]:
Expand Down
111 changes: 111 additions & 0 deletions src/uipath/_cli/_evals/mocks/input_mocker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
"""LLM Input Mocker implementation."""

import json
from datetime import datetime
from typing import Any, Dict

from uipath import UiPath
from uipath._cli._evals._models._evaluation_set import EvaluationItem
from uipath.tracing._traced import traced

from .mocker import UiPathInputMockingError


def get_input_mocking_prompt(
input_schema: str,
input_generation_instructions: str,
expected_behavior: str,
expected_output: str,
) -> str:
"""Generate the LLM input mocking prompt."""
current_datetime = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")

return f"""You are simulating input for automated testing purposes of an Agent as part of a simulation run.
You will need to generate realistic input to a LLM agent which will call various tools to achieve a goal. This must be in the exact format of the INPUT_SCHEMA.
You may need to follow specific INPUT_GENERATION_INSTRUCTIONS. If no relevant instructions are provided pertaining to input generation, use the other provided information and your own judgement to generate input.
If the INPUT_GENERATION_INSTRUCTIONS are provided, you MUST follow them exactly. For example if the instructions say to generate a value for a field to be before a certain calendar date, you must generate a value that is before that date.

The current date and time is: {current_datetime}

#INPUT_SCHEMA: You MUST OUTPUT THIS EXACT JSON SCHEMA
{input_schema}
#END_INPUT_SCHEMA

#INPUT_GENERATION_INSTRUCTIONS
{input_generation_instructions}
#END_INPUT_GENERATION_INSTRUCTIONS

#EXPECTED_BEHAVIOR
{expected_behavior}
#END_EXPECTED_BEHAVIOR

#EXPECTED_OUTPUT
{expected_output}
#END_EXPECTED_OUTPUT

Based on the above information, provide a realistic input to the LLM agent. Your response should:
1. Match the expected input format according to the INPUT_SCHEMA exactly
2. Be consistent with the style and level of detail in the example inputs
3. Consider the context of the the agent being tested
4. Be realistic and representative of what a real user might say or ask

OUTPUT: ONLY the simulated agent input in the exact format of the INPUT_SCHEMA in valid JSON. Do not include any explanations, quotation marks, or markdown."""


@traced(name="__mocker__")
async def generate_llm_input(
evaluation_item: EvaluationItem,
input_schema: Dict[str, Any],
) -> Dict[str, Any]:
"""Generate synthetic input using an LLM based on the evaluation context."""
try:
llm = UiPath().llm

prompt = get_input_mocking_prompt(
input_schema=json.dumps(input_schema, indent=2),
input_generation_instructions=evaluation_item.input_mocking_strategy.prompt
if evaluation_item.input_mocking_strategy
else "",
expected_behavior=evaluation_item.expected_agent_behavior or "",
expected_output=json.dumps(evaluation_item.expected_output, indent=2)
if evaluation_item.expected_output
else "",
)

response_format = {
"type": "json_schema",
"json_schema": {
"name": "agent_input",
"strict": True,
"schema": input_schema,
},
}

model_parameters = (
evaluation_item.input_mocking_strategy.model
if evaluation_item.input_mocking_strategy
else None
)
completion_kwargs = (
model_parameters.model_dump(by_alias=False, exclude_none=True)
if model_parameters
else {}
)

response = await llm.chat_completions(
[{"role": "user", "content": prompt}],
response_format=response_format,
**completion_kwargs,
)

generated_input_str = response.choices[0].message.content

return json.loads(generated_input_str)
except json.JSONDecodeError as e:
raise UiPathInputMockingError(
f"Failed to parse LLM response as JSON: {str(e)}"
) from e
except UiPathInputMockingError:
raise
except Exception as e:
raise UiPathInputMockingError(f"Failed to generate input: {str(e)}") from e
6 changes: 6 additions & 0 deletions src/uipath/_cli/_evals/mocks/mocker.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,9 @@ class UiPathMockResponseGenerationError(Exception):
"""Exception when a mocker is configured unable to generate a response."""

pass


class UiPathInputMockingError(Exception):
"""Exception when input mocking fails."""

pass
15 changes: 14 additions & 1 deletion src/uipath/agent/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
from httpx import Response
from pydantic import TypeAdapter

from uipath._cli._evals._models._evaluation_set import LLMMockingStrategy
from uipath._cli._evals._models._evaluation_set import (
InputMockingStrategy,
LLMMockingStrategy,
)
from uipath._cli._push.sw_file_handler import SwFileHandler
from uipath._cli._utils._studio_project import (
ProjectFile,
Expand Down Expand Up @@ -137,4 +140,14 @@ async def load_agent_definition(project_id: str) -> AgentDefinition:
evaluation.mocking_strategy = LLMMockingStrategy(
prompt=prompt, tools_to_simulate=tools_to_simulate
)

if not evaluation.input_mocking_strategy:
# Migrate lowCode input mocking fields
if evaluation.model_extra.get("simulateInput", False):
prompt = evaluation.model_extra.get(
"inputGenerationInstructions",
)
evaluation.input_mocking_strategy = InputMockingStrategy(
prompt=prompt
)
return agent_definition
106 changes: 106 additions & 0 deletions tests/cli/eval/mocks/test_input_mocker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from typing import Any

import pytest
from _pytest.monkeypatch import MonkeyPatch
from pytest_httpx import HTTPXMock

from uipath._cli._evals._models._evaluation_set import (
EvaluationItem,
InputMockingStrategy,
ModelSettings,
)
from uipath._cli._evals.mocks.input_mocker import generate_llm_input


@pytest.mark.asyncio
@pytest.mark.httpx_mock(assert_all_responses_were_requested=False)
async def test_generate_llm_input_with_model_settings(
httpx_mock: HTTPXMock, monkeypatch: MonkeyPatch
):
monkeypatch.setenv("UIPATH_URL", "https://example.com")
monkeypatch.setenv("UIPATH_ACCESS_TOKEN", "test-token")

evaluation_item: dict[str, Any] = {
"id": "test-eval-id",
"name": "Test Input Generation",
"inputs": {},
"expectedOutput": {"result": 35},
"expectedAgentBehavior": "Agent should multiply the numbers",
"inputMockingStrategy": {
"prompt": "Generate a multiplication query with 5 and 7",
"model": {
"model": "gpt-4o-mini-2024-07-18",
"temperature": 0.5,
"maxTokens": 150,
},
},
"evalSetId": "test-eval-set-id",
"createdAt": "2025-09-04T18:54:58.378Z",
"updatedAt": "2025-09-04T18:55:55.416Z",
}
eval_item = EvaluationItem(**evaluation_item)

assert isinstance(eval_item.input_mocking_strategy, InputMockingStrategy)
assert isinstance(eval_item.input_mocking_strategy.model, ModelSettings)
assert eval_item.input_mocking_strategy.model.model == "gpt-4o-mini-2024-07-18"
assert eval_item.input_mocking_strategy.model.temperature == 0.5
assert eval_item.input_mocking_strategy.model.max_tokens == 150

input_schema = {
"type": "object",
"properties": {
"query": {"type": "string"},
},
"required": ["query"],
"additionalProperties": False,
}

httpx_mock.add_response(
url="https://example.com/agenthub_/llm/api/capabilities",
status_code=200,
json={},
)
httpx_mock.add_response(
url="https://example.com/orchestrator_/llm/api/capabilities",
status_code=200,
json={},
)

httpx_mock.add_response(
url="https://example.com/api/chat/completions?api-version=2024-08-01-preview",
status_code=200,
json={
"role": "assistant",
"id": "response-id",
"object": "chat.completion",
"created": 0,
"model": "gpt-4o-mini-2024-07-18",
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": '{"query": "Calculate 5 times 7"}',
"tool_calls": None,
},
"finish_reason": "stop",
}
],
"usage": {
"prompt_tokens": 100,
"completion_tokens": 20,
"total_tokens": 120,
},
},
)

result = await generate_llm_input(eval_item, input_schema)

# Verify the mocked input is correct
assert result == {"query": "Calculate 5 times 7"}

requests = httpx_mock.get_requests()
chat_completion_requests = [r for r in requests if "chat/completions" in str(r.url)]
assert len(chat_completion_requests) == 1, (
"Expected exactly one chat completion request"
)
1 change: 1 addition & 0 deletions tests/cli/eval/mocks/test_mocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ async def foofoo(*args, **kwargs):
assert await foo(x=2) == "bar1"


@pytest.mark.httpx_mock(assert_all_responses_were_requested=False)
def test_llm_mockable_sync(httpx_mock: HTTPXMock, monkeypatch: MonkeyPatch):
monkeypatch.setenv("UIPATH_URL", "https://example.com")
monkeypatch.setenv("UIPATH_ACCESS_TOKEN", "1234567890")
Expand Down