feat(InputMocking): generate mock input using LLM given instructions

bai-uipath · bai-uipath · commit 4948dfc17ada · 2025-10-14T15:43:45.000-07:00
diff --git a/samples/calculator/evals/eval-sets/default.json b/samples/calculator/evals/eval-sets/default.json
@@ -64,6 +64,21 @@
       "evalSetId": "default-eval-set-id",
       "createdAt": "2025-09-04T18:54:58.378Z",
       "updatedAt": "2025-09-04T18:55:55.416Z"
+    },
+    {
+      "id": "test-with-llm-input-mocking",
+      "name": "Test with LLM input mocking",
+      "inputs": {},
+      "expectedOutput": {
+        "result": 35
+      },
+      "expectedAgentBehavior": "",
+      "simulateInput": true,
+      "inputGenerationInstructions": "Generate a multiplication calculation where the first number is 5 and the second number is 7",
+      "simulationInstructions": "",
+      "evalSetId": "default-eval-set-id",
+      "createdAt": "2025-09-04T18:54:58.378Z",
+      "updatedAt": "2025-09-04T18:55:55.416Z"
     }
   ],
   "modelSettings": [],
diff --git a/src/uipath/_cli/_evals/_models/_evaluation_set.py b/src/uipath/_cli/_evals/_models/_evaluation_set.py
@@ -103,6 +103,11 @@ class EvaluationItem(BaseModel):
     inputs: Dict[str, Any]
     expected_output: Dict[str, Any]
     expected_agent_behavior: str = Field(default="", alias="expectedAgentBehavior")
+    simulate_input: bool = Field(default=False, alias="simulateInput")
+    input_generation_instructions: str = Field(
+        default="", alias="inputGenerationInstructions"
+    )
+    simulation_instructions: str = Field(default="", alias="simulationInstructions")
     eval_set_id: str = Field(alias="evalSetId")
     created_at: str = Field(alias="createdAt")
     updated_at: str = Field(alias="updatedAt")
diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py
@@ -11,6 +11,12 @@
 from opentelemetry.sdk.trace import ReadableSpan, Span
 from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
 
+from uipath._cli._evals.mocks.input_mocker import (
+    extract_input_schema_from_entrypoint,
+    generate_llm_input,
+)
+from uipath._cli._evals.mocks.mocker import UiPathInputMockingError
+
 from ..._events._event_bus import EventBus
 from ..._events._events import (
     EvalItemExceptionDetails,
@@ -318,6 +324,10 @@ async def _execute_eval(
         evaluators: List[BaseEvaluator[Any]],
         event_bus: EventBus,
     ) -> EvaluationRunResult:
+        # Generate LLM-based input if simulate_input is enabled
+        if eval_item.simulate_input:
+            eval_item = await self._generate_input_for_eval(eval_item)
+
         set_execution_context(eval_item, self.span_collector)
 
         await event_bus.publish(
@@ -417,6 +427,18 @@ async def _execute_eval(
 
         return evaluation_run_results
 
+    async def _generate_input_for_eval(
+        self, eval_item: EvaluationItem
+    ) -> EvaluationItem:
+        """Use LLM to generate a mock input for an evaluation item."""
+        if not self.context.entrypoint:
+            raise UiPathInputMockingError("entrypoint must be provided for eval runs")
+
+        input_schema = extract_input_schema_from_entrypoint(self.context.entrypoint)
+        generated_input = await generate_llm_input(eval_item, input_schema)
+        updated_eval_item = eval_item.model_copy(update={"inputs": generated_input})
+        return updated_eval_item
+
     def _get_and_clear_execution_data(
         self, execution_id: str
     ) -> tuple[List[ReadableSpan], list[logging.LogRecord]]:
diff --git a/src/uipath/_cli/_evals/mocks/input_mocker.py b/src/uipath/_cli/_evals/mocks/input_mocker.py
@@ -0,0 +1,149 @@
+"""LLM Input Mocker implementation."""
+
+import importlib.util
+import inspect
+import json
+from datetime import datetime
+from typing import Any, Dict
+
+from pydantic import TypeAdapter
+
+from uipath import UiPath
+from uipath._cli._evals._models._evaluation_set import EvaluationItem
+from uipath._services.llm_gateway_service import _cleanup_schema
+from uipath.tracing._traced import traced
+
+from .mocker import UiPathInputMockingError
+
+
+def get_input_mocking_prompt(
+    input_schema: str,
+    test_run_proctor_instructions: str,
+    input_generation_instructions: str,
+    expected_behavior: str,
+    expected_output: str,
+) -> str:
+    """Generate the LLM input mocking prompt."""
+    current_datetime = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
+
+    return f"""You are simulating input for automated testing purposes of an Agent as part of a simulation run.
+You will need to generate realistic input to a LLM agent which will call various tools to achieve a goal. This must be in the exact format of the INPUT_SCHEMA.
+You may need to follow specific INPUT_GENERATION_INSTRUCTIONS. If no relevant instructions are provided pertaining to input generation, use the other provided information and your own judgement to generate input.
+If the INPUT_GENERATION_INSTRUCTIONS are provided, you MUST follow them exactly. For example if the instructions say to generate a value for a field to be before a certain calendar date, you must generate a value that is before that date.
+The SIMULATION_INSTRUCTIONS will provide context around how the tools are being simulated.
+
+
+The current date and time is: {current_datetime}
+
+#INPUT_SCHEMA: You MUST OUTPUT THIS EXACT JSON SCHEMA
+{input_schema}
+#END_INPUT_SCHEMA
+
+#INPUT_GENERATION_INSTRUCTIONS
+{input_generation_instructions}
+#END_INPUT_GENERATION_INSTRUCTIONS
+
+#SIMULATION_INSTRUCTIONS
+{test_run_proctor_instructions}
+#END_SIMULATION_INSTRUCTIONS
+
+#EXPECTED_BEHAVIOR
+{expected_behavior}
+#END_EXPECTED_BEHAVIOR
+
+#EXPECTED_OUTPUT
+{expected_output}
+#END_EXPECTED_OUTPUT
+
+Based on the above information, provide a realistic input to the LLM agent. Your response should:
+1. Match the expected input format according to the INPUT_SCHEMA exactly
+2. Be consistent with the style and level of detail in the example inputs
+3. Consider the context of the the agent being tested
+4. Be realistic and representative of what a real user might say or ask
+
+OUTPUT: ONLY the simulated agent input in the exact format of the INPUT_SCHEMA in valid JSON. Do not include any explanations, quotation marks, or markdown."""
+
+
+def extract_input_schema_from_entrypoint(entrypoint_path: str) -> Dict[str, Any]:
+    """Extract JSON schema from the entrypoint file's main function."""
+    spec = importlib.util.spec_from_file_location("entrypoint_module", entrypoint_path)
+    if not spec or not spec.loader:
+        raise UiPathInputMockingError(
+            f"Failed to load module spec from entrypoint: {entrypoint_path}"
+        )
+
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+
+    # Check for main, run, or execute (same as ScriptExecutor)
+    for func_name in ["main", "run", "execute"]:
+        func = getattr(module, func_name, None)
+        if func is not None:
+            sig = inspect.signature(func)
+            params = list(sig.parameters.values())
+
+            if not params:
+                continue
+
+            first_param = params[0]
+            if first_param.annotation == inspect.Parameter.empty:
+                continue
+
+            adapter = TypeAdapter(first_param.annotation)
+            return adapter.json_schema()
+
+    raise UiPathInputMockingError(
+        f"No suitable entrypoint (main, run, execute) with typed parameters found in {entrypoint_path}"
+    )
+
+
+@traced(name="__mocker__")
+async def generate_llm_input(
+    evaluation_item: EvaluationItem,
+    input_schema: Dict[str, Any],
+) -> Dict[str, Any]:
+    """Generate synthetic input using an LLM based on the evaluation context."""
+    try:
+        llm = UiPath().llm
+
+        prompt = get_input_mocking_prompt(
+            input_schema=json.dumps(input_schema, indent=2),
+            test_run_proctor_instructions=evaluation_item.simulation_instructions or "",
+            input_generation_instructions=evaluation_item.input_generation_instructions
+            or "",
+            expected_behavior=evaluation_item.expected_agent_behavior or "",
+            expected_output=json.dumps(evaluation_item.expected_output, indent=2)
+            if evaluation_item.expected_output
+            else "",
+        )
+
+        cleaned_schema = _cleanup_schema(input_schema)
+
+        response_format = {
+            "type": "json_schema",
+            "json_schema": {
+                "name": "agent_input",
+                "strict": True,
+                "schema": cleaned_schema,
+            },
+        }
+
+        response = await llm.chat_completions(
+            [{"role": "user", "content": prompt}],
+            temperature=0.3,
+            response_format=response_format,
+        )
+
+        generated_input_str = response.choices[0].message.content
+        if not generated_input_str:
+            raise UiPathInputMockingError("LLM returned empty response")
+
+        return json.loads(generated_input_str)
+    except json.JSONDecodeError as e:
+        raise UiPathInputMockingError(
+            f"Failed to parse LLM response as JSON: {str(e)}"
+        ) from e
+    except UiPathInputMockingError:
+        raise
+    except Exception as e:
+        raise UiPathInputMockingError(f"Failed to generate input: {str(e)}") from e
diff --git a/src/uipath/_cli/_evals/mocks/mocker.py b/src/uipath/_cli/_evals/mocks/mocker.py
@@ -33,3 +33,9 @@ class UiPathMockResponseGenerationError(Exception):
     """Exception when a mocker is configured unable to generate a response."""
 
     pass
+
+
+class UiPathInputMockingError(Exception):
+    """Exception when input mocking fails."""
+
+    pass
diff --git a/src/uipath/_services/llm_gateway_service.py b/src/uipath/_services/llm_gateway_service.py
@@ -77,15 +77,18 @@ class EmbeddingModels(object):
     text_embedding_ada_002 = "text-embedding-ada-002"
 
 
-def _cleanup_schema(model_class: type[BaseModel]) -> Dict[str, Any]:
-    """Clean up a Pydantic model schema for use with LLM Gateway.
+def _cleanup_schema(
+    model_or_schema: Union[type[BaseModel], Dict[str, Any]],
+) -> Dict[str, Any]:
+    """Clean up a Pydantic model schema or raw JSON schema for use with LLM Gateway.
 
-    This function converts a Pydantic model's JSON schema to a format that's
-    compatible with the LLM Gateway's JSON schema requirements by removing
-    titles and other metadata that might cause validation issues.
+    This function converts a Pydantic model's JSON schema or a raw JSON schema dict
+    to a format that's compatible with the LLM Gateway's JSON schema requirements by
+    removing titles and other metadata that might cause validation issues.
 
     Args:
-        model_class (type[BaseModel]): A Pydantic BaseModel class to convert to schema.
+        model_or_schema (Union[type[BaseModel], Dict[str, Any]]): Either a Pydantic
+            BaseModel class or a JSON schema dictionary to clean.
 
     Returns:
         dict: A cleaned JSON schema dictionary suitable for LLM Gateway response_format.
@@ -102,9 +105,16 @@ class Country(BaseModel):
 
         schema = _cleanup_schema(Country)
         # Returns a clean schema without titles and unnecessary metadata
+
+        # Or pass a schema dict directly
+        schema_dict = {"type": "object", "properties": {...}}
+        schema = _cleanup_schema(schema_dict)
         ```
     """
-    schema = model_class.model_json_schema()
+    if isinstance(model_or_schema, dict):
+        schema = model_or_schema
+    else:
+        schema = model_or_schema.model_json_schema()
 
     def clean_type(type_def):
         """Clean property definitions by removing titles and cleaning nested items. Additionally, `additionalProperties` is ensured on all objects."""
diff --git a/tests/sdk/services/test_llm_schema_cleanup.py b/tests/sdk/services/test_llm_schema_cleanup.py
@@ -230,3 +230,25 @@ class ExtendedEntity(BaseEntity):
         assert "name" in required_fields
         # description is optional, so not in required
         assert "description" not in required_fields
+
+    def test_cleanup_with_schema_dict(self):
+        """Test that _cleanup_schema can accept a schema dict directly."""
+        schema_dict = {
+            "type": "object",
+            "title": "TestSchema",
+            "properties": {
+                "name": {"type": "string", "title": "Name"},
+                "age": {"type": "integer", "title": "Age"},
+            },
+            "required": ["name", "age"],
+        }
+
+        schema = _cleanup_schema(schema_dict)
+
+        assert schema["type"] == "object"
+        assert schema["additionalProperties"] is False
+        assert "title" not in schema
+        assert "name" in schema["properties"]
+        assert "age" in schema["properties"]
+        assert "title" not in schema["properties"]["name"]
+        assert "title" not in schema["properties"]["age"]