Merge pull request #666 from UiPath/bai/llm-mocker-parity

bai-uipath · web-flow · commit cd8fe88ba74d · 2025-10-13T11:05:59.000-07:00
feat(Mocking): pass execution trace history into LLM mocker
diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py
@@ -7,7 +7,8 @@
 from time import time
 from typing import Any, Dict, Generic, List, Optional, Sequence, TypeVar
 
-from opentelemetry.sdk.trace import ReadableSpan
+from opentelemetry import context as context_api
+from opentelemetry.sdk.trace import ReadableSpan, Span
 from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
 
 from ..._events._event_bus import EventBus
@@ -24,6 +25,7 @@
 from ...eval.models.models import AgentExecution, EvalItemResult
 from .._runtime._contracts import (
     UiPathBaseRuntime,
+    UiPathExecutionBatchTraceProcessor,
     UiPathRuntimeContext,
     UiPathRuntimeFactory,
     UiPathRuntimeResult,
@@ -41,7 +43,11 @@
     UiPathEvalOutput,
     UiPathEvalRunExecutionOutput,
 )
-from .mocks.mocks import set_evaluation_item
+from ._span_collection import ExecutionSpanCollector
+from .mocks.mocks import (
+    clear_execution_context,
+    set_execution_context,
+)
 
 T = TypeVar("T", bound=UiPathBaseRuntime)
 C = TypeVar("C", bound=UiPathRuntimeContext)
@@ -78,6 +84,24 @@ def shutdown(self) -> None:
         self.clear()
 
 
+class ExecutionSpanProcessor(UiPathExecutionBatchTraceProcessor):
+    """Span processor that adds spans to ExecutionSpanCollector when they start."""
+
+    def __init__(self, span_exporter: SpanExporter, collector: ExecutionSpanCollector):
+        super().__init__(span_exporter)
+        self.collector = collector
+
+    def on_start(
+        self, span: Span, parent_context: Optional[context_api.Context] = None
+    ) -> None:
+        super().on_start(span, parent_context)
+
+        if span.attributes and "execution.id" in span.attributes:
+            exec_id = span.attributes["execution.id"]
+            if isinstance(exec_id, str):
+                self.collector.add_span(span, exec_id)
+
+
 class ExecutionLogsExporter:
     """Custom exporter that stores multiple execution log handlers."""
 
@@ -127,8 +151,15 @@ def __init__(
         self.context: UiPathEvalContext = context
         self.factory: UiPathRuntimeFactory[T, C] = factory
         self.event_bus: EventBus = event_bus
+
         self.span_exporter: ExecutionSpanExporter = ExecutionSpanExporter()
-        self.factory.add_span_exporter(self.span_exporter)
+        self.span_collector: ExecutionSpanCollector = ExecutionSpanCollector()
+
+        # Span processor feeds both exporter and collector
+        span_processor = ExecutionSpanProcessor(self.span_exporter, self.span_collector)
+        self.factory.tracer_span_processors.append(span_processor)
+        self.factory.tracer_provider.add_span_processor(span_processor)
+
         self.logs_exporter: ExecutionLogsExporter = ExecutionLogsExporter()
         self.execution_id = str(uuid.uuid4())
 
@@ -180,7 +211,6 @@ async def execute(self) -> Optional[UiPathRuntimeResult]:
             evaluation_set_name=evaluation_set.name,
             evaluation_set_results=eval_run_result_list,
         )
-
         # Computing evaluator averages
         evaluator_averages: Dict[str, float] = defaultdict(float)
         evaluator_count: Dict[str, int] = defaultdict(int)
@@ -194,7 +224,6 @@ async def execute(self) -> Optional[UiPathRuntimeResult]:
             evaluator_averages[eval_id] = (
                 evaluator_averages[eval_id] / evaluator_count[eval_id]
             )
-
         await event_bus.publish(
             EvaluationEvents.UPDATE_EVAL_SET_RUN,
             EvalSetRunUpdatedEvent(
@@ -289,7 +318,7 @@ async def _execute_eval(
         evaluators: List[BaseEvaluator[Any]],
         event_bus: EventBus,
     ) -> EvaluationRunResult:
-        set_evaluation_item(eval_item)
+        set_execution_context(eval_item, self.span_collector)
 
         await event_bus.publish(
             EvaluationEvents.CREATE_EVAL_RUN,
@@ -383,6 +412,8 @@ async def _execute_eval(
                 eval_run_updated_event,
                 wait_for_completion=False,
             )
+        finally:
+            clear_execution_context()
 
         return evaluation_run_results
 
@@ -391,6 +422,7 @@ def _get_and_clear_execution_data(
     ) -> tuple[List[ReadableSpan], list[logging.LogRecord]]:
         spans = self.span_exporter.get_spans(execution_id)
         self.span_exporter.clear(execution_id)
+        self.span_collector.clear(execution_id)
 
         logs = self.logs_exporter.get_logs(execution_id)
         self.logs_exporter.clear(execution_id)
diff --git a/src/uipath/_cli/_evals/_span_collection.py b/src/uipath/_cli/_evals/_span_collection.py
@@ -0,0 +1,24 @@
+from collections import defaultdict
+from typing import Dict, List, Optional
+
+from opentelemetry.sdk.trace import ReadableSpan, Span
+
+
+class ExecutionSpanCollector:
+    """Collects spans as they are created during execution."""
+
+    def __init__(self):
+        # { execution_id -> list of spans }
+        self._spans: Dict[str, List[ReadableSpan]] = defaultdict(list)
+
+    def add_span(self, span: Span, execution_id: str) -> None:
+        self._spans[execution_id].append(span)
+
+    def get_spans(self, execution_id: str) -> List[ReadableSpan]:
+        return self._spans.get(execution_id, [])
+
+    def clear(self, execution_id: Optional[str] = None) -> None:
+        if execution_id:
+            self._spans.pop(execution_id, None)
+        else:
+            self._spans.clear()
diff --git a/src/uipath/_cli/_evals/mocks/llm_mocker.py b/src/uipath/_cli/_evals/mocks/llm_mocker.py
@@ -6,6 +6,9 @@
 
 from pydantic import BaseModel
 
+from uipath.tracing._traced import traced
+from uipath.tracing._utils import _SpanUtils
+
 from .._models._evaluation_set import (
     EvaluationItem,
     LLMMockingStrategy,
@@ -51,7 +54,7 @@
 3. Always include the entire output regardless of token length.
 3. Consider the context of the current test run and the agent being tested.  If the agent is acting on a property, make sure the output includes that property.
 
-Respond ONLY with valid JSON that would be a realistic and completetool response. Do not include any explanations or markdown.
+Respond ONLY with valid JSON that would be a realistic and complete tool response. Do not include any explanations or markdown.
 """
 
 logger = logging.getLogger(__name__)
@@ -79,6 +82,7 @@ def __init__(self, evaluation_item: EvaluationItem):
         self.evaluation_item = evaluation_item
         assert isinstance(self.evaluation_item.mocking_strategy, LLMMockingStrategy)
 
+    @traced(name="__mocker__")
     async def response(
         self, func: Callable[[T], R], params: dict[str, Any], *args: T, **kwargs
     ) -> R:
@@ -92,6 +96,8 @@ async def response(
             from uipath import UiPath
             from uipath._services.llm_gateway_service import _cleanup_schema
 
+            from .mocks import evaluation_context, span_collector_context
+
             llm = UiPath().llm
             return_type: Any = func.__annotations__.get("return", None)
             if return_type is None:
@@ -116,9 +122,17 @@ class OutputSchema(BaseModel):
                 example_calls = [
                     call for call in example_calls if isinstance(call, ExampleCall)
                 ]
+
+                test_run_history = "(empty)"
+                eval_item = evaluation_context.get()
+                span_collector = span_collector_context.get()
+                if eval_item and span_collector:
+                    spans = span_collector.get_spans(eval_item.id)
+                    test_run_history = _SpanUtils.spans_to_llm_context(spans)
+
                 prompt_input: dict[str, Any] = {
                     "toolRunExamples": example_calls,
-                    "testRunHistory": [],  # This should contain ordered spans.
+                    "testRunHistory": test_run_history,
                     "toolInfo": {
                         "name": function_name,
                         "description": params.get("description"),
diff --git a/src/uipath/_cli/_evals/mocks/mocks.py b/src/uipath/_cli/_evals/mocks/mocks.py
@@ -5,30 +5,49 @@
 from typing import Any, Callable, Optional
 
 from uipath._cli._evals._models._evaluation_set import EvaluationItem
+from uipath._cli._evals._span_collection import ExecutionSpanCollector
 from uipath._cli._evals.mocks.mocker import Mocker, UiPathNoMockFoundError
 from uipath._cli._evals.mocks.mocker_factory import MockerFactory
 
+# Context variables for evaluation items and mockers
 evaluation_context: ContextVar[Optional[EvaluationItem]] = ContextVar(
     "evaluation", default=None
 )
 
 mocker_context: ContextVar[Optional[Mocker]] = ContextVar("mocker", default=None)
 
+# Span collector for trace access during mocking
+span_collector_context: ContextVar[Optional[ExecutionSpanCollector]] = ContextVar(
+    "span_collector", default=None
+)
+
 logger = logging.getLogger(__name__)
 
 
-def set_evaluation_item(item: EvaluationItem) -> None:
-    """Set an evaluation item within an evaluation set."""
-    evaluation_context.set(item)
+def set_execution_context(
+    eval_item: EvaluationItem, span_collector: ExecutionSpanCollector
+) -> None:
+    """Set the execution context for an evaluation run for mocking and trace access."""
+    evaluation_context.set(eval_item)
+
     try:
-        if item.mocking_strategy:
-            mocker_context.set(MockerFactory.create(item))
+        if eval_item.mocking_strategy:
+            mocker_context.set(MockerFactory.create(eval_item))
         else:
             mocker_context.set(None)
     except Exception:
-        logger.warning(f"Failed to create mocker for evaluation {item.name}")
+        logger.warning(f"Failed to create mocker for evaluation {eval_item.name}")
         mocker_context.set(None)
 
+    span_collector_context.set(span_collector)
+
+
+def clear_execution_context() -> None:
+    """Clear the execution context after evaluation completes."""
+    evaluation_context.set(None)
+    mocker_context.set(None)
+    span_collector_context.set(None)
+
 
 async def get_mocked_response(
     func: Callable[[Any], Any], params: dict[str, Any], *args, **kwargs
diff --git a/src/uipath/tracing/_utils.py b/src/uipath/tracing/_utils.py
@@ -319,3 +319,55 @@ def format_args_for_trace(
                 f"Error formatting arguments for trace: {e}. Using args and kwargs directly."
             )
             return {"args": args, "kwargs": kwargs}
+
+    @staticmethod
+    def _has_ancestor_with_name(
+        span: ReadableSpan, ancestor_name: str, span_map: Dict[int, ReadableSpan]
+    ) -> bool:
+        """Check if this span or any of its ancestors has a given name."""
+        if span.name == ancestor_name:
+            return True
+
+        current = span
+        while current.parent is not None:
+            parent_span = span_map.get(current.parent.span_id)
+            if parent_span is None:
+                break
+            if parent_span.name == ancestor_name:
+                return True
+            current = parent_span
+
+        return False
+
+    @staticmethod
+    def spans_to_llm_context(spans: list[ReadableSpan]) -> str:
+        """Convert spans to a formatted conversation history string suitable for LLM context.
+
+        Includes function calls (including LLM calls) with their inputs and outputs.
+        """
+        # Build span_id -> span map for parent chain traversal
+        span_map = {span.get_span_context().span_id: span for span in spans}
+
+        history = []
+        for span in spans:
+            attributes = dict(span.attributes) if span.attributes else {}
+
+            input_value = attributes.get("input.value")
+            output_value = attributes.get("output.value")
+
+            if not input_value or not output_value:
+                continue
+
+            # Skip spans that are internal LLM calls (eg. for tool mocking in evals)
+            if _SpanUtils._has_ancestor_with_name(span, "__mocker__", span_map):
+                continue
+
+            history.append(f"Function: {span.name}")
+            history.append(f"Input: {input_value}")
+            history.append(f"Output: {output_value}")
+            history.append("")
+
+        if not history:
+            return "(empty)"
+
+        return "\n".join(history)
diff --git a/tests/cli/eval/mocks/test_mocks.py b/tests/cli/eval/mocks/test_mocks.py
@@ -1,4 +1,5 @@
 from typing import Any
+from unittest.mock import MagicMock
 
 import pytest
 from _pytest.monkeypatch import MonkeyPatch
@@ -10,9 +11,11 @@
     MockitoMockingStrategy,
 )
 from uipath._cli._evals.mocks.mocker import UiPathMockResponseGenerationError
-from uipath._cli._evals.mocks.mocks import set_evaluation_item
+from uipath._cli._evals.mocks.mocks import set_execution_context
 from uipath.eval.mocks import mockable
 
+_mock_span_collector = MagicMock()
+
 
 def test_mockito_mockable_sync():
     # Arrange
@@ -51,7 +54,7 @@ def foofoo(*args, **kwargs):
     assert isinstance(evaluation.mocking_strategy, MockitoMockingStrategy)
 
     # Act & Assert
-    set_evaluation_item(evaluation)
+    set_execution_context(evaluation, _mock_span_collector)
     assert foo() == "bar1"
     assert foo() == "bar2"
     assert foo() == "bar2"
@@ -63,13 +66,13 @@ def foofoo(*args, **kwargs):
         assert foofoo()
 
     evaluation.mocking_strategy.behaviors[0].arguments.kwargs = {"x": 1}
-    set_evaluation_item(evaluation)
+    set_execution_context(evaluation, _mock_span_collector)
     assert foo(x=1) == "bar1"
 
     evaluation.mocking_strategy.behaviors[0].arguments.kwargs = {
         "x": {"_target_": "mockito.any"}
     }
-    set_evaluation_item(evaluation)
+    set_execution_context(evaluation, _mock_span_collector)
     assert foo(x=2) == "bar1"
 
 
@@ -111,7 +114,7 @@ async def foofoo(*args, **kwargs):
     assert isinstance(evaluation.mocking_strategy, MockitoMockingStrategy)
 
     # Act & Assert
-    set_evaluation_item(evaluation)
+    set_execution_context(evaluation, _mock_span_collector)
     assert await foo() == "bar1"
     assert await foo() == "bar2"
     assert await foo() == "bar2"
@@ -123,13 +126,13 @@ async def foofoo(*args, **kwargs):
         assert await foofoo()
 
     evaluation.mocking_strategy.behaviors[0].arguments.kwargs = {"x": 1}
-    set_evaluation_item(evaluation)
+    set_execution_context(evaluation, _mock_span_collector)
     assert await foo(x=1) == "bar1"
 
     evaluation.mocking_strategy.behaviors[0].arguments.kwargs = {
         "x": {"_target_": "mockito.any"}
     }
-    set_evaluation_item(evaluation)
+    set_execution_context(evaluation, _mock_span_collector)
     assert await foo(x=2) == "bar1"
 
 
@@ -201,7 +204,7 @@ def foofoo(*args, **kwargs):
         },
     )
     # Act & Assert
-    set_evaluation_item(evaluation)
+    set_execution_context(evaluation, _mock_span_collector)
 
     assert foo() == "bar1"
     with pytest.raises(NotImplementedError):
@@ -274,7 +277,7 @@ async def foofoo(*args, **kwargs):
         },
     )
     # Act & Assert
-    set_evaluation_item(evaluation)
+    set_execution_context(evaluation, _mock_span_collector)
 
     assert await foo() == "bar1"
     with pytest.raises(NotImplementedError):