Skip to content

Commit cd8fe88

Browse files
authored
Merge pull request #666 from UiPath/bai/llm-mocker-parity
feat(Mocking): pass execution trace history into LLM mocker
2 parents 584b0eb + 1d31e7c commit cd8fe88

File tree

6 files changed

+167
-23
lines changed

6 files changed

+167
-23
lines changed

src/uipath/_cli/_evals/_runtime.py

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
from time import time
88
from typing import Any, Dict, Generic, List, Optional, Sequence, TypeVar
99

10-
from opentelemetry.sdk.trace import ReadableSpan
10+
from opentelemetry import context as context_api
11+
from opentelemetry.sdk.trace import ReadableSpan, Span
1112
from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
1213

1314
from ..._events._event_bus import EventBus
@@ -24,6 +25,7 @@
2425
from ...eval.models.models import AgentExecution, EvalItemResult
2526
from .._runtime._contracts import (
2627
UiPathBaseRuntime,
28+
UiPathExecutionBatchTraceProcessor,
2729
UiPathRuntimeContext,
2830
UiPathRuntimeFactory,
2931
UiPathRuntimeResult,
@@ -41,7 +43,11 @@
4143
UiPathEvalOutput,
4244
UiPathEvalRunExecutionOutput,
4345
)
44-
from .mocks.mocks import set_evaluation_item
46+
from ._span_collection import ExecutionSpanCollector
47+
from .mocks.mocks import (
48+
clear_execution_context,
49+
set_execution_context,
50+
)
4551

4652
T = TypeVar("T", bound=UiPathBaseRuntime)
4753
C = TypeVar("C", bound=UiPathRuntimeContext)
@@ -78,6 +84,24 @@ def shutdown(self) -> None:
7884
self.clear()
7985

8086

87+
class ExecutionSpanProcessor(UiPathExecutionBatchTraceProcessor):
88+
"""Span processor that adds spans to ExecutionSpanCollector when they start."""
89+
90+
def __init__(self, span_exporter: SpanExporter, collector: ExecutionSpanCollector):
91+
super().__init__(span_exporter)
92+
self.collector = collector
93+
94+
def on_start(
95+
self, span: Span, parent_context: Optional[context_api.Context] = None
96+
) -> None:
97+
super().on_start(span, parent_context)
98+
99+
if span.attributes and "execution.id" in span.attributes:
100+
exec_id = span.attributes["execution.id"]
101+
if isinstance(exec_id, str):
102+
self.collector.add_span(span, exec_id)
103+
104+
81105
class ExecutionLogsExporter:
82106
"""Custom exporter that stores multiple execution log handlers."""
83107

@@ -127,8 +151,15 @@ def __init__(
127151
self.context: UiPathEvalContext = context
128152
self.factory: UiPathRuntimeFactory[T, C] = factory
129153
self.event_bus: EventBus = event_bus
154+
130155
self.span_exporter: ExecutionSpanExporter = ExecutionSpanExporter()
131-
self.factory.add_span_exporter(self.span_exporter)
156+
self.span_collector: ExecutionSpanCollector = ExecutionSpanCollector()
157+
158+
# Span processor feeds both exporter and collector
159+
span_processor = ExecutionSpanProcessor(self.span_exporter, self.span_collector)
160+
self.factory.tracer_span_processors.append(span_processor)
161+
self.factory.tracer_provider.add_span_processor(span_processor)
162+
132163
self.logs_exporter: ExecutionLogsExporter = ExecutionLogsExporter()
133164
self.execution_id = str(uuid.uuid4())
134165

@@ -180,7 +211,6 @@ async def execute(self) -> Optional[UiPathRuntimeResult]:
180211
evaluation_set_name=evaluation_set.name,
181212
evaluation_set_results=eval_run_result_list,
182213
)
183-
184214
# Computing evaluator averages
185215
evaluator_averages: Dict[str, float] = defaultdict(float)
186216
evaluator_count: Dict[str, int] = defaultdict(int)
@@ -194,7 +224,6 @@ async def execute(self) -> Optional[UiPathRuntimeResult]:
194224
evaluator_averages[eval_id] = (
195225
evaluator_averages[eval_id] / evaluator_count[eval_id]
196226
)
197-
198227
await event_bus.publish(
199228
EvaluationEvents.UPDATE_EVAL_SET_RUN,
200229
EvalSetRunUpdatedEvent(
@@ -289,7 +318,7 @@ async def _execute_eval(
289318
evaluators: List[BaseEvaluator[Any]],
290319
event_bus: EventBus,
291320
) -> EvaluationRunResult:
292-
set_evaluation_item(eval_item)
321+
set_execution_context(eval_item, self.span_collector)
293322

294323
await event_bus.publish(
295324
EvaluationEvents.CREATE_EVAL_RUN,
@@ -383,6 +412,8 @@ async def _execute_eval(
383412
eval_run_updated_event,
384413
wait_for_completion=False,
385414
)
415+
finally:
416+
clear_execution_context()
386417

387418
return evaluation_run_results
388419

@@ -391,6 +422,7 @@ def _get_and_clear_execution_data(
391422
) -> tuple[List[ReadableSpan], list[logging.LogRecord]]:
392423
spans = self.span_exporter.get_spans(execution_id)
393424
self.span_exporter.clear(execution_id)
425+
self.span_collector.clear(execution_id)
394426

395427
logs = self.logs_exporter.get_logs(execution_id)
396428
self.logs_exporter.clear(execution_id)
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from collections import defaultdict
2+
from typing import Dict, List, Optional
3+
4+
from opentelemetry.sdk.trace import ReadableSpan, Span
5+
6+
7+
class ExecutionSpanCollector:
8+
"""Collects spans as they are created during execution."""
9+
10+
def __init__(self):
11+
# { execution_id -> list of spans }
12+
self._spans: Dict[str, List[ReadableSpan]] = defaultdict(list)
13+
14+
def add_span(self, span: Span, execution_id: str) -> None:
15+
self._spans[execution_id].append(span)
16+
17+
def get_spans(self, execution_id: str) -> List[ReadableSpan]:
18+
return self._spans.get(execution_id, [])
19+
20+
def clear(self, execution_id: Optional[str] = None) -> None:
21+
if execution_id:
22+
self._spans.pop(execution_id, None)
23+
else:
24+
self._spans.clear()

src/uipath/_cli/_evals/mocks/llm_mocker.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66

77
from pydantic import BaseModel
88

9+
from uipath.tracing._traced import traced
10+
from uipath.tracing._utils import _SpanUtils
11+
912
from .._models._evaluation_set import (
1013
EvaluationItem,
1114
LLMMockingStrategy,
@@ -51,7 +54,7 @@
5154
3. Always include the entire output regardless of token length.
5255
3. Consider the context of the current test run and the agent being tested. If the agent is acting on a property, make sure the output includes that property.
5356
54-
Respond ONLY with valid JSON that would be a realistic and completetool response. Do not include any explanations or markdown.
57+
Respond ONLY with valid JSON that would be a realistic and complete tool response. Do not include any explanations or markdown.
5558
"""
5659

5760
logger = logging.getLogger(__name__)
@@ -79,6 +82,7 @@ def __init__(self, evaluation_item: EvaluationItem):
7982
self.evaluation_item = evaluation_item
8083
assert isinstance(self.evaluation_item.mocking_strategy, LLMMockingStrategy)
8184

85+
@traced(name="__mocker__")
8286
async def response(
8387
self, func: Callable[[T], R], params: dict[str, Any], *args: T, **kwargs
8488
) -> R:
@@ -92,6 +96,8 @@ async def response(
9296
from uipath import UiPath
9397
from uipath._services.llm_gateway_service import _cleanup_schema
9498

99+
from .mocks import evaluation_context, span_collector_context
100+
95101
llm = UiPath().llm
96102
return_type: Any = func.__annotations__.get("return", None)
97103
if return_type is None:
@@ -116,9 +122,17 @@ class OutputSchema(BaseModel):
116122
example_calls = [
117123
call for call in example_calls if isinstance(call, ExampleCall)
118124
]
125+
126+
test_run_history = "(empty)"
127+
eval_item = evaluation_context.get()
128+
span_collector = span_collector_context.get()
129+
if eval_item and span_collector:
130+
spans = span_collector.get_spans(eval_item.id)
131+
test_run_history = _SpanUtils.spans_to_llm_context(spans)
132+
119133
prompt_input: dict[str, Any] = {
120134
"toolRunExamples": example_calls,
121-
"testRunHistory": [], # This should contain ordered spans.
135+
"testRunHistory": test_run_history,
122136
"toolInfo": {
123137
"name": function_name,
124138
"description": params.get("description"),

src/uipath/_cli/_evals/mocks/mocks.py

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,30 +5,49 @@
55
from typing import Any, Callable, Optional
66

77
from uipath._cli._evals._models._evaluation_set import EvaluationItem
8+
from uipath._cli._evals._span_collection import ExecutionSpanCollector
89
from uipath._cli._evals.mocks.mocker import Mocker, UiPathNoMockFoundError
910
from uipath._cli._evals.mocks.mocker_factory import MockerFactory
1011

12+
# Context variables for evaluation items and mockers
1113
evaluation_context: ContextVar[Optional[EvaluationItem]] = ContextVar(
1214
"evaluation", default=None
1315
)
1416

1517
mocker_context: ContextVar[Optional[Mocker]] = ContextVar("mocker", default=None)
1618

19+
# Span collector for trace access during mocking
20+
span_collector_context: ContextVar[Optional[ExecutionSpanCollector]] = ContextVar(
21+
"span_collector", default=None
22+
)
23+
1724
logger = logging.getLogger(__name__)
1825

1926

20-
def set_evaluation_item(item: EvaluationItem) -> None:
21-
"""Set an evaluation item within an evaluation set."""
22-
evaluation_context.set(item)
27+
def set_execution_context(
28+
eval_item: EvaluationItem, span_collector: ExecutionSpanCollector
29+
) -> None:
30+
"""Set the execution context for an evaluation run for mocking and trace access."""
31+
evaluation_context.set(eval_item)
32+
2333
try:
24-
if item.mocking_strategy:
25-
mocker_context.set(MockerFactory.create(item))
34+
if eval_item.mocking_strategy:
35+
mocker_context.set(MockerFactory.create(eval_item))
2636
else:
2737
mocker_context.set(None)
2838
except Exception:
29-
logger.warning(f"Failed to create mocker for evaluation {item.name}")
39+
logger.warning(f"Failed to create mocker for evaluation {eval_item.name}")
3040
mocker_context.set(None)
3141

42+
span_collector_context.set(span_collector)
43+
44+
45+
def clear_execution_context() -> None:
46+
"""Clear the execution context after evaluation completes."""
47+
evaluation_context.set(None)
48+
mocker_context.set(None)
49+
span_collector_context.set(None)
50+
3251

3352
async def get_mocked_response(
3453
func: Callable[[Any], Any], params: dict[str, Any], *args, **kwargs

src/uipath/tracing/_utils.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,3 +319,55 @@ def format_args_for_trace(
319319
f"Error formatting arguments for trace: {e}. Using args and kwargs directly."
320320
)
321321
return {"args": args, "kwargs": kwargs}
322+
323+
@staticmethod
324+
def _has_ancestor_with_name(
325+
span: ReadableSpan, ancestor_name: str, span_map: Dict[int, ReadableSpan]
326+
) -> bool:
327+
"""Check if this span or any of its ancestors has a given name."""
328+
if span.name == ancestor_name:
329+
return True
330+
331+
current = span
332+
while current.parent is not None:
333+
parent_span = span_map.get(current.parent.span_id)
334+
if parent_span is None:
335+
break
336+
if parent_span.name == ancestor_name:
337+
return True
338+
current = parent_span
339+
340+
return False
341+
342+
@staticmethod
343+
def spans_to_llm_context(spans: list[ReadableSpan]) -> str:
344+
"""Convert spans to a formatted conversation history string suitable for LLM context.
345+
346+
Includes function calls (including LLM calls) with their inputs and outputs.
347+
"""
348+
# Build span_id -> span map for parent chain traversal
349+
span_map = {span.get_span_context().span_id: span for span in spans}
350+
351+
history = []
352+
for span in spans:
353+
attributes = dict(span.attributes) if span.attributes else {}
354+
355+
input_value = attributes.get("input.value")
356+
output_value = attributes.get("output.value")
357+
358+
if not input_value or not output_value:
359+
continue
360+
361+
# Skip spans that are internal LLM calls (eg. for tool mocking in evals)
362+
if _SpanUtils._has_ancestor_with_name(span, "__mocker__", span_map):
363+
continue
364+
365+
history.append(f"Function: {span.name}")
366+
history.append(f"Input: {input_value}")
367+
history.append(f"Output: {output_value}")
368+
history.append("")
369+
370+
if not history:
371+
return "(empty)"
372+
373+
return "\n".join(history)

tests/cli/eval/mocks/test_mocks.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from typing import Any
2+
from unittest.mock import MagicMock
23

34
import pytest
45
from _pytest.monkeypatch import MonkeyPatch
@@ -10,9 +11,11 @@
1011
MockitoMockingStrategy,
1112
)
1213
from uipath._cli._evals.mocks.mocker import UiPathMockResponseGenerationError
13-
from uipath._cli._evals.mocks.mocks import set_evaluation_item
14+
from uipath._cli._evals.mocks.mocks import set_execution_context
1415
from uipath.eval.mocks import mockable
1516

17+
_mock_span_collector = MagicMock()
18+
1619

1720
def test_mockito_mockable_sync():
1821
# Arrange
@@ -51,7 +54,7 @@ def foofoo(*args, **kwargs):
5154
assert isinstance(evaluation.mocking_strategy, MockitoMockingStrategy)
5255

5356
# Act & Assert
54-
set_evaluation_item(evaluation)
57+
set_execution_context(evaluation, _mock_span_collector)
5558
assert foo() == "bar1"
5659
assert foo() == "bar2"
5760
assert foo() == "bar2"
@@ -63,13 +66,13 @@ def foofoo(*args, **kwargs):
6366
assert foofoo()
6467

6568
evaluation.mocking_strategy.behaviors[0].arguments.kwargs = {"x": 1}
66-
set_evaluation_item(evaluation)
69+
set_execution_context(evaluation, _mock_span_collector)
6770
assert foo(x=1) == "bar1"
6871

6972
evaluation.mocking_strategy.behaviors[0].arguments.kwargs = {
7073
"x": {"_target_": "mockito.any"}
7174
}
72-
set_evaluation_item(evaluation)
75+
set_execution_context(evaluation, _mock_span_collector)
7376
assert foo(x=2) == "bar1"
7477

7578

@@ -111,7 +114,7 @@ async def foofoo(*args, **kwargs):
111114
assert isinstance(evaluation.mocking_strategy, MockitoMockingStrategy)
112115

113116
# Act & Assert
114-
set_evaluation_item(evaluation)
117+
set_execution_context(evaluation, _mock_span_collector)
115118
assert await foo() == "bar1"
116119
assert await foo() == "bar2"
117120
assert await foo() == "bar2"
@@ -123,13 +126,13 @@ async def foofoo(*args, **kwargs):
123126
assert await foofoo()
124127

125128
evaluation.mocking_strategy.behaviors[0].arguments.kwargs = {"x": 1}
126-
set_evaluation_item(evaluation)
129+
set_execution_context(evaluation, _mock_span_collector)
127130
assert await foo(x=1) == "bar1"
128131

129132
evaluation.mocking_strategy.behaviors[0].arguments.kwargs = {
130133
"x": {"_target_": "mockito.any"}
131134
}
132-
set_evaluation_item(evaluation)
135+
set_execution_context(evaluation, _mock_span_collector)
133136
assert await foo(x=2) == "bar1"
134137

135138

@@ -201,7 +204,7 @@ def foofoo(*args, **kwargs):
201204
},
202205
)
203206
# Act & Assert
204-
set_evaluation_item(evaluation)
207+
set_execution_context(evaluation, _mock_span_collector)
205208

206209
assert foo() == "bar1"
207210
with pytest.raises(NotImplementedError):
@@ -274,7 +277,7 @@ async def foofoo(*args, **kwargs):
274277
},
275278
)
276279
# Act & Assert
277-
set_evaluation_item(evaluation)
280+
set_execution_context(evaluation, _mock_span_collector)
278281

279282
assert await foo() == "bar1"
280283
with pytest.raises(NotImplementedError):

0 commit comments

Comments
 (0)