Skip to content

Commit f7b8ed5

Browse files
authored
feat(llmobs): add reasoning for custom evals (#14919)
## Description (public change) Adds `reasoning` as an argument to `submit_evaluation_for()` and `submit_evaluation()`. This arg is used to denote an explanation behind the evaluation results (i.e. why was the span marked as toxic?) (internal change - not facing users) Also changes how the `assessment` field is stored on the evaluation object (#14792 added it as a nested `success_criteria` object) to a top-level field on the evaluation object. This isn't breaking (since this hasn't been officially released on our product backend) nor a user-facing change. <!-- Provide an overview of the change and motivation for the change --> ## Testing <!-- Describe your testing strategy or note what tests are included --> ## Risks <!-- Note any risks associated with this change, or "None" if no risks --> ## Additional Notes <!-- Any other information that would be helpful for reviewers -->
1 parent c332ef9 commit f7b8ed5

File tree

5 files changed

+86
-8
lines changed

5 files changed

+86
-8
lines changed

ddtrace/llmobs/_llmobs.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1564,6 +1564,7 @@ def submit_evaluation_for(
15641564
timestamp_ms: Optional[int] = None,
15651565
metadata: Optional[Dict[str, object]] = None,
15661566
assessment: Optional[str] = None,
1567+
reasoning: Optional[str] = None,
15671568
) -> None:
15681569
"""
15691570
Submits a custom evaluation metric for a given span. This method is deprecated and will be
@@ -1584,6 +1585,7 @@ def submit_evaluation_for(
15841585
timestamp_ms=timestamp_ms,
15851586
metadata=metadata,
15861587
assessment=assessment,
1588+
reasoning=reasoning,
15871589
)
15881590

15891591
@classmethod
@@ -1600,6 +1602,7 @@ def submit_evaluation(
16001602
timestamp_ms: Optional[int] = None,
16011603
metadata: Optional[Dict[str, object]] = None,
16021604
assessment: Optional[str] = None,
1605+
reasoning: Optional[str] = None,
16031606
) -> None:
16041607
"""
16051608
Submits a custom evaluation metric for a given span.
@@ -1622,6 +1625,7 @@ def submit_evaluation(
16221625
:param dict metadata: A JSON serializable dictionary of key-value metadata pairs relevant to the
16231626
evaluation metric.
16241627
:param str assessment: An assessment of the validity of this evaluation. Must be either "pass" or "fail".
1628+
:param str reasoning: An explanation of the evaluation result.
16251629
"""
16261630
if span_context is not None:
16271631
log.warning(
@@ -1742,7 +1746,13 @@ def submit_evaluation(
17421746
error = "invalid_assessment"
17431747
log.warning("Failed to parse assessment. assessment must be either 'pass' or 'fail'.")
17441748
else:
1745-
evaluation_metric["success_criteria"] = {"assessment": assessment}
1749+
evaluation_metric["assessment"] = assessment
1750+
if reasoning:
1751+
if not isinstance(reasoning, str):
1752+
error = "invalid_reasoning"
1753+
log.warning("Failed to parse reasoning. reasoning must be a string.")
1754+
else:
1755+
evaluation_metric["reasoning"] = reasoning
17461756

17471757
if metadata:
17481758
if not isinstance(metadata, dict):

ddtrace/llmobs/_writer.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,8 @@ class LLMObsEvaluationMetricEvent(TypedDict, total=False):
8686
ml_app: str
8787
timestamp_ms: int
8888
tags: List[str]
89-
success_criteria: Dict[str, str]
89+
assessment: str
90+
reasoning: str
9091

9192

9293
class LLMObsExperimentEvalMetricEvent(TypedDict, total=False):
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
features:
3+
- |
4+
LLM Observability: The ``LLMObs.submit_evaluation()`` and ``LLMObs.submit_evaluation_for()`` methods now accept a ``reasoning`` argument to denote an explanation of the evaluation results.

tests/llmobs/_utils.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,8 @@ def _expected_llmobs_eval_metric_event(
275275
boolean_value=None,
276276
tags=None,
277277
metadata=None,
278-
success_criteria=None,
278+
assessment=None,
279+
reasoning=None,
279280
):
280281
eval_metric_event = {
281282
"join_on": {},
@@ -300,8 +301,10 @@ def _expected_llmobs_eval_metric_event(
300301
eval_metric_event["boolean_value"] = boolean_value
301302
if tags is not None:
302303
eval_metric_event["tags"] = tags
303-
if success_criteria is not None:
304-
eval_metric_event["success_criteria"] = success_criteria
304+
if assessment is not None:
305+
eval_metric_event["assessment"] = assessment
306+
if reasoning is not None:
307+
eval_metric_event["reasoning"] = reasoning
305308
if timestamp_ms is not None:
306309
eval_metric_event["timestamp_ms"] = timestamp_ms
307310
else:

tests/llmobs/test_llmobs_service.py

Lines changed: 63 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1870,7 +1870,7 @@ def test_submit_evaluation_invalid_assessment_raises_warning(llmobs, mock_llmobs
18701870
)
18711871

18721872

1873-
def test_submit_evaluation_enqueues_writer_with_success_criteria(llmobs, mock_llmobs_eval_metric_writer):
1873+
def test_submit_evaluation_enqueues_writer_with_assessment(llmobs, mock_llmobs_eval_metric_writer):
18741874
llmobs.submit_evaluation(
18751875
span={"span_id": "123", "trace_id": "456"},
18761876
label="toxicity",
@@ -1891,7 +1891,7 @@ def test_submit_evaluation_enqueues_writer_with_success_criteria(llmobs, mock_ll
18911891
categorical_value="high",
18921892
tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:ml_app_override", "foo:bar", "bee:baz"],
18931893
metadata={"foo": ["bar", "baz"]},
1894-
success_criteria={"assessment": "pass"},
1894+
assessment="pass",
18951895
)
18961896
)
18971897
mock_llmobs_eval_metric_writer.reset()
@@ -1914,7 +1914,67 @@ def test_submit_evaluation_enqueues_writer_with_success_criteria(llmobs, mock_ll
19141914
metric_type="categorical",
19151915
categorical_value="high",
19161916
tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:ml_app_override", "foo:bar", "bee:baz"],
1917-
success_criteria={"assessment": "fail"},
1917+
assessment="fail",
1918+
)
1919+
)
1920+
1921+
1922+
def test_submit_evaluation_invalid_reasoning_raises_warning(llmobs, mock_llmobs_logs):
1923+
llmobs.submit_evaluation(
1924+
span={"span_id": "123", "trace_id": "456"},
1925+
label="toxicity",
1926+
metric_type="categorical",
1927+
value="high",
1928+
reasoning=123,
1929+
)
1930+
mock_llmobs_logs.warning.assert_called_once_with("Failed to parse reasoning. reasoning must be a string.")
1931+
1932+
1933+
def test_submit_evaluation_for_enqueues_writer_with_reasoning(llmobs, mock_llmobs_eval_metric_writer):
1934+
llmobs.submit_evaluation_for(
1935+
span={"span_id": "123", "trace_id": "456"},
1936+
label="toxicity",
1937+
metric_type="categorical",
1938+
value="high",
1939+
tags={"foo": "bar", "bee": "baz", "ml_app": "ml_app_override"},
1940+
ml_app="ml_app_override",
1941+
metadata={"foo": ["bar", "baz"]},
1942+
reasoning="the content of the message involved profanity",
1943+
)
1944+
mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
1945+
_expected_llmobs_eval_metric_event(
1946+
ml_app="ml_app_override",
1947+
span_id="123",
1948+
trace_id="456",
1949+
label="toxicity",
1950+
metric_type="categorical",
1951+
categorical_value="high",
1952+
tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:ml_app_override", "foo:bar", "bee:baz"],
1953+
metadata={"foo": ["bar", "baz"]},
1954+
reasoning="the content of the message involved profanity",
1955+
)
1956+
)
1957+
mock_llmobs_eval_metric_writer.reset()
1958+
llmobs.submit_evaluation_for(
1959+
span={"span_id": "123", "trace_id": "456"},
1960+
label="toxicity",
1961+
metric_type="categorical",
1962+
value="low",
1963+
tags={"foo": "bar", "bee": "baz", "ml_app": "ml_app_override"},
1964+
ml_app="ml_app_override",
1965+
metadata="invalid",
1966+
reasoning="the content of the message did not involve profanity or hate speech or negativity",
1967+
)
1968+
mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
1969+
_expected_llmobs_eval_metric_event(
1970+
ml_app="ml_app_override",
1971+
span_id="123",
1972+
trace_id="456",
1973+
label="toxicity",
1974+
metric_type="categorical",
1975+
categorical_value="low",
1976+
tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:ml_app_override", "foo:bar", "bee:baz"],
1977+
reasoning="the content of the message did not involve profanity or hate speech or negativity",
19181978
)
19191979
)
19201980

0 commit comments

Comments
 (0)