Skip to content

Commit 28fecf2

Browse files
authored
chore(llmobs): [MLOB-4041] deprecate submit_evaluation_for (#14772)
## Description This PR renames the `LLMObs.submit_evaluation_for` method to `LLMObs.submit_evaluation` and makes the original `LLMObs.submit_evaluation_for` method a wrapper around `LLMObs.submit_evaluation`. The `span_context` argument on the original `LLMObs.submit_evaluation` method has also been kept, but a warning will be logged if it is used. Note that this may be confusing to customers since we have the following section in our docs where we say we are deprecating `LLMObs.submit_evaluation`; therefore, we should probably update these public facing docs before these changes get released: <img width="1426" height="346" alt="image" src="https://github.com/user-attachments/assets/80730c76-1252-4dc0-8ae2-5d8b3cdde9f6" /> ## New behavior Trying to use `LLMObs.submit_evaluation_for` results in a warning log: ``` 2025-10-15 12:03:00,798 WARNING [ddtrace.llmobs._llmobs] [_llmobs.py:1555] [dd.service=nicole-test dd.env=nicole-test dd.version= dd.trace_id=68ef7154000000004b3847c0ff710294 dd.span_id=11326788719315560003] - LLMObs.submit_evaluation_for() is deprecated and will be removed in a future version. Please use LLMObs.submit_evaluation() instead. ``` Trying to use `span_context` arg on `LLMObs.submit_evaluation` results in a warning log: ``` 2025-10-15 12:01:48,326 WARNING [ddtrace.llmobs._llmobs] [_llmobs.py:1607] [dd.service=nicole-test dd.env=nicole-test dd.version= dd.trace_id=68ef710c00000000421a9c204de31806 dd.span_id=17013630125099479547] - The `span_context` parameter is deprecated and will be removed in a future version. Please use `span` or `span_with_tag_value` instead. ``` ## Testing <!-- Describe your testing strategy or note what tests are included --> ## Risks <!-- Note any risks associated with this change, or "None" if no risks --> ## Additional Notes <!-- Any other information that would be helpful for reviewers -->
1 parent 54a3a0e commit 28fecf2

File tree

6 files changed

+126
-645
lines changed

6 files changed

+126
-645
lines changed

ddtrace/llmobs/_evaluators/ragas/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ def run_and_submit_evaluation(self, span_event: dict):
170170
)
171171
if isinstance(score_result_or_failure, float):
172172
self.llmobs_service.submit_evaluation(
173-
span_context={"trace_id": span_event.get("trace_id"), "span_id": span_event.get("span_id")},
173+
span={"trace_id": span_event.get("trace_id"), "span_id": span_event.get("span_id")},
174174
label=self.LABEL,
175175
metric_type=self.METRIC_TYPE,
176176
value=score_result_or_failure,

ddtrace/llmobs/_llmobs.py

Lines changed: 56 additions & 148 deletions
Original file line numberDiff line numberDiff line change
@@ -1564,6 +1564,42 @@ def submit_evaluation_for(
15641564
timestamp_ms: Optional[int] = None,
15651565
metadata: Optional[Dict[str, object]] = None,
15661566
assessment: Optional[str] = None,
1567+
) -> None:
1568+
"""
1569+
Submits a custom evaluation metric for a given span. This method is deprecated and will be
1570+
removed in the next major version of ddtrace (4.0). Please use `LLMObs.submit_evaluation()` instead.
1571+
"""
1572+
log.warning(
1573+
"LLMObs.submit_evaluation_for() is deprecated and will be removed in the next major "
1574+
"version of ddtrace (4.0). Please use LLMObs.submit_evaluation() instead."
1575+
)
1576+
return cls.submit_evaluation(
1577+
label=label,
1578+
metric_type=metric_type,
1579+
value=value,
1580+
span=span,
1581+
span_with_tag_value=span_with_tag_value,
1582+
tags=tags,
1583+
ml_app=ml_app,
1584+
timestamp_ms=timestamp_ms,
1585+
metadata=metadata,
1586+
assessment=assessment,
1587+
)
1588+
1589+
@classmethod
1590+
def submit_evaluation(
1591+
cls,
1592+
label: str,
1593+
metric_type: str,
1594+
value: Union[str, int, float, bool],
1595+
span_context: Optional[Dict[str, str]] = None,
1596+
span: Optional[dict] = None,
1597+
span_with_tag_value: Optional[Dict[str, str]] = None,
1598+
tags: Optional[Dict[str, str]] = None,
1599+
ml_app: Optional[str] = None,
1600+
timestamp_ms: Optional[int] = None,
1601+
metadata: Optional[Dict[str, object]] = None,
1602+
assessment: Optional[str] = None,
15671603
) -> None:
15681604
"""
15691605
Submits a custom evaluation metric for a given span.
@@ -1572,6 +1608,9 @@ def submit_evaluation_for(
15721608
:param str metric_type: The type of the evaluation metric. One of "categorical", "score", "boolean".
15731609
:param value: The value of the evaluation metric.
15741610
Must be a string (categorical), integer (score), float (score), or boolean (boolean).
1611+
:param dict span_context: A dictionary containing the span_id and trace_id of interest. This is a
1612+
deprecated parameter and will be removed in the next major version of
1613+
ddtrace (4.0). Please use `span` or `span_with_tag_value` instead.
15751614
:param dict span: A dictionary of shape {'span_id': str, 'trace_id': str} uniquely identifying
15761615
the span associated with this evaluation.
15771616
:param dict span_with_tag_value: A dictionary with the format {'tag_key': str, 'tag_value': str}
@@ -1584,9 +1623,16 @@ def submit_evaluation_for(
15841623
evaluation metric.
15851624
:param str assessment: An assessment of the validity of this evaluation. Must be either "pass" or "fail".
15861625
"""
1626+
if span_context is not None:
1627+
log.warning(
1628+
"The `span_context` parameter is deprecated and will be removed in the next major version of "
1629+
"ddtrace (4.0). Please use `span` or `span_with_tag_value` instead."
1630+
)
1631+
span = span or span_context
1632+
15871633
if cls.enabled is False:
15881634
log.debug(
1589-
"LLMObs.submit_evaluation_for() called when LLMObs is not enabled. ",
1635+
"LLMObs.submit_evaluation() called when LLMObs is not enabled. ",
15901636
"Evaluation metric data will not be sent.",
15911637
)
15921638
return
@@ -1659,6 +1705,15 @@ def submit_evaluation_for(
16591705
log.warning("tags must be a dictionary of string key-value pairs.")
16601706
tags = {}
16611707

1708+
ml_app = ml_app if ml_app else config._llmobs_ml_app
1709+
if not ml_app:
1710+
error = "missing_ml_app"
1711+
log.warning(
1712+
"ML App name is required for sending evaluation metrics. Evaluation metric data will not be sent. "
1713+
"Ensure this configuration is set before running your application."
1714+
)
1715+
return
1716+
16621717
evaluation_tags = {
16631718
"ddtrace.version": ddtrace.__version__,
16641719
"ml_app": ml_app,
@@ -1672,15 +1727,6 @@ def submit_evaluation_for(
16721727
error = "invalid_tags"
16731728
log.warning("Failed to parse tags. Tags for evaluation metrics must be strings.")
16741729

1675-
ml_app = ml_app if ml_app else config._llmobs_ml_app
1676-
if not ml_app:
1677-
error = "missing_ml_app"
1678-
log.warning(
1679-
"ML App name is required for sending evaluation metrics. Evaluation metric data will not be sent. "
1680-
"Ensure this configuration is set before running your application."
1681-
)
1682-
return
1683-
16841730
evaluation_metric: LLMObsEvaluationMetricEvent = {
16851731
"join_on": join_on,
16861732
"label": str(label),
@@ -1711,144 +1757,6 @@ def submit_evaluation_for(
17111757
finally:
17121758
telemetry.record_llmobs_submit_evaluation(join_on, metric_type, error)
17131759

1714-
@classmethod
1715-
def submit_evaluation(
1716-
cls,
1717-
span_context: Dict[str, str],
1718-
label: str,
1719-
metric_type: str,
1720-
value: Union[str, int, float, bool],
1721-
tags: Optional[Dict[str, str]] = None,
1722-
ml_app: Optional[str] = None,
1723-
timestamp_ms: Optional[int] = None,
1724-
metadata: Optional[Dict[str, object]] = None,
1725-
) -> None:
1726-
"""
1727-
Submits a custom evaluation metric for a given span ID and trace ID.
1728-
1729-
:param span_context: A dictionary containing the span_id and trace_id of interest.
1730-
:param str label: The name of the evaluation metric.
1731-
:param str metric_type: The type of the evaluation metric. One of "categorical", "score", "boolean".
1732-
:param value: The value of the evaluation metric.
1733-
Must be a string (categorical), integer (score), float (score), or boolean (boolean).
1734-
:param tags: A dictionary of string key-value pairs to tag the evaluation metric with.
1735-
:param str ml_app: The name of the ML application
1736-
:param int timestamp_ms: The timestamp in milliseconds when the evaluation metric result was generated.
1737-
:param dict metadata: A JSON serializable dictionary of key-value metadata pairs relevant to the
1738-
evaluation metric.
1739-
"""
1740-
if cls.enabled is False:
1741-
log.debug(
1742-
"LLMObs.submit_evaluation() called when LLMObs is not enabled. Evaluation metric data will not be sent."
1743-
)
1744-
return
1745-
error = None
1746-
try:
1747-
if not isinstance(span_context, dict):
1748-
error = "invalid_span"
1749-
log.warning(
1750-
"span_context must be a dictionary containing both span_id and trace_id keys. "
1751-
"LLMObs.export_span() can be used to generate this dictionary from a given span."
1752-
)
1753-
return
1754-
1755-
ml_app = ml_app if ml_app else config._llmobs_ml_app
1756-
if not ml_app:
1757-
error = "missing_ml_app"
1758-
log.warning(
1759-
"ML App name is required for sending evaluation metrics. Evaluation metric data will not be sent. "
1760-
"Ensure this configuration is set before running your application."
1761-
)
1762-
return
1763-
1764-
timestamp_ms = timestamp_ms if timestamp_ms else int(time.time() * 1000)
1765-
1766-
if not isinstance(timestamp_ms, int) or timestamp_ms < 0:
1767-
error = "invalid_timestamp"
1768-
log.warning("timestamp_ms must be a non-negative integer. Evaluation metric data will not be sent")
1769-
return
1770-
1771-
span_id = span_context.get("span_id")
1772-
trace_id = span_context.get("trace_id")
1773-
if not (span_id and trace_id):
1774-
error = "invalid_span"
1775-
log.warning(
1776-
"span_id and trace_id must both be specified for the given evaluation metric to be submitted."
1777-
)
1778-
return
1779-
if not label:
1780-
error = "invalid_metric_label"
1781-
log.warning("label must be the specified name of the evaluation metric.")
1782-
return
1783-
1784-
if not metric_type or metric_type.lower() not in ("categorical", "numerical", "score", "boolean"):
1785-
error = "invalid_metric_type"
1786-
log.warning("metric_type must be one of 'categorical', 'score', or 'boolean'.")
1787-
return
1788-
1789-
metric_type = metric_type.lower()
1790-
if metric_type == "numerical":
1791-
error = "invalid_metric_type"
1792-
log.warning(
1793-
"The evaluation metric type 'numerical' is unsupported. Use 'score' instead. "
1794-
"Converting `numerical` metric to `score` type."
1795-
)
1796-
metric_type = "score"
1797-
1798-
if metric_type == "categorical" and not isinstance(value, str):
1799-
error = "invalid_metric_value"
1800-
log.warning("value must be a string for a categorical metric.")
1801-
return
1802-
if metric_type == "score" and not isinstance(value, (int, float)):
1803-
error = "invalid_metric_value"
1804-
log.warning("value must be an integer or float for a score metric.")
1805-
return
1806-
if metric_type == "boolean" and not isinstance(value, bool):
1807-
error = "invalid_metric_value"
1808-
log.warning("value must be a boolean for a boolean metric.")
1809-
return
1810-
if tags is not None and not isinstance(tags, dict):
1811-
error = "invalid_tags"
1812-
log.warning("tags must be a dictionary of string key-value pairs.")
1813-
return
1814-
1815-
# initialize tags with default values that will be overridden by user-provided tags
1816-
evaluation_tags = {
1817-
"ddtrace.version": ddtrace.__version__,
1818-
"ml_app": ml_app,
1819-
}
1820-
1821-
if tags:
1822-
for k, v in tags.items():
1823-
try:
1824-
evaluation_tags[ensure_text(k)] = ensure_text(v)
1825-
except TypeError:
1826-
error = "invalid_tags"
1827-
log.warning("Failed to parse tags. Tags for evaluation metrics must be strings.")
1828-
1829-
evaluation_metric: LLMObsEvaluationMetricEvent = {
1830-
"join_on": {"span": {"span_id": span_id, "trace_id": trace_id}},
1831-
"label": str(label),
1832-
"metric_type": metric_type.lower(),
1833-
"timestamp_ms": timestamp_ms,
1834-
"{}_value".format(metric_type): value, # type: ignore
1835-
"ml_app": ml_app,
1836-
"tags": ["{}:{}".format(k, v) for k, v in evaluation_tags.items()],
1837-
}
1838-
1839-
if metadata:
1840-
if not isinstance(metadata, dict):
1841-
error = "invalid_metadata"
1842-
log.warning("metadata must be json serializable dictionary.")
1843-
else:
1844-
metadata = safe_json(metadata)
1845-
if metadata and isinstance(metadata, str):
1846-
evaluation_metric["metadata"] = json.loads(metadata)
1847-
1848-
cls._instance._llmobs_eval_metric_writer.enqueue(evaluation_metric)
1849-
finally:
1850-
telemetry.record_llmobs_submit_evaluation({"span": span_context}, metric_type, error)
1851-
18521760
@classmethod
18531761
def _inject_llmobs_context(cls, span_context: Context, request_headers: Dict[str, str]) -> None:
18541762
if cls.enabled is False:
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
deprecations:
2+
- |
3+
LLM Observability: ``LLMObs.submit_evaluation_for()`` has been deprecated and will be removed in a future version.
4+
It will be replaced with ``LLMObs.submit_evaluation()`` which will take the signature of the original ``LLMObs.submit_evaluation_for()``
5+
method in ddtrace version 4.0. Please use ``LLMObs.submit_evaluation()`` for submitting evaluations moving forward.
6+
To migrate:
7+
- ``LLMObs.submit_evaluation_for(...)`` users: rename to ``LLMObs.submit_evaluation(...)``
8+
- ``LLMObs.submit_evaluation_for(...)`` users: rename the ``span_context`` argument to ``span``, i.e.
9+
``LLMObs.submit_evaluation(span_context={"span_id": ..., "trace_id": ...}, ...)`` to ``LLMObs.submit_evaluation(span={"span_id": ..., "trace_id": ...}, ...)``

tests/llmobs/_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -608,7 +608,7 @@ def __init__(self, llmobs_service, label="dummy"):
608608

609609
def run_and_submit_evaluation(self, span):
610610
self.llmobs_service.submit_evaluation(
611-
span_context=span,
611+
span=span,
612612
label=self.LABEL,
613613
value=1.0,
614614
metric_type="score",

tests/llmobs/test_llmobs_ragas_evaluators.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def test_ragas_faithfulness_submits_evaluation(ragas, llmobs, mock_llmobs_submit
109109
rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls(
110110
[
111111
mock.call(
112-
span_context={
112+
span={
113113
"span_id": llm_span.get("span_id"),
114114
"trace_id": llm_span.get("trace_id"),
115115
},
@@ -137,7 +137,7 @@ def test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages
137137
rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls(
138138
[
139139
mock.call(
140-
span_context={
140+
span={
141141
"span_id": llm_span.get("span_id"),
142142
"trace_id": llm_span.get("trace_id"),
143143
},
@@ -176,7 +176,7 @@ def test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys(ragas, l
176176
rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls(
177177
[
178178
mock.call(
179-
span_context={
179+
span={
180180
"span_id": llm_span.get("span_id"),
181181
"trace_id": llm_span.get("trace_id"),
182182
},
@@ -293,7 +293,7 @@ def test_ragas_context_precision_submits_evaluation(ragas, llmobs, mock_llmobs_s
293293
rcp_evaluator.llmobs_service.submit_evaluation.assert_has_calls(
294294
[
295295
mock.call(
296-
span_context={
296+
span={
297297
"span_id": llm_span.get("span_id"),
298298
"trace_id": llm_span.get("trace_id"),
299299
},
@@ -320,7 +320,7 @@ def test_ragas_context_precision_submits_evaluation_on_span_with_question_in_mes
320320
rcp_evaluator.llmobs_service.submit_evaluation.assert_has_calls(
321321
[
322322
mock.call(
323-
span_context={
323+
span={
324324
"span_id": llm_span.get("span_id"),
325325
"trace_id": llm_span.get("trace_id"),
326326
},
@@ -359,7 +359,7 @@ def test_ragas_context_precision_submits_evaluation_on_span_with_custom_keys(
359359
rcp_evaluator.llmobs_service.submit_evaluation.assert_has_calls(
360360
[
361361
mock.call(
362-
span_context={
362+
span={
363363
"span_id": llm_span.get("span_id"),
364364
"trace_id": llm_span.get("trace_id"),
365365
},
@@ -474,7 +474,7 @@ def test_ragas_answer_relevancy_submits_evaluation(
474474
rar_evaluator.llmobs_service.submit_evaluation.assert_has_calls(
475475
[
476476
mock.call(
477-
span_context={
477+
span={
478478
"span_id": llm_span.get("span_id"),
479479
"trace_id": llm_span.get("trace_id"),
480480
},
@@ -500,7 +500,7 @@ def test_ragas_answer_relevancy_submits_evaluation_on_span_with_question_in_mess
500500
rar_evaluator.llmobs_service.submit_evaluation.assert_has_calls(
501501
[
502502
mock.call(
503-
span_context={
503+
span={
504504
"span_id": llm_span.get("span_id"),
505505
"trace_id": llm_span.get("trace_id"),
506506
},
@@ -538,7 +538,7 @@ def test_ragas_answer_relevancy_submits_evaluation_on_span_with_custom_keys(
538538
rar_evaluator.llmobs_service.submit_evaluation.assert_has_calls(
539539
[
540540
mock.call(
541-
span_context={
541+
span={
542542
"span_id": llm_span.get("span_id"),
543543
"trace_id": llm_span.get("trace_id"),
544544
},

0 commit comments

Comments
 (0)