chore(llmobs): [MLOB-4041] deprecate submit_evaluation_for (#14772)

ncybul · web-flow · commit 28fecf2a508c · 2025-10-16T14:09:53.000-04:00
## Description This PR renames the `LLMObs.submit_evaluation_for` method to `LLMObs.submit_evaluation` and makes the original `LLMObs.submit_evaluation_for` method a wrapper around `LLMObs.submit_evaluation`. The `span_context` argument on the original `LLMObs.submit_evaluation` method has also been kept, but a warning will be logged if it is used. Note that this may be confusing to customers since we have the following section in our docs where we say we are deprecating `LLMObs.submit_evaluation`; therefore, we should probably update these public facing docs before these changes get released: <img width="1426" height="346" alt="image" src="https://github.com/user-attachments/assets/80730c76-1252-4dc0-8ae2-5d8b3cdde9f6" /> ## New behavior Trying to use `LLMObs.submit_evaluation_for` results in a warning log: ``` 2025-10-15 12:03:00,798 WARNING [ddtrace.llmobs._llmobs] [_llmobs.py:1555] [dd.service=nicole-test dd.env=nicole-test dd.version= dd.trace_id=68ef7154000000004b3847c0ff710294 dd.span_id=11326788719315560003] - LLMObs.submit_evaluation_for() is deprecated and will be removed in a future version. Please use LLMObs.submit_evaluation() instead. ``` Trying to use `span_context` arg on `LLMObs.submit_evaluation` results in a warning log: ``` 2025-10-15 12:01:48,326 WARNING [ddtrace.llmobs._llmobs] [_llmobs.py:1607] [dd.service=nicole-test dd.env=nicole-test dd.version= dd.trace_id=68ef710c00000000421a9c204de31806 dd.span_id=17013630125099479547] - The `span_context` parameter is deprecated and will be removed in a future version. Please use `span` or `span_with_tag_value` instead. ``` ## Testing  ## Risks  ## Additional Notes
diff --git a/ddtrace/llmobs/_evaluators/ragas/base.py b/ddtrace/llmobs/_evaluators/ragas/base.py
@@ -170,7 +170,7 @@ def run_and_submit_evaluation(self, span_event: dict):
         )
         if isinstance(score_result_or_failure, float):
             self.llmobs_service.submit_evaluation(
-                span_context={"trace_id": span_event.get("trace_id"), "span_id": span_event.get("span_id")},
+                span={"trace_id": span_event.get("trace_id"), "span_id": span_event.get("span_id")},
                 label=self.LABEL,
                 metric_type=self.METRIC_TYPE,
                 value=score_result_or_failure,
diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
@@ -1564,6 +1564,42 @@ def submit_evaluation_for(
         timestamp_ms: Optional[int] = None,
         metadata: Optional[Dict[str, object]] = None,
         assessment: Optional[str] = None,
+    ) -> None:
+        """
+        Submits a custom evaluation metric for a given span. This method is deprecated and will be
+        removed in the next major version of ddtrace (4.0). Please use `LLMObs.submit_evaluation()` instead.
+        """
+        log.warning(
+            "LLMObs.submit_evaluation_for() is deprecated and will be removed in the next major "
+            "version of ddtrace (4.0). Please use LLMObs.submit_evaluation() instead."
+        )
+        return cls.submit_evaluation(
+            label=label,
+            metric_type=metric_type,
+            value=value,
+            span=span,
+            span_with_tag_value=span_with_tag_value,
+            tags=tags,
+            ml_app=ml_app,
+            timestamp_ms=timestamp_ms,
+            metadata=metadata,
+            assessment=assessment,
+        )
+
+    @classmethod
+    def submit_evaluation(
+        cls,
+        label: str,
+        metric_type: str,
+        value: Union[str, int, float, bool],
+        span_context: Optional[Dict[str, str]] = None,
+        span: Optional[dict] = None,
+        span_with_tag_value: Optional[Dict[str, str]] = None,
+        tags: Optional[Dict[str, str]] = None,
+        ml_app: Optional[str] = None,
+        timestamp_ms: Optional[int] = None,
+        metadata: Optional[Dict[str, object]] = None,
+        assessment: Optional[str] = None,
     ) -> None:
         """
         Submits a custom evaluation metric for a given span.
@@ -1572,6 +1608,9 @@ def submit_evaluation_for(
         :param str metric_type: The type of the evaluation metric. One of "categorical", "score", "boolean".
         :param value: The value of the evaluation metric.
                       Must be a string (categorical), integer (score), float (score), or boolean (boolean).
+        :param dict span_context: A dictionary containing the span_id and trace_id of interest. This is a
+                            deprecated parameter and will be removed in the next major version of
+                            ddtrace (4.0). Please use `span` or `span_with_tag_value` instead.
         :param dict span: A dictionary of shape {'span_id': str, 'trace_id': str} uniquely identifying
                             the span associated with this evaluation.
         :param dict span_with_tag_value: A dictionary with the format {'tag_key': str, 'tag_value': str}
@@ -1584,9 +1623,16 @@ def submit_evaluation_for(
                                 evaluation metric.
         :param str assessment: An assessment of the validity of this evaluation. Must be either "pass" or "fail".
         """
+        if span_context is not None:
+            log.warning(
+                "The `span_context` parameter is deprecated and will be removed in the next major version of "
+                "ddtrace (4.0). Please use `span` or `span_with_tag_value` instead."
+            )
+            span = span or span_context
+
         if cls.enabled is False:
             log.debug(
-                "LLMObs.submit_evaluation_for() called when LLMObs is not enabled. ",
+                "LLMObs.submit_evaluation() called when LLMObs is not enabled. ",
                 "Evaluation metric data will not be sent.",
             )
             return
@@ -1659,6 +1705,15 @@ def submit_evaluation_for(
                 log.warning("tags must be a dictionary of string key-value pairs.")
                 tags = {}
 
+            ml_app = ml_app if ml_app else config._llmobs_ml_app
+            if not ml_app:
+                error = "missing_ml_app"
+                log.warning(
+                    "ML App name is required for sending evaluation metrics. Evaluation metric data will not be sent. "
+                    "Ensure this configuration is set before running your application."
+                )
+                return
+
             evaluation_tags = {
                 "ddtrace.version": ddtrace.__version__,
                 "ml_app": ml_app,
@@ -1672,15 +1727,6 @@ def submit_evaluation_for(
                         error = "invalid_tags"
                         log.warning("Failed to parse tags. Tags for evaluation metrics must be strings.")
 
-            ml_app = ml_app if ml_app else config._llmobs_ml_app
-            if not ml_app:
-                error = "missing_ml_app"
-                log.warning(
-                    "ML App name is required for sending evaluation metrics. Evaluation metric data will not be sent. "
-                    "Ensure this configuration is set before running your application."
-                )
-                return
-
             evaluation_metric: LLMObsEvaluationMetricEvent = {
                 "join_on": join_on,
                 "label": str(label),
@@ -1711,144 +1757,6 @@ def submit_evaluation_for(
         finally:
             telemetry.record_llmobs_submit_evaluation(join_on, metric_type, error)
 
-    @classmethod
-    def submit_evaluation(
-        cls,
-        span_context: Dict[str, str],
-        label: str,
-        metric_type: str,
-        value: Union[str, int, float, bool],
-        tags: Optional[Dict[str, str]] = None,
-        ml_app: Optional[str] = None,
-        timestamp_ms: Optional[int] = None,
-        metadata: Optional[Dict[str, object]] = None,
-    ) -> None:
-        """
-        Submits a custom evaluation metric for a given span ID and trace ID.
-
-        :param span_context: A dictionary containing the span_id and trace_id of interest.
-        :param str label: The name of the evaluation metric.
-        :param str metric_type: The type of the evaluation metric. One of "categorical", "score", "boolean".
-        :param value: The value of the evaluation metric.
-                      Must be a string (categorical), integer (score), float (score), or boolean (boolean).
-        :param tags: A dictionary of string key-value pairs to tag the evaluation metric with.
-        :param str ml_app: The name of the ML application
-        :param int timestamp_ms: The timestamp in milliseconds when the evaluation metric result was generated.
-        :param dict metadata: A JSON serializable dictionary of key-value metadata pairs relevant to the
-                                evaluation metric.
-        """
-        if cls.enabled is False:
-            log.debug(
-                "LLMObs.submit_evaluation() called when LLMObs is not enabled. Evaluation metric data will not be sent."
-            )
-            return
-        error = None
-        try:
-            if not isinstance(span_context, dict):
-                error = "invalid_span"
-                log.warning(
-                    "span_context must be a dictionary containing both span_id and trace_id keys. "
-                    "LLMObs.export_span() can be used to generate this dictionary from a given span."
-                )
-                return
-
-            ml_app = ml_app if ml_app else config._llmobs_ml_app
-            if not ml_app:
-                error = "missing_ml_app"
-                log.warning(
-                    "ML App name is required for sending evaluation metrics. Evaluation metric data will not be sent. "
-                    "Ensure this configuration is set before running your application."
-                )
-                return
-
-            timestamp_ms = timestamp_ms if timestamp_ms else int(time.time() * 1000)
-
-            if not isinstance(timestamp_ms, int) or timestamp_ms < 0:
-                error = "invalid_timestamp"
-                log.warning("timestamp_ms must be a non-negative integer. Evaluation metric data will not be sent")
-                return
-
-            span_id = span_context.get("span_id")
-            trace_id = span_context.get("trace_id")
-            if not (span_id and trace_id):
-                error = "invalid_span"
-                log.warning(
-                    "span_id and trace_id must both be specified for the given evaluation metric to be submitted."
-                )
-                return
-            if not label:
-                error = "invalid_metric_label"
-                log.warning("label must be the specified name of the evaluation metric.")
-                return
-
-            if not metric_type or metric_type.lower() not in ("categorical", "numerical", "score", "boolean"):
-                error = "invalid_metric_type"
-                log.warning("metric_type must be one of 'categorical', 'score', or 'boolean'.")
-                return
-
-            metric_type = metric_type.lower()
-            if metric_type == "numerical":
-                error = "invalid_metric_type"
-                log.warning(
-                    "The evaluation metric type 'numerical' is unsupported. Use 'score' instead. "
-                    "Converting `numerical` metric to `score` type."
-                )
-                metric_type = "score"
-
-            if metric_type == "categorical" and not isinstance(value, str):
-                error = "invalid_metric_value"
-                log.warning("value must be a string for a categorical metric.")
-                return
-            if metric_type == "score" and not isinstance(value, (int, float)):
-                error = "invalid_metric_value"
-                log.warning("value must be an integer or float for a score metric.")
-                return
-            if metric_type == "boolean" and not isinstance(value, bool):
-                error = "invalid_metric_value"
-                log.warning("value must be a boolean for a boolean metric.")
-                return
-            if tags is not None and not isinstance(tags, dict):
-                error = "invalid_tags"
-                log.warning("tags must be a dictionary of string key-value pairs.")
-                return
-
-            # initialize tags with default values that will be overridden by user-provided tags
-            evaluation_tags = {
-                "ddtrace.version": ddtrace.__version__,
-                "ml_app": ml_app,
-            }
-
-            if tags:
-                for k, v in tags.items():
-                    try:
-                        evaluation_tags[ensure_text(k)] = ensure_text(v)
-                    except TypeError:
-                        error = "invalid_tags"
-                        log.warning("Failed to parse tags. Tags for evaluation metrics must be strings.")
-
-            evaluation_metric: LLMObsEvaluationMetricEvent = {
-                "join_on": {"span": {"span_id": span_id, "trace_id": trace_id}},
-                "label": str(label),
-                "metric_type": metric_type.lower(),
-                "timestamp_ms": timestamp_ms,
-                "{}_value".format(metric_type): value,  # type: ignore
-                "ml_app": ml_app,
-                "tags": ["{}:{}".format(k, v) for k, v in evaluation_tags.items()],
-            }
-
-            if metadata:
-                if not isinstance(metadata, dict):
-                    error = "invalid_metadata"
-                    log.warning("metadata must be json serializable dictionary.")
-                else:
-                    metadata = safe_json(metadata)
-                    if metadata and isinstance(metadata, str):
-                        evaluation_metric["metadata"] = json.loads(metadata)
-
-            cls._instance._llmobs_eval_metric_writer.enqueue(evaluation_metric)
-        finally:
-            telemetry.record_llmobs_submit_evaluation({"span": span_context}, metric_type, error)
-
     @classmethod
     def _inject_llmobs_context(cls, span_context: Context, request_headers: Dict[str, str]) -> None:
         if cls.enabled is False:
diff --git a/releasenotes/notes/deprecate-submit-evaluation-for-b38590814c634d95.yaml b/releasenotes/notes/deprecate-submit-evaluation-for-b38590814c634d95.yaml
@@ -0,0 +1,9 @@
+deprecations:
+  - |
+    LLM Observability: ``LLMObs.submit_evaluation_for()`` has been deprecated and will be removed in a future version. 
+    It will be replaced with ``LLMObs.submit_evaluation()`` which will take the signature of the original ``LLMObs.submit_evaluation_for()``
+    method in ddtrace version 4.0. Please use ``LLMObs.submit_evaluation()`` for submitting evaluations moving forward. 
+    To migrate:
+        - ``LLMObs.submit_evaluation_for(...)`` users: rename to ``LLMObs.submit_evaluation(...)``
+        - ``LLMObs.submit_evaluation_for(...)`` users: rename the ``span_context`` argument to ``span``, i.e.
+        ``LLMObs.submit_evaluation(span_context={"span_id": ..., "trace_id": ...}, ...)`` to ``LLMObs.submit_evaluation(span={"span_id": ..., "trace_id": ...}, ...)``
diff --git a/tests/llmobs/_utils.py b/tests/llmobs/_utils.py
@@ -608,7 +608,7 @@ def __init__(self, llmobs_service, label="dummy"):
 
     def run_and_submit_evaluation(self, span):
         self.llmobs_service.submit_evaluation(
-            span_context=span,
+            span=span,
             label=self.LABEL,
             value=1.0,
             metric_type="score",
diff --git a/tests/llmobs/test_llmobs_ragas_evaluators.py b/tests/llmobs/test_llmobs_ragas_evaluators.py
@@ -109,7 +109,7 @@ def test_ragas_faithfulness_submits_evaluation(ragas, llmobs, mock_llmobs_submit
     rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls(
         [
             mock.call(
-                span_context={
+                span={
                     "span_id": llm_span.get("span_id"),
                     "trace_id": llm_span.get("trace_id"),
                 },
@@ -137,7 +137,7 @@ def test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages
     rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls(
         [
             mock.call(
-                span_context={
+                span={
                     "span_id": llm_span.get("span_id"),
                     "trace_id": llm_span.get("trace_id"),
                 },
@@ -176,7 +176,7 @@ def test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys(ragas, l
     rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls(
         [
             mock.call(
-                span_context={
+                span={
                     "span_id": llm_span.get("span_id"),
                     "trace_id": llm_span.get("trace_id"),
                 },
@@ -293,7 +293,7 @@ def test_ragas_context_precision_submits_evaluation(ragas, llmobs, mock_llmobs_s
     rcp_evaluator.llmobs_service.submit_evaluation.assert_has_calls(
         [
             mock.call(
-                span_context={
+                span={
                     "span_id": llm_span.get("span_id"),
                     "trace_id": llm_span.get("trace_id"),
                 },
@@ -320,7 +320,7 @@ def test_ragas_context_precision_submits_evaluation_on_span_with_question_in_mes
     rcp_evaluator.llmobs_service.submit_evaluation.assert_has_calls(
         [
             mock.call(
-                span_context={
+                span={
                     "span_id": llm_span.get("span_id"),
                     "trace_id": llm_span.get("trace_id"),
                 },
@@ -359,7 +359,7 @@ def test_ragas_context_precision_submits_evaluation_on_span_with_custom_keys(
     rcp_evaluator.llmobs_service.submit_evaluation.assert_has_calls(
         [
             mock.call(
-                span_context={
+                span={
                     "span_id": llm_span.get("span_id"),
                     "trace_id": llm_span.get("trace_id"),
                 },
@@ -474,7 +474,7 @@ def test_ragas_answer_relevancy_submits_evaluation(
     rar_evaluator.llmobs_service.submit_evaluation.assert_has_calls(
         [
             mock.call(
-                span_context={
+                span={
                     "span_id": llm_span.get("span_id"),
                     "trace_id": llm_span.get("trace_id"),
                 },
@@ -500,7 +500,7 @@ def test_ragas_answer_relevancy_submits_evaluation_on_span_with_question_in_mess
     rar_evaluator.llmobs_service.submit_evaluation.assert_has_calls(
         [
             mock.call(
-                span_context={
+                span={
                     "span_id": llm_span.get("span_id"),
                     "trace_id": llm_span.get("trace_id"),
                 },
@@ -538,7 +538,7 @@ def test_ragas_answer_relevancy_submits_evaluation_on_span_with_custom_keys(
     rar_evaluator.llmobs_service.submit_evaluation.assert_has_calls(
         [
             mock.call(
-                span_context={
+                span={
                     "span_id": llm_span.get("span_id"),
                     "trace_id": llm_span.get("trace_id"),
                 },
diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py

Original file line number	Diff line number	Diff line change
`@@ -170,7 +170,7 @@ def run_and_submit_evaluation(self, span_event: dict):`
`170`	`170`	`)`
`171`	`171`	`if isinstance(score_result_or_failure, float):`
`172`	`172`	`self.llmobs_service.submit_evaluation(`
`173`		`- span_context={"trace_id": span_event.get("trace_id"), "span_id": span_event.get("span_id")},`
	`173`	`+ span={"trace_id": span_event.get("trace_id"), "span_id": span_event.get("span_id")},`
`174`	`174`	`label=self.LABEL,`
`175`	`175`	`metric_type=self.METRIC_TYPE,`
`176`	`176`	`value=score_result_or_failure,`