From 8ed6ab3602c780d294f1910831e25e7341babcb9 Mon Sep 17 00:00:00 2001 From: gary-huang Date: Fri, 8 Aug 2025 17:28:01 -0400 Subject: [PATCH] loosen restrictions on experiment spans IO --- ddtrace/llmobs/_constants.py | 2 ++ ddtrace/llmobs/_experiment.py | 3 ++- ddtrace/llmobs/_llmobs.py | 31 ++++++++++++++++++++++++++++--- tests/llmobs/test_experiments.py | 6 +++--- 4 files changed, 35 insertions(+), 7 deletions(-) diff --git a/ddtrace/llmobs/_constants.py b/ddtrace/llmobs/_constants.py index b3ce7b7eb67..73ddddef8be 100644 --- a/ddtrace/llmobs/_constants.py +++ b/ddtrace/llmobs/_constants.py @@ -103,4 +103,6 @@ EXPERIMENT_ID_KEY = "_ml_obs.experiment_id" EXPERIMENT_EXPECTED_OUTPUT = "_ml_obs.meta.input.expected_output" +EXPERIMENTS_INPUT = "_ml_obs.meta.input" +EXPERIMENTS_OUTPUT = "_ml_obs.meta.output" DEFAULT_PROJECT_NAME = "default-project" diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py index 9de96a98769..7f15a733c4e 100644 --- a/ddtrace/llmobs/_experiment.py +++ b/ddtrace/llmobs/_experiment.py @@ -26,6 +26,7 @@ from ddtrace.llmobs._constants import DD_SITES_NEEDING_APP_SUBDOMAIN from ddtrace.llmobs._constants import EXPERIMENT_EXPECTED_OUTPUT from ddtrace.llmobs._utils import convert_tags_dict_to_list +from ddtrace.llmobs._utils import safe_json if TYPE_CHECKING: @@ -349,7 +350,7 @@ def _process_record(self, idx_record: Tuple[int, DatasetRecord]) -> Optional[Tas except Exception: span.set_exc_info(*sys.exc_info()) self._llmobs_instance.annotate(span, input_data=input_data, output_data=output_data, tags=tags) - span._set_ctx_item(EXPERIMENT_EXPECTED_OUTPUT, record["expected_output"]) + span._set_ctx_item(EXPERIMENT_EXPECTED_OUTPUT, safe_json(record["expected_output"])) return { "idx": idx, "span_id": span_id, diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index debca91e184..e65a846e821 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -54,6 +54,8 @@ from ddtrace.llmobs._constants import EXPERIMENT_CSV_FIELD_MAX_SIZE from ddtrace.llmobs._constants import EXPERIMENT_EXPECTED_OUTPUT from ddtrace.llmobs._constants import EXPERIMENT_ID_KEY +from ddtrace.llmobs._constants import EXPERIMENTS_INPUT +from ddtrace.llmobs._constants import EXPERIMENTS_OUTPUT from ddtrace.llmobs._constants import INPUT_DOCUMENTS from ddtrace.llmobs._constants import INPUT_MESSAGES from ddtrace.llmobs._constants import INPUT_PROMPT @@ -278,9 +280,18 @@ def _llmobs_span_event(self, span: Span) -> Optional[LLMObsSpanEvent]: if span.context.get_baggage_item(EXPERIMENT_ID_KEY): _dd_attrs["scope"] = "experiments" - expected_output = span._get_ctx_item(EXPERIMENT_EXPECTED_OUTPUT) - if span_kind == "experiment" and expected_output: - meta["expected_output"] = expected_output + if span_kind == "experiment": + expected_output = span._get_ctx_item(EXPERIMENT_EXPECTED_OUTPUT) + if expected_output: + meta["expected_output"] = expected_output + + input_data = span._get_ctx_item(EXPERIMENTS_INPUT) + if input_data: + meta["input"] = input_data + + output_data = span._get_ctx_item(EXPERIMENTS_OUTPUT) + if output_data: + meta["output"] = output_data input_messages = span._get_ctx_item(INPUT_MESSAGES) if span_kind == "llm" and input_messages is not None: @@ -1366,6 +1377,8 @@ def annotate( error = cls._tag_embedding_io(span, input_documents=input_data, output_text=output_data) elif span_kind == "retrieval": error = cls._tag_retrieval_io(span, input_text=input_data, output_documents=output_data) + elif span_kind == "experiment": + cls._tag_freeform_io(span, input_value=input_data, output_value=output_data) else: cls._tag_text_io(span, input_value=input_data, output_value=output_data) finally: @@ -1447,6 +1460,18 @@ def _tag_text_io(cls, span, input_value=None, output_value=None): if output_value is not None: span._set_ctx_item(OUTPUT_VALUE, safe_json(output_value)) + @classmethod + def _tag_freeform_io(cls, span, input_value=None, output_value=None): + """Tags input/output values for experient spans. + Will be mapped to span's `meta.{input,output}` fields. + this is meant to be non restrictive on user's data, experiments allow + arbitrary structured or non structured IO values in its spans + """ + if input_value is not None: + span._set_ctx_item(EXPERIMENTS_INPUT, safe_json(input_value)) + if output_value is not None: + span._set_ctx_item(EXPERIMENTS_OUTPUT, safe_json(output_value)) + @staticmethod def _set_dict_attribute(span: Span, key, value: Dict[str, Any]) -> None: """Sets a given LLM Obs span attribute with a dictionary key/values. diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py index 7aac195fbe3..3bda9bc3406 100644 --- a/tests/llmobs/test_experiments.py +++ b/tests/llmobs/test_experiments.py @@ -1078,9 +1078,9 @@ def test_experiment_span_written_to_experiment_scope(llmobs, llmobs_events, test for key in ("span_id", "trace_id", "parent_id", "start_ns", "duration", "metrics"): assert event[key] == mock.ANY assert event["status"] == "ok" - assert event["meta"]["input"] == {"value": '{"prompt": "What is the capital of France?"}'} - assert event["meta"]["output"] == {"value": '{"prompt": "What is the capital of France?"}'} - assert event["meta"]["expected_output"] == {"answer": "Paris"} + assert event["meta"]["input"] == '{"prompt": "What is the capital of France?"}' + assert event["meta"]["output"] == '{"prompt": "What is the capital of France?"}' + assert event["meta"]["expected_output"] == '{"answer": "Paris"}' assert "dataset_id:{}".format(test_dataset_one_record._id) in event["tags"] assert "dataset_record_id:{}".format(test_dataset_one_record._records[0]["record_id"]) in event["tags"] assert "experiment_id:1234567890" in event["tags"]