feat(llmobs): allow project override when creating experiment (#14923)

gary-huang · brettlangdon · web-flow · commit d36d91395752 · 2025-10-17T13:39:35.000-04:00
## Description this allows an experiment to be saved to a different project from the one defined in `LLMObs.enable` ## Testing with the following script that overrides the project name in `LLMObs.experiment`: ``` import os import math from dotenv import load_dotenv # Load environment variables from the .env file. load_dotenv(override=True) from typing import Dict, Any from ddtrace.llmobs import LLMObs from openai import OpenAI LLMObs.enable(api_key=os.getenv("DD_API_KEY"), app_key=os.getenv("DD_APPLICATION_KEY"), project_name="Onboarding", ml_app="Onboarding-ML-App") import ddtrace print(ddtrace.get_version()) oai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) dataset = LLMObs.pull_dataset("capitals-of-the-world-315rc1") print(dataset.as_dataframe()) print(dataset.url) # the task function will accept a row of input and will manipulate against it using the config provided def generate_capital(input_data: Dict[str, Any], config: Dict[str, Any]) -> str: output = oai_client.chat.completions.create( model=config["model"], messages=[{"role": "user", "content": input_data["question"]}], temperature=config["temperature"] ) return output.choices[0].message.content # Evaluators receive `input_data`, `output_data` (the output to test against), and `expected_output` (ground truth). All of them come automatically from the dataset and the task. # You can modify the logic to support different evaluation methods like fuzzy matching, semantic similarity, llm-as-a-judge, etc. def exact_match(input_data, output_data, expected_output): return expected_output == output_data def contains_answer(input_data, output_data, expected_output): return expected_output in output_data experiment = LLMObs.experiment( name="generate-capital-with-config", dataset=dataset, task=generate_capital, evaluators=[exact_match, contains_answer], project_name="new-gh-project", config={"model": "gpt-4.1-nano", "temperature": 0}, description="a cool basic experiment with config", ) results = experiment.run(jobs=5) print(experiment.url) ``` we see the experiment in the project "new-gh-project": <img width="1740" height="795" alt="image" src="https://github.com/user-attachments/assets/ef302df4-9ded-4049-aebb-a1074cb717f6" /> and it appears normal https://dddev.datadoghq.com/llm/experiments/02a69ecb-7de5-4569-9b5a-8524e096d0d5?spanId=1743163979865576740 and nothing else in "Onboarding" (from the enable call) <img width="1739" height="942" alt="image" src="https://github.com/user-attachments/assets/1a71fff9-b022-4ca9-9545-72a536342e5e" /> ## Risks None --------- Co-authored-by: Brett Langdon <brett.langdon@datadoghq.com>
diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
@@ -771,6 +771,7 @@ def experiment(
         dataset: Dataset,
         evaluators: List[Callable[[DatasetRecordInputType, JSONType, JSONType], JSONType]],
         description: str = "",
+        project_name: Optional[str] = None,
         tags: Optional[Dict[str, str]] = None,
         config: Optional[ExperimentConfigType] = None,
         summary_evaluators: Optional[
@@ -788,9 +789,14 @@ def experiment(
         :param dataset: The dataset to run the experiment on, created with LLMObs.pull/create_dataset().
         :param evaluators: A list of evaluator functions to evaluate the task output.
                            Must accept parameters ``input_data``, ``output_data``, and ``expected_output``.
+        :param project_name: The name of the project to save the experiment to.
         :param description: A description of the experiment.
         :param tags: A dictionary of string key-value tag pairs to associate with the experiment.
         :param config: A configuration dictionary describing the experiment.
+        :param summary_evaluators: A list of summary evaluator functions to evaluate the task results and evaluations
+                                   to produce a single value.
+                                   Must accept parameters ``inputs``, ``outputs``, ``expected_outputs``,
+                                   ``evaluators_results``.
         """
         if not callable(task):
             raise TypeError("task must be a callable function.")
@@ -825,7 +831,7 @@ def experiment(
             task,
             dataset,
             evaluators,
-            project_name=cls._project_name,
+            project_name=project_name or cls._project_name,
             tags=tags,
             description=description,
             config=config,
diff --git a/releasenotes/notes/llmobs-dne-add-project-name-arg-to-create-experiment-a1f4fb61aea22983.yaml b/releasenotes/notes/llmobs-dne-add-project-name-arg-to-create-experiment-a1f4fb61aea22983.yaml
@@ -0,0 +1,5 @@
+---
+upgrade:
+  - |
+    LLM Observability: Experiments can now be created to be stored under a different project from the project defined
+    in ``LLMObs.enable``
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_0629f6b6-8fa3-4649-948f-6446fee79415_events_post_84f54f47.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_0629f6b6-8fa3-4649-948f-6446fee79415_events_post_84f54f47.yaml
@@ -0,0 +1,49 @@
+interactions:
+- request:
+    body: '{"data": {"type": "experiments", "attributes": {"scope": "experiments",
+      "metrics": [{"metric_source": "custom", "span_id": "123", "trace_id": "456",
+      "timestamp_ms": 1234, "metric_type": "score", "label": "dummy_evaluator", "score_value":
+      0, "error": null, "tags": ["ddtrace.version:1.2.3", "experiment_id:0629f6b6-8fa3-4649-948f-6446fee79415"],
+      "experiment_id": "0629f6b6-8fa3-4649-948f-6446fee79415"}], "tags": ["ddtrace.version:1.2.3",
+      "experiment_id:0629f6b6-8fa3-4649-948f-6446fee79415"]}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '494'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/experiments/0629f6b6-8fa3-4649-948f-6446fee79415/events
+  response:
+    body:
+      string: ''
+    headers:
+      content-length:
+      - '0'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Thu, 16 Oct 2025 21:12:54 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 202
+      message: Accepted
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_bbedfc6d.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_bbedfc6d.yaml
@@ -0,0 +1,47 @@
+interactions:
+- request:
+    body: '{"data": {"type": "experiments", "attributes": {"name": "test_experiment",
+      "description": "", "dataset_id": "0969efc9-f104-45cc-b955-25b329e91293", "project_id":
+      "c4b49fb5-7b16-46e1-86f0-de5800e8a56c", "dataset_version": 1, "config": {},
+      "metadata": {"tags": ["ddtrace.version:1.2.3"]}, "ensure_unique": true}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '311'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/experiments
+  response:
+    body:
+      string: '{"data":{"id":"0629f6b6-8fa3-4649-948f-6446fee79415","type":"experiments","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"config":{},"created_at":"2025-10-16T21:12:54.402992085Z","dataset_id":"0969efc9-f104-45cc-b955-25b329e91293","dataset_version":1,"description":"","experiment":"test_experiment","metadata":{"tags":["ddtrace.version:1.2.3"]},"name":"test_experiment-1760649174402","project_id":"c4b49fb5-7b16-46e1-86f0-de5800e8a56c","updated_at":"2025-10-16T21:12:54.402992167Z"}}}'
+    headers:
+      content-length:
+      - '506'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Thu, 16 Oct 2025 21:12:54 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_projects_post_22ea09fc.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_projects_post_22ea09fc.yaml
@@ -0,0 +1,45 @@
+interactions:
+- request:
+    body: '{"data": {"type": "projects", "attributes": {"name": "new-different-project",
+      "description": ""}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '98'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/projects
+  response:
+    body:
+      string: '{"data":{"id":"c4b49fb5-7b16-46e1-86f0-de5800e8a56c","type":"projects","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-16T21:04:07.493104Z","name":"new-different-project","updated_at":"2025-10-16T21:04:07.493104Z"}}}'
+    headers:
+      content-length:
+      - '259'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Thu, 16 Oct 2025 21:12:54 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py
@@ -1413,6 +1413,54 @@ def test_experiment_run(llmobs, test_dataset_one_record):
     assert exp_result["expected_output"] == {"answer": "Paris"}
     assert exp.url == f"https://app.datadoghq.com/llm/experiments/{exp._id}"
 
+    project = llmobs._instance._dne_client.project_create_or_get(name="test-project")
+    assert project.get("_id") == "f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9"
+    assert project.get("name") == "test-project"
+    assert exp._project_id == project.get("_id")
+    assert exp._project_name == project.get("name")
+
+
+def test_experiment_run_w_different_project(llmobs, test_dataset_one_record):
+    with mock.patch("ddtrace.llmobs._experiment.Experiment._process_record") as mock_process_record:
+        # This is to ensure that the eval event post request contains the same span/trace IDs and timestamp.
+        mock_process_record.return_value = {
+            "idx": 0,
+            "span_id": "123",
+            "trace_id": "456",
+            "timestamp": 1234567890,
+            "output": {"prompt": "What is the capital of France?"},
+            "metadata": {
+                "dataset_record_index": 0,
+                "experiment_name": "test_experiment",
+                "dataset_name": "test-dataset-123",
+            },
+            "error": {"message": None, "type": None, "stack": None},
+        }
+        exp = llmobs.experiment(
+            "test_experiment",
+            dummy_task,
+            test_dataset_one_record,
+            [dummy_evaluator],
+            project_name="new-different-project",
+        )
+        exp._tags = {"ddtrace.version": "1.2.3"}  # FIXME: this is a hack to set the tags for the experiment
+        exp_results = exp.run()
+
+    assert len(exp_results["summary_evaluations"]) == 0
+    assert len(exp_results["rows"]) == 1
+    exp_result = exp_results["rows"][0]
+    assert exp_result["idx"] == 0
+    assert exp_result["input"] == {"prompt": "What is the capital of France?"}
+    assert exp_result["output"] == {"prompt": "What is the capital of France?"}
+    assert exp_result["expected_output"] == {"answer": "Paris"}
+    assert exp.url == f"https://app.datadoghq.com/llm/experiments/{exp._id}"
+
+    project = llmobs._instance._dne_client.project_create_or_get(name="new-different-project")
+    assert project.get("_id") == "c4b49fb5-7b16-46e1-86f0-de5800e8a56c"
+    assert project.get("name") == "new-different-project"
+    assert exp._project_id == project.get("_id")
+    assert exp._project_name == project.get("name")
+
 
 def test_experiment_run_w_summary(llmobs, test_dataset_one_record):
     with mock.patch("ddtrace.llmobs._experiment.Experiment._process_record") as mock_process_record: