Skip to content

Commit d36d913

Browse files
feat(llmobs): allow project override when creating experiment (#14923)
## Description this allows an experiment to be saved to a different project from the one defined in `LLMObs.enable` ## Testing with the following script that overrides the project name in `LLMObs.experiment`: ``` import os import math from dotenv import load_dotenv # Load environment variables from the .env file. load_dotenv(override=True) from typing import Dict, Any from ddtrace.llmobs import LLMObs from openai import OpenAI LLMObs.enable(api_key=os.getenv("DD_API_KEY"), app_key=os.getenv("DD_APPLICATION_KEY"), project_name="Onboarding", ml_app="Onboarding-ML-App") import ddtrace print(ddtrace.get_version()) oai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) dataset = LLMObs.pull_dataset("capitals-of-the-world-315rc1") print(dataset.as_dataframe()) print(dataset.url) # the task function will accept a row of input and will manipulate against it using the config provided def generate_capital(input_data: Dict[str, Any], config: Dict[str, Any]) -> str: output = oai_client.chat.completions.create( model=config["model"], messages=[{"role": "user", "content": input_data["question"]}], temperature=config["temperature"] ) return output.choices[0].message.content # Evaluators receive `input_data`, `output_data` (the output to test against), and `expected_output` (ground truth). All of them come automatically from the dataset and the task. # You can modify the logic to support different evaluation methods like fuzzy matching, semantic similarity, llm-as-a-judge, etc. def exact_match(input_data, output_data, expected_output): return expected_output == output_data def contains_answer(input_data, output_data, expected_output): return expected_output in output_data experiment = LLMObs.experiment( name="generate-capital-with-config", dataset=dataset, task=generate_capital, evaluators=[exact_match, contains_answer], project_name="new-gh-project", config={"model": "gpt-4.1-nano", "temperature": 0}, description="a cool basic experiment with config", ) results = experiment.run(jobs=5) print(experiment.url) ``` we see the experiment in the project "new-gh-project": <img width="1740" height="795" alt="image" src="https://github.com/user-attachments/assets/ef302df4-9ded-4049-aebb-a1074cb717f6" /> and it appears normal https://dddev.datadoghq.com/llm/experiments/02a69ecb-7de5-4569-9b5a-8524e096d0d5?spanId=1743163979865576740 and nothing else in "Onboarding" (from the enable call) <img width="1739" height="942" alt="image" src="https://github.com/user-attachments/assets/1a71fff9-b022-4ca9-9545-72a536342e5e" /> ## Risks None --------- Co-authored-by: Brett Langdon <brett.langdon@datadoghq.com>
1 parent 5f952a1 commit d36d913

6 files changed

+201
-1
lines changed

ddtrace/llmobs/_llmobs.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -771,6 +771,7 @@ def experiment(
771771
dataset: Dataset,
772772
evaluators: List[Callable[[DatasetRecordInputType, JSONType, JSONType], JSONType]],
773773
description: str = "",
774+
project_name: Optional[str] = None,
774775
tags: Optional[Dict[str, str]] = None,
775776
config: Optional[ExperimentConfigType] = None,
776777
summary_evaluators: Optional[
@@ -788,9 +789,14 @@ def experiment(
788789
:param dataset: The dataset to run the experiment on, created with LLMObs.pull/create_dataset().
789790
:param evaluators: A list of evaluator functions to evaluate the task output.
790791
Must accept parameters ``input_data``, ``output_data``, and ``expected_output``.
792+
:param project_name: The name of the project to save the experiment to.
791793
:param description: A description of the experiment.
792794
:param tags: A dictionary of string key-value tag pairs to associate with the experiment.
793795
:param config: A configuration dictionary describing the experiment.
796+
:param summary_evaluators: A list of summary evaluator functions to evaluate the task results and evaluations
797+
to produce a single value.
798+
Must accept parameters ``inputs``, ``outputs``, ``expected_outputs``,
799+
``evaluators_results``.
794800
"""
795801
if not callable(task):
796802
raise TypeError("task must be a callable function.")
@@ -825,7 +831,7 @@ def experiment(
825831
task,
826832
dataset,
827833
evaluators,
828-
project_name=cls._project_name,
834+
project_name=project_name or cls._project_name,
829835
tags=tags,
830836
description=description,
831837
config=config,
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
upgrade:
3+
- |
4+
LLM Observability: Experiments can now be created to be stored under a different project from the project defined
5+
in ``LLMObs.enable``
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
interactions:
2+
- request:
3+
body: '{"data": {"type": "experiments", "attributes": {"scope": "experiments",
4+
"metrics": [{"metric_source": "custom", "span_id": "123", "trace_id": "456",
5+
"timestamp_ms": 1234, "metric_type": "score", "label": "dummy_evaluator", "score_value":
6+
0, "error": null, "tags": ["ddtrace.version:1.2.3", "experiment_id:0629f6b6-8fa3-4649-948f-6446fee79415"],
7+
"experiment_id": "0629f6b6-8fa3-4649-948f-6446fee79415"}], "tags": ["ddtrace.version:1.2.3",
8+
"experiment_id:0629f6b6-8fa3-4649-948f-6446fee79415"]}}}'
9+
headers:
10+
Accept:
11+
- '*/*'
12+
? !!python/object/apply:multidict._multidict.istr
13+
- Accept-Encoding
14+
: - identity
15+
Connection:
16+
- keep-alive
17+
Content-Length:
18+
- '494'
19+
? !!python/object/apply:multidict._multidict.istr
20+
- Content-Type
21+
: - application/json
22+
User-Agent:
23+
- python-requests/2.32.3
24+
method: POST
25+
uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/experiments/0629f6b6-8fa3-4649-948f-6446fee79415/events
26+
response:
27+
body:
28+
string: ''
29+
headers:
30+
content-length:
31+
- '0'
32+
content-security-policy:
33+
- frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
34+
content-type:
35+
- application/vnd.api+json
36+
date:
37+
- Thu, 16 Oct 2025 21:12:54 GMT
38+
strict-transport-security:
39+
- max-age=31536000; includeSubDomains; preload
40+
vary:
41+
- Accept-Encoding
42+
x-content-type-options:
43+
- nosniff
44+
x-frame-options:
45+
- SAMEORIGIN
46+
status:
47+
code: 202
48+
message: Accepted
49+
version: 1
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
interactions:
2+
- request:
3+
body: '{"data": {"type": "experiments", "attributes": {"name": "test_experiment",
4+
"description": "", "dataset_id": "0969efc9-f104-45cc-b955-25b329e91293", "project_id":
5+
"c4b49fb5-7b16-46e1-86f0-de5800e8a56c", "dataset_version": 1, "config": {},
6+
"metadata": {"tags": ["ddtrace.version:1.2.3"]}, "ensure_unique": true}}}'
7+
headers:
8+
Accept:
9+
- '*/*'
10+
? !!python/object/apply:multidict._multidict.istr
11+
- Accept-Encoding
12+
: - identity
13+
Connection:
14+
- keep-alive
15+
Content-Length:
16+
- '311'
17+
? !!python/object/apply:multidict._multidict.istr
18+
- Content-Type
19+
: - application/json
20+
User-Agent:
21+
- python-requests/2.32.3
22+
method: POST
23+
uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/experiments
24+
response:
25+
body:
26+
string: '{"data":{"id":"0629f6b6-8fa3-4649-948f-6446fee79415","type":"experiments","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"config":{},"created_at":"2025-10-16T21:12:54.402992085Z","dataset_id":"0969efc9-f104-45cc-b955-25b329e91293","dataset_version":1,"description":"","experiment":"test_experiment","metadata":{"tags":["ddtrace.version:1.2.3"]},"name":"test_experiment-1760649174402","project_id":"c4b49fb5-7b16-46e1-86f0-de5800e8a56c","updated_at":"2025-10-16T21:12:54.402992167Z"}}}'
27+
headers:
28+
content-length:
29+
- '506'
30+
content-security-policy:
31+
- frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
32+
content-type:
33+
- application/vnd.api+json
34+
date:
35+
- Thu, 16 Oct 2025 21:12:54 GMT
36+
strict-transport-security:
37+
- max-age=31536000; includeSubDomains; preload
38+
vary:
39+
- Accept-Encoding
40+
x-content-type-options:
41+
- nosniff
42+
x-frame-options:
43+
- SAMEORIGIN
44+
status:
45+
code: 200
46+
message: OK
47+
version: 1
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
interactions:
2+
- request:
3+
body: '{"data": {"type": "projects", "attributes": {"name": "new-different-project",
4+
"description": ""}}}'
5+
headers:
6+
Accept:
7+
- '*/*'
8+
? !!python/object/apply:multidict._multidict.istr
9+
- Accept-Encoding
10+
: - identity
11+
Connection:
12+
- keep-alive
13+
Content-Length:
14+
- '98'
15+
? !!python/object/apply:multidict._multidict.istr
16+
- Content-Type
17+
: - application/json
18+
User-Agent:
19+
- python-requests/2.32.3
20+
method: POST
21+
uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/projects
22+
response:
23+
body:
24+
string: '{"data":{"id":"c4b49fb5-7b16-46e1-86f0-de5800e8a56c","type":"projects","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-10-16T21:04:07.493104Z","name":"new-different-project","updated_at":"2025-10-16T21:04:07.493104Z"}}}'
25+
headers:
26+
content-length:
27+
- '259'
28+
content-security-policy:
29+
- frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
30+
content-type:
31+
- application/vnd.api+json
32+
date:
33+
- Thu, 16 Oct 2025 21:12:54 GMT
34+
strict-transport-security:
35+
- max-age=31536000; includeSubDomains; preload
36+
vary:
37+
- Accept-Encoding
38+
x-content-type-options:
39+
- nosniff
40+
x-frame-options:
41+
- SAMEORIGIN
42+
status:
43+
code: 200
44+
message: OK
45+
version: 1

tests/llmobs/test_experiments.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1413,6 +1413,54 @@ def test_experiment_run(llmobs, test_dataset_one_record):
14131413
assert exp_result["expected_output"] == {"answer": "Paris"}
14141414
assert exp.url == f"https://app.datadoghq.com/llm/experiments/{exp._id}"
14151415

1416+
project = llmobs._instance._dne_client.project_create_or_get(name="test-project")
1417+
assert project.get("_id") == "f0a6723e-a7e8-4efd-a94a-b892b7b6fbf9"
1418+
assert project.get("name") == "test-project"
1419+
assert exp._project_id == project.get("_id")
1420+
assert exp._project_name == project.get("name")
1421+
1422+
1423+
def test_experiment_run_w_different_project(llmobs, test_dataset_one_record):
1424+
with mock.patch("ddtrace.llmobs._experiment.Experiment._process_record") as mock_process_record:
1425+
# This is to ensure that the eval event post request contains the same span/trace IDs and timestamp.
1426+
mock_process_record.return_value = {
1427+
"idx": 0,
1428+
"span_id": "123",
1429+
"trace_id": "456",
1430+
"timestamp": 1234567890,
1431+
"output": {"prompt": "What is the capital of France?"},
1432+
"metadata": {
1433+
"dataset_record_index": 0,
1434+
"experiment_name": "test_experiment",
1435+
"dataset_name": "test-dataset-123",
1436+
},
1437+
"error": {"message": None, "type": None, "stack": None},
1438+
}
1439+
exp = llmobs.experiment(
1440+
"test_experiment",
1441+
dummy_task,
1442+
test_dataset_one_record,
1443+
[dummy_evaluator],
1444+
project_name="new-different-project",
1445+
)
1446+
exp._tags = {"ddtrace.version": "1.2.3"} # FIXME: this is a hack to set the tags for the experiment
1447+
exp_results = exp.run()
1448+
1449+
assert len(exp_results["summary_evaluations"]) == 0
1450+
assert len(exp_results["rows"]) == 1
1451+
exp_result = exp_results["rows"][0]
1452+
assert exp_result["idx"] == 0
1453+
assert exp_result["input"] == {"prompt": "What is the capital of France?"}
1454+
assert exp_result["output"] == {"prompt": "What is the capital of France?"}
1455+
assert exp_result["expected_output"] == {"answer": "Paris"}
1456+
assert exp.url == f"https://app.datadoghq.com/llm/experiments/{exp._id}"
1457+
1458+
project = llmobs._instance._dne_client.project_create_or_get(name="new-different-project")
1459+
assert project.get("_id") == "c4b49fb5-7b16-46e1-86f0-de5800e8a56c"
1460+
assert project.get("name") == "new-different-project"
1461+
assert exp._project_id == project.get("_id")
1462+
assert exp._project_name == project.get("name")
1463+
14161464

14171465
def test_experiment_run_w_summary(llmobs, test_dataset_one_record):
14181466
with mock.patch("ddtrace.llmobs._experiment.Experiment._process_record") as mock_process_record:

0 commit comments

Comments
 (0)