From ceac5dc2d62bda0c35adbf2db4fb8ff77cc23ebb Mon Sep 17 00:00:00 2001 From: Sam Brenner Date: Wed, 3 Sep 2025 10:57:53 -0400 Subject: [PATCH 1/7] initial fixes --- ddtrace/llmobs/_writer.py | 3 + ..._llm-obs_v2_eval-metric_post_1218a393.yaml | 47 ++++++++++++ ..._llm-obs_v2_eval-metric_post_3ef3a86e.yaml | 47 ++++++++++++ ..._llm-obs_v2_eval-metric_post_9ef24d1e.yaml | 43 +++++++++++ .../test_llmobs_eval_metric_agent_writer.py | 12 +-- ...est_llmobs_eval_metric_agentless_writer.py | 73 ++++++++++++------- tests/llmobs/test_llmobs_evaluator_runner.py | 1 + 7 files changed, 195 insertions(+), 31 deletions(-) create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_1218a393.yaml create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_3ef3a86e.yaml create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_9ef24d1e.yaml diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py index 0f98e0c9623..3324dd86882 100644 --- a/ddtrace/llmobs/_writer.py +++ b/ddtrace/llmobs/_writer.py @@ -154,6 +154,9 @@ def __init__( f"{self.AGENTLESS_BASE_URL}.{self._site}" if is_agentless else agent_config.trace_agent_url ) self._endpoint: str = self.ENDPOINT if is_agentless else f"{EVP_PROXY_AGENT_BASE_PATH}{self.ENDPOINT}" + if self._override_url: + self._endpoint = self.ENDPOINT.lstrip("/") + self._headers: Dict[str, str] = {"Content-Type": "application/json"} if is_agentless: self._headers["DD-API-KEY"] = self._api_key diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_1218a393.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_1218a393.yaml new file mode 100644 index 00000000000..caf1dcc5ba1 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_1218a393.yaml @@ -0,0 +1,47 @@ +interactions: +- request: + body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on": + {"span": {"span_id": "12345678901", "trace_id": "98765432101"}}, "metric_type": + "categorical", "categorical_value": "very", "label": "toxicity", "ml_app": "dummy-ml-app", + "timestamp_ms": 1756910127022}]}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '283' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.4 + method: POST + uri: https://api.datadoghq.com/api/intake/llm-obs/v2/eval-metric + response: + body: + string: '{"data":{"id":"1ef94721-392d-4612-ad63-5f3b289c1cd5","type":"evaluation_metric","attributes":{"metrics":[{"id":"-Xbd-WStY2","join_on":{"span":{"trace_id":"98765432101","span_id":"12345678901"}},"timestamp_ms":1756910127022,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}' + headers: + content-length: + - '325' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Wed, 03 Sep 2025 14:41:13 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 202 + message: Accepted +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_3ef3a86e.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_3ef3a86e.yaml new file mode 100644 index 00000000000..fc1c3dddb11 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_3ef3a86e.yaml @@ -0,0 +1,47 @@ +interactions: +- request: + body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on": + {"span": {"span_id": "12345678902", "trace_id": "98765432102"}}, "metric_type": + "score", "label": "sentiment", "score_value": 0.9, "ml_app": "dummy-ml-app", + "timestamp_ms": 1756910127022}]}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '269' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.4 + method: POST + uri: https://api.datadoghq.com/api/intake/llm-obs/v2/eval-metric + response: + body: + string: '{"data":{"id":"c7ca5837-c593-4973-aefc-fe9ccbca1e74","type":"evaluation_metric","attributes":{"metrics":[{"id":"BKrS9Vc9nU","join_on":{"span":{"trace_id":"98765432102","span_id":"12345678902"}},"timestamp_ms":1756910127022,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}' + headers: + content-length: + - '311' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Wed, 03 Sep 2025 14:45:25 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 202 + message: Accepted +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_9ef24d1e.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_9ef24d1e.yaml new file mode 100644 index 00000000000..f6ee7c02c32 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_9ef24d1e.yaml @@ -0,0 +1,43 @@ +interactions: +- request: + body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on": + {"span": {"span_id": "12345678901", "trace_id": "98765432101"}}, "metric_type": + "categorical", "categorical_value": "wrong-api-key", "label": "api-key", "ml_app": + "dummy-ml-app", "timestamp_ms": 1756910127022}]}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '291' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.4 + method: POST + uri: https://api.datadoghq.com/api/intake/llm-obs/v2/eval-metric + response: + body: + string: '{"status":"error","code":403,"errors":["Forbidden"],"statuspage":"http://status.datadoghq.com","twitter":"http://twitter.com/datadogops","email":"support@datadoghq.com"}' + headers: + connection: + - close + content-length: + - '169' + content-type: + - application/json + date: + - Wed, 03 Sep 2025 14:39:21 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-content-type-options: + - nosniff + status: + code: 403 + message: Forbidden +version: 1 diff --git a/tests/llmobs/test_llmobs_eval_metric_agent_writer.py b/tests/llmobs/test_llmobs_eval_metric_agent_writer.py index 529d0945859..7a80bda0cf0 100644 --- a/tests/llmobs/test_llmobs_eval_metric_agent_writer.py +++ b/tests/llmobs/test_llmobs_eval_metric_agent_writer.py @@ -44,7 +44,7 @@ def test_buffer_limit(mock_writer_logs): @mock.patch("ddtrace.llmobs._writer.LLMObsEvalMetricWriter._send_payload") def test_send_categorical_metrics(mock_send_payload, mock_writer_logs): llmobs_eval_metric_writer = LLMObsEvalMetricWriter(1, 1, is_agentless=False) - llmobs_eval_metric_writer.enqueue(_categorical_metric_event()) + llmobs_eval_metric_writer.enqueue(_categorical_metric_event(label="toxicity", value="very")) llmobs_eval_metric_writer.periodic() mock_writer_logs.debug.assert_called_with("encoded %d LLMObs %s events to be sent", 1, "evaluation_metric") @@ -52,7 +52,7 @@ def test_send_categorical_metrics(mock_send_payload, mock_writer_logs): @mock.patch("ddtrace.llmobs._writer.LLMObsEvalMetricWriter._send_payload") def test_send_score_metric(mock_send_payload, mock_writer_logs): llmobs_eval_metric_writer = LLMObsEvalMetricWriter(1, 1, is_agentless=False) - llmobs_eval_metric_writer.enqueue(_score_metric_event()) + llmobs_eval_metric_writer.enqueue(_score_metric_event(label="sentiment", value=0.9)) llmobs_eval_metric_writer.periodic() mock_writer_logs.debug.assert_called_with("encoded %d LLMObs %s events to be sent", 1, "evaluation_metric") @@ -63,11 +63,11 @@ def test_send_timed_events(mock_send_payload, mock_writer_logs): llmobs_eval_metric_writer.start() mock_writer_logs.reset_mock() - llmobs_eval_metric_writer.enqueue(_score_metric_event()) + llmobs_eval_metric_writer.enqueue(_score_metric_event(label="sentiment", value=0.9)) time.sleep(0.1) mock_writer_logs.debug.assert_called_with("encoded %d LLMObs %s events to be sent", 1, "evaluation_metric") mock_writer_logs.reset_mock() - llmobs_eval_metric_writer.enqueue(_categorical_metric_event()) + llmobs_eval_metric_writer.enqueue(_categorical_metric_event(label="toxicity", value="very")) time.sleep(0.1) mock_writer_logs.debug.assert_called_with("encoded %d LLMObs %s events to be sent", 1, "evaluation_metric") llmobs_eval_metric_writer.stop() @@ -77,7 +77,7 @@ def test_send_timed_events(mock_send_payload, mock_writer_logs): def test_send_multiple_events(mock_send_payload, mock_writer_logs): llmobs_eval_metric_writer = LLMObsEvalMetricWriter(1, 1, is_agentless=False) mock_writer_logs.reset_mock() - llmobs_eval_metric_writer.enqueue(_score_metric_event()) - llmobs_eval_metric_writer.enqueue(_categorical_metric_event()) + llmobs_eval_metric_writer.enqueue(_score_metric_event(label="sentiment", value=0.9)) + llmobs_eval_metric_writer.enqueue(_categorical_metric_event(label="toxicity", value="very")) llmobs_eval_metric_writer.periodic() mock_writer_logs.debug.assert_called_with("encoded %d LLMObs %s events to be sent", 2, "evaluation_metric") diff --git a/tests/llmobs/test_llmobs_eval_metric_agentless_writer.py b/tests/llmobs/test_llmobs_eval_metric_agentless_writer.py index 56ed4556367..5a9b84394e0 100644 --- a/tests/llmobs/test_llmobs_eval_metric_agentless_writer.py +++ b/tests/llmobs/test_llmobs_eval_metric_agentless_writer.py @@ -10,12 +10,11 @@ DD_SITE = "datad0g.com" -AGENTLESS_URL = "{}.{}".format(AGENTLESS_EVAL_BASE_URL, DD_SITE) INTAKE_ENDPOINT = "https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric" DD_API_KEY = os.getenv("DD_API_KEY", default="") -def _categorical_metric_event(): +def _categorical_metric_event(label: str, value: str): return { "join_on": { "span": { @@ -24,14 +23,14 @@ def _categorical_metric_event(): }, }, "metric_type": "categorical", - "categorical_value": "very", - "label": "toxicity", + "categorical_value": value, + "label": label, "ml_app": "dummy-ml-app", - "timestamp_ms": round(time.time() * 1000), + "timestamp_ms": 1756910127022, } -def _score_metric_event(): +def _score_metric_event(label: str, value: float): return { "join_on": { "span": { @@ -40,10 +39,10 @@ def _score_metric_event(): }, }, "metric_type": "score", - "label": "sentiment", - "score_value": 0.9, + "label": label, + "score_value": value, "ml_app": "dummy-ml-app", - "timestamp_ms": round(time.time() * 1000), + "timestamp_ms": 1756910127022, } @@ -64,15 +63,22 @@ def test_buffer_limit(mock_writer_logs): def test_send_metric_bad_api_key(mock_writer_logs): - llmobs_eval_metric_writer = LLMObsEvalMetricWriter(1, 1, is_agentless=True, _site=DD_SITE, _api_key="") + llmobs_eval_metric_writer = LLMObsEvalMetricWriter( + interval=1, + timeout=1, + is_agentless=True, + _override_url="http://localhost:9126/vcr/datadog/", + _api_key="", + ) + + llmobs_eval_metric_writer.enqueue(_categorical_metric_event(label="api-key", value="wrong-api-key")) - llmobs_eval_metric_writer.enqueue(_categorical_metric_event()) llmobs_eval_metric_writer.periodic() mock_writer_logs.error.assert_called_with( "failed to send %d LLMObs %s events to %s, got response code %d, status: %s", 1, "evaluation_metric", - INTAKE_ENDPOINT, + "http://localhost:9126/vcr/datadog/api/intake/llm-obs/v2/eval-metric", 403, b'{"status":"error","code":403,"errors":["Forbidden"],"statuspage":"http://status.datadoghq.com","twitter":"http://twitter.com/datadogops","email":"support@datadoghq.com"}', # noqa ) @@ -90,39 +96,55 @@ def test_send_metric_no_api_key(mock_writer_logs): ) -@pytest.mark.vcr_logs def test_send_categorical_metric(mock_writer_logs): - llmobs_eval_metric_writer = LLMObsEvalMetricWriter(1, 1, is_agentless=True, _site=DD_SITE, _api_key=DD_API_KEY) - llmobs_eval_metric_writer.enqueue(_categorical_metric_event()) + llmobs_eval_metric_writer = LLMObsEvalMetricWriter( + interval=1, + timeout=1, + is_agentless=True, + _api_key=DD_API_KEY, + _override_url="http://localhost:9126/vcr/datadog/", + ) + llmobs_eval_metric_writer.enqueue(_categorical_metric_event(label="toxicity", value="very")) llmobs_eval_metric_writer.periodic() mock_writer_logs.debug.assert_has_calls( [mock.call("encoded %d LLMObs %s events to be sent", 1, "evaluation_metric")] ) -@pytest.mark.vcr_logs def test_send_score_metric(mock_writer_logs): - llmobs_eval_metric_writer = LLMObsEvalMetricWriter(1, 1, is_agentless=True, _site=DD_SITE, _api_key=DD_API_KEY) - llmobs_eval_metric_writer.enqueue(_score_metric_event()) + llmobs_eval_metric_writer = LLMObsEvalMetricWriter( + interval=1, + timeout=1, + is_agentless=True, + _site=DD_SITE, + _api_key=DD_API_KEY, + _override_url="http://localhost:9126/vcr/datadog/", + ) + llmobs_eval_metric_writer.enqueue(_score_metric_event(label="sentiment", value=0.9)) llmobs_eval_metric_writer.periodic() mock_writer_logs.debug.assert_has_calls( [mock.call("encoded %d LLMObs %s events to be sent", 1, "evaluation_metric")] ) -@pytest.mark.vcr_logs def test_send_timed_events(mock_writer_logs): - llmobs_eval_metric_writer = LLMObsEvalMetricWriter(0.01, 1, is_agentless=True, _site=DD_SITE, _api_key=DD_API_KEY) + llmobs_eval_metric_writer = LLMObsEvalMetricWriter( + interval=0.01, + timeout=1, + is_agentless=True, + _api_key=DD_API_KEY, + _override_url="http://localhost:9126/vcr/datadog/", + ) llmobs_eval_metric_writer.start() mock_writer_logs.reset_mock() - llmobs_eval_metric_writer.enqueue(_score_metric_event()) + llmobs_eval_metric_writer.enqueue(_score_metric_event(label="sentiment", value=0.9)) time.sleep(0.1) mock_writer_logs.debug.assert_has_calls( [mock.call("encoded %d LLMObs %s events to be sent", 1, "evaluation_metric")] ) mock_writer_logs.reset_mock() - llmobs_eval_metric_writer.enqueue(_categorical_metric_event()) + llmobs_eval_metric_writer.enqueue(_categorical_metric_event(label="toxicity", value="very")) time.sleep(0.1) mock_writer_logs.debug.assert_has_calls( [mock.call("encoded %d LLMObs %s events to be sent", 1, "evaluation_metric")] @@ -130,6 +152,7 @@ def test_send_timed_events(mock_writer_logs): llmobs_eval_metric_writer.stop() +# TODO: use vcr proxy for this test @pytest.mark.vcr_logs def test_send_multiple_events(mock_writer_logs): llmobs_eval_metric_writer = LLMObsEvalMetricWriter(1, 1, is_agentless=True, _site=DD_SITE, _api_key=DD_API_KEY) @@ -151,13 +174,13 @@ def test_send_on_exit(mock_writer_logs, run_python_code_in_subprocess): out, err, status, pid = run_python_code_in_subprocess( """ from ddtrace.llmobs._writer import LLMObsEvalMetricWriter -from tests.llmobs.test_llmobs_eval_metric_agentless_writer import _score_metric_event +from tests.llmobs.test_llmobs_eval_metric_agentless_writer import _categorical_metric_event llmobs_eval_metric_writer = LLMObsEvalMetricWriter( - 0.01, 1, is_agentless=True, _site="datad0g.com", _api_key="" + interval=0.01, timeout=1, is_agentless=True, _api_key="", _override_url="http://localhost:9126/vcr/datadog/" ) llmobs_eval_metric_writer.start() -llmobs_eval_metric_writer.enqueue(_score_metric_event()) +llmobs_eval_metric_writer.enqueue(_categorical_metric_event(label="api-key", value="wrong-api-key")) """, env=env, ) diff --git a/tests/llmobs/test_llmobs_evaluator_runner.py b/tests/llmobs/test_llmobs_evaluator_runner.py index 6b5806111c3..ca7331912ac 100644 --- a/tests/llmobs/test_llmobs_evaluator_runner.py +++ b/tests/llmobs/test_llmobs_evaluator_runner.py @@ -92,6 +92,7 @@ def test_evaluator_runner_multiple_evaluators(llmobs, mock_llmobs_eval_metric_wr ] +# TODO: use vcr proxy for this test def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subprocess): env = os.environ.copy() pypath = [os.path.dirname(os.path.dirname(os.path.dirname(__file__)))] From b7884f698cba17a84b6248dfd6560cfa7e8a3ef8 Mon Sep 17 00:00:00 2001 From: Sam Brenner Date: Wed, 3 Sep 2025 11:12:30 -0400 Subject: [PATCH 2/7] evaluator runner test --- ..._llm-obs_v2_eval-metric_post_42090a9a.yaml | 43 +++++++++++++++++++ tests/llmobs/test_llmobs_evaluator_runner.py | 8 +++- 2 files changed, 49 insertions(+), 2 deletions(-) create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_42090a9a.yaml diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_42090a9a.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_42090a9a.yaml new file mode 100644 index 00000000000..e9e39472e05 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_42090a9a.yaml @@ -0,0 +1,43 @@ +interactions: +- request: + body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on": + {"span": {"span_id": "123", "trace_id": "1234"}}, "label": "dummy", "metric_type": + "score", "timestamp_ms": 1756911917780, "score_value": 1.0, "ml_app": "unnamed-ml-app", + "tags": ["ddtrace.version:3.13.0.dev56+gf40756451.d20250822", "ml_app:unnamed-ml-app"]}]}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '340' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.4 + method: POST + uri: https://api.datadoghq.com/api/intake/llm-obs/v2/eval-metric + response: + body: + string: '{"status":"error","code":403,"errors":["Forbidden"],"statuspage":"http://status.datadoghq.com","twitter":"http://twitter.com/datadogops","email":"support@datadoghq.com"}' + headers: + connection: + - close + content-length: + - '169' + content-type: + - application/json + date: + - Wed, 03 Sep 2025 15:05:17 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-content-type-options: + - nosniff + status: + code: 403 + message: Forbidden +version: 1 diff --git a/tests/llmobs/test_llmobs_evaluator_runner.py b/tests/llmobs/test_llmobs_evaluator_runner.py index ca7331912ac..37d132a5c02 100644 --- a/tests/llmobs/test_llmobs_evaluator_runner.py +++ b/tests/llmobs/test_llmobs_evaluator_runner.py @@ -92,7 +92,6 @@ def test_evaluator_runner_multiple_evaluators(llmobs, mock_llmobs_eval_metric_wr ] -# TODO: use vcr proxy for this test def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subprocess): env = os.environ.copy() pypath = [os.path.dirname(os.path.dirname(os.path.dirname(__file__)))] @@ -107,7 +106,12 @@ def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subproces from ddtrace.llmobs._evaluators.runner import EvaluatorRunner from tests.llmobs._utils import DummyEvaluator -LLMObs.enable(api_key="dummy-api-key", site="datad0g.com", ml_app="unnamed-ml-app", agentless_enabled=True) +LLMObs.enable(api_key="dummy-api-key", ml_app="unnamed-ml-app", agentless_enabled=True) + +LLMObs._instance._llmobs_eval_metric_writer._override_url = "http://localhost:9126/vcr/datadog/" +LLMObs._instance._llmobs_eval_metric_writer._intake = "http://localhost:9126/vcr/datadog/" +LLMObs._instance._llmobs_eval_metric_writer._endpoint = "api/intake/llm-obs/v2/eval-metric" + LLMObs._instance._evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=LLMObs)) LLMObs._instance._evaluator_runner.start() LLMObs._instance._evaluator_runner.enqueue({"span_id": "123", "trace_id": "1234"}, None) From 6dd1dac3c17ba5d1c0c9d2b6db3d60e40b0df491 Mon Sep 17 00:00:00 2001 From: Sam Brenner Date: Thu, 4 Sep 2025 21:50:33 -0400 Subject: [PATCH 3/7] fmt --- tests/llmobs/test_llmobs_eval_metric_agentless_writer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/llmobs/test_llmobs_eval_metric_agentless_writer.py b/tests/llmobs/test_llmobs_eval_metric_agentless_writer.py index 523f3094250..069de2c48ee 100644 --- a/tests/llmobs/test_llmobs_eval_metric_agentless_writer.py +++ b/tests/llmobs/test_llmobs_eval_metric_agentless_writer.py @@ -4,7 +4,6 @@ import mock import pytest -from ddtrace.llmobs._constants import AGENTLESS_EVAL_BASE_URL from ddtrace.llmobs._writer import LLMObsEvalMetricWriter from tests.utils import override_global_config From a12e63e7e29ce64464b723fa4b25acb4d1931e3b Mon Sep 17 00:00:00 2001 From: Sam Brenner Date: Thu, 4 Sep 2025 21:52:09 -0400 Subject: [PATCH 4/7] remove todo --- tests/llmobs/test_llmobs_eval_metric_agentless_writer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/llmobs/test_llmobs_eval_metric_agentless_writer.py b/tests/llmobs/test_llmobs_eval_metric_agentless_writer.py index 069de2c48ee..28d73ae34d3 100644 --- a/tests/llmobs/test_llmobs_eval_metric_agentless_writer.py +++ b/tests/llmobs/test_llmobs_eval_metric_agentless_writer.py @@ -152,7 +152,6 @@ def test_send_timed_events(mock_writer_logs): llmobs_eval_metric_writer.stop() -# TODO: use vcr proxy for this test @pytest.mark.vcr_logs def test_send_multiple_events(mock_writer_logs): llmobs_eval_metric_writer = LLMObsEvalMetricWriter(1, 1, is_agentless=True, _site=DD_SITE, _api_key=DD_API_KEY) From 48c6385e3b2e4ab524ca7d25e85548fac06cac0d Mon Sep 17 00:00:00 2001 From: Sam Brenner Date: Fri, 5 Sep 2025 08:31:31 -0400 Subject: [PATCH 5/7] fixes --- ddtrace/llmobs/_writer.py | 14 +++++- ..._llm-obs_v2_eval-metric_post_2d529580.yaml | 43 +++++++++++++++++++ ..._llm-obs_v2_eval-metric_post_ca2bfa88.yaml | 43 +++++++++++++++++++ ...est_llmobs_eval_metric_agentless_writer.py | 11 ++--- 4 files changed, 105 insertions(+), 6 deletions(-) create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_2d529580.yaml create mode 100644 tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_ca2bfa88.yaml diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py index fbe52a984ec..75521126c33 100644 --- a/ddtrace/llmobs/_writer.py +++ b/ddtrace/llmobs/_writer.py @@ -11,6 +11,7 @@ from typing import Union from typing import cast from urllib.parse import quote +from urllib.parse import urlparse # TypedDict was added to typing in python 3.8 @@ -155,8 +156,19 @@ def __init__( self._intake: str = self._override_url or ( f"{self.AGENTLESS_BASE_URL}.{self._site}" if is_agentless else agent_config.trace_agent_url ) + self._endpoint: str = self.ENDPOINT if is_agentless else f"{EVP_PROXY_AGENT_BASE_PATH}{self.ENDPOINT}" - if self._override_url: + override_url_parsed = urlparse(self._override_url) + if ( + self._override_url + and override_url_parsed.scheme != "unix" + and override_url_parsed.path != "/" + and override_url_parsed.path != "" + ): + # handles cases where the override url includes a base path, ie + # http://localhost:8080/foo/bar and endpoint /buz/baz + # we need to strip the base path from the endpoint so the eventual urljoin works properly + # to form http://localhost:8080/foo/bar/buz/baz self._endpoint = self.ENDPOINT.lstrip("/") self._headers: Dict[str, str] = {"Content-Type": "application/json"} diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_2d529580.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_2d529580.yaml new file mode 100644 index 00000000000..48a5e5ea1d3 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_2d529580.yaml @@ -0,0 +1,43 @@ +interactions: +- request: + body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on": + {"span": {"span_id": "123", "trace_id": "1234"}}, "label": "dummy", "metric_type": + "score", "timestamp_ms": 1757074814754, "score_value": 1.0, "ml_app": "unnamed-ml-app", + "tags": ["ddtrace.version:3.13.0.dev56+gf40756451.d20250822", "ml_app:unnamed-ml-app"]}]}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '340' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.4 + method: POST + uri: https://api.datadoghq.com/api/intake/llm-obs/v2/eval-metric + response: + body: + string: '{"status":"error","code":403,"errors":["Forbidden"],"statuspage":"http://status.datadoghq.com","twitter":"http://twitter.com/datadogops","email":"support@datadoghq.com"}' + headers: + connection: + - close + content-length: + - '169' + content-type: + - application/json + date: + - Fri, 05 Sep 2025 12:20:14 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-content-type-options: + - nosniff + status: + code: 403 + message: Forbidden +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_ca2bfa88.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_ca2bfa88.yaml new file mode 100644 index 00000000000..b8be0daf561 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_intake_llm-obs_v2_eval-metric_post_ca2bfa88.yaml @@ -0,0 +1,43 @@ +interactions: +- request: + body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on": + {"span": {"span_id": "123", "trace_id": "1234"}}, "label": "dummy", "metric_type": + "score", "timestamp_ms": 1757074518879, "score_value": 1.0, "ml_app": "unnamed-ml-app", + "tags": ["ddtrace.version:3.13.0.dev56+gf40756451.d20250822", "ml_app:unnamed-ml-app"]}]}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '340' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.4 + method: POST + uri: https://api.datadoghq.com/api/intake/llm-obs/v2/eval-metric + response: + body: + string: '{"status":"error","code":403,"errors":["Forbidden"],"statuspage":"http://status.datadoghq.com","twitter":"http://twitter.com/datadogops","email":"support@datadoghq.com"}' + headers: + connection: + - close + content-length: + - '169' + content-type: + - application/json + date: + - Fri, 05 Sep 2025 12:15:18 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-content-type-options: + - nosniff + status: + code: 403 + message: Forbidden +version: 1 diff --git a/tests/llmobs/test_llmobs_eval_metric_agentless_writer.py b/tests/llmobs/test_llmobs_eval_metric_agentless_writer.py index 28d73ae34d3..58171added4 100644 --- a/tests/llmobs/test_llmobs_eval_metric_agentless_writer.py +++ b/tests/llmobs/test_llmobs_eval_metric_agentless_writer.py @@ -5,6 +5,7 @@ import pytest from ddtrace.llmobs._writer import LLMObsEvalMetricWriter +from ddtrace.llmobs._writer import LLMObsEvaluationMetricEvent from tests.utils import override_global_config @@ -13,7 +14,7 @@ DD_API_KEY = os.getenv("DD_API_KEY", default="") -def _categorical_metric_event(label: str, value: str): +def _categorical_metric_event(label: str, value: str) -> LLMObsEvaluationMetricEvent: return { "join_on": { "span": { @@ -29,7 +30,7 @@ def _categorical_metric_event(label: str, value: str): } -def _score_metric_event(label: str, value: float): +def _score_metric_event(label: str, value: float) -> LLMObsEvaluationMetricEvent: return { "join_on": { "span": { @@ -87,7 +88,7 @@ def test_send_metric_bad_api_key(mock_writer_logs): def test_send_metric_no_api_key(mock_writer_logs): with override_global_config(dict(_dd_api_key="")): llmobs_eval_metric_writer = LLMObsEvalMetricWriter(1, 1, is_agentless=True, _site=DD_SITE, _api_key="") - llmobs_eval_metric_writer.enqueue(_categorical_metric_event()) + llmobs_eval_metric_writer.enqueue(_categorical_metric_event(label="toxicity", value="very")) llmobs_eval_metric_writer.periodic() mock_writer_logs.warning.assert_called_with( "A Datadog API key is required for sending data to LLM Observability in agentless mode. " @@ -156,8 +157,8 @@ def test_send_timed_events(mock_writer_logs): def test_send_multiple_events(mock_writer_logs): llmobs_eval_metric_writer = LLMObsEvalMetricWriter(1, 1, is_agentless=True, _site=DD_SITE, _api_key=DD_API_KEY) mock_writer_logs.reset_mock() - llmobs_eval_metric_writer.enqueue(_score_metric_event()) - llmobs_eval_metric_writer.enqueue(_categorical_metric_event()) + llmobs_eval_metric_writer.enqueue(_score_metric_event(label="sentiment", value=0.9)) + llmobs_eval_metric_writer.enqueue(_categorical_metric_event(label="toxicity", value="very")) llmobs_eval_metric_writer.periodic() mock_writer_logs.debug.assert_has_calls( [mock.call("encoded %d LLMObs %s events to be sent", 2, "evaluation_metric")] From b0145f7266ac3240e6ee34cb8bd57d7f3f21bf08 Mon Sep 17 00:00:00 2001 From: Sam Brenner Date: Sat, 6 Sep 2025 21:38:13 -0400 Subject: [PATCH 6/7] review suggestions --- ddtrace/llmobs/_writer.py | 7 +------ tests/llmobs/conftest.py | 8 +++++++- ...test_llmobs_eval_metric_agentless_writer.py | 18 +++++++++--------- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py index 75521126c33..08fda0525f3 100644 --- a/ddtrace/llmobs/_writer.py +++ b/ddtrace/llmobs/_writer.py @@ -159,12 +159,7 @@ def __init__( self._endpoint: str = self.ENDPOINT if is_agentless else f"{EVP_PROXY_AGENT_BASE_PATH}{self.ENDPOINT}" override_url_parsed = urlparse(self._override_url) - if ( - self._override_url - and override_url_parsed.scheme != "unix" - and override_url_parsed.path != "/" - and override_url_parsed.path != "" - ): + if self._override_url and override_url_parsed.scheme != "unix" and override_url_parsed.path not in ("/", ""): # handles cases where the override url includes a base path, ie # http://localhost:8080/foo/bar and endpoint /buz/baz # we need to strip the base path from the endpoint so the eventual urljoin works properly diff --git a/tests/llmobs/conftest.py b/tests/llmobs/conftest.py index 768e5fd9b95..43007811229 100644 --- a/tests/llmobs/conftest.py +++ b/tests/llmobs/conftest.py @@ -253,6 +253,11 @@ def llmobs_enable_opts(): yield {"project_name": "test-project"} +@pytest.fixture +def llmobs_api_proxy_url(): + return "http://localhost:9126/vcr/datadog" + + @pytest.fixture def llmobs( ddtrace_global_config, @@ -261,6 +266,7 @@ def llmobs( llmobs_enable_opts, llmobs_env, llmobs_span_writer, + llmobs_api_proxy_url, mock_llmobs_eval_metric_writer, mock_llmobs_evaluator_runner, ): @@ -274,7 +280,7 @@ def llmobs( llmobs_service.enable(_tracer=tracer, **llmobs_enable_opts) llmobs_service._instance._llmobs_span_writer = llmobs_span_writer llmobs_service._instance._llmobs_span_writer.start() - llmobs_service._instance._dne_client._intake = "http://localhost:9126/vcr/datadog" + llmobs_service._instance._dne_client._intake = llmobs_api_proxy_url yield llmobs_service tracer.shutdown() llmobs_service.disable() diff --git a/tests/llmobs/test_llmobs_eval_metric_agentless_writer.py b/tests/llmobs/test_llmobs_eval_metric_agentless_writer.py index 58171added4..99b27c1f2c2 100644 --- a/tests/llmobs/test_llmobs_eval_metric_agentless_writer.py +++ b/tests/llmobs/test_llmobs_eval_metric_agentless_writer.py @@ -63,12 +63,12 @@ def test_buffer_limit(mock_writer_logs): @pytest.mark.skip(reason="Skipping due to flakiness in hitting the staging endpoint") -def test_send_metric_bad_api_key(mock_writer_logs): +def test_send_metric_bad_api_key(mock_writer_logs, llmobs_api_proxy_url): llmobs_eval_metric_writer = LLMObsEvalMetricWriter( interval=1, timeout=1, is_agentless=True, - _override_url="http://localhost:9126/vcr/datadog/", + _override_url=llmobs_api_proxy_url, _api_key="", ) @@ -79,7 +79,7 @@ def test_send_metric_bad_api_key(mock_writer_logs): "failed to send %d LLMObs %s events to %s, got response code %d, status: %s", 1, "evaluation_metric", - "http://localhost:9126/vcr/datadog/api/intake/llm-obs/v2/eval-metric", + f"{llmobs_api_proxy_url}/api/intake/llm-obs/v2/eval-metric", 403, b'{"status":"error","code":403,"errors":["Forbidden"],"statuspage":"http://status.datadoghq.com","twitter":"http://twitter.com/datadogops","email":"support@datadoghq.com"}', # noqa ) @@ -97,13 +97,13 @@ def test_send_metric_no_api_key(mock_writer_logs): ) -def test_send_categorical_metric(mock_writer_logs): +def test_send_categorical_metric(mock_writer_logs, llmobs_api_proxy_url): llmobs_eval_metric_writer = LLMObsEvalMetricWriter( interval=1, timeout=1, is_agentless=True, _api_key=DD_API_KEY, - _override_url="http://localhost:9126/vcr/datadog/", + _override_url=llmobs_api_proxy_url, ) llmobs_eval_metric_writer.enqueue(_categorical_metric_event(label="toxicity", value="very")) llmobs_eval_metric_writer.periodic() @@ -112,14 +112,14 @@ def test_send_categorical_metric(mock_writer_logs): ) -def test_send_score_metric(mock_writer_logs): +def test_send_score_metric(mock_writer_logs, llmobs_api_proxy_url): llmobs_eval_metric_writer = LLMObsEvalMetricWriter( interval=1, timeout=1, is_agentless=True, _site=DD_SITE, _api_key=DD_API_KEY, - _override_url="http://localhost:9126/vcr/datadog/", + _override_url=llmobs_api_proxy_url, ) llmobs_eval_metric_writer.enqueue(_score_metric_event(label="sentiment", value=0.9)) llmobs_eval_metric_writer.periodic() @@ -128,13 +128,13 @@ def test_send_score_metric(mock_writer_logs): ) -def test_send_timed_events(mock_writer_logs): +def test_send_timed_events(mock_writer_logs, llmobs_api_proxy_url): llmobs_eval_metric_writer = LLMObsEvalMetricWriter( interval=0.01, timeout=1, is_agentless=True, _api_key=DD_API_KEY, - _override_url="http://localhost:9126/vcr/datadog/", + _override_url=llmobs_api_proxy_url, ) llmobs_eval_metric_writer.start() mock_writer_logs.reset_mock() From 8cbe231130ae0815bf8a30ad0d2d96ad0c93030f Mon Sep 17 00:00:00 2001 From: Sam Brenner Date: Sat, 6 Sep 2025 21:44:50 -0400 Subject: [PATCH 7/7] remove newline --- ddtrace/llmobs/_writer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py index 8d5f431bf1d..aff2cf7e48e 100644 --- a/ddtrace/llmobs/_writer.py +++ b/ddtrace/llmobs/_writer.py @@ -156,7 +156,6 @@ def __init__( self._intake: str = self._override_url or ( f"{self.AGENTLESS_BASE_URL}.{self._site}" if is_agentless else agent_config.trace_agent_url ) - self._endpoint: str = self.ENDPOINT if is_agentless else f"{EVP_PROXY_AGENT_BASE_PATH}{self.ENDPOINT}" override_url_parsed = urlparse(self._override_url) if self._override_url and override_url_parsed.scheme != "unix" and override_url_parsed.path not in ("/", ""):