Skip to content

Commit d54de92

Browse files
committed
migrate latency metrics to span events
1 parent 4dd7d2e commit d54de92

12 files changed

+124
-107
lines changed

ddtrace/contrib/internal/vllm/patch.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
from .extractors import get_model_name
1515
from .utils import create_span
1616
from .utils import inject_trace_context
17-
from .utils import set_latency_metrics
1817

1918

2019
logger = get_logger(__name__)
@@ -120,14 +119,11 @@ def traced_output_processor_process_outputs(vllm, pin, func, instance, args, kwa
120119
integration.llmobs_set_tags(
121120
span,
122121
args=[],
123-
kwargs={"request_data": data},
122+
kwargs={"request_data": data, "stats": span_info["stats"]},
124123
response=None,
125124
operation=operation,
126125
)
127126

128-
if span_info["stats"]:
129-
set_latency_metrics(span, span_info["stats"])
130-
131127
span.finish()
132128

133129
return result

ddtrace/llmobs/_constants.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,12 @@
4444
CACHE_READ_INPUT_TOKENS_METRIC_KEY = "cache_read_input_tokens"
4545
BILLABLE_CHARACTER_COUNT_METRIC_KEY = "billable_character_count"
4646

47+
TIME_TO_FIRST_TOKEN_METRIC_KEY = "time_to_first_token"
48+
TIME_IN_QUEUE_METRIC_KEY = "time_in_queue"
49+
TIME_IN_MODEL_PREFILL_METRIC_KEY = "time_in_model_prefill"
50+
TIME_IN_MODEL_DECODE_METRIC_KEY = "time_in_model_decode"
51+
TIME_IN_MODEL_INFERENCE_METRIC_KEY = "time_in_model_inference"
52+
4753
EVP_PROXY_AGENT_BASE_PATH = "/evp_proxy/v2"
4854
EVAL_ENDPOINT = "/api/intake/llm-obs/v2/eval-metric"
4955
SPAN_ENDPOINT = "/api/v2/llmobs"

ddtrace/llmobs/_integrations/vllm.py

Lines changed: 38 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,11 @@
1919
from ddtrace.llmobs._constants import OUTPUT_TOKENS_METRIC_KEY
2020
from ddtrace.llmobs._constants import OUTPUT_VALUE
2121
from ddtrace.llmobs._constants import SPAN_KIND
22+
from ddtrace.llmobs._constants import TIME_IN_MODEL_DECODE_METRIC_KEY
23+
from ddtrace.llmobs._constants import TIME_IN_MODEL_INFERENCE_METRIC_KEY
24+
from ddtrace.llmobs._constants import TIME_IN_MODEL_PREFILL_METRIC_KEY
25+
from ddtrace.llmobs._constants import TIME_IN_QUEUE_METRIC_KEY
26+
from ddtrace.llmobs._constants import TIME_TO_FIRST_TOKEN_METRIC_KEY
2227
from ddtrace.llmobs._constants import TOTAL_TOKENS_METRIC_KEY
2328
from ddtrace.llmobs._integrations.base import BaseLLMIntegration
2429
from ddtrace.llmobs.utils import Document
@@ -59,22 +64,46 @@ def _build_metadata(self, data: RequestData) -> Dict[str, Any]:
5964

6065
return md
6166

62-
def _build_metrics(self, data: RequestData) -> Dict[str, Any]:
63-
"""Build token metrics from request data."""
67+
def _build_metrics(self, data: RequestData, stats=None) -> Dict[str, Any]:
68+
"""Build token and latency metrics from request data."""
6469
it = int(data.input_tokens or 0)
6570
ot = int(data.output_tokens or 0)
66-
return {
71+
metrics = {
6772
INPUT_TOKENS_METRIC_KEY: it,
6873
OUTPUT_TOKENS_METRIC_KEY: ot,
6974
TOTAL_TOKENS_METRIC_KEY: it + ot,
7075
}
7176

72-
def _build_embedding_context(self, data: RequestData) -> Dict[str, Any]:
77+
# Add latency metrics if stats are available
78+
if stats:
79+
if stats.first_token_latency:
80+
metrics[TIME_TO_FIRST_TOKEN_METRIC_KEY] = float(stats.first_token_latency)
81+
82+
queued = stats.queued_ts
83+
scheduled = stats.scheduled_ts
84+
first_token = stats.first_token_ts
85+
last_token = stats.last_token_ts
86+
87+
if queued and scheduled:
88+
metrics[TIME_IN_QUEUE_METRIC_KEY] = float(scheduled - queued)
89+
90+
if scheduled and first_token:
91+
metrics[TIME_IN_MODEL_PREFILL_METRIC_KEY] = float(first_token - scheduled)
92+
93+
if first_token and last_token and last_token > first_token:
94+
metrics[TIME_IN_MODEL_DECODE_METRIC_KEY] = float(last_token - first_token)
95+
96+
if scheduled and last_token:
97+
metrics[TIME_IN_MODEL_INFERENCE_METRIC_KEY] = float(last_token - scheduled)
98+
99+
return metrics
100+
101+
def _build_embedding_context(self, data: RequestData, stats=None) -> Dict[str, Any]:
73102
"""Build LLMObs context for embedding operations."""
74103
ctx: Dict[str, Any] = {
75104
SPAN_KIND: "embedding",
76105
METADATA: self._build_metadata(data),
77-
METRICS: self._build_metrics(data),
106+
METRICS: self._build_metrics(data, stats),
78107
}
79108

80109
docs: List[Document] = []
@@ -94,12 +123,12 @@ def _build_embedding_context(self, data: RequestData) -> Dict[str, Any]:
94123

95124
return ctx
96125

97-
def _build_completion_context(self, data: RequestData) -> Dict[str, Any]:
126+
def _build_completion_context(self, data: RequestData, stats=None) -> Dict[str, Any]:
98127
"""Build LLMObs context for completion operations."""
99128
ctx: Dict[str, Any] = {
100129
SPAN_KIND: "llm",
101130
METADATA: self._build_metadata(data),
102-
METRICS: self._build_metrics(data),
131+
METRICS: self._build_metrics(data, stats),
103132
}
104133

105134
if data.prompt:
@@ -123,7 +152,8 @@ def _llmobs_set_tags(
123152
if data is None:
124153
return
125154

126-
ctx = self._build_embedding_context(data) if operation == "embedding" else self._build_completion_context(data)
155+
stats = kwargs.get("stats")
156+
ctx = self._build_embedding_context(data, stats) if operation == "embedding" else self._build_completion_context(data, stats)
127157
ctx[MODEL_NAME] = span.get_tag("vllm.request.model") or ""
128158
ctx[MODEL_PROVIDER] = span.get_tag("vllm.request.provider") or ""
129159
span._set_ctx_items(ctx)

tests/contrib/vllm/test_api_app.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,6 @@
1313

1414

1515
IGNORE_FIELDS = [
16-
"metrics.vllm.latency.ttft",
17-
"metrics.vllm.latency.queue",
18-
"metrics.vllm.latency.prefill",
19-
"metrics.vllm.latency.decode",
20-
"metrics.vllm.latency.inference",
21-
"metrics.vllm.latency.model_forward",
22-
"metrics.vllm.latency.model_execute",
2316
"meta._dd.p.llmobs_trace_id",
2417
]
2518

@@ -88,6 +81,10 @@ def test_rag_parent_child(vllm, llmobs_span_writer):
8881
assert event["meta"]["metadata"]["num_cached_tokens"] == 0
8982
assert event["metrics"]["input_tokens"] > 0
9083
assert event["metrics"]["output_tokens"] == 0
84+
assert "time_to_first_token" in event["metrics"]
85+
assert "time_in_queue" in event["metrics"]
86+
assert "time_in_model_prefill" in event["metrics"]
87+
assert "time_in_model_inference" in event["metrics"]
9188
assert "ml_app:<ml-app-name>" in event["tags"]
9289
assert "service:tests.contrib.vllm" in event["tags"]
9390

@@ -103,5 +100,10 @@ def test_rag_parent_child(vllm, llmobs_span_writer):
103100
assert gen_event["meta"]["metadata"]["num_cached_tokens"] == 0
104101
assert gen_event["metrics"]["input_tokens"] == 27
105102
assert gen_event["metrics"]["output_tokens"] > 0
103+
assert "time_to_first_token" in gen_event["metrics"]
104+
assert "time_in_queue" in gen_event["metrics"]
105+
assert "time_in_model_prefill" in gen_event["metrics"]
106+
assert "time_in_model_decode" in gen_event["metrics"]
107+
assert "time_in_model_inference" in gen_event["metrics"]
106108
assert "ml_app:<ml-app-name>" in gen_event["tags"]
107109
assert "service:tests.contrib.vllm" in gen_event["tags"]

tests/contrib/vllm/test_vllm_llmobs.py

Lines changed: 56 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,7 @@
66
from ._utils import get_simple_chat_template
77

88

9-
IGNORE_FIELDS = [
10-
"metrics.vllm.latency.ttft",
11-
"metrics.vllm.latency.queue",
12-
"metrics.vllm.latency.prefill",
13-
"metrics.vllm.latency.decode",
14-
"metrics.vllm.latency.inference",
15-
"metrics.vllm.latency.model_forward",
16-
"metrics.vllm.latency.model_execute",
17-
]
9+
IGNORE_FIELDS = []
1810

1911

2012
@pytest.mark.snapshot(ignores=IGNORE_FIELDS)
@@ -41,7 +33,16 @@ def test_llmobs_basic(llmobs_events, mock_tracer, opt_125m_llm):
4133
"finish_reason": "length",
4234
"num_cached_tokens": 0,
4335
},
44-
token_metrics={"input_tokens": 6, "output_tokens": 8, "total_tokens": 14},
36+
token_metrics={
37+
"input_tokens": 6,
38+
"output_tokens": 8,
39+
"total_tokens": 14,
40+
"time_to_first_token": mock.ANY,
41+
"time_in_queue": mock.ANY,
42+
"time_in_model_prefill": mock.ANY,
43+
"time_in_model_decode": mock.ANY,
44+
"time_in_model_inference": mock.ANY,
45+
},
4546
tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.vllm"},
4647
)
4748
assert llmobs_events[0] == expected
@@ -95,7 +96,16 @@ def test_llmobs_chat(llmobs_events, mock_tracer, opt_125m_llm):
9596
"finish_reason": "length",
9697
"num_cached_tokens": mock.ANY,
9798
},
98-
token_metrics={"input_tokens": 37, "output_tokens": 16, "total_tokens": 53},
99+
token_metrics={
100+
"input_tokens": 37,
101+
"output_tokens": 16,
102+
"total_tokens": 53,
103+
"time_to_first_token": mock.ANY,
104+
"time_in_queue": mock.ANY,
105+
"time_in_model_prefill": mock.ANY,
106+
"time_in_model_decode": mock.ANY,
107+
"time_in_model_inference": mock.ANY,
108+
},
99109
tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.vllm"},
100110
)
101111
assert llmobs_events[0] == expected
@@ -128,7 +138,15 @@ def test_llmobs_classify(llmobs_events, mock_tracer, bge_reranker_llm):
128138
input_documents=[{"text": prompt}],
129139
output_value="[1 embedding(s) returned with size 1]",
130140
metadata={"embedding_dim": 1, "num_cached_tokens": 0},
131-
token_metrics={"input_tokens": 7, "output_tokens": 0, "total_tokens": 7},
141+
token_metrics={
142+
"input_tokens": 7,
143+
"output_tokens": 0,
144+
"total_tokens": 7,
145+
"time_to_first_token": mock.ANY,
146+
"time_in_queue": mock.ANY,
147+
"time_in_model_prefill": mock.ANY,
148+
"time_in_model_inference": mock.ANY,
149+
},
132150
tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.vllm"},
133151
)
134152
assert event == expected
@@ -161,7 +179,15 @@ def test_llmobs_embed(llmobs_events, mock_tracer, e5_small_llm):
161179
input_documents=[{"text": prompt}],
162180
output_value="[1 embedding(s) returned with size 384]",
163181
metadata={"embedding_dim": 384, "num_cached_tokens": 0},
164-
token_metrics={"input_tokens": 7, "output_tokens": 0, "total_tokens": 7},
182+
token_metrics={
183+
"input_tokens": 7,
184+
"output_tokens": 0,
185+
"total_tokens": 7,
186+
"time_to_first_token": mock.ANY,
187+
"time_in_queue": mock.ANY,
188+
"time_in_model_prefill": mock.ANY,
189+
"time_in_model_inference": mock.ANY,
190+
},
165191
tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.vllm"},
166192
)
167193
assert event == expected
@@ -194,7 +220,15 @@ def test_llmobs_reward(llmobs_events, mock_tracer, bge_reranker_llm):
194220
input_documents=[{"text": prompt}],
195221
output_value="[7 embedding(s) returned with size 1024]",
196222
metadata={"embedding_dim": 1024, "num_cached_tokens": 0},
197-
token_metrics={"input_tokens": 7, "output_tokens": 0, "total_tokens": 7},
223+
token_metrics={
224+
"input_tokens": 7,
225+
"output_tokens": 0,
226+
"total_tokens": 7,
227+
"time_to_first_token": mock.ANY,
228+
"time_in_queue": mock.ANY,
229+
"time_in_model_prefill": mock.ANY,
230+
"time_in_model_inference": mock.ANY,
231+
},
198232
tags={"ml_app": "<ml-app-name>", "service": "tests.contrib.vllm"},
199233
)
200234
assert event == expected
@@ -223,11 +257,19 @@ def test_llmobs_score(llmobs_events, mock_tracer, bge_reranker_llm):
223257
"input_tokens": 19,
224258
"output_tokens": 0,
225259
"total_tokens": 19,
260+
"time_to_first_token": mock.ANY,
261+
"time_in_queue": mock.ANY,
262+
"time_in_model_prefill": mock.ANY,
263+
"time_in_model_inference": mock.ANY,
226264
},
227265
"[0, 4865, 83, 70, 10323, 111, 9942, 32, 2, 2, 581, 10323, 111, 9942, 83, 7270, 5, 2]": {
228266
"input_tokens": 18,
229267
"output_tokens": 0,
230268
"total_tokens": 18,
269+
"time_to_first_token": mock.ANY,
270+
"time_in_queue": mock.ANY,
271+
"time_in_model_prefill": mock.ANY,
272+
"time_in_model_inference": mock.ANY,
231273
},
232274
}
233275

tests/snapshots/tests.contrib.vllm.test_api_app.test_rag_parent_child.json

Lines changed: 4 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,7 @@
4747
"metrics": {
4848
"_dd.measured": 1,
4949
"_dd.top_level": 1,
50-
"process_id": 5676,
51-
"vllm.latency.inference": 0.15978306904435158,
52-
"vllm.latency.prefill": 0.15978306904435158,
53-
"vllm.latency.queue": 1.8851947970688343e-05,
54-
"vllm.latency.ttft": 0.3274271488189697
50+
"process_id": 5676
5551
},
5652
"duration": 328518565,
5753
"start": 1759410315481061632
@@ -77,11 +73,7 @@
7773
"metrics": {
7874
"_dd.measured": 1,
7975
"_dd.top_level": 1,
80-
"process_id": 5676,
81-
"vllm.latency.inference": 0.007373907952569425,
82-
"vllm.latency.prefill": 0.007373907952569425,
83-
"vllm.latency.queue": 0.00016414199490100145,
84-
"vllm.latency.ttft": 0.009606599807739258
76+
"process_id": 5676
8577
},
8678
"duration": 10421016,
8779
"start": 1759410315810782464
@@ -107,11 +99,7 @@
10799
"metrics": {
108100
"_dd.measured": 1,
109101
"_dd.top_level": 1,
110-
"process_id": 5676,
111-
"vllm.latency.inference": 0.007004529004916549,
112-
"vllm.latency.prefill": 0.007004529004916549,
113-
"vllm.latency.queue": 0.0001580320531502366,
114-
"vllm.latency.ttft": 0.009066581726074219
102+
"process_id": 5676
115103
},
116104
"duration": 9502380,
117105
"start": 1759410315821967872
@@ -137,12 +125,7 @@
137125
"metrics": {
138126
"_dd.measured": 1,
139127
"_dd.top_level": 1,
140-
"process_id": 5676,
141-
"vllm.latency.decode": 0.13968857692088932,
142-
"vllm.latency.inference": 0.20357014192268252,
143-
"vllm.latency.prefill": 0.0638815650017932,
144-
"vllm.latency.queue": 1.562107354402542e-05,
145-
"vllm.latency.ttft": 0.17212820053100586
128+
"process_id": 5676
146129
},
147130
"duration": 312517816,
148131
"start": 1759410333296862208

tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_basic.json

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,7 @@
2121
"_dd.top_level": 1,
2222
"_dd.tracer_kr": 1.0,
2323
"_sampling_priority_v1": 1,
24-
"process_id": 5676,
25-
"vllm.latency.decode": 0.05201620701700449,
26-
"vllm.latency.inference": 0.11478733806870878,
27-
"vllm.latency.prefill": 0.06277113105170429,
28-
"vllm.latency.queue": 0.0001168189337477088,
29-
"vllm.latency.ttft": 0.06522464752197266
24+
"process_id": 5676
3025
},
3126
"duration": 118164202,
3227
"start": 1759410364206987008

tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_chat.json

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,7 @@
2121
"_dd.top_level": 1,
2222
"_dd.tracer_kr": 1.0,
2323
"_sampling_priority_v1": 1,
24-
"process_id": 5676,
25-
"vllm.latency.decode": 0.09951003501191735,
26-
"vllm.latency.inference": 0.10793503595050424,
27-
"vllm.latency.prefill": 0.00842500093858689,
28-
"vllm.latency.queue": 0.0001842339988797903,
29-
"vllm.latency.ttft": 0.010640621185302734
24+
"process_id": 5676
3025
},
3126
"duration": 110508762,
3227
"start": 1759410376640131328

tests/snapshots/tests.contrib.vllm.test_vllm_llmobs.test_llmobs_classify.json

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,7 @@
2121
"_dd.top_level": 1,
2222
"_dd.tracer_kr": 1.0,
2323
"_sampling_priority_v1": 1,
24-
"process_id": 5676,
25-
"vllm.latency.inference": 0.021056173951365054,
26-
"vllm.latency.prefill": 0.021056173951365054,
27-
"vllm.latency.queue": 0.00024107703939080238,
28-
"vllm.latency.ttft": 0.023837804794311523
24+
"process_id": 5676
2925
},
3026
"duration": 24355851,
3127
"start": 1759410376376144128
@@ -53,11 +49,7 @@
5349
"_dd.top_level": 1,
5450
"_dd.tracer_kr": 1.0,
5551
"_sampling_priority_v1": 1,
56-
"process_id": 5676,
57-
"vllm.latency.inference": 0.021056173951365054,
58-
"vllm.latency.prefill": 0.021056173951365054,
59-
"vllm.latency.queue": 5.408399738371372e-05,
60-
"vllm.latency.ttft": 0.02319478988647461
52+
"process_id": 5676
6153
},
6254
"duration": 24859074,
6355
"start": 1759410376376787200

0 commit comments

Comments
 (0)