Skip to content

Commit cf83794

Browse files
feat: DIS-678 kvbm modularity: standalone metrics endpoint (#3433)
Signed-off-by: richardhuo-nv <rihuo@nvidia.com>
1 parent 2d59861 commit cf83794

File tree

16 files changed

+262
-249
lines changed

16 files changed

+262
-249
lines changed

deploy/metrics/grafana_dashboards/grafana-kvbm-dashboard.json

Lines changed: 13 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,8 @@
118118
"targets": [
119119
{
120120
"disableTextWrap": false,
121-
"editorMode": "builder",
122-
"expr": "dynamo_component_matched_tokens{dynamo_namespace=\"kvbm_connector_leader\"}",
121+
"editorMode": "code",
122+
"expr": "kvbm_matched_tokens",
123123
"fullMetaSearch": false,
124124
"includeNullMetadata": true,
125125
"legendFormat": "__auto",
@@ -227,8 +227,8 @@
227227
"targets": [
228228
{
229229
"disableTextWrap": false,
230-
"editorMode": "builder",
231-
"expr": "dynamo_component_offload_requests{dynamo_namespace=\"kvbm_connector_leader\"}",
230+
"editorMode": "code",
231+
"expr": "kvbm_offload_requests",
232232
"fullMetaSearch": false,
233233
"includeNullMetadata": true,
234234
"legendFormat": "__auto",
@@ -323,8 +323,8 @@
323323
"targets": [
324324
{
325325
"disableTextWrap": false,
326-
"editorMode": "builder",
327-
"expr": "dynamo_component_offload_blocks_d2h{dynamo_namespace=\"kvbm_connector_leader\"}",
326+
"editorMode": "code",
327+
"expr": "kvbm_offload_blocks_d2h",
328328
"fullMetaSearch": false,
329329
"includeNullMetadata": true,
330330
"legendFormat": "__auto",
@@ -336,102 +336,6 @@
336336
"title": "Offload Blocks - Device to Host",
337337
"type": "timeseries"
338338
},
339-
{
340-
"datasource": {
341-
"type": "prometheus",
342-
"uid": "P1809F7CD0C75ACF3"
343-
},
344-
"fieldConfig": {
345-
"defaults": {
346-
"color": {
347-
"mode": "palette-classic"
348-
},
349-
"custom": {
350-
"axisBorderShow": false,
351-
"axisCenteredZero": false,
352-
"axisColorMode": "text",
353-
"axisLabel": "",
354-
"axisPlacement": "auto",
355-
"barAlignment": 0,
356-
"barWidthFactor": 0.6,
357-
"drawStyle": "line",
358-
"fillOpacity": 0,
359-
"gradientMode": "none",
360-
"hideFrom": {
361-
"legend": false,
362-
"tooltip": false,
363-
"viz": false
364-
},
365-
"insertNulls": false,
366-
"lineInterpolation": "linear",
367-
"lineWidth": 1,
368-
"pointSize": 5,
369-
"scaleDistribution": {
370-
"type": "linear"
371-
},
372-
"showPoints": "auto",
373-
"spanNulls": false,
374-
"stacking": {
375-
"group": "A",
376-
"mode": "none"
377-
},
378-
"thresholdsStyle": {
379-
"mode": "off"
380-
}
381-
},
382-
"mappings": [],
383-
"thresholds": {
384-
"mode": "absolute",
385-
"steps": [
386-
{
387-
"color": "green"
388-
},
389-
{
390-
"color": "red",
391-
"value": 80
392-
}
393-
]
394-
}
395-
},
396-
"overrides": []
397-
},
398-
"gridPos": {
399-
"h": 8,
400-
"w": 12,
401-
"x": 0,
402-
"y": 18
403-
},
404-
"id": 1,
405-
"options": {
406-
"legend": {
407-
"calcs": [],
408-
"displayMode": "list",
409-
"placement": "bottom",
410-
"showLegend": true
411-
},
412-
"tooltip": {
413-
"hideZeros": false,
414-
"mode": "single",
415-
"sort": "none"
416-
}
417-
},
418-
"pluginVersion": "12.0.1",
419-
"targets": [
420-
{
421-
"disableTextWrap": false,
422-
"editorMode": "builder",
423-
"expr": "dynamo_component_save_kv_layer_requests{dynamo_namespace=\"kvbm_connector_worker\"}",
424-
"fullMetaSearch": false,
425-
"includeNullMetadata": true,
426-
"legendFormat": "__auto",
427-
"range": true,
428-
"refId": "A",
429-
"useBackend": false
430-
}
431-
],
432-
"title": "Save KV Layer Requests",
433-
"type": "timeseries"
434-
},
435339
{
436340
"collapsed": false,
437341
"gridPos": {
@@ -528,8 +432,8 @@
528432
"targets": [
529433
{
530434
"disableTextWrap": false,
531-
"editorMode": "builder",
532-
"expr": "dynamo_component_onboard_requests{dynamo_namespace=\"kvbm_connector_leader\"}",
435+
"editorMode": "code",
436+
"expr": "kvbm_onboard_requests",
533437
"fullMetaSearch": false,
534438
"includeNullMetadata": true,
535439
"legendFormat": "__auto",
@@ -624,8 +528,8 @@
624528
"targets": [
625529
{
626530
"disableTextWrap": false,
627-
"editorMode": "builder",
628-
"expr": "dynamo_component_onboard_blocks_h2d{dynamo_namespace=\"kvbm_connector_leader\"}",
531+
"editorMode": "code",
532+
"expr": "kvbm_onboard_blocks_h2d",
629533
"fullMetaSearch": false,
630534
"includeNullMetadata": true,
631535
"legendFormat": "__auto",
@@ -720,8 +624,8 @@
720624
"targets": [
721625
{
722626
"disableTextWrap": false,
723-
"editorMode": "builder",
724-
"expr": "dynamo_component_onboard_blocks_d2d{dynamo_namespace=\"kvbm_connector_leader\"}",
627+
"editorMode": "code",
628+
"expr": "kvbm_onboard_blocks_d2d",
725629
"fullMetaSearch": false,
726630
"includeNullMetadata": true,
727631
"legendFormat": "__auto",
@@ -750,4 +654,4 @@
750654
"title": "KVBM Dashboard",
751655
"uid": "3f679257-70a5-402c-92b4-05382337b548",
752656
"version": 7
753-
}
657+
}

deploy/metrics/prometheus.yml

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -59,16 +59,10 @@ scrape_configs:
5959
- targets: ['host.docker.internal:9091'] # metrics aggregation service on host
6060

6161
# KVBM leader related metrics
62-
- job_name: 'kvbm-leader-metrics'
62+
- job_name: 'kvbm-metrics'
6363
scrape_interval: 2s
6464
static_configs:
65-
- targets: ['host.docker.internal:6882']
66-
67-
# KVBM worker related metrics
68-
- job_name: 'kvbm-worker-metrics'
69-
scrape_interval: 2s
70-
static_configs:
71-
- targets: ['host.docker.internal:6881']
65+
- targets: ['host.docker.internal:6880']
7266

7367
# Uncomment to see its own Prometheus metrics
7468
# - job_name: 'prometheus'

docs/guides/run_kvbm_in_trtllm.md

Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -109,18 +109,51 @@ Follow below steps to enable metrics collection and view via Grafana dashboard:
109109
# Start the basic services (etcd & natsd), along with Prometheus and Grafana
110110
docker compose -f deploy/docker-compose.yml --profile metrics up -d
111111

112-
# set env var DYN_SYSTEM_ENABLED to true, DYN_SYSTEM_PORT to 6880, DYN_KVBM_SLEEP to 5, when launch via dynamo
113-
# NOTE: Make sure port 6881 (for KVBM worker metrics) and port 6882 (for KVBM leader metrics) are available.
114-
# NOTE: DYN_KVBM_SLEEP is needed to avoid metrics port conflict between KVBM leader and worker
115-
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=6880 DYN_KVBM_SLEEP=5 \
112+
# set env var DYN_KVBM_METRICS to true, when launch via dynamo
113+
# Optionally set DYN_KVBM_METRICS_PORT to choose the /metrics port (default: 6880).
114+
DYN_KVBM_METRICS=true \
116115
python3 -m dynamo.trtllm \
117116
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
118117
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
119118
--extra-engine-args /tmp/kvbm_llm_api_config.yaml &
120119

121120
# optional if firewall blocks KVBM metrics ports to send prometheus metrics
122-
sudo ufw allow 6881/tcp
123-
sudo ufw allow 6882/tcp
121+
sudo ufw allow 6880/tcp
124122
```
125123

126124
View grafana metrics via http://localhost:3001 (default login: dynamo/dynamo) and look for KVBM Dashboard
125+
126+
## Benchmark KVBM
127+
128+
Once the model is loaded ready, follow below steps to use LMBenchmark to benchmark KVBM performance:
129+
```bash
130+
git clone https://github.com/LMCache/LMBenchmark.git
131+
132+
# show case of running the synthetic multi-turn chat dataset.
133+
# we are passing model, endpoint, output file prefix and qps to the sh script.
134+
cd LMBenchmark/synthetic-multi-round-qa
135+
./long_input_short_output_run.sh \
136+
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B" \
137+
"http://localhost:8000" \
138+
"benchmark_kvbm" \
139+
1
140+
141+
# Average TTFT and other perf numbers would be in the output from above cmd
142+
```
143+
More details about how to use LMBenchmark could be found [here](https://github.com/LMCache/LMBenchmark).
144+
145+
`NOTE`: if metrics are enabled as mentioned in the above section, you can observe KV offloading, and KV onboarding in the grafana dashboard.
146+
147+
To compare, you can remove the `kv_connector_config` section from the LLM API config and run `trtllm-serve` with the updated config as the baseline.
148+
```bash
149+
cat > "/tmp/llm_api_config.yaml" <<EOF
150+
backend: pytorch
151+
cuda_graph_config: null
152+
kv_cache_config:
153+
enable_partial_reuse: false
154+
free_gpu_memory_fraction: 0.80
155+
EOF
156+
157+
# run trtllm-serve for the baseline for comparison
158+
trtllm-serve deepseek-ai/DeepSeek-R1-Distill-Llama-8B --host localhost --port 8000 --backend pytorch --extra_llm_api_options /tmp/llm_api_config.yaml &
159+
```

docs/guides/run_kvbm_in_vllm.md

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -77,23 +77,22 @@ Follow below steps to enable metrics collection and view via Grafana dashboard:
7777
# Start the basic services (etcd & natsd), along with Prometheus and Grafana
7878
docker compose -f deploy/docker-compose.yml --profile metrics up -d
7979

80-
# set env var DYN_SYSTEM_ENABLED to true, DYN_SYSTEM_PORT to 6880, DYN_KVBM_SLEEP to 5, when launch via dynamo
81-
# NOTE: Make sure port 6881 (for KVBM worker metrics) and port 6882 (for KVBM leader metrics) are available.
82-
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=6880 \
80+
# set env var DYN_KVBM_METRICS to true, when launch via dynamo
81+
# Optionally set DYN_KVBM_METRICS_PORT to choose the /metrics port (default: 6880).
82+
DYN_KVBM_METRICS=true \
8383
python -m dynamo.vllm \
8484
--model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
8585
--connector kvbm &
8686

8787
# optional if firewall blocks KVBM metrics ports to send prometheus metrics
88-
sudo ufw allow 6881/tcp
89-
sudo ufw allow 6882/tcp
88+
sudo ufw allow 6880/tcp
9089
```
9190

9291
View grafana metrics via http://localhost:3001 (default login: dynamo/dynamo) and look for KVBM Dashboard
9392

9493
## Benchmark KVBM
9594

96-
Once vllm serve is ready, follow below steps to use LMBenchmark to benchmark KVBM performance:
95+
Once the model is loaded ready, follow below steps to use LMBenchmark to benchmark KVBM performance:
9796
```bash
9897
git clone https://github.com/LMCache/LMBenchmark.git
9998

lib/bindings/python/rust/llm/block_manager/vllm/connector/leader.rs

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ pub mod recorder;
55
pub mod slot;
66

77
use super::*;
8-
use dynamo_llm::block_manager::metrics_kvbm::KvbmMetrics;
8+
use dynamo_llm::block_manager::metrics_kvbm::{KvbmMetrics, KvbmMetricsRegistry};
99
use dynamo_runtime::DistributedRuntime;
1010
use slot::{ConnectorSlotManager, SlotError, SlotManager, SlotState};
1111

@@ -15,7 +15,6 @@ use crate::llm::block_manager::{
1515
VllmBlockManager, distributed::KvbmLeader as PyKvbmLeader, vllm::KvbmRequest,
1616
vllm::connector::leader::slot::VllmConnectorSlot,
1717
};
18-
use dynamo_runtime::metrics::prometheus_names::kvbm_connector;
1918

2019
use dynamo_llm::block_manager::{
2120
BasicMetadata, DiskStorage, ImmutableBlock, PinnedStorage,
@@ -103,11 +102,11 @@ impl KvConnectorLeader {
103102
let drt = drt.inner().clone();
104103
let handle: Handle = drt.runtime().primary();
105104

106-
let ns = drt
107-
.namespace(kvbm_connector::KVBM_CONNECTOR_LEADER)
108-
.unwrap();
109-
110-
let kvbm_metrics = KvbmMetrics::new(&ns);
105+
let kvbm_metrics = KvbmMetrics::new(
106+
&KvbmMetricsRegistry::default(),
107+
kvbm_metrics_endpoint_enabled(),
108+
parse_kvbm_metrics_port(),
109+
);
111110
let kvbm_metrics_clone = kvbm_metrics.clone();
112111

113112
let slot_manager_cell = Arc::new(OnceLock::new());
@@ -615,3 +614,30 @@ impl PyKvConnectorLeader {
615614
.map_err(to_pyerr)
616615
}
617616
}
617+
618+
pub fn kvbm_metrics_endpoint_enabled() -> bool {
619+
std::env::var("DYN_KVBM_METRICS")
620+
.map(|v| v == "1" || v.eq_ignore_ascii_case("true"))
621+
.unwrap_or(false)
622+
}
623+
624+
pub fn parse_kvbm_metrics_port() -> u16 {
625+
match std::env::var("DYN_KVBM_METRICS_PORT") {
626+
Ok(val) => match val.trim().parse::<u16>() {
627+
Ok(port) => port,
628+
Err(_) => {
629+
tracing::warn!(
630+
"[kvbm] Invalid DYN_KVBM_METRICS_PORT='{}', falling back to 6880",
631+
val
632+
);
633+
6880
634+
}
635+
},
636+
Err(_) => {
637+
tracing::warn!(
638+
"DYN_KVBM_METRICS_PORT not present or couldn’t be interpreted, falling back to 6880"
639+
);
640+
6880
641+
}
642+
}
643+
}

lib/bindings/python/rust/llm/block_manager/vllm/connector/leader/recorder.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -100,11 +100,11 @@ impl KvConnectorLeaderRecorder {
100100
let drt = drt.inner().clone();
101101
let handle: Handle = drt.runtime().primary();
102102

103-
let ns = drt
104-
.namespace(kvbm_connector::KVBM_CONNECTOR_LEADER)
105-
.unwrap();
106-
107-
let kvbm_metrics = KvbmMetrics::new(&ns);
103+
let kvbm_metrics = KvbmMetrics::new(
104+
&KvbmMetricsRegistry::default(),
105+
kvbm_metrics_endpoint_enabled(),
106+
parse_kvbm_metrics_port(),
107+
);
108108
let kvbm_metrics_clone = kvbm_metrics.clone();
109109

110110
let token = CancellationToken::new();

lib/bindings/python/rust/llm/block_manager/vllm/connector/trtllm_leader.rs

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,12 @@ use crate::llm::block_manager::BlockManagerBuilder;
88
use crate::llm::block_manager::vllm::connector::leader::slot::{
99
ConnectorSlotManager, SlotManager, SlotState,
1010
};
11+
use crate::llm::block_manager::vllm::connector::leader::{
12+
kvbm_metrics_endpoint_enabled, parse_kvbm_metrics_port,
13+
};
1114
use crate::llm::block_manager::{distributed::KvbmLeader as PyKvbmLeader, vllm::KvbmRequest};
1215
use anyhow;
13-
use dynamo_llm::block_manager::metrics_kvbm::KvbmMetrics;
14-
use dynamo_runtime::metrics::prometheus_names::kvbm_connector;
16+
use dynamo_llm::block_manager::metrics_kvbm::{KvbmMetrics, KvbmMetricsRegistry};
1517
use std::collections::HashSet;
1618
use std::sync::{Arc, OnceLock};
1719
use tokio::runtime::Handle;
@@ -76,11 +78,12 @@ impl KvConnectorLeader {
7678
let drt = drt.inner().clone();
7779
let handle: Handle = drt.runtime().primary();
7880

79-
let ns = drt
80-
.namespace(kvbm_connector::KVBM_CONNECTOR_LEADER)
81-
.unwrap();
81+
let kvbm_metrics = KvbmMetrics::new(
82+
&KvbmMetricsRegistry::default(),
83+
kvbm_metrics_endpoint_enabled(),
84+
parse_kvbm_metrics_port(),
85+
);
8286

83-
let kvbm_metrics = KvbmMetrics::new(&ns);
8487
let kvbm_metrics_clone = kvbm_metrics.clone();
8588

8689
let slot_manager_cell = Arc::new(OnceLock::new());

0 commit comments

Comments
 (0)