Skip to content

Commit c9eb6a8

Browse files
feat: expose estimated kv cache hit in dynamo-run (#1246)
Signed-off-by: Hongkuan Zhou <tedzhouhk@gmail.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
1 parent b889948 commit c9eb6a8

File tree

13 files changed

+74
-30
lines changed

13 files changed

+74
-30
lines changed

deploy/sdk/src/dynamo/sdk/lib/config.py

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ class ServiceConfig(dict):
2424
"""Configuration store that inherits from dict for simpler access patterns"""
2525

2626
_instance = None
27+
COMMON_CONFIG_SERVICE = "Common"
28+
COMMON_CONFIG_KEY = "common-configs"
2729

2830
@classmethod
2931
def get_instance(cls):
@@ -49,6 +51,33 @@ def require(self, service_name, key):
4951
raise ValueError(f"{service_name}.{key} must be specified in configuration")
5052
return self[service_name][key]
5153

54+
@classmethod
55+
def get_parsed_config(cls, service_name):
56+
"""Get parsed config for a service with common configs applied, returned as dict"""
57+
instance = cls.get_instance()
58+
59+
if service_name not in instance:
60+
return {}
61+
62+
# Get service config excluding ServiceArgs if it exists
63+
service_config = instance[service_name].copy()
64+
if "ServiceArgs" in service_config:
65+
del service_config["ServiceArgs"]
66+
67+
# Apply common configs if they exist
68+
if (common := instance.get(cls.COMMON_CONFIG_SERVICE)) is not None and (
69+
common_config_keys := service_config.get(cls.COMMON_CONFIG_KEY)
70+
) is not None:
71+
for key in common_config_keys:
72+
if key in common and key not in service_config:
73+
service_config[key] = common[key]
74+
75+
# Remove the common-configs key itself from the final config
76+
if cls.COMMON_CONFIG_KEY in service_config:
77+
del service_config[cls.COMMON_CONFIG_KEY]
78+
79+
return service_config
80+
5281
def as_args(self, service_name, prefix=""):
5382
"""Extract configs as CLI args for a service, with optional prefix filtering.
5483
@@ -57,8 +86,6 @@ def as_args(self, service_name, prefix=""):
5786
the component's `common-configs` setting, and that key has not been overriden by the
5887
component's config.
5988
"""
60-
COMMON_CONFIG_SERVICE = "Common"
61-
COMMON_CONFIG_KEY = "common-configs"
6289

6390
if service_name not in self:
6491
return []
@@ -69,7 +96,7 @@ def add_to_args(args: list[str], key: str, value):
6996
if prefix and not key.startswith(prefix):
7097
return
7198

72-
if key.endswith(COMMON_CONFIG_KEY):
99+
if key.endswith(self.COMMON_CONFIG_KEY):
73100
return
74101

75102
# Strip prefix if needed
@@ -90,8 +117,8 @@ def add_to_args(args: list[str], key: str, value):
90117
if "ServiceArgs" in service_config:
91118
del service_config["ServiceArgs"]
92119

93-
if (common := self.get(COMMON_CONFIG_SERVICE)) is not None and (
94-
common_config_keys := service_config.get(COMMON_CONFIG_KEY)
120+
if (common := self.get(self.COMMON_CONFIG_SERVICE)) is not None and (
121+
common_config_keys := service_config.get(self.COMMON_CONFIG_KEY)
95122
) is not None:
96123
for key in common_config_keys:
97124
if key in common and key not in service_config:

examples/llm/components/frontend.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,7 @@ class Frontend:
6464

6565
def __init__(self):
6666
"""Initialize Frontend service with HTTP server and model configuration."""
67-
config = ServiceConfig.get_instance()
68-
frontend_config = FrontendConfig(**config.get("Frontend", {}))
67+
frontend_config = FrontendConfig(**ServiceConfig.get_parsed_config("Frontend"))
6968
self.frontend_config = frontend_config
7069
self.process = None
7170
self.setup_model()

examples/sglang/components/frontend.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,7 @@ class Frontend:
6060

6161
def __init__(self):
6262
"""Initialize Frontend service with HTTP server and model configuration."""
63-
config = ServiceConfig.get_instance()
64-
frontend_config = FrontendConfig(**config.get("Frontend", {}))
63+
frontend_config = FrontendConfig(**ServiceConfig.get_parsed_config("Frontend"))
6564
self.frontend_config = frontend_config
6665
self.process = None
6766

examples/tensorrt_llm/components/frontend.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,7 @@ class Frontend:
6161
processor = depends(Processor)
6262

6363
def __init__(self):
64-
config = ServiceConfig.get_instance()
65-
frontend_config = FrontendConfig(**config.get("Frontend", {}))
64+
frontend_config = FrontendConfig(**ServiceConfig.get_parsed_config("Frontend"))
6665

6766
# Chat/completions Endpoint
6867
subprocess.run(

examples/vllm_v0/components/frontend.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,13 @@ class Frontend:
6363

6464
def __init__(self):
6565
"""Initialize Frontend service with HTTP server and model configuration."""
66-
config = ServiceConfig.get_instance()
67-
self.frontend_config = FrontendConfig(**config.get("Frontend", {}))
66+
self.frontend_config = FrontendConfig(
67+
**ServiceConfig.get_parsed_config("Frontend")
68+
)
6869
self.process = None
6970

71+
logger.warning(f"Frontend config: {self.frontend_config}")
72+
7073
self.start_ingress_and_processor()
7174

7275
def start_ingress_and_processor(self):
@@ -87,6 +90,8 @@ def start_ingress_and_processor(self):
8790
self.frontend_config.router,
8891
]
8992

93+
logger.info(f"Frontend cmd: {cmd}")
94+
9095
self.process = subprocess.Popen(
9196
cmd,
9297
stdout=None,

examples/vllm_v0/components/worker.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ async def generate(self, request: PreprocessedRequest):
212212
prefill_queue_size = await prefill_queue.get_queue_size()
213213
disagg_router_decision = await self.disaggregated_router.prefill_remote(
214214
len(request.token_ids),
215-
0, # TODO: return prefix hit rate from dynamo-run router
215+
request.estimated_prefix_hit_num_blocks * self.engine_args.block_size,
216216
prefill_queue_size,
217217
)
218218
else:
@@ -225,12 +225,12 @@ async def generate(self, request: PreprocessedRequest):
225225
remote_prefill_request_callback=self.get_remote_prefill_request_callback(),
226226
)
227227
logger.info(
228-
f"Prefilling remotely for request {request_id} with length {len(request.token_ids)}"
228+
f"Prefilling remotely for request {request_id} with length {len(request.token_ids)} (estimated prefix hit length {(request.estimated_prefix_hit_num_blocks or 0) * self.engine_args.block_size})"
229229
)
230230
else:
231231
remote_prefill_params = None
232232
logger.info(
233-
f"Prefilling locally for request {request_id} with length {len(request.token_ids)}"
233+
f"Prefilling locally for request {request_id} with length {len(request.token_ids)} (estimated prefix hit length {request.estimated_prefix_hit_num_blocks * self.engine_args.block_size})"
234234
)
235235

236236
sampling_params = SamplingParams(**self.default_sampling_params)

examples/vllm_v0/configs/agg_kv.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,13 @@ Common:
1616
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
1717
block-size: 64
1818
max-model-len: 16384
19+
router: kv
1920

2021
Frontend:
2122
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
2223
endpoint: dynamo.VllmWorker.generate
2324
port: 8000
24-
router: kv
25-
common-configs: [block-size]
25+
common-configs: [block-size, router]
2626

2727
VllmWorker:
2828
enforce-eager: true
@@ -32,4 +32,4 @@ VllmWorker:
3232
workers: 1
3333
resources:
3434
gpu: '1'
35-
common-configs: [model, block-size, max-model-len]
35+
common-configs: [model, block-size, max-model-len, router]

examples/vllm_v0/configs/disagg_kv.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,13 @@ Common:
1717
block-size: 64
1818
max-model-len: 16384
1919
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
20+
router: kv
2021

2122
Frontend:
2223
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
2324
endpoint: dynamo.VllmWorker.generate
2425
port: 8000
25-
router: kv
26-
common-configs: [block-size]
26+
common-configs: [block-size, router]
2727

2828
VllmWorker:
2929
remote-prefill: true
@@ -35,7 +35,7 @@ VllmWorker:
3535
workers: 1
3636
resources:
3737
gpu: 1
38-
common-configs: [model, block-size, max-model-len, kv-transfer-config]
38+
common-configs: [model, block-size, max-model-len, kv-transfer-config, router]
3939

4040
PrefillWorker:
4141
max-num-batched-tokens: 16384

examples/vllm_v0/utils/protocol.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ class PreprocessedRequest(BaseModel):
5252
eos_token_ids: List[TokenIdType] = Field(default_factory=list)
5353
mdc_sum: Optional[str] = None
5454
annotations: List[str] = Field(default_factory=list)
55+
estimated_prefix_hit_num_blocks: Optional[int] = None
5556

5657

5758
class DisaggPreprocessedRequest(BaseModel):

examples/vllm_v1/components/frontend.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,7 @@ class Frontend:
6262

6363
def __init__(self):
6464
"""Initialize Frontend service with HTTP server and model configuration."""
65-
config = ServiceConfig.get_instance()
66-
frontend_config = FrontendConfig(**config.get("Frontend", {}))
65+
frontend_config = FrontendConfig(**ServiceConfig.get_parsed_config("Frontend"))
6766
self.frontend_config = frontend_config
6867
self.process = None
6968

0 commit comments

Comments
 (0)