Skip to content

Commit b853540

Browse files
authored
[Core][Hybrid allocator + kv connector 1/n] Enable hybrid allocator + KV cache connector (#25712)
Signed-off-by: KuntaiDu <kuntai@uchicago.edu> Signed-off-by: Kuntai Du <kuntai@uchicago.edu>
1 parent 56ed760 commit b853540

File tree

15 files changed

+113
-18
lines changed

15 files changed

+113
-18
lines changed

tests/v1/core/test_scheduler.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -899,6 +899,7 @@ def test_kv_connector_basic():
899899
scheduler = create_scheduler(
900900
enable_prefix_caching=True,
901901
use_kv_connector=True,
902+
disable_hybrid_kv_cache_manager=True,
902903
)
903904
NUM_TOTAL_BLOCKS = scheduler.kv_cache_manager.block_pool.get_num_free_blocks()
904905
BLOCK_SIZE = scheduler.cache_config.block_size
@@ -1024,6 +1025,7 @@ def test_external_prefix_cache_metrics():
10241025
scheduler = create_scheduler(
10251026
enable_prefix_caching=False,
10261027
use_kv_connector=True,
1028+
disable_hybrid_kv_cache_manager=True,
10271029
)
10281030

10291031
# Mock connector to simulate a partial external cache hit
@@ -1088,6 +1090,7 @@ def test_kv_connector_unable_to_allocate():
10881090
use_kv_connector=True,
10891091
block_size=BLOCK_SIZE,
10901092
num_blocks=NUM_BLOCKS,
1093+
disable_hybrid_kv_cache_manager=True,
10911094
)
10921095
NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
10931096
scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
@@ -1171,6 +1174,7 @@ def test_kv_connector_handles_preemption():
11711174
use_kv_connector=True,
11721175
block_size=BLOCK_SIZE,
11731176
num_blocks=NUM_BLOCKS,
1177+
disable_hybrid_kv_cache_manager=True,
11741178
)
11751179

11761180
NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
@@ -1387,6 +1391,7 @@ def create_scheduler_with_priority(
13871391
block_size: int = 16,
13881392
max_model_len: int | None = None,
13891393
num_speculative_tokens: int | None = None,
1394+
disable_hybrid_kv_cache_manager: bool = False,
13901395
) -> Scheduler:
13911396
"""Create scheduler with priority policy enabled.
13921397
@@ -1411,6 +1416,7 @@ def create_scheduler_with_priority(
14111416
disable_chunked_mm_input=disable_chunked_mm_input,
14121417
enable_chunked_prefill=True,
14131418
policy="priority", # Enable priority scheduling
1419+
disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager,
14141420
)
14151421
model_config = ModelConfig(
14161422
model=model,
@@ -2018,6 +2024,7 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv():
20182024
num_blocks=5, # Can hold 64 tokens (first block is null)
20192025
block_size=16, # Standard block size
20202026
use_kv_connector=True,
2027+
disable_hybrid_kv_cache_manager=True,
20212028
)
20222029

20232030
# Create a request and schedule it

tests/v1/core/utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ def create_scheduler(
4646
num_speculative_tokens: int | None = None,
4747
skip_tokenizer_init: bool = False,
4848
async_scheduling: bool = False,
49+
disable_hybrid_kv_cache_manager: bool = False,
4950
) -> Scheduler | AsyncScheduler:
5051
"""Create scheduler under test.
5152
@@ -70,6 +71,7 @@ def create_scheduler(
7071
disable_chunked_mm_input=disable_chunked_mm_input,
7172
enable_chunked_prefill=True,
7273
async_scheduling=async_scheduling,
74+
disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager,
7375
)
7476
model_config = ModelConfig(
7577
model=model,

tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ run_tests_for_model() {
136136
vllm serve $model_name \
137137
--port $PORT \
138138
--enforce-eager \
139+
--disable-hybrid-kv-cache-manager \
139140
--gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
140141
--tensor-parallel-size $PREFILLER_TP_SIZE \
141142
--kv-transfer-config '$KV_CONFIG'"
@@ -178,6 +179,7 @@ run_tests_for_model() {
178179
--port $PORT \
179180
--enforce-eager \
180181
--gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
182+
--disable-hybrid-kv-cache-manager \
181183
--kv-transfer-config '$KV_CONFIG'"
182184

183185
# DP-EP attention mode

tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ run_tests_for_model() {
8585
--port $PREFILL_PORT \
8686
--enforce-eager \
8787
--gpu-memory-utilization 0.2 \
88+
--disable-hybrid-kv-cache-manager \
8889
--kv-transfer-config '$KV_CONFIG'"
8990

9091
if [ -n "$model_args" ]; then
@@ -103,6 +104,7 @@ run_tests_for_model() {
103104
--port $DECODE_PORT \
104105
--enforce-eager \
105106
--gpu-memory-utilization 0.2 \
107+
--disable-hybrid-kv-cache-manager \
106108
--kv-transfer-config '$KV_CONFIG'"
107109

108110
if [ -n "$model_args" ]; then

tests/v1/kv_connector/unit/test_multi_connector.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ def test_multi_shared_storage_connector_consistency():
114114
enforce_eager=True,
115115
gpu_memory_utilization=0.5,
116116
kv_transfer_config=kv_transfer_config,
117+
disable_hybrid_kv_cache_manager=True,
117118
)
118119
# Run generation - this should trigger saving KV cache
119120
_ = llm.generate(PROMPTS, SAMPLING_PARAMS)

tests/v1/kv_connector/unit/test_nixl_connector.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -932,6 +932,7 @@ def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend):
932932
"gpu_memory_utilization": 0.5,
933933
"kv_transfer_config": kv_transfer_config,
934934
"distributed_executor_backend": distributed_executor_backend,
935+
"disable_hybrid_kv_cache_manager": True,
935936
}
936937

937938
timeout = 6

tests/v1/kv_connector/unit/test_shared_storage_connector.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ def test_shared_storage_connector_hashes(tmp_path):
132132
enforce_eager=True,
133133
kv_transfer_config=kv_transfer_config,
134134
limit_mm_per_prompt={"image": 2},
135+
disable_hybrid_kv_cache_manager=True,
135136
)
136137

137138
# don't put this import at the top level

tests/v1/kv_connector/unit/utils.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,9 @@ def create_vllm_config(
9191
max_num_batched_tokens=max_num_batched_tokens,
9292
max_model_len=max_model_len,
9393
enable_chunked_prefill=enable_chunked_prefill,
94+
# Disable hybrid KV cache manager for testing
95+
# Should be removed after we support hybrid KV cache manager-based testing.
96+
disable_hybrid_kv_cache_manager=True,
9497
)
9598
model_config = ModelConfig(
9699
model=model,

tests/v1/kv_offload/test_cpu_offloading.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ def test_cpu_offloading(cpu_block_size: int) -> None:
2727
model="meta-llama/Llama-3.2-1B-Instruct",
2828
gpu_memory_utilization=0.5,
2929
kv_transfer_config=kv_transfer_config,
30+
disable_hybrid_kv_cache_manager=True,
3031
)
3132

3233
prompts = ["Hi " * 100]

vllm/config/vllm.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,14 @@
4040
from transformers import PretrainedConfig
4141

4242
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
43+
from vllm.v1.kv_cache_interface import KVCacheConfig
4344
else:
4445
PretrainedConfig = Any
4546

4647
QuantizationConfig = Any
4748

49+
KVCacheConfig = Any
50+
4851
logger = init_logger(__name__)
4952

5053

@@ -568,9 +571,6 @@ def __post_init__(self):
568571
if not current_platform.support_hybrid_kv_cache():
569572
# Hybrid KV cache manager is not supported on non-GPU platforms.
570573
self.scheduler_config.disable_hybrid_kv_cache_manager = True
571-
if self.kv_transfer_config is not None:
572-
# Hybrid KV cache manager is not compatible with KV transfer.
573-
self.scheduler_config.disable_hybrid_kv_cache_manager = True
574574
if self.kv_events_config is not None:
575575
# Hybrid KV cache manager is not compatible with KV events.
576576
self.scheduler_config.disable_hybrid_kv_cache_manager = True

0 commit comments

Comments
 (0)