From d515360ccfde70121e16b5607b1d02a2d6c5c5fb Mon Sep 17 00:00:00 2001 From: Sourashis Roy Date: Sun, 6 Oct 2024 04:13:17 +0000 Subject: [PATCH 1/6] Fix BlockManager V2 when the encoder input is None --- vllm/core/block/block_table.py | 2 -- vllm/core/block_manager_v2.py | 3 ++- vllm/engine/arg_utils.py | 5 ----- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index a9f4bd871dfd..d10cb29ef4a7 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -220,7 +220,6 @@ def free(self) -> None: occupied by each block. After freeing all the blocks, the `_blocks` list is set to `None`. """ - assert self._is_allocated for block in self.blocks: self._allocator.free(block) self._blocks.reset() @@ -239,7 +238,6 @@ def physical_block_ids(self) -> List[int]: List[int]: A list of physical block indices for the blocks in the BlockTable. """ - assert self._is_allocated return self._blocks.ids() def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]: diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 0fad5fa99daf..2dd5479f54a7 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -151,7 +151,8 @@ def _allocate_sequence(self, seq: Sequence) -> BlockTable: block_allocator=self.block_allocator, max_block_sliding_window=self.max_block_sliding_window, ) - block_table.allocate(seq.get_token_ids()) + if (len(seq.get_token_ids()) > 0): + block_table.allocate(seq.get_token_ids()) return block_table diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 1623ebb3aa74..cae95d20ca23 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -903,11 +903,6 @@ def create_engine_config(self) -> EngineConfig: "--enable-prefix-caching is currently not " "supported for multimodal models and has been disabled.") self.enable_prefix_caching = False - if model_config.is_encoder_decoder_model: - logger.warning( - "Block Manager v2 does not support encoder-decoder models" - " currently. Using Block Manager v1 as fallback.") - self.use_v2_block_manager = False cache_config = CacheConfig( block_size=self.block_size if self.device != "neuron" else From effc48d6b4b2dfb82fcbdf7f3da5ab43b700c21a Mon Sep 17 00:00:00 2001 From: Sourashis Roy Date: Sun, 6 Oct 2024 18:57:54 +0000 Subject: [PATCH 2/6] Comments --- vllm/core/block_manager_v2.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 2dd5479f54a7..752faa845ada 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -152,6 +152,8 @@ def _allocate_sequence(self, seq: Sequence) -> BlockTable: max_block_sliding_window=self.max_block_sliding_window, ) if (len(seq.get_token_ids()) > 0): + # Add blocks to the block table only if the encoder sequence + # is non empty. block_table.allocate(seq.get_token_ids()) return block_table From 3296228e48b29647f77712d1fa7020bc7214dd30 Mon Sep 17 00:00:00 2001 From: Sourashis Roy Date: Sun, 6 Oct 2024 19:00:28 +0000 Subject: [PATCH 3/6] Comment --- vllm/core/block_manager_v2.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 752faa845ada..81724a023f84 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -152,8 +152,7 @@ def _allocate_sequence(self, seq: Sequence) -> BlockTable: max_block_sliding_window=self.max_block_sliding_window, ) if (len(seq.get_token_ids()) > 0): - # Add blocks to the block table only if the encoder sequence - # is non empty. + # Add blocks to the block table only if the sequence is non empty. block_table.allocate(seq.get_token_ids()) return block_table From 2a484cb24dded46e713cf48476067972bb81bb58 Mon Sep 17 00:00:00 2001 From: sroy745 <142070531+sroy745@users.noreply.github.com> Date: Sun, 6 Oct 2024 12:42:38 -0700 Subject: [PATCH 4/6] Update vllm/core/block_manager_v2.py Co-authored-by: Cody Yu --- vllm/core/block_manager_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 81724a023f84..c7ee6609306d 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -151,7 +151,7 @@ def _allocate_sequence(self, seq: Sequence) -> BlockTable: block_allocator=self.block_allocator, max_block_sliding_window=self.max_block_sliding_window, ) - if (len(seq.get_token_ids()) > 0): + if seq.get_token_ids(): # Add blocks to the block table only if the sequence is non empty. block_table.allocate(seq.get_token_ids()) From 56b446dd751778c2b2169e1527d24f523c7bac3c Mon Sep 17 00:00:00 2001 From: Sourashis Roy Date: Sun, 6 Oct 2024 23:55:56 +0000 Subject: [PATCH 5/6] Dummy --- vllm/core/block_manager_v2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index c7ee6609306d..1cd507126e3a 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -151,6 +151,7 @@ def _allocate_sequence(self, seq: Sequence) -> BlockTable: block_allocator=self.block_allocator, max_block_sliding_window=self.max_block_sliding_window, ) + if seq.get_token_ids(): # Add blocks to the block table only if the sequence is non empty. block_table.allocate(seq.get_token_ids()) From 27047a8d4087ed2245c7018e924b9f8496426991 Mon Sep 17 00:00:00 2001 From: Sourashis Roy Date: Sun, 6 Oct 2024 23:56:19 +0000 Subject: [PATCH 6/6] Dummy --- vllm/core/block_manager_v2.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 1cd507126e3a..c7ee6609306d 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -151,7 +151,6 @@ def _allocate_sequence(self, seq: Sequence) -> BlockTable: block_allocator=self.block_allocator, max_block_sliding_window=self.max_block_sliding_window, ) - if seq.get_token_ids(): # Add blocks to the block table only if the sequence is non empty. block_table.allocate(seq.get_token_ids())