From d1a91aa81e68ee29db6e0f66f387b4ec1aeb1c95 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Fri, 2 Feb 2024 15:42:42 -0500 Subject: [PATCH 01/79] init --- tests/prefix_caching/test_prefix_caching.py | 1 - vllm/core/block_manager.py | 79 ++++++++++----------- vllm/core/scheduler.py | 5 -- vllm/engine/llm_engine.py | 13 +--- vllm/sequence.py | 12 ++-- vllm/worker/model_runner.py | 8 +-- 6 files changed, 47 insertions(+), 71 deletions(-) diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index 1e301bedfc21e..dded5e1b0f7a4 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -38,4 +38,3 @@ def test_prefix_caching( outputs_without_prefix, outputs_with_prefix): assert (output_without_prefix.outputs[0].token_ids == output_with_prefix.outputs[0].token_ids) - assert len(llm.llm_engine.scheduler.prefix_pool.prefixes) == 1 diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 7f91051f03ac1..7b6bbe6b60a4a 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -1,6 +1,7 @@ """A block manager that manages token blocks.""" import enum -from typing import Dict, List, Optional, Set, Tuple +from collections import deque +from typing import Dict, List, Optional, Set, Tuple, Deque from vllm.block import BlockTable, PhysicalTokenBlock from vllm.sequence import Sequence, SequenceGroup, SequenceStatus @@ -25,22 +26,38 @@ def __init__( self.block_size = block_size self.num_blocks = num_blocks + self.current_num_blocks = 0 + self.table: Dict[int, PhysicalTokenBlock] = {} # Initialize the free blocks. - self.free_blocks: BlockTable = [] - for i in range(num_blocks): - block = PhysicalTokenBlock(device=device, - block_number=i, - block_size=block_size) - self.free_blocks.append(block) + self.free_blocks: Deque[PhysicalTokenBlock] = deque() - def allocate(self) -> PhysicalTokenBlock: - if not self.free_blocks: - raise ValueError("Out of memory! No free blocks are available.") + def evict(self) -> PhysicalTokenBlock: + assert (len(self.free_blocks)) + # Find the block in the main hash table block = self.free_blocks.pop() - block.ref_count = 1 + key = list(self.table.keys())[list(self.table.values()).index()] + del self.table[key] + return block + + def allocate_block(self) -> PhysicalTokenBlock: + if self.current_num_blocks == self.num_blocks: + return self.evict() + block = PhysicalTokenBlock(device=self.device, + block_number=self.current_num_blocks, + block_size=self.block_size) + self.current_num_blocks += 1 + return block + + def allocate(self, i: int) -> PhysicalTokenBlock: + if i not in self.table: + self.table[i] = self.allocate_block() + block = self.table[i] + block.ref_count += 1 + # print(f"REFCOUNT ON ALLOCTION: {block}") return block def free(self, block: PhysicalTokenBlock) -> None: + # print(f"FREEING: {block}") if block.ref_count == 0: raise ValueError(f"Double free! {block} is already freed.") block.ref_count -= 1 @@ -48,7 +65,7 @@ def free(self, block: PhysicalTokenBlock) -> None: self.free_blocks.append(block) def get_num_free_blocks(self) -> int: - return len(self.free_blocks) + return self.num_blocks - self.current_num_blocks class AllocStatus(enum.Enum): @@ -103,9 +120,6 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] num_required_blocks = len(seq.logical_token_blocks) - if seq_group.prefix is not None and seq_group.prefix.allocated: - num_required_blocks -= seq_group.prefix.get_num_blocks() - if self.block_sliding_window is not None: num_required_blocks = min(num_required_blocks, self.block_sliding_window) @@ -129,36 +143,17 @@ def allocate(self, seq_group: SequenceGroup) -> None: num_prompt_blocks = len(seq.logical_token_blocks) block_table: BlockTable = [] - prefix_block_table: BlockTable = [] - num_prefix_blocks = 0 - - prefix = seq_group.prefix - if prefix is not None and prefix.allocated: - # Prefix has already been allocated. Use the existing block table. - num_prompt_blocks -= prefix.get_num_blocks() - for block in prefix.block_table: - block.ref_count += seq_group.num_seqs() - block_table.append(block) for logical_idx in range(num_prompt_blocks): if (self.block_sliding_window is not None and logical_idx >= self.block_sliding_window): block = block_table[logical_idx % self.block_sliding_window] else: - block = self.gpu_allocator.allocate() + block = self.gpu_allocator.allocate(seq.hash(logical_idx)) # Set the reference counts of the token blocks. - block.ref_count = seq_group.num_seqs() + # block.ref_count = seq_group.num_seqs() block_table.append(block) - if prefix is not None and not prefix.allocated: - # Allocate blocks for the prefix, we will compute the prefix's - # KV cache in this run. - num_prefix_blocks = prefix.get_num_blocks() - prefix_block_table = block_table[:num_prefix_blocks] - for block in prefix_block_table: - block.ref_count += 1 - prefix.set_block_table(prefix_block_table) - # Assign the block table for each sequence. for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): self.block_tables[seq.seq_id] = block_table.copy() @@ -184,7 +179,8 @@ def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]: else: # The sequence has a new logical block. # Allocate a new physical block. - block = self.gpu_allocator.allocate() + block = self.gpu_allocator.allocate( + seq.hash(len(logical_blocks) - 1)) block_table.append(block) return None @@ -197,7 +193,8 @@ def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]: else: # The last block is shared with other sequences. # Copy on Write: Allocate a new block and copy the tokens. - new_block = self.gpu_allocator.allocate() + new_block = self.gpu_allocator.allocate( + seq.hash(len(logical_blocks) - 1)) block_table[-1] = new_block self.gpu_allocator.free(last_block) return last_block.block_number, new_block.block_number @@ -251,7 +248,8 @@ def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]: gpu_block = mapping[cpu_block] gpu_block.ref_count += 1 else: - gpu_block = self.gpu_allocator.allocate() + gpu_block = self.gpu_allocator.allocate( + seq.hash(len(seq.logical_blocks) - 1)) mapping[cpu_block] = gpu_block new_block_table.append(gpu_block) # Free the CPU block swapped in to GPU. @@ -286,7 +284,8 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: cpu_block = mapping[gpu_block] cpu_block.ref_count += 1 else: - cpu_block = self.cpu_allocator.allocate() + cpu_block = self.cpu_allocator.allocate( + seq.hash(len(seq.logical_blocks) - 1)) mapping[gpu_block] = cpu_block new_block_table.append(cpu_block) # Free the GPU block swapped out to CPU. diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 4fdf9ec341cfd..213f9bb9cf30c 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -10,7 +10,6 @@ from vllm.logger import init_logger from vllm.sequence import (Sequence, SequenceData, SequenceGroup, SequenceGroupMetadata, SequenceStatus) -from vllm.prefix import PrefixPool logger = init_logger(__name__) @@ -97,9 +96,6 @@ def __init__( num_cpu_blocks=self.cache_config.num_cpu_blocks, sliding_window=self.cache_config.sliding_window) - # Create the prefix pool to cache the prefixes. - self.prefix_pool = PrefixPool(self.cache_config.block_size) - # Sequence groups in the WAITING state. self.waiting: Deque[SequenceGroup] = deque() # Sequence groups in the RUNNING state. @@ -383,7 +379,6 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: sampling_params=seq_group.sampling_params, block_tables=block_tables, lora_request=seq_group.lora_request, - prefix=seq_group.prefix, ) seq_group_metadata_list.append(seq_group_metadata) return seq_group_metadata_list, scheduler_outputs diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 0dedc232292dd..8258eba4453cb 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -438,14 +438,9 @@ def add_request( seq = Sequence(seq_id, prompt, prompt_token_ids, block_size, lora_request) - # Check whether the input specifies prefix - prefix = self.scheduler.prefix_pool.add_or_get_prefix( - prompt_token_ids[:prefix_pos], lora_request.lora_int_id - if lora_request else 0) if prefix_pos is not None else None - # Create the sequence group. seq_group = SequenceGroup(request_id, [seq], sampling_params, - arrival_time, lora_request, prefix) + arrival_time, lora_request) # Add the sequence group to the scheduler. self.scheduler.add_seq_group(seq_group) @@ -720,12 +715,6 @@ def _process_model_outputs( request_output = RequestOutput.from_seq_group(seq_group) request_outputs.append(request_output) - # Update prefix state, now all the uncomputed prefixes are computed. - for seq_group in scheduled_seq_groups: - if (seq_group.prefix is not None and seq_group.prefix.allocated - and not seq_group.prefix.computed): - seq_group.prefix.computed = True - if self.log_stats: # Log the system stats. self._log_system_stats(scheduler_outputs.prompt_run, diff --git a/vllm/sequence.py b/vllm/sequence.py index d28627f47498f..ad50873e338b0 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -4,7 +4,6 @@ from typing import Dict, List, Optional, Union from vllm.block import LogicalTokenBlock -from vllm.prefix import Prefix from vllm.sampling_params import SamplingParams from vllm.lora.request import LoRARequest @@ -142,6 +141,12 @@ def __init__( def lora_int_id(self) -> int: return self.lora_request.lora_int_id if self.lora_request else 0 + def hash(self, logical_idx: int) -> int: + num_tokens = (logical_idx * self.block_size) + ( + self.block_size - + self.logical_token_blocks[logical_idx].get_num_empty_slots()) + return hash(tuple(self.data.get_token_ids()[0:num_tokens])) + def _append_logical_block(self) -> None: block = LogicalTokenBlock( block_number=len(self.logical_token_blocks), @@ -248,14 +253,12 @@ def __init__( sampling_params: SamplingParams, arrival_time: float, lora_request: Optional[LoRARequest] = None, - prefix: Optional[Prefix] = None, ) -> None: self.request_id = request_id self.seqs_dict = {seq.seq_id: seq for seq in seqs} self.sampling_params = sampling_params self.arrival_time = arrival_time self.lora_request = lora_request - self.prefix: Optional[Prefix] = prefix self.prompt_logprobs: Optional[PromptLogprobs] = None @property @@ -354,7 +357,6 @@ class SequenceGroupMetadata: block_tables: The block tables. (Seq id -> list of physical block numbers) lora_request: LoRA request. - prefix: The prefix of the prompt of the sequence group. """ def __init__( @@ -365,7 +367,6 @@ def __init__( sampling_params: SamplingParams, block_tables: Dict[int, List[int]], lora_request: Optional[LoRARequest] = None, - prefix: Optional[Prefix] = None, ) -> None: self.request_id = request_id self.is_prompt = is_prompt @@ -373,7 +374,6 @@ def __init__( self.sampling_params = sampling_params self.block_tables = block_tables self.lora_request = lora_request - self.prefix = prefix @property def lora_int_id(self) -> int: diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 985115613e044..590eaab77901b 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -117,13 +117,7 @@ def _prepare_prompt( prompt_len = len(prompt_tokens) prompt_lens.append(prompt_len) prefix_len = 0 - prefix = seq_group_metadata.prefix - if prefix is not None and prefix.computed: - prefix_len = prefix.get_length() - prompt_tokens = prompt_tokens[prefix_len:] - prefix_block_tables.append(prefix.get_block_numbers()) - else: - prefix_block_tables.append([]) + prefix_block_tables.append([]) # actual prompt lens context_lens.append(prefix_len) subquery_lens.append(prompt_len - prefix_len) From ec211305e4f0f16ae680c9223b402973dd380422 Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Mon, 5 Feb 2024 09:02:42 -0500 Subject: [PATCH 02/79] Move evictor and eviction policy to a separate class --- vllm/core/block_manager.py | 48 +++++++++++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 7b6bbe6b60a4a..2c5c445a08aef 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -7,6 +7,40 @@ from vllm.sequence import Sequence, SequenceGroup, SequenceStatus from vllm.utils import Device +class EvictionPolicy(enum.Enum): + """Enum for eviction policy used by BlockAllocator.""" + LRU = enum.auto() + + +class Evictor: + """Evicts physical blocks on cache based on eviction policy.""" + + def __init__( + self, + eviction_policy: EvictionPolicy = EvictionPolicy.LRU + ) -> None: + self.eviction_policy = eviction_policy + + # Initialize the free blocks. + self.free_blocks: Deque[PhysicalTokenBlock] = deque() + + def evict( + self, + table: Dict[int, PhysicalTokenBlock] + ) -> PhysicalTokenBlock: + match(self.eviction_policy): + case EvictionPolicy.LRU: + assert (len(self.free_blocks)) + # Find the block in the main hash table + block = self.free_blocks.pop() + key = list(table.keys())[list(table.values()).index()] + del table[key] + return block + case _: + raise ValueError(f"Unknown cache eviction policy: {self.eviction_policy}") + + def return_block(self, block: PhysicalTokenBlock) -> None: + self.free_blocks.append(block) class BlockAllocator: """Manages free physical token blocks for a device. @@ -21,23 +55,19 @@ def __init__( device: Device, block_size: int, num_blocks: int, + eviction_policy: EvictionPolicy = EvictionPolicy.LRU ) -> None: self.device = device self.block_size = block_size self.num_blocks = num_blocks + self.evictor = Evictor(eviction_policy) + self.current_num_blocks = 0 self.table: Dict[int, PhysicalTokenBlock] = {} - # Initialize the free blocks. - self.free_blocks: Deque[PhysicalTokenBlock] = deque() def evict(self) -> PhysicalTokenBlock: - assert (len(self.free_blocks)) - # Find the block in the main hash table - block = self.free_blocks.pop() - key = list(self.table.keys())[list(self.table.values()).index()] - del self.table[key] - return block + return self.evictor.evict(self.table) def allocate_block(self) -> PhysicalTokenBlock: if self.current_num_blocks == self.num_blocks: @@ -62,7 +92,7 @@ def free(self, block: PhysicalTokenBlock) -> None: raise ValueError(f"Double free! {block} is already freed.") block.ref_count -= 1 if block.ref_count == 0: - self.free_blocks.append(block) + self.evictor.return_block(block) def get_num_free_blocks(self) -> int: return self.num_blocks - self.current_num_blocks From 73ab52cbf8dd53eeaec642e830b0089cf73812e3 Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Mon, 5 Feb 2024 09:14:39 -0500 Subject: [PATCH 03/79] format, replace match with if-else --- vllm/core/block_manager.py | 48 +++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 2c5c445a08aef..c182f702fa20b 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -7,41 +7,39 @@ from vllm.sequence import Sequence, SequenceGroup, SequenceStatus from vllm.utils import Device + class EvictionPolicy(enum.Enum): """Enum for eviction policy used by BlockAllocator.""" LRU = enum.auto() class Evictor: - """Evicts physical blocks on cache based on eviction policy.""" + """Evicts physical blocks from cache based on eviction policy.""" - def __init__( - self, - eviction_policy: EvictionPolicy = EvictionPolicy.LRU - ) -> None: + def __init__(self, + eviction_policy: EvictionPolicy = EvictionPolicy.LRU) -> None: self.eviction_policy = eviction_policy # Initialize the free blocks. self.free_blocks: Deque[PhysicalTokenBlock] = deque() - def evict( - self, - table: Dict[int, PhysicalTokenBlock] - ) -> PhysicalTokenBlock: - match(self.eviction_policy): - case EvictionPolicy.LRU: - assert (len(self.free_blocks)) - # Find the block in the main hash table - block = self.free_blocks.pop() - key = list(table.keys())[list(table.values()).index()] - del table[key] - return block - case _: - raise ValueError(f"Unknown cache eviction policy: {self.eviction_policy}") + def evict(self, table: Dict[int, + PhysicalTokenBlock]) -> PhysicalTokenBlock: + if self.eviction_policy == EvictionPolicy.LRU: + assert (len(self.free_blocks)) + # Find the block in the main hash table + block = self.free_blocks.pop() + key = list(table.keys())[list(table.values()).index()] + del table[key] + return block + else: + raise ValueError( + f"Unknown cache eviction policy: {self.eviction_policy}") def return_block(self, block: PhysicalTokenBlock) -> None: self.free_blocks.append(block) + class BlockAllocator: """Manages free physical token blocks for a device. @@ -50,13 +48,11 @@ class BlockAllocator: the reference count becomes zero, the block is added back to the free list. """ - def __init__( - self, - device: Device, - block_size: int, - num_blocks: int, - eviction_policy: EvictionPolicy = EvictionPolicy.LRU - ) -> None: + def __init__(self, + device: Device, + block_size: int, + num_blocks: int, + eviction_policy: EvictionPolicy = EvictionPolicy.LRU) -> None: self.device = device self.block_size = block_size self.num_blocks = num_blocks From 76b5290e6a3e32c17f8a0610605aee31ed45583c Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Mon, 5 Feb 2024 10:32:54 -0500 Subject: [PATCH 04/79] shore up some of the eviction logic --- vllm/block.py | 3 +++ vllm/core/block_manager.py | 48 ++++++++++++++++++++++---------------- 2 files changed, 31 insertions(+), 20 deletions(-) diff --git a/vllm/block.py b/vllm/block.py index 5fe39ed47b2ff..d57173eb8a80b 100644 --- a/vllm/block.py +++ b/vllm/block.py @@ -55,6 +55,7 @@ def __init__( device: Device, block_number: int, block_size: int, + block_hash: int, ) -> None: self.device = device self.block_number = block_number @@ -62,6 +63,8 @@ def __init__( self.ref_count = 0 + self.block_hash = block_hash + def __repr__(self) -> str: return (f'PhysicalTokenBlock(device={self.device}, ' f'block_number={self.block_number}, ' diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index c182f702fa20b..b41ee6c9dc60d 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -23,21 +23,26 @@ def __init__(self, # Initialize the free blocks. self.free_blocks: Deque[PhysicalTokenBlock] = deque() - def evict(self, table: Dict[int, - PhysicalTokenBlock]) -> PhysicalTokenBlock: - if self.eviction_policy == EvictionPolicy.LRU: - assert (len(self.free_blocks)) - # Find the block in the main hash table - block = self.free_blocks.pop() - key = list(table.keys())[list(table.values()).index()] - del table[key] - return block - else: - raise ValueError( - f"Unknown cache eviction policy: {self.eviction_policy}") + def evict( + self, + table: Dict[int, PhysicalTokenBlock] + ) -> PhysicalTokenBlock: + if self.eviction_policy == EvictionPolicy.LRU: + assert (len(self.free_blocks)) + # Find the block in the main hash table + block = self.free_blocks.pop() + + # Continue poping blocks until we find one with a ref_count of 0 + while block.ref_count != 0: + block = self.free_blocks.pop() + + del table[block.block_hash] + return block + else: + raise ValueError(f"Unknown cache eviction policy: {self.eviction_policy}") def return_block(self, block: PhysicalTokenBlock) -> None: - self.free_blocks.append(block) + self.free_blocks.appendleft(block) class BlockAllocator: @@ -65,19 +70,22 @@ def __init__(self, def evict(self) -> PhysicalTokenBlock: return self.evictor.evict(self.table) - def allocate_block(self) -> PhysicalTokenBlock: + def allocate_block(self, block_hash: int) -> PhysicalTokenBlock: if self.current_num_blocks == self.num_blocks: - return self.evict() + block = self.evict() + block.block_hash = block_hash + return block block = PhysicalTokenBlock(device=self.device, block_number=self.current_num_blocks, - block_size=self.block_size) + block_size=self.block_size, + block_hash = block_hash) self.current_num_blocks += 1 return block - def allocate(self, i: int) -> PhysicalTokenBlock: - if i not in self.table: - self.table[i] = self.allocate_block() - block = self.table[i] + def allocate(self, block_hash: int) -> PhysicalTokenBlock: + if block_hash not in self.table: + self.table[block_hash] = self.allocate_block(block_hash) + block = self.table[block_hash] block.ref_count += 1 # print(f"REFCOUNT ON ALLOCTION: {block}") return block From fb9132bb0b6d1f315ad9862b1c534a8491ccd0ab Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Mon, 5 Feb 2024 10:46:41 -0500 Subject: [PATCH 05/79] autoformat --- vllm/core/block_manager.py | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index b41ee6c9dc60d..11053a9998872 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -23,23 +23,22 @@ def __init__(self, # Initialize the free blocks. self.free_blocks: Deque[PhysicalTokenBlock] = deque() - def evict( - self, - table: Dict[int, PhysicalTokenBlock] - ) -> PhysicalTokenBlock: - if self.eviction_policy == EvictionPolicy.LRU: - assert (len(self.free_blocks)) - # Find the block in the main hash table + def evict(self, table: Dict[int, + PhysicalTokenBlock]) -> PhysicalTokenBlock: + if self.eviction_policy == EvictionPolicy.LRU: + assert (len(self.free_blocks)) + # Find the block in the main hash table + block = self.free_blocks.pop() + + # Continue poping blocks until we find one with a ref_count of 0 + while block.ref_count != 0: block = self.free_blocks.pop() - # Continue poping blocks until we find one with a ref_count of 0 - while block.ref_count != 0: - block = self.free_blocks.pop() - - del table[block.block_hash] - return block - else: - raise ValueError(f"Unknown cache eviction policy: {self.eviction_policy}") + del table[block.block_hash] + return block + else: + raise ValueError( + f"Unknown cache eviction policy: {self.eviction_policy}") def return_block(self, block: PhysicalTokenBlock) -> None: self.free_blocks.appendleft(block) @@ -77,8 +76,8 @@ def allocate_block(self, block_hash: int) -> PhysicalTokenBlock: return block block = PhysicalTokenBlock(device=self.device, block_number=self.current_num_blocks, - block_size=self.block_size, - block_hash = block_hash) + block_size=self.block_size, + block_hash=block_hash) self.current_num_blocks += 1 return block From c84bbdaa3dfc3574a1ec207f34a70537c6383b02 Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Tue, 6 Feb 2024 02:42:51 -0500 Subject: [PATCH 06/79] Test block hashing --- tests/test_cache_block_hashing.py | 80 +++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 tests/test_cache_block_hashing.py diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py new file mode 100644 index 0000000000000..b36db3ce506ac --- /dev/null +++ b/tests/test_cache_block_hashing.py @@ -0,0 +1,80 @@ +"""Test hashing of cache blocks. + +Run `pytest tests/test_cache_block_hashing.py`. +""" +import pytest + +from vllm.transformers_utils.tokenizer import TokenizerGroup +from vllm.sequence import Sequence + +# Make two prefixes with different first blocks. +prefix_start = [("You are an expert"), ("You are a")] +prefix_common = ( + " school principal, skilled in effectively managing " + "faculty and staff. Draft 10-15 questions for a potential first grade " + "Head Teacher for my K-12, all-girls', independent school that emphasizes " + "community, joyful discovery, and life-long learning. The candidate is " + "coming in for a first-round panel interview for a 8th grade Math " + "teaching role. They have 5 years of previous teaching experience " + "as an assistant teacher at a co-ed, public school with experience " + "in middle school math teaching. Based on this, fulfill " + "the following: ") +prefixes = [start + prefix_common for start in prefix_start] + +# Sample prompts. +sample_prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is" +] + +# Helper function. +def flatten_2d(l): + return [lss for ls in l for lss in ls] + +@pytest.mark.parametrize("model", ["facebook/opt-125m"]) +@pytest.mark.parametrize("block_size", [16]) +@pytest.mark.parametrize("max_num_seqs", [256]) +def test_auto_prefix_caching( + model: str, + block_size: int, + max_num_seqs: int +): + + tokenizer = TokenizerGroup( + tokenizer_id="facebook/opt-125m", + enable_lora=False, + max_num_seqs=max_num_seqs, + max_input_length=None, + ) + + hashes = [] + + for prefix in prefixes: + hashes.append([]) + prompts = [prefix + prompt for prompt in sample_prompts] + seq_id = 0 + for prompt in prompts: + hashes[-1].append([]) + prompt_token_ids = tokenizer.encode(prompt) + seq = Sequence(seq_id, prompt, prompt_token_ids, block_size) + + num_blocks = len(prompt_token_ids) // block_size + for idx in range(num_blocks): + hashes[-1][-1].append(seq.hash(idx)) + + seq_id += 1 + + # Check that hashes made with two prefixes with different first blocks are + # different everywhere. + for hash0, hash1 in zip(flatten_2d(hashes[0]), flatten_2d(hashes[1])): + assert (hash0 != hash1) + + # Check that hashes of different prompts made with the same prefix are the + # same until the hashes that contain the prompt. + for hash_pref in hashes: + same_hashes = [tuple(h[:-1]) for h in hash_pref] + different_hashes = [h[-1] for h in hash_pref] + assert(len(set(same_hashes)) == 1) + assert(len(set(different_hashes)) == len(different_hashes)) From be146c070bae51b508df2e4a537e63532069b004 Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Tue, 6 Feb 2024 02:47:37 -0500 Subject: [PATCH 07/79] Format --- tests/test_cache_block_hashing.py | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index b36db3ce506ac..f4eb90378eb0b 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -23,24 +23,20 @@ # Sample prompts. sample_prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is" + "Hello, my name is", "The president of the United States is", + "The capital of France is", "The future of AI is" ] + # Helper function. -def flatten_2d(l): - return [lss for ls in l for lss in ls] +def flatten_2d(li): + return [lss for ls in li for lss in ls] + @pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("block_size", [16]) @pytest.mark.parametrize("max_num_seqs", [256]) -def test_auto_prefix_caching( - model: str, - block_size: int, - max_num_seqs: int -): +def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int): tokenizer = TokenizerGroup( tokenizer_id="facebook/opt-125m", @@ -59,11 +55,11 @@ def test_auto_prefix_caching( hashes[-1].append([]) prompt_token_ids = tokenizer.encode(prompt) seq = Sequence(seq_id, prompt, prompt_token_ids, block_size) - + num_blocks = len(prompt_token_ids) // block_size for idx in range(num_blocks): hashes[-1][-1].append(seq.hash(idx)) - + seq_id += 1 # Check that hashes made with two prefixes with different first blocks are @@ -76,5 +72,5 @@ def test_auto_prefix_caching( for hash_pref in hashes: same_hashes = [tuple(h[:-1]) for h in hash_pref] different_hashes = [h[-1] for h in hash_pref] - assert(len(set(same_hashes)) == 1) - assert(len(set(different_hashes)) == len(different_hashes)) + assert (len(set(same_hashes)) == 1) + assert (len(set(different_hashes)) == len(different_hashes)) From 063d2fb9394823be4428094dfa80ff1df18b0826 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Tue, 6 Feb 2024 09:11:39 -0500 Subject: [PATCH 08/79] added block allocator tests --- tests/prefix_caching/test_prefix_caching.py | 75 +++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index dded5e1b0f7a4..e98fb67fbe5d5 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -5,6 +5,8 @@ import pytest from vllm import LLM, SamplingParams +from vllm.core.block_manager import BlockAllocator +from vllm.utils import Device prefix = ( "You are an expert school principal, skilled in effectively managing " @@ -18,6 +20,14 @@ "the following paragraph: ") +def allocate_all_blocks(block_allocator, num_blocks): + blocks = [] + for i in range(num_blocks): + # use i as the block_hash + blocks.append(block_allocator.allocate(i)) + return blocks + + @pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("max_tokens", [16]) def test_prefix_caching( @@ -38,3 +48,68 @@ def test_prefix_caching( outputs_without_prefix, outputs_with_prefix): assert (output_without_prefix.outputs[0].token_ids == output_with_prefix.outputs[0].token_ids) + + +@pytest.mark.parametrize("block_size", [16]) +@pytest.mark.parametrize("num_blocks", [16]) +def test_block_allocator( + block_size: int, + num_blocks: int, +): + block_hash = 1 + block_allocator = BlockAllocator(Device.CPU, block_size, num_blocks) + + # Allocate two PysicalTokenBlocks with the same hash and check that they are the same PhysicalTokenBlock + first_block = block_allocator.allocate(block_hash) + second_block = block_allocator.allocate(block_hash) + assert (first_block == second_block) + assert (second_block.ref_count == 2) + + # Free the first_block and confirm that the ref_count is correctly decremented on the second block + block_allocator.free(first_block) + assert (second_block.ref_count == 1) + + # Free the second block and confirm that the block ends up on the free list + block_allocator.free(second_block) + assert (len(block_allocator.evictor.free_blocks) == 1) + free_block = block_allocator.evictor.free_blocks[0] + assert (free_block == second_block) + + # Reallocate the first block and confirm that, even after the block had its ref_count go to 0, we still get the same block back + first_block = block_allocator.allocate(block_hash) + assert (first_block == second_block) + assert (first_block.block_hash == block_hash) + + +@pytest.mark.parametrize("num_blocks", [16]) +def test_eviction(num_blocks: int, ): + block_size = 16 + block_allocator = BlockAllocator(Device.CPU, block_size, num_blocks) + blocks = [] + + for i in range(num_blocks): + # use i as the block_hash + blocks.append(block_allocator.allocate(i)) + + #Free all blocks + for block in blocks: + block_allocator.free(block) + + # Allocate a new block and confirm that it's the first block freed. I.E The Least Recently Used block + new_block_hash = block_size + new_block = block_allocator.allocate(new_block_hash) + assert (new_block == blocks[0]) + assert (new_block.block_hash == new_block_hash) + + # Reallocate the second in blocks to remove it from the free list + realloc_block_hash = 1 + realloc_block = block_allocator.allocate(realloc_block_hash) + assert (realloc_block == blocks[realloc_block_hash]) + assert (realloc_block.block_hash == realloc_block_hash) + + # Allocate a new block and confirm that it's not the realloc_block, since the realloc_block shouldn't be in the free list + new_block_hash = block_size + 1 + new_block = block_allocator.allocate(new_block_hash) + assert (realloc_block != new_block) + assert (new_block.block_hash == new_block_hash) + assert (new_block.block_number == 2) From 15099d2efa73ff5a0cf157c710c66c3ef01b1b0f Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Wed, 7 Feb 2024 15:17:37 -0500 Subject: [PATCH 09/79] added timestamps to the PhysicalTokenBlock and updated the eviction logic --- tests/prefix_caching/test_prefix_caching.py | 2 +- vllm/block.py | 3 + vllm/core/block_manager.py | 86 ++++++++++++++------- vllm/core/scheduler.py | 40 ++++++---- 4 files changed, 85 insertions(+), 46 deletions(-) diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index e98fb67fbe5d5..798d9f1973df5 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -72,7 +72,7 @@ def test_block_allocator( # Free the second block and confirm that the block ends up on the free list block_allocator.free(second_block) assert (len(block_allocator.evictor.free_blocks) == 1) - free_block = block_allocator.evictor.free_blocks[0] + free_block = block_allocator.evictor.free_blocks[block_hash] assert (free_block == second_block) # Reallocate the first block and confirm that, even after the block had its ref_count go to 0, we still get the same block back diff --git a/vllm/block.py b/vllm/block.py index d57173eb8a80b..13a4d4bb067f5 100644 --- a/vllm/block.py +++ b/vllm/block.py @@ -65,6 +65,9 @@ def __init__( self.block_hash = block_hash + #TODO: is this a good default? + self.last_accessed = 0 + def __repr__(self) -> str: return (f'PhysicalTokenBlock(device={self.device}, ' f'block_number={self.block_number}, ' diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 11053a9998872..552ddee4b0e35 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -1,7 +1,7 @@ """A block manager that manages token blocks.""" import enum -from collections import deque -from typing import Dict, List, Optional, Set, Tuple, Deque +from time import monotonic +from typing import Dict, List, Optional, Set, Tuple from vllm.block import BlockTable, PhysicalTokenBlock from vllm.sequence import Sequence, SequenceGroup, SequenceStatus @@ -21,27 +21,42 @@ def __init__(self, self.eviction_policy = eviction_policy # Initialize the free blocks. - self.free_blocks: Deque[PhysicalTokenBlock] = deque() + self.free_blocks: Dict[int, PhysicalTokenBlock] = {} def evict(self, table: Dict[int, PhysicalTokenBlock]) -> PhysicalTokenBlock: if self.eviction_policy == EvictionPolicy.LRU: - assert (len(self.free_blocks)) - # Find the block in the main hash table - block = self.free_blocks.pop() - - # Continue poping blocks until we find one with a ref_count of 0 - while block.ref_count != 0: - block = self.free_blocks.pop() - - del table[block.block_hash] - return block + all_blocks: List[PhysicalTokenBlock] = list( + self.free_blocks.values()) + assert (len(all_blocks) > 0) + + # Find lowest timestamp + lowest_timestamp = all_blocks[0].last_accessed + for block in all_blocks: + assert (block.last_accessed != 0) + if block.last_accessed < lowest_timestamp: + lowest_timestamp = block.last_accessed + + # Find all blocks with the lowest timestamp + eviction_candidates: List[PhysicalTokenBlock] = [] + for block in all_blocks: + if block.last_accessed == lowest_timestamp: + eviction_candidates.append(block) + + # Arbitrarily evict the first candidate + # TODO: Evict based on the number of prefix tokens in the block + assert (len(eviction_candidates) > 0) + evicted_block = eviction_candidates[0] + del table[evicted_block.block_hash] + del self.free_blocks[evicted_block.block_hash] + + return evicted_block else: raise ValueError( f"Unknown cache eviction policy: {self.eviction_policy}") def return_block(self, block: PhysicalTokenBlock) -> None: - self.free_blocks.appendleft(block) + self.free_blocks[block.block_hash] = block class BlockAllocator: @@ -85,16 +100,23 @@ def allocate(self, block_hash: int) -> PhysicalTokenBlock: if block_hash not in self.table: self.table[block_hash] = self.allocate_block(block_hash) block = self.table[block_hash] + if self.evictor.free_blocks.get(block_hash) is not None: + del self.evictor.free_blocks[block_hash] + block.ref_count += 1 # print(f"REFCOUNT ON ALLOCTION: {block}") return block - def free(self, block: PhysicalTokenBlock) -> None: - # print(f"FREEING: {block}") + def free(self, + block: PhysicalTokenBlock, + now: Optional[int] = None) -> None: if block.ref_count == 0: raise ValueError(f"Double free! {block} is already freed.") block.ref_count -= 1 if block.ref_count == 0: + if now is None: + now = monotonic() + block.last_accessed = now self.evictor.return_block(block) def get_num_free_blocks(self) -> int: @@ -198,7 +220,9 @@ def can_append_slot(self, seq_group: SequenceGroup) -> bool: num_seqs = seq_group.num_seqs(status=SequenceStatus.RUNNING) return num_seqs <= num_free_gpu_blocks - def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]: + def append_slot(self, + seq: Sequence, + now: Optional[float] = None) -> Optional[Tuple[int, int]]: """Allocate a physical slot for a new token.""" logical_blocks = seq.logical_token_blocks block_table = self.block_tables[seq.seq_id] @@ -229,7 +253,7 @@ def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]: new_block = self.gpu_allocator.allocate( seq.hash(len(logical_blocks) - 1)) block_table[-1] = new_block - self.gpu_allocator.free(last_block) + self.gpu_allocator.free(last_block, now) return last_block.block_number, new_block.block_number def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: @@ -261,7 +285,9 @@ def can_swap_in(self, seq_group: SequenceGroup) -> bool: num_required_blocks = len(blocks) + num_swapped_seqs return num_free_blocks - num_required_blocks >= self.watermark_blocks - def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]: + def swap_in(self, + seq_group: SequenceGroup, + now: Optional[float] = None) -> Dict[int, int]: # CPU block -> GPU block. if seq_group.prefix is not None: # make sure to swap in the prefix first @@ -286,7 +312,7 @@ def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]: mapping[cpu_block] = gpu_block new_block_table.append(gpu_block) # Free the CPU block swapped in to GPU. - self.cpu_allocator.free(cpu_block) + self.cpu_allocator.free(cpu_block, now) self.block_tables[seq.seq_id] = new_block_table block_number_mapping = { @@ -299,7 +325,9 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool: blocks = self._get_physical_blocks(seq_group) return len(blocks) <= self.cpu_allocator.get_num_free_blocks() - def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: + def swap_out(self, + seq_group: SequenceGroup, + now: Optional[float] = None) -> Dict[int, int]: # GPU block -> CPU block. mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {} for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): @@ -310,7 +338,7 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: if (seq_group.prefix is not None and gpu_block in seq_group.prefix.block_table): # NOTE: We do not swap out the prefix blocks for now. - self.gpu_allocator.free(gpu_block) + self.gpu_allocator.free(gpu_block, now) continue if gpu_block in mapping: @@ -322,7 +350,7 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: mapping[gpu_block] = cpu_block new_block_table.append(cpu_block) # Free the GPU block swapped out to CPU. - self.gpu_allocator.free(gpu_block) + self.gpu_allocator.free(gpu_block, now) self.block_tables[seq.seq_id] = new_block_table block_number_mapping = { @@ -331,19 +359,21 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: } return block_number_mapping - def _free_block_table(self, block_table: BlockTable) -> None: + def _free_block_table(self, + block_table: BlockTable, + now: Optional[float] = None) -> None: for block in set(block_table): if block.device == Device.GPU: - self.gpu_allocator.free(block) + self.gpu_allocator.free(block, now) else: - self.cpu_allocator.free(block) + self.cpu_allocator.free(block, now) - def free(self, seq: Sequence) -> None: + def free(self, seq: Sequence, now: Optional[float] = None) -> None: if seq.seq_id not in self.block_tables: # Already freed or haven't been scheduled yet. return block_table = self.block_tables[seq.seq_id] - self._free_block_table(block_table) + self._free_block_table(block_table, now) del self.block_tables[seq.seq_id] def reset(self) -> None: diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 213f9bb9cf30c..158fec4d8d123 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -127,6 +127,8 @@ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None: if isinstance(request_id, str): request_id = (request_id, ) request_ids = set(request_id) + + now = time.monotonic() for state_queue in [self.waiting, self.running, self.swapped]: aborted_groups: List[SequenceGroup] = [] for seq_group in state_queue: @@ -145,7 +147,7 @@ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None: if seq.is_finished(): continue seq.status = SequenceStatus.FINISHED_ABORTED - self.free_seq(seq) + self.free_seq(seq, now) def has_unfinished_seqs(self) -> bool: return self.waiting or self.running or self.swapped @@ -279,17 +281,18 @@ def _schedule(self) -> SchedulerOutputs: if self.running: # Preempt the lowest-priority sequence groups. victim_seq_group = self.running.pop() - self._preempt(victim_seq_group, blocks_to_swap_out) + self._preempt(victim_seq_group, blocks_to_swap_out, None, + now) preempted.append(victim_seq_group) else: # No other sequence groups can be preempted. # Preempt the current sequence group. - self._preempt(seq_group, blocks_to_swap_out) + self._preempt(seq_group, blocks_to_swap_out, None, now) preempted.append(seq_group) break else: # Append new slots to the sequence group. - self._append_slot(seq_group, blocks_to_copy) + self._append_slot(seq_group, blocks_to_copy, now) running.append(seq_group) self.running = running @@ -331,7 +334,7 @@ def _schedule(self) -> SchedulerOutputs: if lora_int_id > 0: curr_loras.add(lora_int_id) self.swapped.popleft() - self._swap_in(seq_group, blocks_to_swap_in) + self._swap_in(seq_group, blocks_to_swap_in, now) self._append_slot(seq_group, blocks_to_copy) num_curr_seqs += num_new_seqs self.running.append(seq_group) @@ -386,8 +389,8 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None: self.block_manager.fork(parent_seq, child_seq) - def free_seq(self, seq: Sequence) -> None: - self.block_manager.free(seq) + def free_seq(self, seq: Sequence, now: Optional[float] = None) -> None: + self.block_manager.free(seq, now) def free_finished_seq_groups(self) -> None: self.running = deque(seq_group for seq_group in self.running @@ -402,9 +405,10 @@ def _append_slot( self, seq_group: SequenceGroup, blocks_to_copy: Dict[int, List[int]], + now: Optional[float] = None, ) -> None: for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): - ret = self.block_manager.append_slot(seq) + ret = self.block_manager.append_slot(seq, now) if ret is not None: src_block, dst_block = ret if src_block in blocks_to_copy: @@ -417,6 +421,7 @@ def _preempt( seq_group: SequenceGroup, blocks_to_swap_out: Dict[int, int], preemption_mode: Optional[PreemptionMode] = None, + now: Optional[float] = None, ) -> None: # If preemption mode is not specified, we determine the mode as follows: # We use recomputation by default since it incurs lower overhead than @@ -435,7 +440,7 @@ def _preempt( else: preemption_mode = PreemptionMode.SWAP if preemption_mode == PreemptionMode.RECOMPUTE: - self._preempt_by_recompute(seq_group) + self._preempt_by_recompute(seq_group, now) elif preemption_mode == PreemptionMode.SWAP: self._preempt_by_swap(seq_group, blocks_to_swap_out) else: @@ -444,12 +449,13 @@ def _preempt( def _preempt_by_recompute( self, seq_group: SequenceGroup, + now: Optional[float] = None, ) -> None: seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) assert len(seqs) == 1 for seq in seqs: seq.status = SequenceStatus.WAITING - self.block_manager.free(seq) + self.block_manager.free(seq, now) # NOTE: For FCFS, we insert the preempted sequence group to the front # of the waiting queue. self.waiting.appendleft(seq_group) @@ -466,24 +472,24 @@ def _swap_in( self, seq_group: SequenceGroup, blocks_to_swap_in: Dict[int, int], + now: Optional[float] = None, ) -> None: - mapping = self.block_manager.swap_in(seq_group) + mapping = self.block_manager.swap_in(seq_group, now) blocks_to_swap_in.update(mapping) for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): seq.status = SequenceStatus.RUNNING - def _swap_out( - self, - seq_group: SequenceGroup, - blocks_to_swap_out: Dict[int, int], - ) -> None: + def _swap_out(self, + seq_group: SequenceGroup, + blocks_to_swap_out: Dict[int, int], + now: Optional[float] = None) -> None: if not self.block_manager.can_swap_out(seq_group): # FIXME(woosuk): Abort the sequence group instead of aborting the # entire engine. raise RuntimeError( "Aborted due to the lack of CPU swap space. Please increase " "the swap space to avoid this error.") - mapping = self.block_manager.swap_out(seq_group) + mapping = self.block_manager.swap_out(seq_group, now) blocks_to_swap_out.update(mapping) for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): seq.status = SequenceStatus.SWAPPED From 9411e06088ed30fa70025b2a0d11835e436288f9 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Wed, 7 Feb 2024 17:30:55 -0500 Subject: [PATCH 10/79] Delete the free hash table from the evictor class --- tests/prefix_caching/test_prefix_caching.py | 5 +--- vllm/core/block_manager.py | 28 ++++++--------------- vllm/sequence.py | 1 + 3 files changed, 9 insertions(+), 25 deletions(-) diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index 798d9f1973df5..9fe77b57f3fb9 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -69,11 +69,8 @@ def test_block_allocator( block_allocator.free(first_block) assert (second_block.ref_count == 1) - # Free the second block and confirm that the block ends up on the free list + # Free the second block block_allocator.free(second_block) - assert (len(block_allocator.evictor.free_blocks) == 1) - free_block = block_allocator.evictor.free_blocks[block_hash] - assert (free_block == second_block) # Reallocate the first block and confirm that, even after the block had its ref_count go to 0, we still get the same block back first_block = block_allocator.allocate(block_hash) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 552ddee4b0e35..80f6ebc4e4b28 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -20,27 +20,23 @@ def __init__(self, eviction_policy: EvictionPolicy = EvictionPolicy.LRU) -> None: self.eviction_policy = eviction_policy - # Initialize the free blocks. - self.free_blocks: Dict[int, PhysicalTokenBlock] = {} - def evict(self, table: Dict[int, PhysicalTokenBlock]) -> PhysicalTokenBlock: if self.eviction_policy == EvictionPolicy.LRU: all_blocks: List[PhysicalTokenBlock] = list( - self.free_blocks.values()) + table.values()) assert (len(all_blocks) > 0) # Find lowest timestamp - lowest_timestamp = all_blocks[0].last_accessed + lowest_timestamp = monotonic() for block in all_blocks: - assert (block.last_accessed != 0) - if block.last_accessed < lowest_timestamp: + if block.ref_count == 0 and block.last_accessed < lowest_timestamp: lowest_timestamp = block.last_accessed # Find all blocks with the lowest timestamp eviction_candidates: List[PhysicalTokenBlock] = [] for block in all_blocks: - if block.last_accessed == lowest_timestamp: + if block.ref_count == 0 and block.last_accessed == lowest_timestamp: eviction_candidates.append(block) # Arbitrarily evict the first candidate @@ -48,17 +44,12 @@ def evict(self, table: Dict[int, assert (len(eviction_candidates) > 0) evicted_block = eviction_candidates[0] del table[evicted_block.block_hash] - del self.free_blocks[evicted_block.block_hash] return evicted_block else: raise ValueError( f"Unknown cache eviction policy: {self.eviction_policy}") - def return_block(self, block: PhysicalTokenBlock) -> None: - self.free_blocks[block.block_hash] = block - - class BlockAllocator: """Manages free physical token blocks for a device. @@ -100,9 +91,6 @@ def allocate(self, block_hash: int) -> PhysicalTokenBlock: if block_hash not in self.table: self.table[block_hash] = self.allocate_block(block_hash) block = self.table[block_hash] - if self.evictor.free_blocks.get(block_hash) is not None: - del self.evictor.free_blocks[block_hash] - block.ref_count += 1 # print(f"REFCOUNT ON ALLOCTION: {block}") return block @@ -113,11 +101,9 @@ def free(self, if block.ref_count == 0: raise ValueError(f"Double free! {block} is already freed.") block.ref_count -= 1 - if block.ref_count == 0: - if now is None: - now = monotonic() - block.last_accessed = now - self.evictor.return_block(block) + if now is None: + now = monotonic() + block.last_accessed = now def get_num_free_blocks(self) -> int: return self.num_blocks - self.current_num_blocks diff --git a/vllm/sequence.py b/vllm/sequence.py index 2134e4f872c67..b3fcd0303c4f1 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -141,6 +141,7 @@ def lora_int_id(self) -> int: return self.lora_request.lora_int_id if self.lora_request else 0 def hash(self, logical_idx: int) -> int: + # Compute the number of tokens in the sequence num_tokens = (logical_idx * self.block_size) + ( self.block_size - self.logical_token_blocks[logical_idx].get_num_empty_slots()) From 359b82901f04c3137175fa3ba66b6fcb102f182c Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Wed, 7 Feb 2024 17:36:00 -0500 Subject: [PATCH 11/79] Remove the evictor class in favor of eviction free functions --- vllm/core/block_manager.py | 64 +++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 36 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 80f6ebc4e4b28..62bded080ae4a 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -13,42 +13,30 @@ class EvictionPolicy(enum.Enum): LRU = enum.auto() -class Evictor: - """Evicts physical blocks from cache based on eviction policy.""" +def lru_eviction(table: Dict[int, PhysicalTokenBlock]) -> PhysicalTokenBlock: + all_blocks: List[PhysicalTokenBlock] = list(table.values()) + assert (len(all_blocks) > 0) - def __init__(self, - eviction_policy: EvictionPolicy = EvictionPolicy.LRU) -> None: - self.eviction_policy = eviction_policy + # Find lowest timestamp + lowest_timestamp = monotonic() + for block in all_blocks: + if block.ref_count == 0 and block.last_accessed < lowest_timestamp: + lowest_timestamp = block.last_accessed + + # Find all blocks with the lowest timestamp + eviction_candidates: List[PhysicalTokenBlock] = [] + for block in all_blocks: + if block.ref_count == 0 and block.last_accessed == lowest_timestamp: + eviction_candidates.append(block) + + # Arbitrarily evict the first candidate + # TODO: Evict based on the number of prefix tokens in the block + assert (len(eviction_candidates) > 0) + evicted_block = eviction_candidates[0] + del table[evicted_block.block_hash] + + return evicted_block - def evict(self, table: Dict[int, - PhysicalTokenBlock]) -> PhysicalTokenBlock: - if self.eviction_policy == EvictionPolicy.LRU: - all_blocks: List[PhysicalTokenBlock] = list( - table.values()) - assert (len(all_blocks) > 0) - - # Find lowest timestamp - lowest_timestamp = monotonic() - for block in all_blocks: - if block.ref_count == 0 and block.last_accessed < lowest_timestamp: - lowest_timestamp = block.last_accessed - - # Find all blocks with the lowest timestamp - eviction_candidates: List[PhysicalTokenBlock] = [] - for block in all_blocks: - if block.ref_count == 0 and block.last_accessed == lowest_timestamp: - eviction_candidates.append(block) - - # Arbitrarily evict the first candidate - # TODO: Evict based on the number of prefix tokens in the block - assert (len(eviction_candidates) > 0) - evicted_block = eviction_candidates[0] - del table[evicted_block.block_hash] - - return evicted_block - else: - raise ValueError( - f"Unknown cache eviction policy: {self.eviction_policy}") class BlockAllocator: """Manages free physical token blocks for a device. @@ -67,13 +55,17 @@ def __init__(self, self.block_size = block_size self.num_blocks = num_blocks - self.evictor = Evictor(eviction_policy) + self.eviction_policy = eviction_policy self.current_num_blocks = 0 self.table: Dict[int, PhysicalTokenBlock] = {} def evict(self) -> PhysicalTokenBlock: - return self.evictor.evict(self.table) + if self.eviction_policy == EvictionPolicy.LRU: + return lru_eviction(self.table) + else: + raise ValueError( + f"Unknown cache eviction policy: {self.eviction_policy}") def allocate_block(self, block_hash: int) -> PhysicalTokenBlock: if self.current_num_blocks == self.num_blocks: From c9b0be6fcd20bea452cc2e136beb8fa1d35d9c05 Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Thu, 8 Feb 2024 03:12:18 -0500 Subject: [PATCH 12/79] debugging in progress --- tests/test_cache_block_hashing.py | 4 ++++ vllm/core/block_manager.py | 11 +++++++++++ vllm/model_executor/weight_utils.py | 2 +- vllm/prefix.py | 1 + vllm/sequence.py | 3 +++ vllm/worker/model_runner.py | 4 ++++ 6 files changed, 24 insertions(+), 1 deletion(-) diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index f4eb90378eb0b..ea8559508b481 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -60,6 +60,10 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int): for idx in range(num_blocks): hashes[-1][-1].append(seq.hash(idx)) + # Check that we can't hash incomplete blocks + with pytest.raises(ValueError) as e: + _ = seq.hash(num_blocks + 1) + seq_id += 1 # Check that hashes made with two prefixes with different first blocks are diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 11053a9998872..943fa58f637d5 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -173,6 +173,8 @@ def allocate(self, seq_group: SequenceGroup) -> None: seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] # Allocate new physical token blocks that will store the prompt tokens. + # num_prompt_blocks = seq.get_prompt_len() // self.block_size + num_prompt_blocks = len(seq.logical_token_blocks) block_table: BlockTable = [] @@ -182,11 +184,16 @@ def allocate(self, seq_group: SequenceGroup) -> None: and logical_idx >= self.block_sliding_window): block = block_table[logical_idx % self.block_sliding_window] else: + print(f"hash allocate {logical_idx}") block = self.gpu_allocator.allocate(seq.hash(logical_idx)) # Set the reference counts of the token blocks. # block.ref_count = seq_group.num_seqs() block_table.append(block) + # Append incomplete block to seq if any + # if num_prompt_blocks * self.block_size < seq.get_prompt_len(): + # # TODO + # Assign the block table for each sequence. for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): self.block_tables[seq.seq_id] = block_table.copy() @@ -212,6 +219,7 @@ def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]: else: # The sequence has a new logical block. # Allocate a new physical block. + print("hash append_slot 1") block = self.gpu_allocator.allocate( seq.hash(len(logical_blocks) - 1)) block_table.append(block) @@ -226,6 +234,7 @@ def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]: else: # The last block is shared with other sequences. # Copy on Write: Allocate a new block and copy the tokens. + print("hash append_slot 2") new_block = self.gpu_allocator.allocate( seq.hash(len(logical_blocks) - 1)) block_table[-1] = new_block @@ -281,6 +290,7 @@ def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]: gpu_block = mapping[cpu_block] gpu_block.ref_count += 1 else: + print("hash swap_in 1") gpu_block = self.gpu_allocator.allocate( seq.hash(len(seq.logical_blocks) - 1)) mapping[cpu_block] = gpu_block @@ -317,6 +327,7 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: cpu_block = mapping[gpu_block] cpu_block.ref_count += 1 else: + print("hash swap_in 2") cpu_block = self.cpu_allocator.allocate( seq.hash(len(seq.logical_blocks) - 1)) mapping[gpu_block] = cpu_block diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 3570366887e78..a00062b8ddd1d 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -28,7 +28,7 @@ def __init__(self, *args, **kwargs): def get_lock(model_name_or_path: str, cache_dir: Optional[str] = None): - lock_dir = cache_dir if cache_dir is not None else "/tmp" + lock_dir = cache_dir if cache_dir is not None else "~/vllm_cache" lock_file_name = model_name_or_path.replace("/", "-") + ".lock" lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name)) return lock diff --git a/vllm/prefix.py b/vllm/prefix.py index 5b6e8e4b92be6..4b780161a5278 100644 --- a/vllm/prefix.py +++ b/vllm/prefix.py @@ -74,6 +74,7 @@ def _truncate_token_ids(self, token_ids: Sequence[int]) -> Tuple[int]: new_length = len(token_ids) // self.block_size * self.block_size return tuple(token_ids[:new_length]) + # TODO clean this up? It's not used anywhere now def add_or_get_prefix(self, token_ids: Sequence[int], lora_int_id: int) -> Optional[Prefix]: token_ids = self._truncate_token_ids(token_ids) diff --git a/vllm/sequence.py b/vllm/sequence.py index 2134e4f872c67..ce092900483c1 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -144,6 +144,9 @@ def hash(self, logical_idx: int) -> int: num_tokens = (logical_idx * self.block_size) + ( self.block_size - self.logical_token_blocks[logical_idx].get_num_empty_slots()) + # num_tokens = logical_idx * self.block_size + self.block_size + # if num_tokens > len(self.data.get_token_ids()): + # raise ValueError(f"Can't hash incomplete block (block {logical_idx} needs hashing {num_tokens} tokens, but only {len(self.data.get_token_ids())} are present).") return hash(tuple(self.data.get_token_ids()[0:num_tokens])) def _append_logical_block(self) -> None: diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 5908d577e1a28..907208d065c36 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -172,6 +172,8 @@ def _prepare_prompt( slot_mapping[-1].append(_PAD_SLOT_ID) continue + print(block_table) + print(f"prepare {i}") block_number = block_table[i // self.block_size] block_offset = i % self.block_size slot = block_number * self.block_size + block_offset @@ -200,6 +202,8 @@ def _prepare_prompt( context_lens_tensor = torch.tensor(context_lens, dtype=torch.int, device=self.device) + + print("prefix block tables:", prefix_block_tables) # Prepare prefix block tables max_prompt_block_table_len = max(len(t) for t in prefix_block_tables) block_tables = _make_tensor_with_pad( From 6218d1a7f2bbd691530708f86fd3708941f898cf Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Thu, 8 Feb 2024 09:46:16 -0500 Subject: [PATCH 13/79] partial block support --- tests/test_cache_block_hashing.py | 4 ---- vllm/core/block_manager.py | 35 +++++++++++++++++------------ vllm/model_executor/weight_utils.py | 2 +- vllm/sequence.py | 11 ++++----- vllm/worker/model_runner.py | 4 ---- 5 files changed, 26 insertions(+), 30 deletions(-) diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index ea8559508b481..f4eb90378eb0b 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -60,10 +60,6 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int): for idx in range(num_blocks): hashes[-1][-1].append(seq.hash(idx)) - # Check that we can't hash incomplete blocks - with pytest.raises(ValueError) as e: - _ = seq.hash(num_blocks + 1) - seq_id += 1 # Check that hashes made with two prefixes with different first blocks are diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index d59fae743449f..04cd314c19de7 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -100,6 +100,12 @@ def free(self, def get_num_free_blocks(self) -> int: return self.num_blocks - self.current_num_blocks + def update_hash(self, block_hash: int, block: PhysicalTokenBlock) -> None: + old_hash = block.block_hash + del self.table[old_hash] + self.table[block_hash] = block + block.block_hash = block_hash + class AllocStatus(enum.Enum): """Result for BlockSpaceManager.can_allocate @@ -146,6 +152,7 @@ def __init__( num_cpu_blocks) # Mapping: seq_id -> BlockTable. self.block_tables: Dict[int, BlockTable] = {} + self.partial_block_table: Dict[int, PhysicalTokenBlock] = {} def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: # FIXME(woosuk): Here we assume that all sequences in the group share @@ -173,7 +180,6 @@ def allocate(self, seq_group: SequenceGroup) -> None: seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] # Allocate new physical token blocks that will store the prompt tokens. - # num_prompt_blocks = seq.get_prompt_len() // self.block_size num_prompt_blocks = len(seq.logical_token_blocks) @@ -184,16 +190,14 @@ def allocate(self, seq_group: SequenceGroup) -> None: and logical_idx >= self.block_sliding_window): block = block_table[logical_idx % self.block_sliding_window] else: - print(f"hash allocate {logical_idx}") block = self.gpu_allocator.allocate(seq.hash(logical_idx)) + if logical_idx * self.block_size + self.block_size > len( + seq.data.get_token_ids()): + self.partial_block_table[seq.seq_id] = block # Set the reference counts of the token blocks. # block.ref_count = seq_group.num_seqs() block_table.append(block) - # Append incomplete block to seq if any - # if num_prompt_blocks * self.block_size < seq.get_prompt_len(): - # # TODO - # Assign the block table for each sequence. for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): self.block_tables[seq.seq_id] = block_table.copy() @@ -212,6 +216,7 @@ def append_slot(self, logical_blocks = seq.logical_token_blocks block_table = self.block_tables[seq.seq_id] + # If we need to allocate a new physical block if len(block_table) < len(logical_blocks): if (self.block_sliding_window and len(block_table) >= self.block_sliding_window): @@ -221,10 +226,10 @@ def append_slot(self, else: # The sequence has a new logical block. # Allocate a new physical block. - print("hash append_slot 1") - block = self.gpu_allocator.allocate( - seq.hash(len(logical_blocks) - 1)) - block_table.append(block) + assert (seq.seq_id not in self.partial_block_table) + self.partial_block_table[ + seq.seq_id] = self.gpu_allocator.allocate(seq.seq_id) + block_table.append(self.partial_block_table[seq.seq_id]) return None # We want to append the token to the last physical block. @@ -232,13 +237,17 @@ def append_slot(self, assert last_block.device == Device.GPU if last_block.ref_count == 1: # Not shared with other sequences. Appendable. - return None + if len(seq.data.get_token_ids()) % seq.block_size == 0: + del self.partial_block_table[seq.seq_id] + new_hash = seq.hash(len(logical_blocks) - 1) + self.gpu_allocator.update_hash(new_hash, last_block) + return None else: # The last block is shared with other sequences. # Copy on Write: Allocate a new block and copy the tokens. - print("hash append_slot 2") new_block = self.gpu_allocator.allocate( seq.hash(len(logical_blocks) - 1)) + assert (new_block.ref_count == 1) block_table[-1] = new_block self.gpu_allocator.free(last_block, now) return last_block.block_number, new_block.block_number @@ -294,7 +303,6 @@ def swap_in(self, gpu_block = mapping[cpu_block] gpu_block.ref_count += 1 else: - print("hash swap_in 1") gpu_block = self.gpu_allocator.allocate( seq.hash(len(seq.logical_blocks) - 1)) mapping[cpu_block] = gpu_block @@ -333,7 +341,6 @@ def swap_out(self, cpu_block = mapping[gpu_block] cpu_block.ref_count += 1 else: - print("hash swap_in 2") cpu_block = self.cpu_allocator.allocate( seq.hash(len(seq.logical_blocks) - 1)) mapping[gpu_block] = cpu_block diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index a00062b8ddd1d..3570366887e78 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -28,7 +28,7 @@ def __init__(self, *args, **kwargs): def get_lock(model_name_or_path: str, cache_dir: Optional[str] = None): - lock_dir = cache_dir if cache_dir is not None else "~/vllm_cache" + lock_dir = cache_dir if cache_dir is not None else "/tmp" lock_file_name = model_name_or_path.replace("/", "-") + ".lock" lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name)) return lock diff --git a/vllm/sequence.py b/vllm/sequence.py index 8e505e437105b..295c7e51b5a01 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -142,13 +142,10 @@ def lora_int_id(self) -> int: def hash(self, logical_idx: int) -> int: # Compute the number of tokens in the sequence - num_tokens = (logical_idx * self.block_size) + ( - self.block_size - - self.logical_token_blocks[logical_idx].get_num_empty_slots()) - # num_tokens = logical_idx * self.block_size + self.block_size - # if num_tokens > len(self.data.get_token_ids()): - # raise ValueError(f"Can't hash incomplete block (block {logical_idx} needs hashing {num_tokens} tokens, but only {len(self.data.get_token_ids())} are present).") - return hash(tuple(self.data.get_token_ids()[0:num_tokens])) + num_tokens = logical_idx * self.block_size + self.block_size + return hash( + tuple(self.data.get_token_ids() + [0:min(num_tokens, len(self.data.get_token_ids()))])) def _append_logical_block(self) -> None: block = LogicalTokenBlock( diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 907208d065c36..5908d577e1a28 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -172,8 +172,6 @@ def _prepare_prompt( slot_mapping[-1].append(_PAD_SLOT_ID) continue - print(block_table) - print(f"prepare {i}") block_number = block_table[i // self.block_size] block_offset = i % self.block_size slot = block_number * self.block_size + block_offset @@ -202,8 +200,6 @@ def _prepare_prompt( context_lens_tensor = torch.tensor(context_lens, dtype=torch.int, device=self.device) - - print("prefix block tables:", prefix_block_tables) # Prepare prefix block tables max_prompt_block_table_len = max(len(t) for t in prefix_block_tables) block_tables = _make_tensor_with_pad( From b35819d3204541ed8ba5fbf14f253302f248f659 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Thu, 8 Feb 2024 16:08:50 -0500 Subject: [PATCH 14/79] Move PhysicalTokenBlock.last_accessed updates to the block_manager/scheduler --- vllm/core/block_manager.py | 45 ++++++++++++++------------------- vllm/core/scheduler.py | 52 ++++++++++++++------------------------ 2 files changed, 38 insertions(+), 59 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 04cd314c19de7..e9f94deca425d 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -87,15 +87,10 @@ def allocate(self, block_hash: int) -> PhysicalTokenBlock: # print(f"REFCOUNT ON ALLOCTION: {block}") return block - def free(self, - block: PhysicalTokenBlock, - now: Optional[int] = None) -> None: + def free(self, block: PhysicalTokenBlock) -> None: if block.ref_count == 0: raise ValueError(f"Double free! {block} is already freed.") block.ref_count -= 1 - if now is None: - now = monotonic() - block.last_accessed = now def get_num_free_blocks(self) -> int: return self.num_blocks - self.current_num_blocks @@ -209,9 +204,7 @@ def can_append_slot(self, seq_group: SequenceGroup) -> bool: num_seqs = seq_group.num_seqs(status=SequenceStatus.RUNNING) return num_seqs <= num_free_gpu_blocks - def append_slot(self, - seq: Sequence, - now: Optional[float] = None) -> Optional[Tuple[int, int]]: + def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]: """Allocate a physical slot for a new token.""" logical_blocks = seq.logical_token_blocks block_table = self.block_tables[seq.seq_id] @@ -249,7 +242,7 @@ def append_slot(self, seq.hash(len(logical_blocks) - 1)) assert (new_block.ref_count == 1) block_table[-1] = new_block - self.gpu_allocator.free(last_block, now) + self.gpu_allocator.free(last_block) return last_block.block_number, new_block.block_number def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: @@ -281,9 +274,7 @@ def can_swap_in(self, seq_group: SequenceGroup) -> bool: num_required_blocks = len(blocks) + num_swapped_seqs return num_free_blocks - num_required_blocks >= self.watermark_blocks - def swap_in(self, - seq_group: SequenceGroup, - now: Optional[float] = None) -> Dict[int, int]: + def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]: # CPU block -> GPU block. if seq_group.prefix is not None: # make sure to swap in the prefix first @@ -308,7 +299,7 @@ def swap_in(self, mapping[cpu_block] = gpu_block new_block_table.append(gpu_block) # Free the CPU block swapped in to GPU. - self.cpu_allocator.free(cpu_block, now) + self.cpu_allocator.free(cpu_block) self.block_tables[seq.seq_id] = new_block_table block_number_mapping = { @@ -321,9 +312,7 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool: blocks = self._get_physical_blocks(seq_group) return len(blocks) <= self.cpu_allocator.get_num_free_blocks() - def swap_out(self, - seq_group: SequenceGroup, - now: Optional[float] = None) -> Dict[int, int]: + def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: # GPU block -> CPU block. mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {} for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): @@ -334,7 +323,7 @@ def swap_out(self, if (seq_group.prefix is not None and gpu_block in seq_group.prefix.block_table): # NOTE: We do not swap out the prefix blocks for now. - self.gpu_allocator.free(gpu_block, now) + self.gpu_allocator.free(gpu_block) continue if gpu_block in mapping: @@ -346,7 +335,7 @@ def swap_out(self, mapping[gpu_block] = cpu_block new_block_table.append(cpu_block) # Free the GPU block swapped out to CPU. - self.gpu_allocator.free(gpu_block, now) + self.gpu_allocator.free(gpu_block) self.block_tables[seq.seq_id] = new_block_table block_number_mapping = { @@ -355,21 +344,19 @@ def swap_out(self, } return block_number_mapping - def _free_block_table(self, - block_table: BlockTable, - now: Optional[float] = None) -> None: + def _free_block_table(self, block_table: BlockTable) -> None: for block in set(block_table): if block.device == Device.GPU: - self.gpu_allocator.free(block, now) + self.gpu_allocator.free(block) else: - self.cpu_allocator.free(block, now) + self.cpu_allocator.free(block) - def free(self, seq: Sequence, now: Optional[float] = None) -> None: + def free(self, seq: Sequence) -> None: if seq.seq_id not in self.block_tables: # Already freed or haven't been scheduled yet. return block_table = self.block_tables[seq.seq_id] - self._free_block_table(block_table, now) + self._free_block_table(block_table) del self.block_tables[seq.seq_id] def reset(self) -> None: @@ -386,3 +373,9 @@ def get_num_free_gpu_blocks(self) -> int: def get_num_free_cpu_blocks(self) -> int: return self.cpu_allocator.get_num_free_blocks() + + def access_all_blocks_in_seq(self, seq: Sequence, + access_time: float) -> None: + block_table = self.block_tables[seq.seq_id] + for block in block_table: + block.last_accessed = access_time diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 158fec4d8d123..4e92634ea76c9 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -128,7 +128,6 @@ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None: request_id = (request_id, ) request_ids = set(request_id) - now = time.monotonic() for state_queue in [self.waiting, self.running, self.swapped]: aborted_groups: List[SequenceGroup] = [] for seq_group in state_queue: @@ -147,7 +146,7 @@ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None: if seq.is_finished(): continue seq.status = SequenceStatus.FINISHED_ABORTED - self.free_seq(seq, now) + self.free_seq(seq) def has_unfinished_seqs(self) -> bool: return self.waiting or self.running or self.swapped @@ -292,7 +291,7 @@ def _schedule(self) -> SchedulerOutputs: break else: # Append new slots to the sequence group. - self._append_slot(seq_group, blocks_to_copy, now) + self._append_slot(seq_group, blocks_to_copy) running.append(seq_group) self.running = running @@ -334,7 +333,7 @@ def _schedule(self) -> SchedulerOutputs: if lora_int_id > 0: curr_loras.add(lora_int_id) self.swapped.popleft() - self._swap_in(seq_group, blocks_to_swap_in, now) + self._swap_in(seq_group, blocks_to_swap_in) self._append_slot(seq_group, blocks_to_copy) num_curr_seqs += num_new_seqs self.running.append(seq_group) @@ -365,6 +364,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: # such as self.running, self.swapped, and self.waiting. scheduler_outputs = self._schedule() + now = time.monotonic() # Create input data structures. seq_group_metadata_list: List[SequenceGroupMetadata] = [] for seq_group in scheduler_outputs.scheduled_seq_groups: @@ -374,6 +374,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: seq_id = seq.seq_id seq_data[seq_id] = seq.data block_tables[seq_id] = self.block_manager.get_block_table(seq) + self.block_manager.access_all_blocks_in_seq(seq, now) seq_group_metadata = SequenceGroupMetadata( request_id=seq_group.request_id, @@ -389,8 +390,8 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None: self.block_manager.fork(parent_seq, child_seq) - def free_seq(self, seq: Sequence, now: Optional[float] = None) -> None: - self.block_manager.free(seq, now) + def free_seq(self, seq: Sequence) -> None: + self.block_manager.free(seq) def free_finished_seq_groups(self) -> None: self.running = deque(seq_group for seq_group in self.running @@ -401,14 +402,10 @@ def _allocate(self, seq_group: SequenceGroup) -> None: for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): seq.status = SequenceStatus.RUNNING - def _append_slot( - self, - seq_group: SequenceGroup, - blocks_to_copy: Dict[int, List[int]], - now: Optional[float] = None, - ) -> None: + def _append_slot(self, seq_group: SequenceGroup, + blocks_to_copy: Dict[int, List[int]]) -> None: for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): - ret = self.block_manager.append_slot(seq, now) + ret = self.block_manager.append_slot(seq) if ret is not None: src_block, dst_block = ret if src_block in blocks_to_copy: @@ -421,7 +418,6 @@ def _preempt( seq_group: SequenceGroup, blocks_to_swap_out: Dict[int, int], preemption_mode: Optional[PreemptionMode] = None, - now: Optional[float] = None, ) -> None: # If preemption mode is not specified, we determine the mode as follows: # We use recomputation by default since it incurs lower overhead than @@ -440,22 +436,18 @@ def _preempt( else: preemption_mode = PreemptionMode.SWAP if preemption_mode == PreemptionMode.RECOMPUTE: - self._preempt_by_recompute(seq_group, now) + self._preempt_by_recompute(seq_group) elif preemption_mode == PreemptionMode.SWAP: self._preempt_by_swap(seq_group, blocks_to_swap_out) else: raise AssertionError("Invalid preemption mode.") - def _preempt_by_recompute( - self, - seq_group: SequenceGroup, - now: Optional[float] = None, - ) -> None: + def _preempt_by_recompute(self, seq_group: SequenceGroup) -> None: seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) assert len(seqs) == 1 for seq in seqs: seq.status = SequenceStatus.WAITING - self.block_manager.free(seq, now) + self.block_manager.free(seq) # NOTE: For FCFS, we insert the preempted sequence group to the front # of the waiting queue. self.waiting.appendleft(seq_group) @@ -468,28 +460,22 @@ def _preempt_by_swap( self._swap_out(seq_group, blocks_to_swap_out) self.swapped.append(seq_group) - def _swap_in( - self, - seq_group: SequenceGroup, - blocks_to_swap_in: Dict[int, int], - now: Optional[float] = None, - ) -> None: - mapping = self.block_manager.swap_in(seq_group, now) + def _swap_in(self, seq_group: SequenceGroup, + blocks_to_swap_in: Dict[int, int]) -> None: + mapping = self.block_manager.swap_in(seq_group) blocks_to_swap_in.update(mapping) for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): seq.status = SequenceStatus.RUNNING - def _swap_out(self, - seq_group: SequenceGroup, - blocks_to_swap_out: Dict[int, int], - now: Optional[float] = None) -> None: + def _swap_out(self, seq_group: SequenceGroup, + blocks_to_swap_out: Dict[int, int]) -> None: if not self.block_manager.can_swap_out(seq_group): # FIXME(woosuk): Abort the sequence group instead of aborting the # entire engine. raise RuntimeError( "Aborted due to the lack of CPU swap space. Please increase " "the swap space to avoid this error.") - mapping = self.block_manager.swap_out(seq_group, now) + mapping = self.block_manager.swap_out(seq_group) blocks_to_swap_out.update(mapping) for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): seq.status = SequenceStatus.SWAPPED From 38c1fc63c2f7dec06c363b52b0aa5609a18f184f Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Thu, 8 Feb 2024 16:29:32 -0500 Subject: [PATCH 15/79] Remove overly aggressive assert --- vllm/core/block_manager.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index e9f94deca425d..8e7375ebfad11 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -240,7 +240,6 @@ def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]: # Copy on Write: Allocate a new block and copy the tokens. new_block = self.gpu_allocator.allocate( seq.hash(len(logical_blocks) - 1)) - assert (new_block.ref_count == 1) block_table[-1] = new_block self.gpu_allocator.free(last_block) return last_block.block_number, new_block.block_number From b3e73f5538ce289c427d5502e65bec2b546cfbf3 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Thu, 8 Feb 2024 16:40:12 -0500 Subject: [PATCH 16/79] minor refactoring --- vllm/block.py | 4 ++-- vllm/core/block_manager.py | 1 + vllm/core/scheduler.py | 6 ++---- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/vllm/block.py b/vllm/block.py index 13a4d4bb067f5..bdae7f1a82902 100644 --- a/vllm/block.py +++ b/vllm/block.py @@ -2,6 +2,7 @@ from typing import List from vllm.utils import Device +from time import monotonic _BLANK_TOKEN_ID = -1 @@ -65,8 +66,7 @@ def __init__( self.block_hash = block_hash - #TODO: is this a good default? - self.last_accessed = 0 + self.last_accessed = monotonic def __repr__(self) -> str: return (f'PhysicalTokenBlock(device={self.device}, ' diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 8e7375ebfad11..cc1706b0d78d7 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -92,6 +92,7 @@ def free(self, block: PhysicalTokenBlock) -> None: raise ValueError(f"Double free! {block} is already freed.") block.ref_count -= 1 + # TODO: Should this account for the number of blocks with a ref count of 0? def get_num_free_blocks(self) -> int: return self.num_blocks - self.current_num_blocks diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 4e92634ea76c9..a26af0bf127b0 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -127,7 +127,6 @@ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None: if isinstance(request_id, str): request_id = (request_id, ) request_ids = set(request_id) - for state_queue in [self.waiting, self.running, self.swapped]: aborted_groups: List[SequenceGroup] = [] for seq_group in state_queue: @@ -280,13 +279,12 @@ def _schedule(self) -> SchedulerOutputs: if self.running: # Preempt the lowest-priority sequence groups. victim_seq_group = self.running.pop() - self._preempt(victim_seq_group, blocks_to_swap_out, None, - now) + self._preempt(victim_seq_group, blocks_to_swap_out) preempted.append(victim_seq_group) else: # No other sequence groups can be preempted. # Preempt the current sequence group. - self._preempt(seq_group, blocks_to_swap_out, None, now) + self._preempt(seq_group, blocks_to_swap_out) preempted.append(seq_group) break else: From 48624d9dfed8eb09ca089f0c239676238f22689d Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Fri, 9 Feb 2024 02:08:19 -0500 Subject: [PATCH 17/79] Add prefix len to eviction strategy --- tests/prefix_caching/test_prefix_caching.py | 16 ++++---- vllm/block.py | 3 ++ vllm/core/block_manager.py | 41 +++++++++++++++------ vllm/core/scheduler.py | 3 +- vllm/engine/llm_engine.py | 2 +- vllm/sequence.py | 10 ++++- 6 files changed, 53 insertions(+), 22 deletions(-) diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index 9fe77b57f3fb9..e40ea9927bf22 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -24,7 +24,7 @@ def allocate_all_blocks(block_allocator, num_blocks): blocks = [] for i in range(num_blocks): # use i as the block_hash - blocks.append(block_allocator.allocate(i)) + blocks.append(block_allocator.allocate(i, 0)) return blocks @@ -60,8 +60,8 @@ def test_block_allocator( block_allocator = BlockAllocator(Device.CPU, block_size, num_blocks) # Allocate two PysicalTokenBlocks with the same hash and check that they are the same PhysicalTokenBlock - first_block = block_allocator.allocate(block_hash) - second_block = block_allocator.allocate(block_hash) + first_block = block_allocator.allocate(block_hash, 0) + second_block = block_allocator.allocate(block_hash, 0) assert (first_block == second_block) assert (second_block.ref_count == 2) @@ -73,7 +73,7 @@ def test_block_allocator( block_allocator.free(second_block) # Reallocate the first block and confirm that, even after the block had its ref_count go to 0, we still get the same block back - first_block = block_allocator.allocate(block_hash) + first_block = block_allocator.allocate(block_hash, 0) assert (first_block == second_block) assert (first_block.block_hash == block_hash) @@ -86,7 +86,7 @@ def test_eviction(num_blocks: int, ): for i in range(num_blocks): # use i as the block_hash - blocks.append(block_allocator.allocate(i)) + blocks.append(block_allocator.allocate(i, 0)) #Free all blocks for block in blocks: @@ -94,19 +94,19 @@ def test_eviction(num_blocks: int, ): # Allocate a new block and confirm that it's the first block freed. I.E The Least Recently Used block new_block_hash = block_size - new_block = block_allocator.allocate(new_block_hash) + new_block = block_allocator.allocate(new_block_hash, 0) assert (new_block == blocks[0]) assert (new_block.block_hash == new_block_hash) # Reallocate the second in blocks to remove it from the free list realloc_block_hash = 1 - realloc_block = block_allocator.allocate(realloc_block_hash) + realloc_block = block_allocator.allocate(realloc_block_hash, 0) assert (realloc_block == blocks[realloc_block_hash]) assert (realloc_block.block_hash == realloc_block_hash) # Allocate a new block and confirm that it's not the realloc_block, since the realloc_block shouldn't be in the free list new_block_hash = block_size + 1 - new_block = block_allocator.allocate(new_block_hash) + new_block = block_allocator.allocate(new_block_hash, 0) assert (realloc_block != new_block) assert (new_block.block_hash == new_block_hash) assert (new_block.block_number == 2) diff --git a/vllm/block.py b/vllm/block.py index 13a4d4bb067f5..9796178c5a60e 100644 --- a/vllm/block.py +++ b/vllm/block.py @@ -56,6 +56,7 @@ def __init__( block_number: int, block_size: int, block_hash: int, + prefix_len: int, ) -> None: self.device = device self.block_number = block_number @@ -68,6 +69,8 @@ def __init__( #TODO: is this a good default? self.last_accessed = 0 + self.prefix_len = prefix_len + def __repr__(self) -> str: return (f'PhysicalTokenBlock(device={self.device}, ' f'block_number={self.block_number}, ' diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 04cd314c19de7..d6ec475337472 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -24,13 +24,24 @@ def lru_eviction(table: Dict[int, PhysicalTokenBlock]) -> PhysicalTokenBlock: lowest_timestamp = block.last_accessed # Find all blocks with the lowest timestamp - eviction_candidates: List[PhysicalTokenBlock] = [] + least_recent: List[PhysicalTokenBlock] = [] for block in all_blocks: if block.ref_count == 0 and block.last_accessed == lowest_timestamp: + least_recent.append(block) + + # Find highest prefix count per block + highest_prefix_count = 0 + for block in least_recent: + if block.ref_count == 0 and block.prefix_len > highest_prefix_count: + highest_prefix_count = block.prefix_len + + # Find all blocks with the lowest timestamp + eviction_candidates: List[PhysicalTokenBlock] = [] + for block in least_recent: + if block.ref_count == 0 and block.prefix_len == highest_prefix_count: eviction_candidates.append(block) # Arbitrarily evict the first candidate - # TODO: Evict based on the number of prefix tokens in the block assert (len(eviction_candidates) > 0) evicted_block = eviction_candidates[0] del table[evicted_block.block_hash] @@ -67,7 +78,8 @@ def evict(self) -> PhysicalTokenBlock: raise ValueError( f"Unknown cache eviction policy: {self.eviction_policy}") - def allocate_block(self, block_hash: int) -> PhysicalTokenBlock: + def allocate_block(self, block_hash: int, + prefix_len: int) -> PhysicalTokenBlock: if self.current_num_blocks == self.num_blocks: block = self.evict() block.block_hash = block_hash @@ -75,13 +87,15 @@ def allocate_block(self, block_hash: int) -> PhysicalTokenBlock: block = PhysicalTokenBlock(device=self.device, block_number=self.current_num_blocks, block_size=self.block_size, - block_hash=block_hash) + block_hash=block_hash, + prefix_len=prefix_len) self.current_num_blocks += 1 return block - def allocate(self, block_hash: int) -> PhysicalTokenBlock: + def allocate(self, block_hash: int, prefix_len: int) -> PhysicalTokenBlock: if block_hash not in self.table: - self.table[block_hash] = self.allocate_block(block_hash) + self.table[block_hash] = self.allocate_block( + block_hash, prefix_len) block = self.table[block_hash] block.ref_count += 1 # print(f"REFCOUNT ON ALLOCTION: {block}") @@ -190,7 +204,8 @@ def allocate(self, seq_group: SequenceGroup) -> None: and logical_idx >= self.block_sliding_window): block = block_table[logical_idx % self.block_sliding_window] else: - block = self.gpu_allocator.allocate(seq.hash(logical_idx)) + block = self.gpu_allocator.allocate(seq.hash(logical_idx), + seq_group.get_prefix_len()) if logical_idx * self.block_size + self.block_size > len( seq.data.get_token_ids()): self.partial_block_table[seq.seq_id] = block @@ -211,6 +226,7 @@ def can_append_slot(self, seq_group: SequenceGroup) -> bool: def append_slot(self, seq: Sequence, + prefix_len: int, now: Optional[float] = None) -> Optional[Tuple[int, int]]: """Allocate a physical slot for a new token.""" logical_blocks = seq.logical_token_blocks @@ -228,7 +244,8 @@ def append_slot(self, # Allocate a new physical block. assert (seq.seq_id not in self.partial_block_table) self.partial_block_table[ - seq.seq_id] = self.gpu_allocator.allocate(seq.seq_id) + seq.seq_id] = self.gpu_allocator.allocate( + seq.seq_id, prefix_len) block_table.append(self.partial_block_table[seq.seq_id]) return None @@ -246,7 +263,7 @@ def append_slot(self, # The last block is shared with other sequences. # Copy on Write: Allocate a new block and copy the tokens. new_block = self.gpu_allocator.allocate( - seq.hash(len(logical_blocks) - 1)) + seq.hash(len(logical_blocks) - 1), prefix_len) assert (new_block.ref_count == 1) block_table[-1] = new_block self.gpu_allocator.free(last_block, now) @@ -304,7 +321,8 @@ def swap_in(self, gpu_block.ref_count += 1 else: gpu_block = self.gpu_allocator.allocate( - seq.hash(len(seq.logical_blocks) - 1)) + seq.hash(len(seq.logical_blocks) - 1), + seq_group.get_prefix_len()) mapping[cpu_block] = gpu_block new_block_table.append(gpu_block) # Free the CPU block swapped in to GPU. @@ -342,7 +360,8 @@ def swap_out(self, cpu_block.ref_count += 1 else: cpu_block = self.cpu_allocator.allocate( - seq.hash(len(seq.logical_blocks) - 1)) + seq.hash(len(seq.logical_blocks) - 1), + seq_group.get_prefix_len()) mapping[gpu_block] = cpu_block new_block_table.append(cpu_block) # Free the GPU block swapped out to CPU. diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 158fec4d8d123..912a9f1320d88 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -408,7 +408,8 @@ def _append_slot( now: Optional[float] = None, ) -> None: for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): - ret = self.block_manager.append_slot(seq, now) + ret = self.block_manager.append_slot(seq, now, + seq_group.get_prefix_len()) if ret is not None: src_block, dst_block = ret if src_block in blocks_to_copy: diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 8c84e1dee1fff..5317874827357 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -451,7 +451,7 @@ def add_request( # Create the sequence group. seq_group = SequenceGroup(request_id, [seq], sampling_params, - arrival_time, lora_request) + arrival_time, lora_request, prefix_pos) # Add the sequence group to the scheduler. self.scheduler.add_seq_group(seq_group) diff --git a/vllm/sequence.py b/vllm/sequence.py index 295c7e51b5a01..f9bb3eb24fc93 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -243,7 +243,7 @@ class SequenceGroup: sampling_params: The sampling parameters used to generate the outputs. arrival_time: The arrival time of the request. lora_request: LoRA request. - prefix: The prefix of the prompt of the sequence group. + prefix_pos: The end of prefix of the prompt of the sequence group. """ def __init__( @@ -253,6 +253,7 @@ def __init__( sampling_params: SamplingParams, arrival_time: float, lora_request: Optional[LoRARequest] = None, + prefix_pos: Optional[int] = None, ) -> None: self.request_id = request_id self.seqs_dict = {seq.seq_id: seq for seq in seqs} @@ -260,6 +261,7 @@ def __init__( self.arrival_time = arrival_time self.last_token_time = arrival_time self.lora_request = lora_request + self.prefix_pos: Optional[int] = prefix_pos self.prompt_logprobs: Optional[PromptLogprobs] = None @property @@ -347,6 +349,9 @@ def remove(self, seq_id: int) -> None: def is_finished(self) -> bool: return all(seq.is_finished() for seq in self.get_seqs()) + def get_prefix_len(self) -> int: + return self.prefix_pos if self.prefix_pos is not None else 0 + def __repr__(self) -> str: return (f"SequenceGroup(request_id={self.request_id}, " f"sampling_params={self.sampling_params}, " @@ -364,6 +369,7 @@ class SequenceGroupMetadata: block_tables: The block tables. (Seq id -> list of physical block numbers) lora_request: LoRA request. + prefix_pos: The end of prefix of the prompt of the sequence group. """ def __init__( @@ -374,6 +380,7 @@ def __init__( sampling_params: SamplingParams, block_tables: Dict[int, List[int]], lora_request: Optional[LoRARequest] = None, + prefix_pos: Optional[int] = None, ) -> None: self.request_id = request_id self.is_prompt = is_prompt @@ -381,6 +388,7 @@ def __init__( self.sampling_params = sampling_params self.block_tables = block_tables self.lora_request = lora_request + self.prefix_pos = prefix_pos @property def lora_int_id(self) -> int: From bb471f2b397d9f0f28c682e0957a202cf595c6b2 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Fri, 9 Feb 2024 09:02:35 -0500 Subject: [PATCH 18/79] fixed a few bugs in the partial block management code --- vllm/core/block_manager.py | 46 +++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index e73a71979cfd9..0c0cb8e469eeb 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -220,8 +220,25 @@ def can_append_slot(self, seq_group: SequenceGroup) -> bool: num_seqs = seq_group.num_seqs(status=SequenceStatus.RUNNING) return num_seqs <= num_free_gpu_blocks - def append_slot(self, seq: Sequence, - prefix_len: int) -> Optional[Tuple[int, int]]: + def replace_partial_block(self, seq: Sequence, block: PhysicalTokenBlock, + old_block: PhysicalTokenBlock): + # If there's something already in the partial block table, delete it + block_hash: int = seq.seq_id + if block_hash in self.partial_block_table: + assert self.partial_block_table[block_hash] == old_block + del self.partial_block_table[block_hash] + + self.partial_block_table[block_hash] = block + + def promote_partial_block(self, seq: Sequence, block: PhysicalTokenBlock): + # Delete the block from the partial table, but don't decrement the ref count + del self.partial_block_table[seq.seq_id] + + # Compute a new hash for the block so that it can be shared by other Sequences + new_hash = seq.hash(len(seq.logical_token_blocks) - 1) + self.gpu_allocator.update_hash(new_hash, block) + + def append_slot(self, seq: Sequence, prefix_len: int) -> Optional[Tuple[int, int]]: """Allocate a physical slot for a new token.""" logical_blocks = seq.logical_token_blocks block_table = self.block_tables[seq.seq_id] @@ -237,10 +254,10 @@ def append_slot(self, seq: Sequence, # The sequence has a new logical block. # Allocate a new physical block. assert (seq.seq_id not in self.partial_block_table) - self.partial_block_table[ - seq.seq_id] = self.gpu_allocator.allocate( - seq.seq_id, prefix_len) - block_table.append(self.partial_block_table[seq.seq_id]) + new_block = self.gpu_allocator.allocate(seq.seq_id, prefix_len) + self.partial_block_table[seq.seq_id] = new_block + assert (new_block.ref_count == 1) + block_table.append(new_block) return None # We want to append the token to the last physical block. @@ -248,17 +265,20 @@ def append_slot(self, seq: Sequence, assert last_block.device == Device.GPU if last_block.ref_count == 1: # Not shared with other sequences. Appendable. - if len(seq.data.get_token_ids()) % seq.block_size == 0: - del self.partial_block_table[seq.seq_id] - new_hash = seq.hash(len(logical_blocks) - 1) - self.gpu_allocator.update_hash(new_hash, last_block) - return None + + # If the last block is now complete, promote it to a full block so that it can be shared + should_promote_partial_block = len( + seq.data.get_token_ids()) % seq.block_size == 0 + if should_promote_partial_block: + self.promote_partial_block(seq, last_block) + return None else: # The last block is shared with other sequences. # Copy on Write: Allocate a new block and copy the tokens. - new_block = self.gpu_allocator.allocate( - seq.hash(len(logical_blocks) - 1), prefix_len) + new_block = self.gpu_allocator.allocate(seq.seq_id, prefix_len) + self.replace_partial_block(seq, new_block, last_block) block_table[-1] = new_block + assert (new_block.ref_count == 1) self.gpu_allocator.free(last_block) return last_block.block_number, new_block.block_number From 5d5db121f97e85c3966cbd790ce5b521415e90f9 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Fri, 9 Feb 2024 09:10:19 -0500 Subject: [PATCH 19/79] auto format --- vllm/core/block_manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 0c0cb8e469eeb..5849b33d53ae1 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -238,7 +238,8 @@ def promote_partial_block(self, seq: Sequence, block: PhysicalTokenBlock): new_hash = seq.hash(len(seq.logical_token_blocks) - 1) self.gpu_allocator.update_hash(new_hash, block) - def append_slot(self, seq: Sequence, prefix_len: int) -> Optional[Tuple[int, int]]: + def append_slot(self, seq: Sequence, + prefix_len: int) -> Optional[Tuple[int, int]]: """Allocate a physical slot for a new token.""" logical_blocks = seq.logical_token_blocks block_table = self.block_tables[seq.seq_id] From ffbddd9f7d0f54b678db8da52380338ac21ba9ac Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Fri, 9 Feb 2024 15:52:27 -0500 Subject: [PATCH 20/79] fix fork/cow mechanisms so that they work with partial blocks --- vllm/core/block_manager.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 5849b33d53ae1..9393859005725 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -194,7 +194,6 @@ def allocate(self, seq_group: SequenceGroup) -> None: num_prompt_blocks = len(seq.logical_token_blocks) block_table: BlockTable = [] - for logical_idx in range(num_prompt_blocks): if (self.block_sliding_window is not None and logical_idx >= self.block_sliding_window): @@ -210,8 +209,12 @@ def allocate(self, seq_group: SequenceGroup) -> None: block_table.append(block) # Assign the block table for each sequence. + first_id = seq.seq_id for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): self.block_tables[seq.seq_id] = block_table.copy() + if first_id in self.partial_block_table and first_id != seq.seq_id: + self.partial_block_table[ + seq.seq_id] = self.partial_block_table[first_id] def can_append_slot(self, seq_group: SequenceGroup) -> bool: # Simple heuristic: If there is at least one free block @@ -243,7 +246,6 @@ def append_slot(self, seq: Sequence, """Allocate a physical slot for a new token.""" logical_blocks = seq.logical_token_blocks block_table = self.block_tables[seq.seq_id] - # If we need to allocate a new physical block if len(block_table) < len(logical_blocks): if (self.block_sliding_window @@ -255,7 +257,8 @@ def append_slot(self, seq: Sequence, # The sequence has a new logical block. # Allocate a new physical block. assert (seq.seq_id not in self.partial_block_table) - new_block = self.gpu_allocator.allocate(seq.seq_id, prefix_len) + new_block = self.gpu_allocator.allocate( + monotonic(), prefix_len) self.partial_block_table[seq.seq_id] = new_block assert (new_block.ref_count == 1) block_table.append(new_block) @@ -266,18 +269,22 @@ def append_slot(self, seq: Sequence, assert last_block.device == Device.GPU if last_block.ref_count == 1: # Not shared with other sequences. Appendable. - # If the last block is now complete, promote it to a full block so that it can be shared - should_promote_partial_block = len( - seq.data.get_token_ids()) % seq.block_size == 0 + should_promote_partial_block = (len( + seq.data.get_token_ids())) % seq.block_size == 0 if should_promote_partial_block: self.promote_partial_block(seq, last_block) return None else: # The last block is shared with other sequences. # Copy on Write: Allocate a new block and copy the tokens. - new_block = self.gpu_allocator.allocate(seq.seq_id, prefix_len) - self.replace_partial_block(seq, new_block, last_block) + new_block = self.gpu_allocator.allocate(monotonic(), prefix_len) + should_promote_partial_block = (len( + seq.data.get_token_ids())) % seq.block_size == 0 + if not should_promote_partial_block: + self.replace_partial_block(seq, new_block, last_block) + else: + del self.partial_block_table[seq.seq_id] block_table[-1] = new_block assert (new_block.ref_count == 1) self.gpu_allocator.free(last_block) @@ -290,6 +297,9 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: self.block_tables[child_seq.seq_id] = src_block_table.copy() for block in src_block_table: block.ref_count += 1 + if parent_seq.seq_id in self.partial_block_table: + self.partial_block_table[ + child_seq.seq_id] = self.partial_block_table[parent_seq.seq_id] def _get_physical_blocks( self, seq_group: SequenceGroup) -> List[PhysicalTokenBlock]: From 1f7fe4279a89295b0dffc7a64cd20eb230efe9f4 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Mon, 12 Feb 2024 08:37:55 -0500 Subject: [PATCH 21/79] replace the partial block table with a simpler promotion mechanism --- vllm/core/block_manager.py | 47 ++++++++------------------------------ 1 file changed, 10 insertions(+), 37 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 9393859005725..998c00338abd1 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -162,7 +162,6 @@ def __init__( num_cpu_blocks) # Mapping: seq_id -> BlockTable. self.block_tables: Dict[int, BlockTable] = {} - self.partial_block_table: Dict[int, PhysicalTokenBlock] = {} def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: # FIXME(woosuk): Here we assume that all sequences in the group share @@ -201,20 +200,13 @@ def allocate(self, seq_group: SequenceGroup) -> None: else: block = self.gpu_allocator.allocate(seq.hash(logical_idx), seq_group.get_prefix_len()) - if logical_idx * self.block_size + self.block_size > len( - seq.data.get_token_ids()): - self.partial_block_table[seq.seq_id] = block # Set the reference counts of the token blocks. # block.ref_count = seq_group.num_seqs() block_table.append(block) # Assign the block table for each sequence. - first_id = seq.seq_id for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): self.block_tables[seq.seq_id] = block_table.copy() - if first_id in self.partial_block_table and first_id != seq.seq_id: - self.partial_block_table[ - seq.seq_id] = self.partial_block_table[first_id] def can_append_slot(self, seq_group: SequenceGroup) -> bool: # Simple heuristic: If there is at least one free block @@ -223,24 +215,14 @@ def can_append_slot(self, seq_group: SequenceGroup) -> bool: num_seqs = seq_group.num_seqs(status=SequenceStatus.RUNNING) return num_seqs <= num_free_gpu_blocks - def replace_partial_block(self, seq: Sequence, block: PhysicalTokenBlock, - old_block: PhysicalTokenBlock): - # If there's something already in the partial block table, delete it - block_hash: int = seq.seq_id - if block_hash in self.partial_block_table: - assert self.partial_block_table[block_hash] == old_block - del self.partial_block_table[block_hash] - - self.partial_block_table[block_hash] = block - - def promote_partial_block(self, seq: Sequence, block: PhysicalTokenBlock): - # Delete the block from the partial table, but don't decrement the ref count - del self.partial_block_table[seq.seq_id] - + def promote_last_block(self, seq: Sequence, block: PhysicalTokenBlock): # Compute a new hash for the block so that it can be shared by other Sequences new_hash = seq.hash(len(seq.logical_token_blocks) - 1) self.gpu_allocator.update_hash(new_hash, block) + def should_promote_last_block(self, seq: Sequence) -> bool: + return (len(seq.data.get_token_ids())) % seq.block_size == 0 + def append_slot(self, seq: Sequence, prefix_len: int) -> Optional[Tuple[int, int]]: """Allocate a physical slot for a new token.""" @@ -256,10 +238,8 @@ def append_slot(self, seq: Sequence, else: # The sequence has a new logical block. # Allocate a new physical block. - assert (seq.seq_id not in self.partial_block_table) new_block = self.gpu_allocator.allocate( monotonic(), prefix_len) - self.partial_block_table[seq.seq_id] = new_block assert (new_block.ref_count == 1) block_table.append(new_block) return None @@ -267,24 +247,20 @@ def append_slot(self, seq: Sequence, # We want to append the token to the last physical block. last_block = block_table[-1] assert last_block.device == Device.GPU + should_promote_last_block = self.should_promote_last_block(seq) if last_block.ref_count == 1: # Not shared with other sequences. Appendable. # If the last block is now complete, promote it to a full block so that it can be shared - should_promote_partial_block = (len( - seq.data.get_token_ids())) % seq.block_size == 0 - if should_promote_partial_block: - self.promote_partial_block(seq, last_block) + if (should_promote_last_block): + self.promote_last_block(seq, last_block) return None else: # The last block is shared with other sequences. # Copy on Write: Allocate a new block and copy the tokens. new_block = self.gpu_allocator.allocate(monotonic(), prefix_len) - should_promote_partial_block = (len( - seq.data.get_token_ids())) % seq.block_size == 0 - if not should_promote_partial_block: - self.replace_partial_block(seq, new_block, last_block) - else: - del self.partial_block_table[seq.seq_id] + + if (should_promote_last_block): + self.promote_last_block(seq, new_block) block_table[-1] = new_block assert (new_block.ref_count == 1) self.gpu_allocator.free(last_block) @@ -297,9 +273,6 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: self.block_tables[child_seq.seq_id] = src_block_table.copy() for block in src_block_table: block.ref_count += 1 - if parent_seq.seq_id in self.partial_block_table: - self.partial_block_table[ - child_seq.seq_id] = self.partial_block_table[parent_seq.seq_id] def _get_physical_blocks( self, seq_group: SequenceGroup) -> List[PhysicalTokenBlock]: From 7ab75d759e97c08cfe3cec712c8b4336d36a28b5 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Mon, 12 Feb 2024 09:15:37 -0500 Subject: [PATCH 22/79] clean up the BlockSpaceManager a bit --- vllm/core/block_manager.py | 44 +++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 998c00338abd1..b2e8424b69f4c 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -92,7 +92,11 @@ def allocate_block(self, block_hash: int, self.current_num_blocks += 1 return block - def allocate(self, block_hash: int, prefix_len: int) -> PhysicalTokenBlock: + def allocate(self, + block_hash: Optional[int] = None, + prefix_len: int = 0) -> PhysicalTokenBlock: + if block_hash is None: + block_hash = monotonic() if block_hash not in self.table: self.table[block_hash] = self.allocate_block( block_hash, prefix_len) @@ -215,14 +219,32 @@ def can_append_slot(self, seq_group: SequenceGroup) -> bool: num_seqs = seq_group.num_seqs(status=SequenceStatus.RUNNING) return num_seqs <= num_free_gpu_blocks - def promote_last_block(self, seq: Sequence, block: PhysicalTokenBlock): + def _promote_last_block(self, seq: Sequence, block: PhysicalTokenBlock): # Compute a new hash for the block so that it can be shared by other Sequences new_hash = seq.hash(len(seq.logical_token_blocks) - 1) + + # TODO: What if the hash already exists in the table? If it does, we can free and use that block? self.gpu_allocator.update_hash(new_hash, block) - def should_promote_last_block(self, seq: Sequence) -> bool: + def _should_promote_last_block(self, seq: Sequence) -> bool: return (len(seq.data.get_token_ids())) % seq.block_size == 0 + def _maybe_promote_last_block(self, seq: Sequence, + last_block: PhysicalTokenBlock) -> None: + if self._should_promote_last_block(seq): + self._promote_last_block(seq, last_block) + + def _allocate_last_physical_block(self, seq: Sequence, + prefix_len: int) -> PhysicalTokenBlock: + block_hash: Optional[int] = None + if (self._should_promote_last_block(seq)): + block_hash = seq.hash(len(seq.logical_token_blocks) - 1) + new_block = self.gpu_allocator.allocate(block_hash, + prefix_len=prefix_len) + + assert (new_block.ref_count == 1) + return new_block + def append_slot(self, seq: Sequence, prefix_len: int) -> Optional[Tuple[int, int]]: """Allocate a physical slot for a new token.""" @@ -230,6 +252,9 @@ def append_slot(self, seq: Sequence, block_table = self.block_tables[seq.seq_id] # If we need to allocate a new physical block if len(block_table) < len(logical_blocks): + # Currently this code only supports adding one physical block + assert len(block_table) == len(logical_blocks) - 1 + if (self.block_sliding_window and len(block_table) >= self.block_sliding_window): # re-use a block @@ -238,31 +263,24 @@ def append_slot(self, seq: Sequence, else: # The sequence has a new logical block. # Allocate a new physical block. - new_block = self.gpu_allocator.allocate( - monotonic(), prefix_len) - assert (new_block.ref_count == 1) + new_block = self.allocate_last_physical_block(seq, prefix_len) block_table.append(new_block) return None # We want to append the token to the last physical block. last_block = block_table[-1] assert last_block.device == Device.GPU - should_promote_last_block = self.should_promote_last_block(seq) if last_block.ref_count == 1: # Not shared with other sequences. Appendable. # If the last block is now complete, promote it to a full block so that it can be shared - if (should_promote_last_block): - self.promote_last_block(seq, last_block) + self.maybe_promote_last_block(seq, last_block) return None else: # The last block is shared with other sequences. # Copy on Write: Allocate a new block and copy the tokens. - new_block = self.gpu_allocator.allocate(monotonic(), prefix_len) + new_block = self.allocate_last_physical_block(seq, prefix_len) - if (should_promote_last_block): - self.promote_last_block(seq, new_block) block_table[-1] = new_block - assert (new_block.ref_count == 1) self.gpu_allocator.free(last_block) return last_block.block_number, new_block.block_number From ca3e288d1f6dafbb7805f9b9dd2048c056a001f9 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Mon, 12 Feb 2024 09:18:20 -0500 Subject: [PATCH 23/79] fix minor typos --- vllm/core/block_manager.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index b2e8424b69f4c..d065219c2e93c 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -230,12 +230,12 @@ def _should_promote_last_block(self, seq: Sequence) -> bool: return (len(seq.data.get_token_ids())) % seq.block_size == 0 def _maybe_promote_last_block(self, seq: Sequence, - last_block: PhysicalTokenBlock) -> None: + last_block: PhysicalTokenBlock) -> None: if self._should_promote_last_block(seq): self._promote_last_block(seq, last_block) def _allocate_last_physical_block(self, seq: Sequence, - prefix_len: int) -> PhysicalTokenBlock: + prefix_len: int) -> PhysicalTokenBlock: block_hash: Optional[int] = None if (self._should_promote_last_block(seq)): block_hash = seq.hash(len(seq.logical_token_blocks) - 1) @@ -263,7 +263,7 @@ def append_slot(self, seq: Sequence, else: # The sequence has a new logical block. # Allocate a new physical block. - new_block = self.allocate_last_physical_block(seq, prefix_len) + new_block = self._allocate_last_physical_block(seq, prefix_len) block_table.append(new_block) return None @@ -273,12 +273,12 @@ def append_slot(self, seq: Sequence, if last_block.ref_count == 1: # Not shared with other sequences. Appendable. # If the last block is now complete, promote it to a full block so that it can be shared - self.maybe_promote_last_block(seq, last_block) + self._maybe_promote_last_block(seq, last_block) return None else: # The last block is shared with other sequences. # Copy on Write: Allocate a new block and copy the tokens. - new_block = self.allocate_last_physical_block(seq, prefix_len) + new_block = self._allocate_last_physical_block(seq, prefix_len) block_table[-1] = new_block self.gpu_allocator.free(last_block) From ecf389dcf8fd91084072074564ce2291db2f8734 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Mon, 12 Feb 2024 09:19:21 -0500 Subject: [PATCH 24/79] minor name change --- vllm/core/block_manager.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index d065219c2e93c..81d2e92183d35 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -226,18 +226,18 @@ def _promote_last_block(self, seq: Sequence, block: PhysicalTokenBlock): # TODO: What if the hash already exists in the table? If it does, we can free and use that block? self.gpu_allocator.update_hash(new_hash, block) - def _should_promote_last_block(self, seq: Sequence) -> bool: + def _is_last_block_full(self, seq: Sequence) -> bool: return (len(seq.data.get_token_ids())) % seq.block_size == 0 def _maybe_promote_last_block(self, seq: Sequence, last_block: PhysicalTokenBlock) -> None: - if self._should_promote_last_block(seq): + if self._is_last_block_full(seq): self._promote_last_block(seq, last_block) def _allocate_last_physical_block(self, seq: Sequence, prefix_len: int) -> PhysicalTokenBlock: block_hash: Optional[int] = None - if (self._should_promote_last_block(seq)): + if (self._is_last_block_full(seq)): block_hash = seq.hash(len(seq.logical_token_blocks) - 1) new_block = self.gpu_allocator.allocate(block_hash, prefix_len=prefix_len) From 427566a7a75651fc40b8fe8dc3cef103e1d718af Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Mon, 12 Feb 2024 09:40:04 -0500 Subject: [PATCH 25/79] update assert --- vllm/core/block_manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 81d2e92183d35..76774bdb7f850 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -242,7 +242,8 @@ def _allocate_last_physical_block(self, seq: Sequence, new_block = self.gpu_allocator.allocate(block_hash, prefix_len=prefix_len) - assert (new_block.ref_count == 1) + if block_hash is None: + assert (new_block.ref_count == 1) return new_block def append_slot(self, seq: Sequence, From a3431bbf63627814df966926e711828d880c8836 Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Mon, 12 Feb 2024 10:45:10 -0500 Subject: [PATCH 26/79] fix swap_in and swap_out --- vllm/core/block_manager.py | 49 +++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 76774bdb7f850..bd80dbb3648c5 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -229,23 +229,43 @@ def _promote_last_block(self, seq: Sequence, block: PhysicalTokenBlock): def _is_last_block_full(self, seq: Sequence) -> bool: return (len(seq.data.get_token_ids())) % seq.block_size == 0 + def _is_last_block(self, seq: Sequence, index: int) -> bool: + return index == len(seq.logical_token_blocks) - 1 + + def _is_block_full(self, seq: Sequence, index: int) -> bool: + return not self._is_last_block(seq, + index) or self._is_last_block_full(seq) + def _maybe_promote_last_block(self, seq: Sequence, last_block: PhysicalTokenBlock) -> None: if self._is_last_block_full(seq): self._promote_last_block(seq, last_block) - def _allocate_last_physical_block(self, seq: Sequence, - prefix_len: int) -> PhysicalTokenBlock: + def _allocate_physical_block(self, + seq: Sequence, + index: int, + prefix_len: int, + use_gpu: bool = True) -> PhysicalTokenBlock: block_hash: Optional[int] = None - if (self._is_last_block_full(seq)): - block_hash = seq.hash(len(seq.logical_token_blocks) - 1) - new_block = self.gpu_allocator.allocate(block_hash, - prefix_len=prefix_len) + if (self._is_block_full(seq, index)): + block_hash = seq.hash(index) + if use_gpu: + new_block = self.gpu_allocator.allocate(block_hash, + prefix_len=prefix_len) + else: + new_block = self.cpu_allocator.allocate(block_hash, + prefix_len=prefix_len) if block_hash is None: assert (new_block.ref_count == 1) return new_block + def _allocate_last_physical_block(self, seq: Sequence, + prefix_len: int) -> PhysicalTokenBlock: + return self._allocate_physical_block(seq, + len(seq.logical_token_blocks) - 1, + prefix_len) + def append_slot(self, seq: Sequence, prefix_len: int) -> Optional[Tuple[int, int]]: """Allocate a physical slot for a new token.""" @@ -329,14 +349,15 @@ def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]: new_block_table.append(block) block.ref_count += 1 - for cpu_block in block_table: + # Assumption that len(block_table) == len(logical_blocks) + for i in range(len(block_table)): + cpu_block = block_table[i] if cpu_block in mapping: gpu_block = mapping[cpu_block] gpu_block.ref_count += 1 else: - gpu_block = self.gpu_allocator.allocate( - seq.hash(len(seq.logical_blocks) - 1), - seq_group.get_prefix_len()) + gpu_block = self._allocate_physical_block( + seq, i, seq_group.get_prefix_len()) mapping[cpu_block] = gpu_block new_block_table.append(gpu_block) # Free the CPU block swapped in to GPU. @@ -360,7 +381,8 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: new_block_table: BlockTable = [] block_table = self.block_tables[seq.seq_id] - for gpu_block in block_table: + for i in range(len(block_table)): + gpu_block = block_table[i] if (seq_group.prefix is not None and gpu_block in seq_group.prefix.block_table): # NOTE: We do not swap out the prefix blocks for now. @@ -371,9 +393,8 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: cpu_block = mapping[gpu_block] cpu_block.ref_count += 1 else: - cpu_block = self.cpu_allocator.allocate( - seq.hash(len(seq.logical_blocks) - 1), - seq_group.get_prefix_len()) + cpu_block = self._allocate_physical_block( + seq, i, seq_group.get_prefix_len(), use_gpu=False) mapping[gpu_block] = cpu_block new_block_table.append(cpu_block) # Free the GPU block swapped out to CPU. From dedc9c0b0fe713ef978e0bbe608041d4bb3a3c94 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Mon, 12 Feb 2024 11:30:52 -0500 Subject: [PATCH 27/79] remove dead code in BlockSpaceManager --- vllm/core/block_manager.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index bd80dbb3648c5..71b61e12e5261 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -336,10 +336,6 @@ def can_swap_in(self, seq_group: SequenceGroup) -> bool: def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]: # CPU block -> GPU block. - if seq_group.prefix is not None: - # make sure to swap in the prefix first - assert seq_group.prefix.allocated and seq_group.prefix.computed - mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {} for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): new_block_table: BlockTable = [] @@ -383,12 +379,6 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: for i in range(len(block_table)): gpu_block = block_table[i] - if (seq_group.prefix is not None - and gpu_block in seq_group.prefix.block_table): - # NOTE: We do not swap out the prefix blocks for now. - self.gpu_allocator.free(gpu_block) - continue - if gpu_block in mapping: cpu_block = mapping[gpu_block] cpu_block.ref_count += 1 From 86299a457b9dc4e8ba57580e50d4fda425416996 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Mon, 12 Feb 2024 12:05:00 -0500 Subject: [PATCH 28/79] refactor swap_in/swap_out in BlockSpaceManager --- vllm/core/block_manager.py | 42 ++++++++++++-------------------------- 1 file changed, 13 insertions(+), 29 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 71b61e12e5261..a67982e659515 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -241,31 +241,18 @@ def _maybe_promote_last_block(self, seq: Sequence, if self._is_last_block_full(seq): self._promote_last_block(seq, last_block) - def _allocate_physical_block(self, - seq: Sequence, - index: int, - prefix_len: int, - use_gpu: bool = True) -> PhysicalTokenBlock: + def _allocate_last_physical_block(self, seq: Sequence, + prefix_len: int) -> PhysicalTokenBlock: block_hash: Optional[int] = None - if (self._is_block_full(seq, index)): - block_hash = seq.hash(index) - if use_gpu: - new_block = self.gpu_allocator.allocate(block_hash, - prefix_len=prefix_len) - else: - new_block = self.cpu_allocator.allocate(block_hash, - prefix_len=prefix_len) - + logical_idx = len(seq.logical_token_blocks) - 1 + if (self._is_block_full(seq, logical_idx)): + block_hash = seq.hash(logical_idx) + new_block = self.gpu_allocator.allocate(block_hash, + prefix_len=prefix_len) if block_hash is None: assert (new_block.ref_count == 1) return new_block - def _allocate_last_physical_block(self, seq: Sequence, - prefix_len: int) -> PhysicalTokenBlock: - return self._allocate_physical_block(seq, - len(seq.logical_token_blocks) - 1, - prefix_len) - def append_slot(self, seq: Sequence, prefix_len: int) -> Optional[Tuple[int, int]]: """Allocate a physical slot for a new token.""" @@ -345,15 +332,13 @@ def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]: new_block_table.append(block) block.ref_count += 1 - # Assumption that len(block_table) == len(logical_blocks) - for i in range(len(block_table)): - cpu_block = block_table[i] + for cpu_block in block_table: if cpu_block in mapping: gpu_block = mapping[cpu_block] gpu_block.ref_count += 1 else: - gpu_block = self._allocate_physical_block( - seq, i, seq_group.get_prefix_len()) + gpu_block = self.gpu_allocator.allocate( + cpu_block.block_hash, cpu_block.prefix_len) mapping[cpu_block] = gpu_block new_block_table.append(gpu_block) # Free the CPU block swapped in to GPU. @@ -377,14 +362,13 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: new_block_table: BlockTable = [] block_table = self.block_tables[seq.seq_id] - for i in range(len(block_table)): - gpu_block = block_table[i] + for gpu_block in block_table: if gpu_block in mapping: cpu_block = mapping[gpu_block] cpu_block.ref_count += 1 else: - cpu_block = self._allocate_physical_block( - seq, i, seq_group.get_prefix_len(), use_gpu=False) + cpu_block = self.cpu_allocator.allocate( + gpu_block.block_hash, gpu_block.prefix_len) mapping[gpu_block] = cpu_block new_block_table.append(cpu_block) # Free the GPU block swapped out to CPU. From 614a197e20c258c788156788d7637fc5a410b916 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Mon, 12 Feb 2024 17:10:09 -0500 Subject: [PATCH 29/79] Update the partial block promotion logic to account for the full version already being in the cache --- vllm/core/block_manager.py | 43 ++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index a67982e659515..ef6e47cb89f44 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -102,7 +102,6 @@ def allocate(self, block_hash, prefix_len) block = self.table[block_hash] block.ref_count += 1 - # print(f"REFCOUNT ON ALLOCTION: {block}") return block def free(self, block: PhysicalTokenBlock) -> None: @@ -114,7 +113,11 @@ def free(self, block: PhysicalTokenBlock) -> None: def get_num_free_blocks(self) -> int: return self.num_blocks - self.current_num_blocks - def update_hash(self, block_hash: int, block: PhysicalTokenBlock) -> None: + def contains_block(self, block_hash: int) -> bool: + return block_hash in self.table + + def update_hash(self, block_hash: int, block: PhysicalTokenBlock): + assert (not self.contains_block(block_hash)) old_hash = block.block_hash del self.table[old_hash] self.table[block_hash] = block @@ -204,8 +207,6 @@ def allocate(self, seq_group: SequenceGroup) -> None: else: block = self.gpu_allocator.allocate(seq.hash(logical_idx), seq_group.get_prefix_len()) - # Set the reference counts of the token blocks. - # block.ref_count = seq_group.num_seqs() block_table.append(block) # Assign the block table for each sequence. @@ -219,12 +220,19 @@ def can_append_slot(self, seq_group: SequenceGroup) -> bool: num_seqs = seq_group.num_seqs(status=SequenceStatus.RUNNING) return num_seqs <= num_free_gpu_blocks - def _promote_last_block(self, seq: Sequence, block: PhysicalTokenBlock): + def _promote_last_block( + self, seq: Sequence, + last_block: PhysicalTokenBlock) -> PhysicalTokenBlock: # Compute a new hash for the block so that it can be shared by other Sequences new_hash = seq.hash(len(seq.logical_token_blocks) - 1) - # TODO: What if the hash already exists in the table? If it does, we can free and use that block? - self.gpu_allocator.update_hash(new_hash, block) + # if new_hash is already in the cached table, then free last_block and return the cached version + if self.gpu_allocator.contains_block(new_hash): + self.gpu_allocator.free(last_block) + return self.gpu_allocator.allocate(new_hash) + else: + self.gpu_allocator.update_hash(new_hash, last_block) + return last_block def _is_last_block_full(self, seq: Sequence) -> bool: return (len(seq.data.get_token_ids())) % seq.block_size == 0 @@ -232,21 +240,19 @@ def _is_last_block_full(self, seq: Sequence) -> bool: def _is_last_block(self, seq: Sequence, index: int) -> bool: return index == len(seq.logical_token_blocks) - 1 - def _is_block_full(self, seq: Sequence, index: int) -> bool: - return not self._is_last_block(seq, - index) or self._is_last_block_full(seq) - - def _maybe_promote_last_block(self, seq: Sequence, - last_block: PhysicalTokenBlock) -> None: + def _maybe_promote_last_block( + self, seq: Sequence, + last_block: PhysicalTokenBlock) -> PhysicalTokenBlock: if self._is_last_block_full(seq): - self._promote_last_block(seq, last_block) + return self._promote_last_block(seq, last_block) + else: + return last_block def _allocate_last_physical_block(self, seq: Sequence, prefix_len: int) -> PhysicalTokenBlock: block_hash: Optional[int] = None - logical_idx = len(seq.logical_token_blocks) - 1 - if (self._is_block_full(seq, logical_idx)): - block_hash = seq.hash(logical_idx) + if (self._is_last_block_full(seq)): + block_hash = seq.hash(len(seq.logical_token_blocks) - 1) new_block = self.gpu_allocator.allocate(block_hash, prefix_len=prefix_len) if block_hash is None: @@ -281,7 +287,8 @@ def append_slot(self, seq: Sequence, if last_block.ref_count == 1: # Not shared with other sequences. Appendable. # If the last block is now complete, promote it to a full block so that it can be shared - self._maybe_promote_last_block(seq, last_block) + new_block = self._maybe_promote_last_block(seq, last_block) + block_table[-1] = new_block return None else: # The last block is shared with other sequences. From 0f8547423565a72e470f7355bb7e223701765e1b Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Tue, 13 Feb 2024 07:55:12 -0500 Subject: [PATCH 30/79] remove min from sequence hash --- vllm/sequence.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/sequence.py b/vllm/sequence.py index f9bb3eb24fc93..2c72be85fc520 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -143,9 +143,7 @@ def lora_int_id(self) -> int: def hash(self, logical_idx: int) -> int: # Compute the number of tokens in the sequence num_tokens = logical_idx * self.block_size + self.block_size - return hash( - tuple(self.data.get_token_ids() - [0:min(num_tokens, len(self.data.get_token_ids()))])) + return hash(tuple(self.data.get_token_ids()[0:num_tokens])) def _append_logical_block(self) -> None: block = LogicalTokenBlock( From 9672b20fe6c9a7fc46c9e8eeb7f3f2f9b07570ee Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Tue, 13 Feb 2024 09:43:26 -0500 Subject: [PATCH 31/79] Remove prefix.py --- vllm/prefix.py | 88 -------------------------------------------------- 1 file changed, 88 deletions(-) delete mode 100644 vllm/prefix.py diff --git a/vllm/prefix.py b/vllm/prefix.py deleted file mode 100644 index 4b780161a5278..0000000000000 --- a/vllm/prefix.py +++ /dev/null @@ -1,88 +0,0 @@ -from typing import Dict, List, Sequence, Tuple, Optional - -from vllm.block import BlockTable - - -class Prefix: - """Data and states associated with a prefix of prompt tokens for multiple - sequence groups. - - NOTE: This feature is experimental and may be replaced with automatic - prefix caching in the future. - - Args: - token_ids: The token ids of the prefix. - block_size: The block size of the executed model. - """ - - def __init__( - self, - token_ids: Sequence[int], - block_size: int, - ) -> None: - self.token_ids = tuple(token_ids) - self.block_size = block_size - self.length = len(token_ids) - self.hash = hash(token_ids) - assert self.length % block_size == 0 - self.block_table: Optional[BlockTable] = None - self.computed = False - - @property - def allocated(self) -> bool: - return self.block_table is not None - - def get_num_blocks(self) -> int: - return self.length // self.block_size - - def get_block_numbers(self) -> List[int]: - return [block.block_number for block in self.block_table] - - def get_length(self) -> int: - return self.length - - def __hash__(self) -> int: - return self.hash - - def set_block_table(self, block_table: BlockTable) -> None: - self.block_table = block_table.copy() - - -class PrefixPool: - """Manages all the prompt prefixes. - - NOTE: This feature is experimental and may be replaced with automatic - prefix caching in the future. - - Args: - block_size: The block size of the executed model. - - Attributes: - prefixes: A list of all the prefixes. - block_size: The block size of the executed model. - """ - - def __init__( - self, - block_size: int, - ) -> None: - # TODO(zhuohan): Add a capacity limit to the prefix pool. - self.prefixes: Dict[int, Prefix] = {} - self.block_size = block_size - - def _truncate_token_ids(self, token_ids: Sequence[int]) -> Tuple[int]: - new_length = len(token_ids) // self.block_size * self.block_size - return tuple(token_ids[:new_length]) - - # TODO clean this up? It's not used anywhere now - def add_or_get_prefix(self, token_ids: Sequence[int], - lora_int_id: int) -> Optional[Prefix]: - token_ids = self._truncate_token_ids(token_ids) - if len(token_ids) == 0: - # Prefix is empty. - return None - prefix = Prefix(token_ids, self.block_size) - prefix_hash = hash((prefix, lora_int_id)) - if prefix_hash not in self.prefixes: - self.prefixes[prefix_hash] = prefix - return self.prefixes[prefix_hash] From 6044c2b39462c4ffaecdf37420ade26439ad9fdc Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Tue, 13 Feb 2024 09:56:20 -0500 Subject: [PATCH 32/79] misc formatting --- vllm/core/block_manager.py | 45 +++++++++++++++++++++++++++----------- vllm/core/scheduler.py | 26 ++++++++++++++++------ 2 files changed, 51 insertions(+), 20 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index ef6e47cb89f44..1ecb7127232d8 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -196,7 +196,6 @@ def allocate(self, seq_group: SequenceGroup) -> None: seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] # Allocate new physical token blocks that will store the prompt tokens. - num_prompt_blocks = len(seq.logical_token_blocks) block_table: BlockTable = [] @@ -221,8 +220,10 @@ def can_append_slot(self, seq_group: SequenceGroup) -> bool: return num_seqs <= num_free_gpu_blocks def _promote_last_block( - self, seq: Sequence, - last_block: PhysicalTokenBlock) -> PhysicalTokenBlock: + self, + seq: Sequence, + last_block: PhysicalTokenBlock, + ) -> PhysicalTokenBlock: # Compute a new hash for the block so that it can be shared by other Sequences new_hash = seq.hash(len(seq.logical_token_blocks) - 1) @@ -234,22 +235,34 @@ def _promote_last_block( self.gpu_allocator.update_hash(new_hash, last_block) return last_block - def _is_last_block_full(self, seq: Sequence) -> bool: + def _is_last_block_full( + self, + seq: Sequence, + ) -> bool: return (len(seq.data.get_token_ids())) % seq.block_size == 0 - def _is_last_block(self, seq: Sequence, index: int) -> bool: + def _is_last_block( + self, + seq: Sequence, + index: int, + ) -> bool: return index == len(seq.logical_token_blocks) - 1 def _maybe_promote_last_block( - self, seq: Sequence, - last_block: PhysicalTokenBlock) -> PhysicalTokenBlock: + self, + seq: Sequence, + last_block: PhysicalTokenBlock, + ) -> PhysicalTokenBlock: if self._is_last_block_full(seq): return self._promote_last_block(seq, last_block) else: return last_block - def _allocate_last_physical_block(self, seq: Sequence, - prefix_len: int) -> PhysicalTokenBlock: + def _allocate_last_physical_block( + self, + seq: Sequence, + prefix_len: int, + ) -> PhysicalTokenBlock: block_hash: Optional[int] = None if (self._is_last_block_full(seq)): block_hash = seq.hash(len(seq.logical_token_blocks) - 1) @@ -259,8 +272,11 @@ def _allocate_last_physical_block(self, seq: Sequence, assert (new_block.ref_count == 1) return new_block - def append_slot(self, seq: Sequence, - prefix_len: int) -> Optional[Tuple[int, int]]: + def append_slot( + self, + seq: Sequence, + prefix_len: int, + ) -> Optional[Tuple[int, int]]: """Allocate a physical slot for a new token.""" logical_blocks = seq.logical_token_blocks block_table = self.block_tables[seq.seq_id] @@ -418,8 +434,11 @@ def get_num_free_gpu_blocks(self) -> int: def get_num_free_cpu_blocks(self) -> int: return self.cpu_allocator.get_num_free_blocks() - def access_all_blocks_in_seq(self, seq: Sequence, - access_time: float) -> None: + def access_all_blocks_in_seq( + self, + seq: Sequence, + access_time: float, + ) -> None: block_table = self.block_tables[seq.seq_id] for block in block_table: block.last_accessed = access_time diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index cd08940c49a0b..b12457afa85b9 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -400,8 +400,11 @@ def _allocate(self, seq_group: SequenceGroup) -> None: for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): seq.status = SequenceStatus.RUNNING - def _append_slot(self, seq_group: SequenceGroup, - blocks_to_copy: Dict[int, List[int]]) -> None: + def _append_slot( + self, + seq_group: SequenceGroup, + blocks_to_copy: Dict[int, List[int]], + ) -> None: for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): ret = self.block_manager.append_slot(seq, seq_group.get_prefix_len()) @@ -441,7 +444,10 @@ def _preempt( else: raise AssertionError("Invalid preemption mode.") - def _preempt_by_recompute(self, seq_group: SequenceGroup) -> None: + def _preempt_by_recompute( + self, + seq_group: SequenceGroup, + ) -> None: seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) assert len(seqs) == 1 for seq in seqs: @@ -459,15 +465,21 @@ def _preempt_by_swap( self._swap_out(seq_group, blocks_to_swap_out) self.swapped.append(seq_group) - def _swap_in(self, seq_group: SequenceGroup, - blocks_to_swap_in: Dict[int, int]) -> None: + def _swap_in( + self, + seq_group: SequenceGroup, + blocks_to_swap_in: Dict[int, int], + ) -> None: mapping = self.block_manager.swap_in(seq_group) blocks_to_swap_in.update(mapping) for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): seq.status = SequenceStatus.RUNNING - def _swap_out(self, seq_group: SequenceGroup, - blocks_to_swap_out: Dict[int, int]) -> None: + def _swap_out( + self, + seq_group: SequenceGroup, + blocks_to_swap_out: Dict[int, int], + ) -> None: if not self.block_manager.can_swap_out(seq_group): # FIXME(woosuk): Abort the sequence group instead of aborting the # entire engine. From 9f7ae9f1ab3f973652f273a463702e8437d823f1 Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Tue, 13 Feb 2024 12:09:40 -0500 Subject: [PATCH 33/79] bring back free table --- vllm/core/block_manager.py | 39 ++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 1ecb7127232d8..ba8415d110c79 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -13,38 +13,41 @@ class EvictionPolicy(enum.Enum): LRU = enum.auto() -def lru_eviction(table: Dict[int, PhysicalTokenBlock]) -> PhysicalTokenBlock: - all_blocks: List[PhysicalTokenBlock] = list(table.values()) - assert (len(all_blocks) > 0) +def lru_eviction(free_table: Dict[int, PhysicalTokenBlock]) -> PhysicalTokenBlock: + free_blocks: List[PhysicalTokenBlock] = list(free_table.values()) + if len(free_blocks) == 0: + raise ValueError("No usable cache memory left") # Find lowest timestamp lowest_timestamp = monotonic() - for block in all_blocks: - if block.ref_count == 0 and block.last_accessed < lowest_timestamp: + for block in free_blocks: + if block.last_accessed < lowest_timestamp: lowest_timestamp = block.last_accessed # Find all blocks with the lowest timestamp least_recent: List[PhysicalTokenBlock] = [] - for block in all_blocks: - if block.ref_count == 0 and block.last_accessed == lowest_timestamp: + for block in free_blocks: + if block.last_accessed == lowest_timestamp: least_recent.append(block) # Find highest prefix count per block highest_prefix_count = 0 for block in least_recent: - if block.ref_count == 0 and block.prefix_len > highest_prefix_count: + if block.prefix_len > highest_prefix_count: highest_prefix_count = block.prefix_len # Find all blocks with the lowest timestamp eviction_candidates: List[PhysicalTokenBlock] = [] for block in least_recent: - if block.ref_count == 0 and block.prefix_len == highest_prefix_count: + if block.prefix_len == highest_prefix_count: eviction_candidates.append(block) # Arbitrarily evict the first candidate - assert (len(eviction_candidates) > 0) + if len(eviction_candidates) == 0: + raise ValueError("No usable cache memory left") + evicted_block = eviction_candidates[0] - del table[evicted_block.block_hash] + del free_table[evicted_block.block_hash] return evicted_block @@ -70,10 +73,11 @@ def __init__(self, self.current_num_blocks = 0 self.table: Dict[int, PhysicalTokenBlock] = {} + self.free_table: Dict[int, PhysicalTokenBlock] = {} def evict(self) -> PhysicalTokenBlock: if self.eviction_policy == EvictionPolicy.LRU: - return lru_eviction(self.table) + return lru_eviction(self.free_table) else: raise ValueError( f"Unknown cache eviction policy: {self.eviction_policy}") @@ -83,6 +87,7 @@ def allocate_block(self, block_hash: int, if self.current_num_blocks == self.num_blocks: block = self.evict() block.block_hash = block_hash + block.prefix_len = prefix_len return block block = PhysicalTokenBlock(device=self.device, block_number=self.current_num_blocks, @@ -97,6 +102,13 @@ def allocate(self, prefix_len: int = 0) -> PhysicalTokenBlock: if block_hash is None: block_hash = monotonic() + if block_hash in self.free_table: + assert block_hash not in self.table + block = self.free_table[block_hash] + self.table[block_hash] = block + block.ref_count += 1 + del self.free_table[block_hash] + return block if block_hash not in self.table: self.table[block_hash] = self.allocate_block( block_hash, prefix_len) @@ -108,6 +120,9 @@ def free(self, block: PhysicalTokenBlock) -> None: if block.ref_count == 0: raise ValueError(f"Double free! {block} is already freed.") block.ref_count -= 1 + if block.ref_count == 0: + self.free_table[block.block_hash] = block + del self.table[block.block_hash] # TODO: Should this account for the number of blocks with a ref count of 0? def get_num_free_blocks(self) -> int: From 66551303da18d5e4ac3d5d92e3bd1cea0680c9ac Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Tue, 13 Feb 2024 12:10:13 -0500 Subject: [PATCH 34/79] format --- vllm/core/block_manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index ba8415d110c79..9db130949f2ad 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -13,7 +13,8 @@ class EvictionPolicy(enum.Enum): LRU = enum.auto() -def lru_eviction(free_table: Dict[int, PhysicalTokenBlock]) -> PhysicalTokenBlock: +def lru_eviction( + free_table: Dict[int, PhysicalTokenBlock]) -> PhysicalTokenBlock: free_blocks: List[PhysicalTokenBlock] = list(free_table.values()) if len(free_blocks) == 0: raise ValueError("No usable cache memory left") From 1d6f0a03be761e9aed5a2c6f1d8870b101db8ec1 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Tue, 13 Feb 2024 14:41:17 -0500 Subject: [PATCH 35/79] update get_num_free_blocks to account for blocks in free table --- vllm/core/block_manager.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 9db130949f2ad..883a88fddb2e7 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -125,9 +125,8 @@ def free(self, block: PhysicalTokenBlock) -> None: self.free_table[block.block_hash] = block del self.table[block.block_hash] - # TODO: Should this account for the number of blocks with a ref count of 0? def get_num_free_blocks(self) -> int: - return self.num_blocks - self.current_num_blocks + return self.num_blocks - self.current_num_blocks + len(self.free_table) def contains_block(self, block_hash: int) -> bool: return block_hash in self.table From 0ca5c43685b287652ecece544faa5518cf368e81 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Tue, 13 Feb 2024 15:02:55 -0500 Subject: [PATCH 36/79] add some more asserts to BlockAllocator --- vllm/core/block_manager.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 883a88fddb2e7..0f4f33e42ebee 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -106,6 +106,7 @@ def allocate(self, if block_hash in self.free_table: assert block_hash not in self.table block = self.free_table[block_hash] + assert block.ref_count == 0 self.table[block_hash] = block block.ref_count += 1 del self.free_table[block_hash] @@ -122,6 +123,7 @@ def free(self, block: PhysicalTokenBlock) -> None: raise ValueError(f"Double free! {block} is already freed.") block.ref_count -= 1 if block.ref_count == 0: + assert block.block_hash not in self.free_table self.free_table[block.block_hash] = block del self.table[block.block_hash] From 7d6444d4125f47c97d0633d4539fa72181e51bf0 Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Tue, 13 Feb 2024 21:49:57 -0500 Subject: [PATCH 37/79] contains_block() now looks at both table and free_table + a couple asserts --- vllm/core/block_manager.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 0f4f33e42ebee..88d3ca96758a6 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -110,11 +110,13 @@ def allocate(self, self.table[block_hash] = block block.ref_count += 1 del self.free_table[block_hash] + assert block.block_hash == block_hash return block if block_hash not in self.table: self.table[block_hash] = self.allocate_block( block_hash, prefix_len) block = self.table[block_hash] + assert block.block_hash == block_hash block.ref_count += 1 return block @@ -131,7 +133,7 @@ def get_num_free_blocks(self) -> int: return self.num_blocks - self.current_num_blocks + len(self.free_table) def contains_block(self, block_hash: int) -> bool: - return block_hash in self.table + return block_hash in self.table or block_hash in self.free_table def update_hash(self, block_hash: int, block: PhysicalTokenBlock): assert (not self.contains_block(block_hash)) From 47754236324221ee0e57eb2bdd9fccdcf270194b Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Wed, 14 Feb 2024 16:58:45 -0500 Subject: [PATCH 38/79] updated semantics of prefix length in block --- vllm/core/block_manager.py | 10 +++++++--- vllm/sequence.py | 7 +++++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 88d3ca96758a6..225019ed87880 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -223,8 +223,10 @@ def allocate(self, seq_group: SequenceGroup) -> None: and logical_idx >= self.block_sliding_window): block = block_table[logical_idx % self.block_sliding_window] else: - block = self.gpu_allocator.allocate(seq.hash(logical_idx), - seq_group.get_prefix_len()) + block = self.gpu_allocator.allocate( + seq.hash(logical_idx), + seq.prefix_len_of_block(logical_idx, + seq_group.get_prefix_len())) block_table.append(block) # Assign the block table for each sequence. @@ -285,8 +287,10 @@ def _allocate_last_physical_block( block_hash: Optional[int] = None if (self._is_last_block_full(seq)): block_hash = seq.hash(len(seq.logical_token_blocks) - 1) + block_prefix_len = seq.prefix_len_of_block( + len(seq.logical_token_blocks) - 1, prefix_len) new_block = self.gpu_allocator.allocate(block_hash, - prefix_len=prefix_len) + prefix_len=block_prefix_len) if block_hash is None: assert (new_block.ref_count == 1) return new_block diff --git a/vllm/sequence.py b/vllm/sequence.py index 2c72be85fc520..8c4dc0f72a529 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -145,6 +145,13 @@ def hash(self, logical_idx: int) -> int: num_tokens = logical_idx * self.block_size + self.block_size return hash(tuple(self.data.get_token_ids()[0:num_tokens])) + def prefix_len_of_block(self, logical_idx: int, full_prefix_len: int): + num_tokens = logical_idx * self.block_size + self.block_size + if num_tokens > full_prefix_len: + return full_prefix_len + else: + return num_tokens + def _append_logical_block(self) -> None: block = LogicalTokenBlock( block_number=len(self.logical_token_blocks), From 5cfee5fd7435952fa9ad75abf60eb2a8ed8496f2 Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Thu, 15 Feb 2024 03:01:03 -0500 Subject: [PATCH 39/79] bring back prefix block tables --- vllm/core/block_manager.py | 8 ++++++++ vllm/core/scheduler.py | 2 ++ vllm/sequence.py | 4 ++++ vllm/worker/model_runner.py | 8 +++++++- 4 files changed, 21 insertions(+), 1 deletion(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 88d3ca96758a6..16a5d7f9d91ae 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -227,6 +227,14 @@ def allocate(self, seq_group: SequenceGroup) -> None: seq_group.get_prefix_len()) block_table.append(block) + #TODO add block ref_counts for each block in prefix? + if seq_group.prefix_pos is not None and seq_group.prefix_pos > 0 and seq_group.prefix_block_nums is None: + num_prefix_blocks = seq_group.prefix_pos // self.block_size + prefix_block_table = block_table[:num_prefix_blocks] + seq_group.prefix_block_nums = [ + block.block_number for block in prefix_block_table + ] + # Assign the block table for each sequence. for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): self.block_tables[seq.seq_id] = block_table.copy() diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index b12457afa85b9..75b1bd8b5a64c 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -381,6 +381,8 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: sampling_params=seq_group.sampling_params, block_tables=block_tables, lora_request=seq_group.lora_request, + prefix_pos=seq_group.prefix_pos, + prefix_block_nums=seq_group.prefix_block_nums, ) seq_group_metadata_list.append(seq_group_metadata) return seq_group_metadata_list, scheduler_outputs diff --git a/vllm/sequence.py b/vllm/sequence.py index 2c72be85fc520..1d15d29cd30af 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -252,6 +252,7 @@ def __init__( arrival_time: float, lora_request: Optional[LoRARequest] = None, prefix_pos: Optional[int] = None, + prefix_block_nums: Optional[List[int]] = None, ) -> None: self.request_id = request_id self.seqs_dict = {seq.seq_id: seq for seq in seqs} @@ -261,6 +262,7 @@ def __init__( self.lora_request = lora_request self.prefix_pos: Optional[int] = prefix_pos self.prompt_logprobs: Optional[PromptLogprobs] = None + self.prefix_block_nums = prefix_block_nums @property def prompt(self) -> str: @@ -379,6 +381,7 @@ def __init__( block_tables: Dict[int, List[int]], lora_request: Optional[LoRARequest] = None, prefix_pos: Optional[int] = None, + prefix_block_nums: Optional[List[int]] = None, ) -> None: self.request_id = request_id self.is_prompt = is_prompt @@ -387,6 +390,7 @@ def __init__( self.block_tables = block_tables self.lora_request = lora_request self.prefix_pos = prefix_pos + self.prefix_block_nums = prefix_block_nums @property def lora_int_id(self) -> int: diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 5908d577e1a28..ef7b8d1e8f2f6 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -125,7 +125,13 @@ def _prepare_prompt( prompt_len = len(prompt_tokens) prompt_lens.append(prompt_len) prefix_len = 0 - prefix_block_tables.append([]) + prefix_block_nums = seq_group_metadata.prefix_block_nums + if prefix_block_nums is not None: + prefix_len = seq_group_metadata.prefix_pos + prompt_tokens = prompt_tokens[prefix_len:] + prefix_block_tables.append(prefix_block_nums) + else: + prefix_block_tables.append([]) # actual prompt lens context_lens.append(prefix_len) subquery_lens.append(prompt_len - prefix_len) From 492507175d937688eb0f129b770e59b616527c62 Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Thu, 15 Feb 2024 03:18:21 -0500 Subject: [PATCH 40/79] Nits (style) --- vllm/block.py | 7 ++----- vllm/core/block_manager.py | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/vllm/block.py b/vllm/block.py index c34591e6ad236..da05d0d3f4f4d 100644 --- a/vllm/block.py +++ b/vllm/block.py @@ -62,15 +62,12 @@ def __init__( self.device = device self.block_number = block_number self.block_size = block_size - - self.ref_count = 0 - self.block_hash = block_hash + self.prefix_len = prefix_len + self.ref_count = 0 self.last_accessed = monotonic() - self.prefix_len = prefix_len - def __repr__(self) -> str: return (f'PhysicalTokenBlock(device={self.device}, ' f'block_number={self.block_number}, ' diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 66021a8fb6117..602fad22418b8 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -139,8 +139,8 @@ def update_hash(self, block_hash: int, block: PhysicalTokenBlock): assert (not self.contains_block(block_hash)) old_hash = block.block_hash del self.table[old_hash] - self.table[block_hash] = block block.block_hash = block_hash + self.table[block_hash] = block class AllocStatus(enum.Enum): From 4fba5f9c28fe6c89e2abfdf239324a8ab6a07557 Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Thu, 15 Feb 2024 07:11:56 -0500 Subject: [PATCH 41/79] delete comment --- vllm/core/block_manager.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 602fad22418b8..90d74e95ae5ac 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -229,7 +229,6 @@ def allocate(self, seq_group: SequenceGroup) -> None: seq_group.get_prefix_len())) block_table.append(block) - #TODO add block ref_counts for each block in prefix? if seq_group.prefix_pos is not None and seq_group.prefix_pos > 0 and seq_group.prefix_block_nums is None: num_prefix_blocks = seq_group.prefix_pos // self.block_size prefix_block_table = block_table[:num_prefix_blocks] From 46c62e4789f613002da576936327f4ab2cd09daf Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Thu, 15 Feb 2024 10:43:44 -0500 Subject: [PATCH 42/79] Added computed_block_nums --- vllm/block.py | 3 +++ vllm/core/block_manager.py | 38 ++++++++++++++++++++++++++++++------- vllm/core/scheduler.py | 6 +++++- vllm/engine/llm_engine.py | 4 ++++ vllm/sequence.py | 6 ++---- vllm/worker/model_runner.py | 9 ++++++--- 6 files changed, 51 insertions(+), 15 deletions(-) diff --git a/vllm/block.py b/vllm/block.py index da05d0d3f4f4d..5be24a6b4f88e 100644 --- a/vllm/block.py +++ b/vllm/block.py @@ -68,6 +68,9 @@ def __init__( self.ref_count = 0 self.last_accessed = monotonic() + self.computed = False + + # TODO: update this def __repr__(self) -> str: return (f'PhysicalTokenBlock(device={self.device}, ' f'block_number={self.block_number}, ' diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 90d74e95ae5ac..f5bbeb0cffbc3 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -50,6 +50,7 @@ def lru_eviction( evicted_block = eviction_candidates[0] del free_table[evicted_block.block_hash] + evicted_block.computed = False return evicted_block @@ -229,13 +230,6 @@ def allocate(self, seq_group: SequenceGroup) -> None: seq_group.get_prefix_len())) block_table.append(block) - if seq_group.prefix_pos is not None and seq_group.prefix_pos > 0 and seq_group.prefix_block_nums is None: - num_prefix_blocks = seq_group.prefix_pos // self.block_size - prefix_block_table = block_table[:num_prefix_blocks] - seq_group.prefix_block_nums = [ - block.block_number for block in prefix_block_table - ] - # Assign the block table for each sequence. for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): self.block_tables[seq.seq_id] = block_table.copy() @@ -472,3 +466,33 @@ def access_all_blocks_in_seq( block_table = self.block_tables[seq.seq_id] for block in block_table: block.last_accessed = access_time + + def compute_all_blocks_in_seq(self, seq: Sequence): + if seq.seq_id not in self.block_tables: + return + block_table = self.block_tables[seq.seq_id] + for block in block_table: + block.computed = True + + def get_all_computed_block_ids_2(self, seq: Sequence): + block_ids: List[int] = [] + if seq.seq_id not in self.block_tables: + return block_ids + block_table = self.block_tables[seq.seq_id] + # We want to get the first n contiguous completed blocks + for block in block_table: + if block.computed: + block_ids.append(block.block_number) + else: + return block_ids + return block_ids + + def get_all_computed_block_ids(self, + seq_group: SequenceGroup) -> List[int]: + + return self.get_all_computed_block_ids_2( + next(iter(seq_group.seqs_dict.values()))) + + def mark_blocks_as_computed(self, seq_group: SequenceGroup): + for seq in seq_group.seqs_dict.values(): + self.compute_all_blocks_in_seq(seq) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 75b1bd8b5a64c..ba80ee204c010 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -382,7 +382,8 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: block_tables=block_tables, lora_request=seq_group.lora_request, prefix_pos=seq_group.prefix_pos, - prefix_block_nums=seq_group.prefix_block_nums, + computed_block_nums=self.block_manager. + get_all_computed_block_ids(seq_group), ) seq_group_metadata_list.append(seq_group_metadata) return seq_group_metadata_list, scheduler_outputs @@ -492,3 +493,6 @@ def _swap_out( blocks_to_swap_out.update(mapping) for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): seq.status = SequenceStatus.SWAPPED + + def mark_blocks_as_computed(self, seq_group: SequenceGroup): + self.block_manager.mark_blocks_as_computed(seq_group) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 5317874827357..a3e672e067bf6 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -712,6 +712,10 @@ def _process_model_outputs( scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]: # Update the scheduled sequence groups with the model outputs. scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups + + for seq_group in scheduled_seq_groups: + self.scheduler.mark_blocks_as_computed(seq_group) + for seq_group, outputs in zip(scheduled_seq_groups, output): self._process_sequence_group_outputs(seq_group, outputs) diff --git a/vllm/sequence.py b/vllm/sequence.py index c03e9fde816ff..bc39f317b0277 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -259,7 +259,6 @@ def __init__( arrival_time: float, lora_request: Optional[LoRARequest] = None, prefix_pos: Optional[int] = None, - prefix_block_nums: Optional[List[int]] = None, ) -> None: self.request_id = request_id self.seqs_dict = {seq.seq_id: seq for seq in seqs} @@ -269,7 +268,6 @@ def __init__( self.lora_request = lora_request self.prefix_pos: Optional[int] = prefix_pos self.prompt_logprobs: Optional[PromptLogprobs] = None - self.prefix_block_nums = prefix_block_nums @property def prompt(self) -> str: @@ -388,7 +386,7 @@ def __init__( block_tables: Dict[int, List[int]], lora_request: Optional[LoRARequest] = None, prefix_pos: Optional[int] = None, - prefix_block_nums: Optional[List[int]] = None, + computed_block_nums: Optional[List[int]] = None, ) -> None: self.request_id = request_id self.is_prompt = is_prompt @@ -397,7 +395,7 @@ def __init__( self.block_tables = block_tables self.lora_request = lora_request self.prefix_pos = prefix_pos - self.prefix_block_nums = prefix_block_nums + self.computed_block_nums = computed_block_nums @property def lora_int_id(self) -> int: diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index ef7b8d1e8f2f6..382b9feb6b85c 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -125,11 +125,14 @@ def _prepare_prompt( prompt_len = len(prompt_tokens) prompt_lens.append(prompt_len) prefix_len = 0 - prefix_block_nums = seq_group_metadata.prefix_block_nums - if prefix_block_nums is not None: + + # NOTE: This only works for oooooooxxx style attention. + computed_block_nums = seq_group_metadata.computed_block_nums + if computed_block_nums is not None and len( + computed_block_nums) > 0: prefix_len = seq_group_metadata.prefix_pos prompt_tokens = prompt_tokens[prefix_len:] - prefix_block_tables.append(prefix_block_nums) + prefix_block_tables.append(computed_block_nums) else: prefix_block_tables.append([]) # actual prompt lens From 38b34d82031e2b904a0c7aae64f3195ccbe20e19 Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Thu, 15 Feb 2024 11:17:34 -0500 Subject: [PATCH 43/79] pythonize get_all_computed_block_ids --- vllm/core/block_manager.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index f5bbeb0cffbc3..5f278fdfa5953 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -1,5 +1,7 @@ """A block manager that manages token blocks.""" import enum +from itertools import takewhile +from os.path import commonprefix from time import monotonic from typing import Dict, List, Optional, Set, Tuple @@ -474,24 +476,23 @@ def compute_all_blocks_in_seq(self, seq: Sequence): for block in block_table: block.computed = True - def get_all_computed_block_ids_2(self, seq: Sequence): - block_ids: List[int] = [] + def get_all_computed_block_ids_seq(self, seq: Sequence) -> List[int]: if seq.seq_id not in self.block_tables: - return block_ids + return [] block_table = self.block_tables[seq.seq_id] # We want to get the first n contiguous completed blocks - for block in block_table: - if block.computed: - block_ids.append(block.block_number) - else: - return block_ids - return block_ids + return [ + block.block_number + for block in takewhile(lambda block: block.computed, block_table) + ] def get_all_computed_block_ids(self, seq_group: SequenceGroup) -> List[int]: - - return self.get_all_computed_block_ids_2( - next(iter(seq_group.seqs_dict.values()))) + ids_list = [ + self.get_all_computed_block_ids_seq(seq) + for seq in iter(seq_group.seqs_dict.values()) + ] + return commonprefix([ids for ids in ids_list if ids != []]) def mark_blocks_as_computed(self, seq_group: SequenceGroup): for seq in seq_group.seqs_dict.values(): From ba97f8026c827f626d7247fb3c458c9b75c5f4e5 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Thu, 15 Feb 2024 11:47:17 -0500 Subject: [PATCH 44/79] account for prefix_len=0 in _prepare_prompt --- vllm/sequence.py | 3 +++ vllm/worker/model_runner.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/sequence.py b/vllm/sequence.py index bc39f317b0277..2ec3d974fa087 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -397,6 +397,9 @@ def __init__( self.prefix_pos = prefix_pos self.computed_block_nums = computed_block_nums + def get_prefix_len(self) -> int: + return self.prefix_pos if self.prefix_pos is not None else 0 + @property def lora_int_id(self) -> int: return self.lora_request.lora_int_id if self.lora_request else 0 diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 382b9feb6b85c..069c05a7c6402 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -130,7 +130,7 @@ def _prepare_prompt( computed_block_nums = seq_group_metadata.computed_block_nums if computed_block_nums is not None and len( computed_block_nums) > 0: - prefix_len = seq_group_metadata.prefix_pos + prefix_len = seq_group_metadata.get_prefix_len() prompt_tokens = prompt_tokens[prefix_len:] prefix_block_tables.append(computed_block_nums) else: From fe37722b0825fcbb20883eded201a609207ec9c5 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Thu, 15 Feb 2024 13:11:14 -0500 Subject: [PATCH 45/79] attempt to fix build --- vllm/block.py | 5 ++++- vllm/core/block_manager.py | 3 ++- vllm/core/scheduler.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/vllm/block.py b/vllm/block.py index 5be24a6b4f88e..e5f16e1bf611e 100644 --- a/vllm/block.py +++ b/vllm/block.py @@ -74,7 +74,10 @@ def __init__( def __repr__(self) -> str: return (f'PhysicalTokenBlock(device={self.device}, ' f'block_number={self.block_number}, ' - f'ref_count={self.ref_count})') + f'prefix_len={self.prefix_len}, ' + f'ref_count={self.ref_count}, ' + f'last_accessed={self.last_accessed}, ' + f'computed={self.computed})') # Mapping: logical block number -> physical block. diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 5f278fdfa5953..c4de36e3c668d 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -475,6 +475,7 @@ def compute_all_blocks_in_seq(self, seq: Sequence): block_table = self.block_tables[seq.seq_id] for block in block_table: block.computed = True + block_table[-1].computed = False def get_all_computed_block_ids_seq(self, seq: Sequence) -> List[int]: if seq.seq_id not in self.block_tables: @@ -486,7 +487,7 @@ def get_all_computed_block_ids_seq(self, seq: Sequence) -> List[int]: for block in takewhile(lambda block: block.computed, block_table) ] - def get_all_computed_block_ids(self, + def get_common_computed_block_ids(self, seq_group: SequenceGroup) -> List[int]: ids_list = [ self.get_all_computed_block_ids_seq(seq) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index ba80ee204c010..acf2a59d65bb3 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -383,7 +383,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: lora_request=seq_group.lora_request, prefix_pos=seq_group.prefix_pos, computed_block_nums=self.block_manager. - get_all_computed_block_ids(seq_group), + get_common_computed_block_ids(seq_group), ) seq_group_metadata_list.append(seq_group_metadata) return seq_group_metadata_list, scheduler_outputs From 28f4ad26f494faefeeba58b043ce16fd6f53fb45 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Thu, 15 Feb 2024 13:12:27 -0500 Subject: [PATCH 46/79] attempt to fix build --- vllm/core/block_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index c4de36e3c668d..9e01f1aa43643 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -488,7 +488,7 @@ def get_all_computed_block_ids_seq(self, seq: Sequence) -> List[int]: ] def get_common_computed_block_ids(self, - seq_group: SequenceGroup) -> List[int]: + seq_group: SequenceGroup) -> List[int]: ids_list = [ self.get_all_computed_block_ids_seq(seq) for seq in iter(seq_group.seqs_dict.values()) From bff30a79632e9da6adb7ec02c9c73407f102709b Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Thu, 15 Feb 2024 13:19:27 -0500 Subject: [PATCH 47/79] cap computed blocks to prefix length --- vllm/core/block_manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 9e01f1aa43643..3565ca449a5a7 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -493,7 +493,8 @@ def get_common_computed_block_ids(self, self.get_all_computed_block_ids_seq(seq) for seq in iter(seq_group.seqs_dict.values()) ] - return commonprefix([ids for ids in ids_list if ids != []]) + cp = commonprefix([ids for ids in ids_list if ids != []]) + return cp[:seq_group.get_prefix_len() // 16] def mark_blocks_as_computed(self, seq_group: SequenceGroup): for seq in seq_group.seqs_dict.values(): From e829c34578a4f02dd2cea0ad49c359a86ea2ba33 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Thu, 15 Feb 2024 13:28:01 -0500 Subject: [PATCH 48/79] misc fixes --- vllm/core/block_manager.py | 2 +- vllm/core/scheduler.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 3565ca449a5a7..f4865de9f42c2 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -494,7 +494,7 @@ def get_common_computed_block_ids(self, for seq in iter(seq_group.seqs_dict.values()) ] cp = commonprefix([ids for ids in ids_list if ids != []]) - return cp[:seq_group.get_prefix_len() // 16] + return cp[:seq_group.get_prefix_len()] def mark_blocks_as_computed(self, seq_group: SequenceGroup): for seq in seq_group.seqs_dict.values(): diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index acf2a59d65bb3..3225542b7fc25 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -381,7 +381,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: sampling_params=seq_group.sampling_params, block_tables=block_tables, lora_request=seq_group.lora_request, - prefix_pos=seq_group.prefix_pos, + prefix_pos=(seq_group.prefix_pos // 16) * 16, computed_block_nums=self.block_manager. get_common_computed_block_ids(seq_group), ) From 7f78ad4cc135965c64fe6f1c559f997762a15d2a Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Thu, 15 Feb 2024 13:29:10 -0500 Subject: [PATCH 49/79] typo --- vllm/core/block_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index f4865de9f42c2..ddc9e7f5d7ee0 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -494,7 +494,7 @@ def get_common_computed_block_ids(self, for seq in iter(seq_group.seqs_dict.values()) ] cp = commonprefix([ids for ids in ids_list if ids != []]) - return cp[:seq_group.get_prefix_len()] + return cp def mark_blocks_as_computed(self, seq_group: SequenceGroup): for seq in seq_group.seqs_dict.values(): From 18da5e6de5e30395df759d2e8ecc37eeba22393d Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Thu, 15 Feb 2024 13:32:22 -0500 Subject: [PATCH 50/79] account for none --- vllm/core/scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 3225542b7fc25..f708befffae24 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -381,7 +381,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: sampling_params=seq_group.sampling_params, block_tables=block_tables, lora_request=seq_group.lora_request, - prefix_pos=(seq_group.prefix_pos // 16) * 16, + prefix_pos=(seq_group.get_prefix_len() // 16) * 16, computed_block_nums=self.block_manager. get_common_computed_block_ids(seq_group), ) From 49357be8f041bd5c5189e9f9e98e7d0679275d38 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Thu, 15 Feb 2024 13:44:15 -0500 Subject: [PATCH 51/79] block manager refactoring --- vllm/core/block_manager.py | 15 ++++++++++----- vllm/core/scheduler.py | 2 +- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index ddc9e7f5d7ee0..9603f6e6f23e5 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -469,13 +469,17 @@ def access_all_blocks_in_seq( for block in block_table: block.last_accessed = access_time - def compute_all_blocks_in_seq(self, seq: Sequence): + def compute_all_blocks_in_seq(self, seq: Sequence, + max_computed_blocks: int): if seq.seq_id not in self.block_tables: return block_table = self.block_tables[seq.seq_id] + counter = 0 for block in block_table: + if counter >= max_computed_blocks: + return block.computed = True - block_table[-1].computed = False + counter += 1 def get_all_computed_block_ids_seq(self, seq: Sequence) -> List[int]: if seq.seq_id not in self.block_tables: @@ -493,9 +497,10 @@ def get_common_computed_block_ids(self, self.get_all_computed_block_ids_seq(seq) for seq in iter(seq_group.seqs_dict.values()) ] - cp = commonprefix([ids for ids in ids_list if ids != []]) - return cp + return commonprefix([ids for ids in ids_list if ids != []]) def mark_blocks_as_computed(self, seq_group: SequenceGroup): for seq in seq_group.seqs_dict.values(): - self.compute_all_blocks_in_seq(seq) + self.compute_all_blocks_in_seq( + seq, + seq_group.get_prefix_len() // seq.block_size) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index f708befffae24..acf2a59d65bb3 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -381,7 +381,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: sampling_params=seq_group.sampling_params, block_tables=block_tables, lora_request=seq_group.lora_request, - prefix_pos=(seq_group.get_prefix_len() // 16) * 16, + prefix_pos=seq_group.prefix_pos, computed_block_nums=self.block_manager. get_common_computed_block_ids(seq_group), ) From ea4ec9d57004bee4762e8534962a1687e0606f80 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Thu, 15 Feb 2024 14:17:06 -0500 Subject: [PATCH 52/79] clamp prefix length down to a multiple of block size --- vllm/core/scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index acf2a59d65bb3..ef7a8c8bb81fb 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -381,7 +381,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: sampling_params=seq_group.sampling_params, block_tables=block_tables, lora_request=seq_group.lora_request, - prefix_pos=seq_group.prefix_pos, + prefix_pos=seq_group.get_prefix_len() // 16, computed_block_nums=self.block_manager. get_common_computed_block_ids(seq_group), ) From f5fa2de9b4aa6a282f28ed168fd18942730ba865 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Thu, 15 Feb 2024 15:02:22 -0500 Subject: [PATCH 53/79] minor prefix length fix --- vllm/core/scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index ef7a8c8bb81fb..f708befffae24 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -381,7 +381,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: sampling_params=seq_group.sampling_params, block_tables=block_tables, lora_request=seq_group.lora_request, - prefix_pos=seq_group.get_prefix_len() // 16, + prefix_pos=(seq_group.get_prefix_len() // 16) * 16, computed_block_nums=self.block_manager. get_common_computed_block_ids(seq_group), ) From 704aa47edce334593dba90b85e5bb07d1bddb947 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Fri, 16 Feb 2024 08:15:11 -0500 Subject: [PATCH 54/79] replace 16 with block size --- vllm/core/scheduler.py | 7 ++++++- vllm/sequence.py | 4 ++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index f708befffae24..1698d6d15f694 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -368,6 +368,11 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: for seq_group in scheduler_outputs.scheduled_seq_groups: seq_data: Dict[int, SequenceData] = {} block_tables: Dict[int, List[int]] = {} + + # Round the prefix position down to the last full block + rounded_prefix_pos = (seq_group.get_prefix_len() // + seq_group.block_size) * seq_group.block_size + for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): seq_id = seq.seq_id seq_data[seq_id] = seq.data @@ -381,7 +386,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: sampling_params=seq_group.sampling_params, block_tables=block_tables, lora_request=seq_group.lora_request, - prefix_pos=(seq_group.get_prefix_len() // 16) * 16, + prefix_pos=rounded_prefix_pos, computed_block_nums=self.block_manager. get_common_computed_block_ids(seq_group), ) diff --git a/vllm/sequence.py b/vllm/sequence.py index 2ec3d974fa087..ad5c399f3a642 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -281,6 +281,10 @@ def prompt_token_ids(self) -> List[int]: # We use the prompt of an arbitrary sequence. return next(iter(self.seqs_dict.values())).data.prompt_token_ids + @property + def block_size(self) -> int: + return next(iter(self.seqs_dict.values())).block_size + @property def lora_int_id(self) -> int: return self.lora_request.lora_int_id if self.lora_request else 0 From 8771b3f82c65d6b3f967fcd32a462a5108227519 Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Wed, 21 Feb 2024 08:44:15 -0500 Subject: [PATCH 55/79] First round of feedback changes --- vllm/core/block_manager.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 9603f6e6f23e5..7a5a6aad28777 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -1,6 +1,6 @@ """A block manager that manages token blocks.""" import enum -from itertools import takewhile +from itertools import takewhile, count from os.path import commonprefix from time import monotonic from typing import Dict, List, Optional, Set, Tuple @@ -39,17 +39,16 @@ def lru_eviction( if block.prefix_len > highest_prefix_count: highest_prefix_count = block.prefix_len - # Find all blocks with the lowest timestamp - eviction_candidates: List[PhysicalTokenBlock] = [] + evicted_block: Optional[PhysicalTokenBlock] = None + + # Find the first block with the lowest timestamp for block in least_recent: if block.prefix_len == highest_prefix_count: - eviction_candidates.append(block) + evicted_block = block + break - # Arbitrarily evict the first candidate - if len(eviction_candidates) == 0: - raise ValueError("No usable cache memory left") + assert evicted_block is not None - evicted_block = eviction_candidates[0] del free_table[evicted_block.block_hash] evicted_block.computed = False @@ -72,13 +71,14 @@ def __init__(self, self.device = device self.block_size = block_size self.num_blocks = num_blocks - self.eviction_policy = eviction_policy self.current_num_blocks = 0 self.table: Dict[int, PhysicalTokenBlock] = {} self.free_table: Dict[int, PhysicalTokenBlock] = {} + self.default_hash_ctr = count() + def evict(self) -> PhysicalTokenBlock: if self.eviction_policy == EvictionPolicy.LRU: return lru_eviction(self.free_table) @@ -105,7 +105,7 @@ def allocate(self, block_hash: Optional[int] = None, prefix_len: int = 0) -> PhysicalTokenBlock: if block_hash is None: - block_hash = monotonic() + block_hash = next(self.default_hash_ctr) if block_hash in self.free_table: assert block_hash not in self.table block = self.free_table[block_hash] @@ -263,7 +263,8 @@ def _is_last_block_full( self, seq: Sequence, ) -> bool: - return (len(seq.data.get_token_ids())) % seq.block_size == 0 + token_ids_len = len(seq.data.get_token_ids()) + return token_ids_len > 0 and token_ids_len % seq.block_size == 0 def _is_last_block( self, From 2dba195be91fb7d8fdb4ceb060853d592a68b8fa Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Wed, 21 Feb 2024 15:23:12 -0500 Subject: [PATCH 56/79] added a flag to disable automatic prefix caching --- docs/source/models/engine_args.rst | 4 ++++ vllm/config.py | 2 ++ vllm/core/block_manager.py | 4 ---- vllm/engine/arg_utils.py | 8 +++++++- vllm/engine/llm_engine.py | 7 +++++-- 5 files changed, 18 insertions(+), 7 deletions(-) diff --git a/docs/source/models/engine_args.rst b/docs/source/models/engine_args.rst index d89b795149501..945e315d663fd 100644 --- a/docs/source/models/engine_args.rst +++ b/docs/source/models/engine_args.rst @@ -81,6 +81,10 @@ Below, you can find an explanation of every engine argument for vLLM: Token block size for contiguous chunks of tokens. +.. option:: --disable-prefix-caching + + Disables automatic prefix caching + .. option:: --seed Random seed for operations. diff --git a/vllm/config.py b/vllm/config.py index 0b8a2a27f6d43..95466f84780af 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -295,12 +295,14 @@ def __init__( swap_space: int, cache_dtype: str, sliding_window: Optional[int] = None, + disable_prefix_caching: bool = False, ) -> None: self.block_size = block_size self.gpu_memory_utilization = gpu_memory_utilization self.swap_space_bytes = swap_space * _GB self.cache_dtype = cache_dtype self.sliding_window = sliding_window + self.disable_prefix_caching = disable_prefix_caching self._verify_args() self._verify_cache_dtype() diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 7a5a6aad28777..a3596ef02f99a 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -377,10 +377,6 @@ def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]: for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): new_block_table: BlockTable = [] block_table = self.block_tables[seq.seq_id] - if seq_group.prefix is not None: - for block in seq_group.prefix.block_table: - new_block_table.append(block) - block.ref_count += 1 for cpu_block in block_table: if cpu_block in mapping: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 8ac0157151d8e..1e0729ba7fb23 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -25,6 +25,7 @@ class EngineArgs: tensor_parallel_size: int = 1 max_parallel_loading_workers: Optional[int] = None block_size: int = 16 + disable_prefix_caching: bool = False swap_space: int = 4 # GiB gpu_memory_utilization: float = 0.90 max_num_batched_tokens: Optional[int] = None @@ -173,6 +174,10 @@ def add_cli_args( default=EngineArgs.block_size, choices=[8, 16, 32], help='token block size') + parser.add_argument('--disable-prefix-caching', + action='store_true', + help='Disables automatic prefix caching') + # TODO(woosuk): Support fine-grained seeds (e.g., seed per request). parser.add_argument('--seed', type=int, @@ -296,7 +301,8 @@ def create_engine_configs( cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, - model_config.get_sliding_window()) + model_config.get_sliding_window(), + self.disable_prefix_caching) parallel_config = ParallelConfig(self.pipeline_parallel_size, self.tensor_parallel_size, self.worker_use_ray, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 97aefa35a0426..0a7d5bc71e8f0 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -726,8 +726,11 @@ def _process_model_outputs( # Update the scheduled sequence groups with the model outputs. scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups - for seq_group in scheduled_seq_groups: - self.scheduler.mark_blocks_as_computed(seq_group) + # If atomatic prefix caching is disabled, all previously computed blocks + # will be recomputed + if not self.cache_config.disable_prefix_caching: + for seq_group in scheduled_seq_groups: + self.scheduler.mark_blocks_as_computed(seq_group) for seq_group, outputs in zip(scheduled_seq_groups, output): self._process_sequence_group_outputs(seq_group, outputs) From ba01fa8f53c5a3059d07a160684e52520280a39d Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 21 Feb 2024 16:07:00 -0500 Subject: [PATCH 57/79] Update vllm/engine/llm_engine.py --- vllm/engine/llm_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 0a7d5bc71e8f0..4a3e288482502 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -726,7 +726,7 @@ def _process_model_outputs( # Update the scheduled sequence groups with the model outputs. scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups - # If atomatic prefix caching is disabled, all previously computed blocks + # If automatic prefix caching is disabled, all previously computed blocks # will be recomputed if not self.cache_config.disable_prefix_caching: for seq_group in scheduled_seq_groups: From 2914b5ab98207aba6aa49c77ec54b81dd32cfe3c Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Fri, 23 Feb 2024 05:41:51 -0500 Subject: [PATCH 58/79] remove explicit prefix pos --- examples/offline_inference_with_prefix.py | 11 +---- tests/prefix_caching/test_prefix_caching.py | 6 +-- tests/test_cache_block_hashing.py | 2 +- vllm/block.py | 6 +-- vllm/core/block_manager.py | 49 +++++++++------------ vllm/core/scheduler.py | 8 +--- vllm/engine/async_llm_engine.py | 14 +----- vllm/engine/llm_engine.py | 8 +--- vllm/entrypoints/api_server.py | 6 +-- vllm/entrypoints/llm.py | 14 +----- vllm/sequence.py | 22 ++------- vllm/worker/model_runner.py | 20 ++++----- 12 files changed, 48 insertions(+), 118 deletions(-) diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference_with_prefix.py index 8ccfb1ceea731..1aa718b88907c 100644 --- a/examples/offline_inference_with_prefix.py +++ b/examples/offline_inference_with_prefix.py @@ -37,20 +37,13 @@ print("-" * 80) -# -1 since the last token can change when concatenating prompts. -prefix_pos = len(llm.llm_engine.tokenizer.encode(prefix)) - 1 - # The llm.generate call will batch all prompts and send the batch at once if resources allow. # The prefix will only be cached after the first batch is processed, so we need to call generate once # to calculate the prefix and cache it. -outputs = llm.generate(generating_prompts[0], - sampling_params, - prefix_pos=[prefix_pos]) +outputs = llm.generate(generating_prompts[0], sampling_params) # Subsequent batches can leverage the cached prefix -outputs = llm.generate(generating_prompts, - sampling_params, - prefix_pos=[prefix_pos] * len(generating_prompts)) +outputs = llm.generate(generating_prompts, sampling_params) # Print the outputs. You should see the same outputs as before for output in outputs: diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index e40ea9927bf22..ffa6fc8f91f15 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -36,14 +36,10 @@ def test_prefix_caching( max_tokens: int, ): llm = LLM(model=model) - # -1 since the last token can change when concatenating prompts. - prefix_pos = len(llm.llm_engine.tokenizer.encode(prefix)) - 1 prompts = [prefix + prompt for prompt in example_prompts] sampling_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) outputs_without_prefix = llm.generate(prompts, sampling_params) - outputs_with_prefix = llm.generate(prompts, - sampling_params, - prefix_pos=[prefix_pos] * len(prompts)) + outputs_with_prefix = llm.generate(prompts, sampling_params) for output_without_prefix, output_with_prefix in zip( outputs_without_prefix, outputs_with_prefix): assert (output_without_prefix.outputs[0].token_ids == diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index f4eb90378eb0b..7c4ade7f8c8ed 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -58,7 +58,7 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int): num_blocks = len(prompt_token_ids) // block_size for idx in range(num_blocks): - hashes[-1][-1].append(seq.hash(idx)) + hashes[-1][-1].append(seq.hash_of_block(idx)) seq_id += 1 diff --git a/vllm/block.py b/vllm/block.py index e5f16e1bf611e..4fc54f918554b 100644 --- a/vllm/block.py +++ b/vllm/block.py @@ -57,13 +57,13 @@ def __init__( block_number: int, block_size: int, block_hash: int, - prefix_len: int, + num_hashed_tokens: int, ) -> None: self.device = device self.block_number = block_number self.block_size = block_size self.block_hash = block_hash - self.prefix_len = prefix_len + self.num_hashed_tokens = num_hashed_tokens self.ref_count = 0 self.last_accessed = monotonic() @@ -74,7 +74,7 @@ def __init__( def __repr__(self) -> str: return (f'PhysicalTokenBlock(device={self.device}, ' f'block_number={self.block_number}, ' - f'prefix_len={self.prefix_len}, ' + f'num_hashed_tokens={self.num_hashed_tokens}, ' f'ref_count={self.ref_count}, ' f'last_accessed={self.last_accessed}, ' f'computed={self.computed})') diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index a3596ef02f99a..a2bf2b75e5046 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -34,16 +34,16 @@ def lru_eviction( least_recent.append(block) # Find highest prefix count per block - highest_prefix_count = 0 + highest_num_hashed_tokens = 0 for block in least_recent: - if block.prefix_len > highest_prefix_count: - highest_prefix_count = block.prefix_len + if block.num_hashed_tokens > highest_num_hashed_tokens: + highest_num_hashed_tokens = block.num_hashed_tokens evicted_block: Optional[PhysicalTokenBlock] = None # Find the first block with the lowest timestamp for block in least_recent: - if block.prefix_len == highest_prefix_count: + if block.num_hashed_tokens == highest_num_hashed_tokens: evicted_block = block break @@ -87,23 +87,23 @@ def evict(self) -> PhysicalTokenBlock: f"Unknown cache eviction policy: {self.eviction_policy}") def allocate_block(self, block_hash: int, - prefix_len: int) -> PhysicalTokenBlock: + num_hashed_tokens: int) -> PhysicalTokenBlock: if self.current_num_blocks == self.num_blocks: block = self.evict() block.block_hash = block_hash - block.prefix_len = prefix_len + block.num_hashed_tokens = num_hashed_tokens return block block = PhysicalTokenBlock(device=self.device, block_number=self.current_num_blocks, block_size=self.block_size, block_hash=block_hash, - prefix_len=prefix_len) + num_hashed_tokens=num_hashed_tokens) self.current_num_blocks += 1 return block def allocate(self, block_hash: Optional[int] = None, - prefix_len: int = 0) -> PhysicalTokenBlock: + num_hashed_tokens: int = 0) -> PhysicalTokenBlock: if block_hash is None: block_hash = next(self.default_hash_ctr) if block_hash in self.free_table: @@ -117,7 +117,7 @@ def allocate(self, return block if block_hash not in self.table: self.table[block_hash] = self.allocate_block( - block_hash, prefix_len) + block_hash, num_hashed_tokens) block = self.table[block_hash] assert block.block_hash == block_hash block.ref_count += 1 @@ -227,9 +227,8 @@ def allocate(self, seq_group: SequenceGroup) -> None: block = block_table[logical_idx % self.block_sliding_window] else: block = self.gpu_allocator.allocate( - seq.hash(logical_idx), - seq.prefix_len_of_block(logical_idx, - seq_group.get_prefix_len())) + seq.hash_of_block(logical_idx), + seq.num_hashed_tokens_of_block(logical_idx)) block_table.append(block) # Assign the block table for each sequence. @@ -249,7 +248,7 @@ def _promote_last_block( last_block: PhysicalTokenBlock, ) -> PhysicalTokenBlock: # Compute a new hash for the block so that it can be shared by other Sequences - new_hash = seq.hash(len(seq.logical_token_blocks) - 1) + new_hash = seq.hash_of_block(len(seq.logical_token_blocks) - 1) # if new_hash is already in the cached table, then free last_block and return the cached version if self.gpu_allocator.contains_block(new_hash): @@ -286,15 +285,13 @@ def _maybe_promote_last_block( def _allocate_last_physical_block( self, seq: Sequence, - prefix_len: int, ) -> PhysicalTokenBlock: block_hash: Optional[int] = None if (self._is_last_block_full(seq)): - block_hash = seq.hash(len(seq.logical_token_blocks) - 1) - block_prefix_len = seq.prefix_len_of_block( - len(seq.logical_token_blocks) - 1, prefix_len) - new_block = self.gpu_allocator.allocate(block_hash, - prefix_len=block_prefix_len) + block_hash = seq.hash_of_block(len(seq.logical_token_blocks) - 1) + num_hashed_tokens = seq.num_hashed_tokens_of_block( + len(seq.logical_token_blocks) - 1) + new_block = self.gpu_allocator.allocate(block_hash, num_hashed_tokens) if block_hash is None: assert (new_block.ref_count == 1) return new_block @@ -302,7 +299,6 @@ def _allocate_last_physical_block( def append_slot( self, seq: Sequence, - prefix_len: int, ) -> Optional[Tuple[int, int]]: """Allocate a physical slot for a new token.""" logical_blocks = seq.logical_token_blocks @@ -320,7 +316,7 @@ def append_slot( else: # The sequence has a new logical block. # Allocate a new physical block. - new_block = self._allocate_last_physical_block(seq, prefix_len) + new_block = self._allocate_last_physical_block(seq) block_table.append(new_block) return None @@ -336,7 +332,7 @@ def append_slot( else: # The last block is shared with other sequences. # Copy on Write: Allocate a new block and copy the tokens. - new_block = self._allocate_last_physical_block(seq, prefix_len) + new_block = self._allocate_last_physical_block(seq) block_table[-1] = new_block self.gpu_allocator.free(last_block) @@ -384,7 +380,7 @@ def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]: gpu_block.ref_count += 1 else: gpu_block = self.gpu_allocator.allocate( - cpu_block.block_hash, cpu_block.prefix_len) + cpu_block.block_hash, cpu_block.num_hashed_tokens) mapping[cpu_block] = gpu_block new_block_table.append(gpu_block) # Free the CPU block swapped in to GPU. @@ -414,7 +410,7 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: cpu_block.ref_count += 1 else: cpu_block = self.cpu_allocator.allocate( - gpu_block.block_hash, gpu_block.prefix_len) + gpu_block.block_hash, gpu_block.num_hashed_tokens) mapping[gpu_block] = cpu_block new_block_table.append(cpu_block) # Free the GPU block swapped out to CPU. @@ -498,6 +494,5 @@ def get_common_computed_block_ids(self, def mark_blocks_as_computed(self, seq_group: SequenceGroup): for seq in seq_group.seqs_dict.values(): - self.compute_all_blocks_in_seq( - seq, - seq_group.get_prefix_len() // seq.block_size) + self.compute_all_blocks_in_seq(seq, + seq.get_len() // seq.block_size) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 5f1644fcda945..38e470e20acae 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -372,10 +372,6 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: seq_data: Dict[int, SequenceData] = {} block_tables: Dict[int, List[int]] = {} - # Round the prefix position down to the last full block - rounded_prefix_pos = (seq_group.get_prefix_len() // - seq_group.block_size) * seq_group.block_size - for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): seq_id = seq.seq_id seq_data[seq_id] = seq.data @@ -389,7 +385,6 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: sampling_params=seq_group.sampling_params, block_tables=block_tables, lora_request=seq_group.lora_request, - prefix_pos=rounded_prefix_pos, computed_block_nums=self.block_manager. get_common_computed_block_ids(seq_group), state=seq_group.state, @@ -418,8 +413,7 @@ def _append_slot( blocks_to_copy: Dict[int, List[int]], ) -> None: for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): - ret = self.block_manager.append_slot(seq, - seq_group.get_prefix_len()) + ret = self.block_manager.append_slot(seq) if ret is not None: src_block, dst_block = ret if src_block in blocks_to_copy: diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 7cba654602779..605aa1bb6bd8d 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -225,7 +225,6 @@ async def add_request_async( prompt_token_ids: Optional[List[int]] = None, arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, - prefix_pos: Optional[int] = None, ) -> None: if lora_request is not None and not self.lora_config: raise ValueError(f"Got lora_request {lora_request} but LoRA is " @@ -245,7 +244,6 @@ async def add_request_async( sampling_params=sampling_params, arrival_time=arrival_time, lora_request=lora_request, - prefix_pos=prefix_pos, ) async def _run_workers_async( @@ -419,7 +417,6 @@ async def add_request( prompt_token_ids: Optional[List[int]] = None, arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, - prefix_pos: Optional[int] = None, ) -> AsyncStream: if self.log_requests: shortened_prompt = prompt @@ -432,7 +429,6 @@ async def add_request( max_log_len] logger.info(f"Received request {request_id}: " f"prompt: {shortened_prompt!r}, " - f"prefix_pos: {prefix_pos}," f"sampling_params: {sampling_params}, " f"prompt_token_ids: {shortened_token_ids}, " f"lora_request: {lora_request}.") @@ -469,8 +465,7 @@ async def add_request( sampling_params=sampling_params, prompt_token_ids=prompt_token_ids, arrival_time=arrival_time, - lora_request=lora_request, - prefix_pos=prefix_pos) + lora_request=lora_request) return stream @@ -481,7 +476,6 @@ async def generate( request_id: str, prompt_token_ids: Optional[List[int]] = None, lora_request: Optional[LoRARequest] = None, - prefix_pos: Optional[int] = None, ) -> AsyncIterator[RequestOutput]: """Generate outputs for a request. @@ -497,11 +491,6 @@ async def generate( prompt_token_ids: The token IDs of the prompt. If None, we use the tokenizer to convert the prompts to token IDs. lora_request: LoRA request to use for generation, if any. - prefix_pos: If not None, we use the given position as the prefix - position for each prompt. We will cache the prefix's KV - cache and reuse it for the next request with the same prefix. - This is an experimental feature, and may be replaced with - automatic prefix caching in the future. Yields: The output `RequestOutput` objects from the LLMEngine for the @@ -562,7 +551,6 @@ async def generate( prompt_token_ids=prompt_token_ids, arrival_time=arrival_time, lora_request=lora_request, - prefix_pos=prefix_pos, ) async for request_output in stream: diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index b71846e1b466f..e126f0b12c06f 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -395,7 +395,6 @@ def add_request( prompt_token_ids: Optional[List[int]] = None, arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, - prefix_pos: Optional[int] = None, ) -> None: """Add a request to the engine's request pool. @@ -412,11 +411,6 @@ def add_request( use the tokenizer to convert the prompts to token IDs. arrival_time: The arrival time of the request. If None, we use the current monotonic time. - prefix_pos: If not None, we use the given position as the prefix - position for each prompt. We will cache the prefix's KV - cache and reuse it for the next request with the same prefix. - This is an experimental feature, and may be replaced with - automatic prefix caching in the future. Details: - Set arrival_time to the current time if it is None. @@ -464,7 +458,7 @@ def add_request( # Create the sequence group. seq_group = SequenceGroup(request_id, [seq], sampling_params, - arrival_time, lora_request, prefix_pos) + arrival_time, lora_request) # Add the sequence group to the scheduler. self.scheduler.add_seq_group(seq_group) diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index e7af2c6db5e4c..1eb4ab8b06b64 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -39,15 +39,11 @@ async def generate(request: Request) -> Response: """ request_dict = await request.json() prompt = request_dict.pop("prompt") - prefix_pos = request_dict.pop("prefix_pos", None) stream = request_dict.pop("stream", False) sampling_params = SamplingParams(**request_dict) request_id = random_uuid() - results_generator = engine.generate(prompt, - sampling_params, - request_id, - prefix_pos=prefix_pos) + results_generator = engine.generate(prompt, sampling_params, request_id) # Streaming case async def stream_results() -> AsyncGenerator[bytes, None]: diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index fc82018d18eb6..62f1d172377f6 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -124,7 +124,6 @@ def generate( prompts: Optional[Union[str, List[str]]] = None, sampling_params: Optional[SamplingParams] = None, prompt_token_ids: Optional[List[List[int]]] = None, - prefix_pos: Optional[Union[int, List[int]]] = None, use_tqdm: bool = True, lora_request: Optional[LoRARequest] = None, ) -> List[RequestOutput]: @@ -140,11 +139,6 @@ def generate( None, we use the default sampling parameters. prompt_token_ids: A list of token IDs for the prompts. If None, we use the tokenizer to convert the prompts to token IDs. - prefix_pos: If not None, we use the given position as the prefix - position for each prompt. We will cache the prefix's KV - cache and reuse it for the next request with the same prefix. - This is an experimental feature, and may be replaced with - automatic prefix caching in the future. use_tqdm: Whether to use tqdm to display the progress bar. lora_request: LoRA request to use for generation, if any. @@ -171,14 +165,12 @@ def generate( prompt_token_ids) for i in range(num_requests): prompt = prompts[i] if prompts is not None else None - prefix_pos_i = prefix_pos[i] if prefix_pos is not None else None token_ids = None if prompt_token_ids is None else prompt_token_ids[ i] self._add_request(prompt, sampling_params, token_ids, - lora_request=lora_request, - prefix_pos=prefix_pos_i) + lora_request=lora_request) return self._run_engine(use_tqdm) def _add_request( @@ -187,15 +179,13 @@ def _add_request( sampling_params: SamplingParams, prompt_token_ids: Optional[List[int]], lora_request: Optional[LoRARequest] = None, - prefix_pos: Optional[int] = None, ) -> None: request_id = str(next(self.request_counter)) self.llm_engine.add_request(request_id, prompt, sampling_params, prompt_token_ids, - lora_request=lora_request, - prefix_pos=prefix_pos) + lora_request=lora_request) def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]: # Initialize tqdm. diff --git a/vllm/sequence.py b/vllm/sequence.py index 23f51b04a985f..1a7dc86718a8e 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -160,17 +160,13 @@ def __init__( def lora_int_id(self) -> int: return self.lora_request.lora_int_id if self.lora_request else 0 - def hash(self, logical_idx: int) -> int: + def hash_of_block(self, logical_idx: int) -> int: # Compute the number of tokens in the sequence num_tokens = logical_idx * self.block_size + self.block_size return hash(tuple(self.data.get_token_ids()[0:num_tokens])) - def prefix_len_of_block(self, logical_idx: int, full_prefix_len: int): - num_tokens = logical_idx * self.block_size + self.block_size - if num_tokens > full_prefix_len: - return full_prefix_len - else: - return num_tokens + def num_hashed_tokens_of_block(self, logical_idx: int): + return logical_idx * self.block_size + self.block_size def _append_logical_block(self) -> None: block = LogicalTokenBlock( @@ -276,7 +272,6 @@ class SequenceGroup: sampling_params: The sampling parameters used to generate the outputs. arrival_time: The arrival time of the request. lora_request: LoRA request. - prefix_pos: The end of prefix of the prompt of the sequence group. """ def __init__( @@ -286,7 +281,6 @@ def __init__( sampling_params: SamplingParams, arrival_time: float, lora_request: Optional[LoRARequest] = None, - prefix_pos: Optional[int] = None, ) -> None: self.request_id = request_id self.seqs_dict = {seq.seq_id: seq for seq in seqs} @@ -297,7 +291,6 @@ def __init__( first_token_time=None, time_in_queue=None) self.lora_request = lora_request - self.prefix_pos: Optional[int] = prefix_pos self.prompt_logprobs: Optional[PromptLogprobs] = None self.state = SequenceGroupState() @@ -405,9 +398,6 @@ def remove(self, seq_id: int) -> None: def is_finished(self) -> bool: return all(seq.is_finished() for seq in self.get_seqs()) - def get_prefix_len(self) -> int: - return self.prefix_pos if self.prefix_pos is not None else 0 - def __repr__(self) -> str: return (f"SequenceGroup(request_id={self.request_id}, " f"sampling_params={self.sampling_params}, " @@ -426,7 +416,6 @@ class SequenceGroupMetadata: numbers) state: Internal state tied to this sequence group. lora_request: LoRA request. - prefix_pos: The end of prefix of the prompt of the sequence group. """ def __init__( @@ -437,7 +426,6 @@ def __init__( sampling_params: SamplingParams, block_tables: Dict[int, List[int]], lora_request: Optional[LoRARequest] = None, - prefix_pos: Optional[int] = None, computed_block_nums: Optional[List[int]] = None, state: Optional[SequenceGroupState] = None, ) -> None: @@ -447,13 +435,9 @@ def __init__( self.sampling_params = sampling_params self.block_tables = block_tables self.lora_request = lora_request - self.prefix_pos = prefix_pos self.computed_block_nums = computed_block_nums self.state = SequenceGroupState() if state is None else state - def get_prefix_len(self) -> int: - return self.prefix_pos if self.prefix_pos is not None else 0 - @property def lora_int_id(self) -> int: return self.lora_request.lora_int_id if self.lora_request else 0 diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 9ac54cdf36fa1..54e5350e68cf9 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -138,36 +138,36 @@ def _prepare_prompt( prompt_tokens = seq_data.get_token_ids() prompt_len = len(prompt_tokens) prompt_lens.append(prompt_len) - prefix_len = 0 + computed_len = 0 # NOTE: This only works for oooooooxxx style attention. computed_block_nums = seq_group_metadata.computed_block_nums if computed_block_nums is not None and len( computed_block_nums) > 0: - prefix_len = seq_group_metadata.get_prefix_len() - prompt_tokens = prompt_tokens[prefix_len:] + computed_len = len(computed_block_nums) * self.block_size + prompt_tokens = prompt_tokens[computed_len:] prefix_block_tables.append(computed_block_nums) else: prefix_block_tables.append([]) # actual prompt lens - context_lens.append(prefix_len) - subquery_lens.append(prompt_len - prefix_len) + context_lens.append(computed_len) + subquery_lens.append(prompt_len - computed_len) input_tokens.append(prompt_tokens) # NOTE(woosuk): Here we assume that the first token in the prompt # is always the first token in the sequence. input_positions.append( - list(range(prefix_len, prefix_len + len(prompt_tokens)))) + list(range(computed_len, computed_len + len(prompt_tokens)))) lora_id = seq_group_metadata.lora_int_id if lora_id > 0: lora_requests.add(seq_group_metadata.lora_request) - lora_index_mapping.append([lora_id] * (prompt_len - prefix_len)) + lora_index_mapping.append([lora_id] * (prompt_len - computed_len)) lora_prompt_mapping.extend( [lora_id] * - (prompt_len - prefix_len + (prompt_len - computed_len if seq_group_metadata.sampling_params.prompt_logprobs else 1)) if seq_group_metadata.block_tables is None: @@ -186,11 +186,11 @@ def _prepare_prompt( # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1]. start_idx = 0 if self.sliding_window is not None: - assert prefix_len == 0, ( + assert computed_len == 0, ( "Prefix caching is currently not supported with " "sliding window attention") start_idx = max(0, prompt_len - self.sliding_window) - for i in range(prefix_len, prompt_len): + for i in range(computed_len, prompt_len): if i < start_idx: slot_mapping[-1].append(_PAD_SLOT_ID) continue From bd235fdac55edd39a888ecb84157f2d38846c099 Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Fri, 23 Feb 2024 06:26:48 -0500 Subject: [PATCH 59/79] remove assert for sliding window, check what will happen --- vllm/worker/model_runner.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 54e5350e68cf9..80d72d59c7e46 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -186,9 +186,6 @@ def _prepare_prompt( # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1]. start_idx = 0 if self.sliding_window is not None: - assert computed_len == 0, ( - "Prefix caching is currently not supported with " - "sliding window attention") start_idx = max(0, prompt_len - self.sliding_window) for i in range(computed_len, prompt_len): if i < start_idx: From ba382d93ca65461b6dc4d9ece908bc70055ff695 Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Fri, 23 Feb 2024 06:35:15 -0500 Subject: [PATCH 60/79] Try the other way around --- vllm/worker/model_runner.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 80d72d59c7e46..979a2503595bb 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -144,7 +144,9 @@ def _prepare_prompt( computed_block_nums = seq_group_metadata.computed_block_nums if computed_block_nums is not None and len( computed_block_nums) > 0: - computed_len = len(computed_block_nums) * self.block_size + # Prefix is not supported with sliding_window + if self.sliding_window is None: + computed_len = len(computed_block_nums) * self.block_size prompt_tokens = prompt_tokens[computed_len:] prefix_block_tables.append(computed_block_nums) else: @@ -186,6 +188,9 @@ def _prepare_prompt( # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1]. start_idx = 0 if self.sliding_window is not None: + assert computed_len == 0, ( + "Prefix caching is currently not supported with " + "sliding window attention") start_idx = max(0, prompt_len - self.sliding_window) for i in range(computed_len, prompt_len): if i < start_idx: From 660007f4178c024adb599aa3f562f7e97adf08d4 Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Fri, 23 Feb 2024 08:36:34 -0500 Subject: [PATCH 61/79] Delete redundant prefix caching test --- tests/prefix_caching/test_prefix_caching.py | 38 --------------------- 1 file changed, 38 deletions(-) diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index ffa6fc8f91f15..de4a19df89097 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -4,47 +4,9 @@ """ import pytest -from vllm import LLM, SamplingParams from vllm.core.block_manager import BlockAllocator from vllm.utils import Device -prefix = ( - "You are an expert school principal, skilled in effectively managing " - "faculty and staff. Draft 10-15 questions for a potential first grade " - "Head Teacher for my K-12, all-girls', independent school that emphasizes " - "community, joyful discovery, and life-long learning. The candidate is " - "coming in for a first-round panel interview for a 8th grade Math " - "teaching role. They have 5 years of previous teaching experience " - "as an assistant teacher at a co-ed, public school with experience " - "in middle school math teaching. Based on these information, fulfill " - "the following paragraph: ") - - -def allocate_all_blocks(block_allocator, num_blocks): - blocks = [] - for i in range(num_blocks): - # use i as the block_hash - blocks.append(block_allocator.allocate(i, 0)) - return blocks - - -@pytest.mark.parametrize("model", ["facebook/opt-125m"]) -@pytest.mark.parametrize("max_tokens", [16]) -def test_prefix_caching( - example_prompts, - model: str, - max_tokens: int, -): - llm = LLM(model=model) - prompts = [prefix + prompt for prompt in example_prompts] - sampling_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) - outputs_without_prefix = llm.generate(prompts, sampling_params) - outputs_with_prefix = llm.generate(prompts, sampling_params) - for output_without_prefix, output_with_prefix in zip( - outputs_without_prefix, outputs_with_prefix): - assert (output_without_prefix.outputs[0].token_ids == - output_with_prefix.outputs[0].token_ids) - @pytest.mark.parametrize("block_size", [16]) @pytest.mark.parametrize("num_blocks", [16]) From f74f67df87f8a69f4cd7f64c3cb326479b60b409 Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Fri, 23 Feb 2024 11:55:48 -0500 Subject: [PATCH 62/79] Don't add last block to --- vllm/core/block_manager.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index a2bf2b75e5046..91ada347e5722 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -462,12 +462,12 @@ def access_all_blocks_in_seq( for block in block_table: block.last_accessed = access_time - def compute_all_blocks_in_seq(self, seq: Sequence, - max_computed_blocks: int): + def compute_all_blocks_in_seq(self, seq: Sequence): if seq.seq_id not in self.block_tables: return block_table = self.block_tables[seq.seq_id] counter = 0 + max_computed_blocks = seq.get_len() // seq.block_size for block in block_table: if counter >= max_computed_blocks: return @@ -478,10 +478,12 @@ def get_all_computed_block_ids_seq(self, seq: Sequence) -> List[int]: if seq.seq_id not in self.block_tables: return [] block_table = self.block_tables[seq.seq_id] + last_block = block_table[-1] # We want to get the first n contiguous completed blocks + # We exclude the last block because it's most likely not cached yet return [ block.block_number - for block in takewhile(lambda block: block.computed, block_table) + for block in takewhile(lambda block: block.computed and block != last_block, block_table) ] def get_common_computed_block_ids(self, @@ -494,5 +496,4 @@ def get_common_computed_block_ids(self, def mark_blocks_as_computed(self, seq_group: SequenceGroup): for seq in seq_group.seqs_dict.values(): - self.compute_all_blocks_in_seq(seq, - seq.get_len() // seq.block_size) + self.compute_all_blocks_in_seq(seq) From 093cb1cbb5a4d2ef07133d7a707d9b9d5c5fe05b Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Fri, 23 Feb 2024 11:58:50 -0500 Subject: [PATCH 63/79] Format --- vllm/core/block_manager.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 91ada347e5722..35b05e7bd5960 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -482,8 +482,9 @@ def get_all_computed_block_ids_seq(self, seq: Sequence) -> List[int]: # We want to get the first n contiguous completed blocks # We exclude the last block because it's most likely not cached yet return [ - block.block_number - for block in takewhile(lambda block: block.computed and block != last_block, block_table) + block.block_number for block in takewhile( + lambda block: block.computed and block != last_block, + block_table) ] def get_common_computed_block_ids(self, From d459d15bdff469b9558ad7b31f3c85097d72e28a Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Fri, 23 Feb 2024 12:26:48 -0500 Subject: [PATCH 64/79] refactored the eviction logic into a separate class --- vllm/core/block_manager.py | 112 ++++++++++++++----------------------- vllm/core/scheduler.py | 3 +- vllm/engine/llm_engine.py | 7 +-- 3 files changed, 45 insertions(+), 77 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 35b05e7bd5960..ae744599dabf4 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -2,57 +2,12 @@ import enum from itertools import takewhile, count from os.path import commonprefix -from time import monotonic from typing import Dict, List, Optional, Set, Tuple from vllm.block import BlockTable, PhysicalTokenBlock from vllm.sequence import Sequence, SequenceGroup, SequenceStatus from vllm.utils import Device - - -class EvictionPolicy(enum.Enum): - """Enum for eviction policy used by BlockAllocator.""" - LRU = enum.auto() - - -def lru_eviction( - free_table: Dict[int, PhysicalTokenBlock]) -> PhysicalTokenBlock: - free_blocks: List[PhysicalTokenBlock] = list(free_table.values()) - if len(free_blocks) == 0: - raise ValueError("No usable cache memory left") - - # Find lowest timestamp - lowest_timestamp = monotonic() - for block in free_blocks: - if block.last_accessed < lowest_timestamp: - lowest_timestamp = block.last_accessed - - # Find all blocks with the lowest timestamp - least_recent: List[PhysicalTokenBlock] = [] - for block in free_blocks: - if block.last_accessed == lowest_timestamp: - least_recent.append(block) - - # Find highest prefix count per block - highest_num_hashed_tokens = 0 - for block in least_recent: - if block.num_hashed_tokens > highest_num_hashed_tokens: - highest_num_hashed_tokens = block.num_hashed_tokens - - evicted_block: Optional[PhysicalTokenBlock] = None - - # Find the first block with the lowest timestamp - for block in least_recent: - if block.num_hashed_tokens == highest_num_hashed_tokens: - evicted_block = block - break - - assert evicted_block is not None - - del free_table[evicted_block.block_hash] - - evicted_block.computed = False - return evicted_block +from vllm.core.evictor import Evictor, EvictionPolicy, make_evictor class BlockAllocator: @@ -67,29 +22,27 @@ def __init__(self, device: Device, block_size: int, num_blocks: int, - eviction_policy: EvictionPolicy = EvictionPolicy.LRU) -> None: + eviction_policy: EvictionPolicy = EvictionPolicy.LRU, + disable_caching: bool = False) -> None: self.device = device self.block_size = block_size self.num_blocks = num_blocks - self.eviction_policy = eviction_policy + self.disable_caching = disable_caching self.current_num_blocks = 0 self.table: Dict[int, PhysicalTokenBlock] = {} - self.free_table: Dict[int, PhysicalTokenBlock] = {} - self.default_hash_ctr = count() + # Switch over to FIFO eviction when caching is disabled + if self.disable_caching: + eviction_policy = EvictionPolicy.FIFO + self.evictor: Evictor = make_evictor(eviction_policy) - def evict(self) -> PhysicalTokenBlock: - if self.eviction_policy == EvictionPolicy.LRU: - return lru_eviction(self.free_table) - else: - raise ValueError( - f"Unknown cache eviction policy: {self.eviction_policy}") + self.default_hash_ctr = count() def allocate_block(self, block_hash: int, num_hashed_tokens: int) -> PhysicalTokenBlock: if self.current_num_blocks == self.num_blocks: - block = self.evict() + block = self.evictor.evict() block.block_hash = block_hash block.num_hashed_tokens = num_hashed_tokens return block @@ -104,15 +57,21 @@ def allocate_block(self, block_hash: int, def allocate(self, block_hash: Optional[int] = None, num_hashed_tokens: int = 0) -> PhysicalTokenBlock: + # If caching is disabled, just allocate a new block and return it + if self.disable_caching: + block = self.allocate_block(next(self.default_hash_ctr), + num_hashed_tokens) + block.ref_count += 1 + return block + if block_hash is None: block_hash = next(self.default_hash_ctr) - if block_hash in self.free_table: + if block_hash in self.evictor: assert block_hash not in self.table - block = self.free_table[block_hash] + block = self.evictor.remove(block_hash) assert block.ref_count == 0 self.table[block_hash] = block block.ref_count += 1 - del self.free_table[block_hash] assert block.block_hash == block_hash return block if block_hash not in self.table: @@ -128,22 +87,28 @@ def free(self, block: PhysicalTokenBlock) -> None: raise ValueError(f"Double free! {block} is already freed.") block.ref_count -= 1 if block.ref_count == 0: - assert block.block_hash not in self.free_table - self.free_table[block.block_hash] = block - del self.table[block.block_hash] + assert block.block_hash not in self.evictor + self.evictor.append(block) + + # If caching is enabled, remove the block from the table + if not self.disable_caching: + del self.table[block.block_hash] def get_num_free_blocks(self) -> int: - return self.num_blocks - self.current_num_blocks + len(self.free_table) + return self.num_blocks - self.current_num_blocks + self.evictor.num_blocks def contains_block(self, block_hash: int) -> bool: - return block_hash in self.table or block_hash in self.free_table + return block_hash in self.table or block_hash in self.evictor def update_hash(self, block_hash: int, block: PhysicalTokenBlock): assert (not self.contains_block(block_hash)) old_hash = block.block_hash - del self.table[old_hash] block.block_hash = block_hash - self.table[block_hash] = block + + # If caching is enabled, update the table + if not self.disable_caching: + del self.table[old_hash] + self.table[block_hash] = block class AllocStatus(enum.Enum): @@ -170,6 +135,7 @@ def __init__( num_cpu_blocks: int, watermark: float = 0.01, sliding_window: Optional[int] = None, + disable_caching: bool = False, ) -> None: self.block_size = block_size self.num_total_gpu_blocks = num_gpu_blocks @@ -185,10 +151,14 @@ def __init__( assert watermark >= 0.0 self.watermark_blocks = int(watermark * num_gpu_blocks) - self.gpu_allocator = BlockAllocator(Device.GPU, block_size, - num_gpu_blocks) - self.cpu_allocator = BlockAllocator(Device.CPU, block_size, - num_cpu_blocks) + self.gpu_allocator = BlockAllocator(Device.GPU, + block_size, + num_gpu_blocks, + disable_caching=disable_caching) + self.cpu_allocator = BlockAllocator(Device.CPU, + block_size, + num_cpu_blocks, + disable_caching=disable_caching) # Mapping: seq_id -> BlockTable. self.block_tables: Dict[int, BlockTable] = {} diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 38e470e20acae..fd8086a7adda5 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -94,7 +94,8 @@ def __init__( block_size=self.cache_config.block_size, num_gpu_blocks=self.cache_config.num_gpu_blocks, num_cpu_blocks=self.cache_config.num_cpu_blocks, - sliding_window=self.cache_config.sliding_window) + sliding_window=self.cache_config.sliding_window, + disable_caching=self.cache_config.disable_prefix_caching) # Sequence groups in the WAITING state. self.waiting: Deque[SequenceGroup] = deque() diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index e126f0b12c06f..145ef27ea8320 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -721,11 +721,8 @@ def _process_model_outputs( # Update the scheduled sequence groups with the model outputs. scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups - # If automatic prefix caching is disabled, all previously computed blocks - # will be recomputed - if not self.cache_config.disable_prefix_caching: - for seq_group in scheduled_seq_groups: - self.scheduler.mark_blocks_as_computed(seq_group) + for seq_group in scheduled_seq_groups: + self.scheduler.mark_blocks_as_computed(seq_group) for seq_group, outputs in zip(scheduled_seq_groups, output): self._process_sequence_group_outputs(seq_group, outputs) From fea6789a8d4fb487ae6865df4a2d83a6fa4ab6b8 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Fri, 23 Feb 2024 12:55:12 -0500 Subject: [PATCH 65/79] minor fixes --- vllm/block.py | 4 +- vllm/core/evictor.py | 138 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 140 insertions(+), 2 deletions(-) create mode 100644 vllm/core/evictor.py diff --git a/vllm/block.py b/vllm/block.py index 4fc54f918554b..b8e4aa828496b 100644 --- a/vllm/block.py +++ b/vllm/block.py @@ -2,10 +2,10 @@ from typing import List from vllm.utils import Device -from time import monotonic _BLANK_TOKEN_ID = -1 +DEFAULT_LAST_ACCESSED_TIME = -1 class LogicalTokenBlock: """A block that stores a contiguous chunk of tokens from left to right. @@ -66,7 +66,7 @@ def __init__( self.num_hashed_tokens = num_hashed_tokens self.ref_count = 0 - self.last_accessed = monotonic() + self.last_accessed = DEFAULT_LAST_ACCESSED_TIME self.computed = False diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py new file mode 100644 index 0000000000000..9c8e74a27d9da --- /dev/null +++ b/vllm/core/evictor.py @@ -0,0 +1,138 @@ +import enum +from typing import Dict, List, Optional +from abc import ABC, abstractmethod, abstractproperty + +from vllm.block import PhysicalTokenBlock, DEFAULT_LAST_ACCESSED_TIME + +class EvictionPolicy(enum.Enum): + """Enum for eviction policy used by make_evictor to instantiate the correct + Evictor subclass. + """ + LRU = enum.auto() + FIFO = enum.auto() + + +class Evictor(ABC): + """ + """ + + @abstractmethod + def evict(self) -> PhysicalTokenBlock: + pass + + @abstractmethod + def __contains__(self, block_hash: int) -> bool: + pass + + @abstractmethod + def append(self, block: PhysicalTokenBlock): + pass + + @abstractmethod + def remove(self, block_hash: int) -> PhysicalTokenBlock: + pass + + @abstractproperty + def num_blocks(self) -> int: + pass + + +class LRUEvictor(Evictor): + def __init__(self): + self.free_table: Dict[int, PhysicalTokenBlock] = {} + + def __contains__(self, block_hash: int) -> bool: + return block_hash in self.free_table + + def evict(self) -> PhysicalTokenBlock: + free_blocks: List[PhysicalTokenBlock] = list(self.free_table.values()) + if len(free_blocks) == 0: + raise ValueError("No usable cache memory left") + + # Find lowest timestamp + lowest_timestamp = DEFAULT_LAST_ACCESSED_TIME + for block in free_blocks: + if block.last_accessed < lowest_timestamp: + lowest_timestamp = block.last_accessed + + # Find all blocks with the lowest timestamp + least_recent: List[PhysicalTokenBlock] = [] + for block in free_blocks: + if block.last_accessed == lowest_timestamp: + least_recent.append(block) + + # Find highest prefix count per block + highest_num_hashed_tokens = 0 + for block in least_recent: + if block.num_hashed_tokens > highest_num_hashed_tokens: + highest_num_hashed_tokens = block.num_hashed_tokens + + evicted_block: Optional[PhysicalTokenBlock] = None + + # Find the first block with the lowest timestamp + for block in least_recent: + if block.num_hashed_tokens == highest_num_hashed_tokens: + evicted_block = block + break + + assert evicted_block is not None + + del self.free_table[evicted_block.block_hash] + + evicted_block.computed = False + return evicted_block + + def append(self, block: PhysicalTokenBlock): + self.free_table[block.block_hash] = block + + def remove(self, block_hash: int) -> PhysicalTokenBlock: + if not block_hash in self.free_table: + raise AssertionError( + "Attempting to remove block that's not in the evictor") + block: PhysicalTokenBlock = self.free_table[block_hash] + del self.free_table[block_hash] + return block + + @property + def num_blocks(self) -> int: + return len(self.free_table) + + +class FIFOEvictor(Evictor): + """Evicts in a first-in-first-out order""" + + def __init__(self): + self.free_list: List[PhysicalTokenBlock] = [] + + def __contains__(self, block_hash: int) -> bool: + return any(block_hash == free_block.block_hash + for free_block in self.free_list) + + def evict(self) -> PhysicalTokenBlock: + if len(self.free_list) == 0: + raise ValueError("No usable cache memory left") + return self.free_list.popleft() + + def append(self, block: PhysicalTokenBlock): + self.free_list.append(block) + + def remove(self, block_hash: int) -> PhysicalTokenBlock: + for free_block in self.free_list: + if block_hash == free_block.block_hash: + self.free_list.remove(free_block) + return free_block + raise AssertionError( + "Attempting to remove block that's not in the evictor") + + @property + def num_blocks(self) -> int: + return len(self.free_list) + + +def make_evictor(eviction_policy: EvictionPolicy) -> Evictor: + if eviction_policy == EvictionPolicy.LRU: + return LRUEvictor() + elif eviction_policy == EvictionPolicy.FIFO: + return FIFOEvictor() + else: + raise ValueError(f"Unknown cache eviction policy: {eviction_policy}") \ No newline at end of file From 052c29452b7d1dbffb938ff3ee0667f8ab4aab51 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Fri, 23 Feb 2024 13:01:53 -0500 Subject: [PATCH 66/79] format evictor file --- vllm/block.py | 2 +- vllm/core/evictor.py | 12 +++++++----- vllm/core/scheduler.py | 1 - 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/vllm/block.py b/vllm/block.py index b8e4aa828496b..2cc6b947f2255 100644 --- a/vllm/block.py +++ b/vllm/block.py @@ -7,6 +7,7 @@ DEFAULT_LAST_ACCESSED_TIME = -1 + class LogicalTokenBlock: """A block that stores a contiguous chunk of tokens from left to right. @@ -70,7 +71,6 @@ def __init__( self.computed = False - # TODO: update this def __repr__(self) -> str: return (f'PhysicalTokenBlock(device={self.device}, ' f'block_number={self.block_number}, ' diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index 9c8e74a27d9da..d42e52065a3b2 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -4,6 +4,7 @@ from vllm.block import PhysicalTokenBlock, DEFAULT_LAST_ACCESSED_TIME + class EvictionPolicy(enum.Enum): """Enum for eviction policy used by make_evictor to instantiate the correct Evictor subclass. @@ -38,6 +39,7 @@ def num_blocks(self) -> int: class LRUEvictor(Evictor): + def __init__(self): self.free_table: Dict[int, PhysicalTokenBlock] = {} @@ -86,9 +88,9 @@ def append(self, block: PhysicalTokenBlock): self.free_table[block.block_hash] = block def remove(self, block_hash: int) -> PhysicalTokenBlock: - if not block_hash in self.free_table: + if block_hash not in self.free_table: raise AssertionError( - "Attempting to remove block that's not in the evictor") + "Attempting to remove block that's not in the evictor") block: PhysicalTokenBlock = self.free_table[block_hash] del self.free_table[block_hash] return block @@ -100,7 +102,7 @@ def num_blocks(self) -> int: class FIFOEvictor(Evictor): """Evicts in a first-in-first-out order""" - + def __init__(self): self.free_list: List[PhysicalTokenBlock] = [] @@ -123,7 +125,7 @@ def remove(self, block_hash: int) -> PhysicalTokenBlock: return free_block raise AssertionError( "Attempting to remove block that's not in the evictor") - + @property def num_blocks(self) -> int: return len(self.free_list) @@ -135,4 +137,4 @@ def make_evictor(eviction_policy: EvictionPolicy) -> Evictor: elif eviction_policy == EvictionPolicy.FIFO: return FIFOEvictor() else: - raise ValueError(f"Unknown cache eviction policy: {eviction_policy}") \ No newline at end of file + raise ValueError(f"Unknown cache eviction policy: {eviction_policy}") diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index fd8086a7adda5..741a6ed69d838 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -364,7 +364,6 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: scheduler_outputs = self._schedule() now = time.time() - now = time.monotonic() # Create input data structures. seq_group_metadata_list: List[SequenceGroupMetadata] = [] for seq_group in scheduler_outputs.scheduled_seq_groups: From e26cd8e3c5c99bb1705278fa71377a070b4b8fc1 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Fri, 23 Feb 2024 13:24:01 -0500 Subject: [PATCH 67/79] added documentation to the evictor class --- vllm/core/evictor.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index d42e52065a3b2..1fe1286d3b2d3 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -14,23 +14,35 @@ class EvictionPolicy(enum.Enum): class Evictor(ABC): - """ + """The Evictor subclasses should be used by the BlockAllocator class to + handle eviction of freed PhysicalTokenBlocks. """ @abstractmethod - def evict(self) -> PhysicalTokenBlock: + def __init__(self): pass @abstractmethod def __contains__(self, block_hash: int) -> bool: pass + @abstractmethod + def evict(self) -> PhysicalTokenBlock: + """Runs the eviction algorithm and returns the evicted block""" + pass + @abstractmethod def append(self, block: PhysicalTokenBlock): + """Adds block to the evictor, making it a candidate for eviction""" pass @abstractmethod def remove(self, block_hash: int) -> PhysicalTokenBlock: + """Simply removes the block with the hash value block_hash from the + evictor. Caller is responsible for making sure that block_hash is contained + in the evictor before calling remove. Should be used to "bring back" blocks + that have been freed but not evicted yet. + """ pass @abstractproperty @@ -39,6 +51,12 @@ def num_blocks(self) -> int: class LRUEvictor(Evictor): + """Evicts in a least-recently-used order using the last_accessed timestamp + that's recorded in the PhysicalTokenBlock. If there are multiple blocks with + the same last_accessed time, then the one with the largest num_hashed_tokens + will be evicted. If two blocks each have the lowest last_accessed time and + highest num_hashed_tokens value, then one will be chose arbitrarily + """ def __init__(self): self.free_table: Dict[int, PhysicalTokenBlock] = {} @@ -89,7 +107,7 @@ def append(self, block: PhysicalTokenBlock): def remove(self, block_hash: int) -> PhysicalTokenBlock: if block_hash not in self.free_table: - raise AssertionError( + raise ValueError( "Attempting to remove block that's not in the evictor") block: PhysicalTokenBlock = self.free_table[block_hash] del self.free_table[block_hash] @@ -123,7 +141,7 @@ def remove(self, block_hash: int) -> PhysicalTokenBlock: if block_hash == free_block.block_hash: self.free_list.remove(free_block) return free_block - raise AssertionError( + raise ValueError( "Attempting to remove block that's not in the evictor") @property From 2335360d4fa3adeacbfae7ec159b3e9cbb98cc3e Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Fri, 23 Feb 2024 14:26:41 -0500 Subject: [PATCH 68/79] delete newline --- vllm/block.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/block.py b/vllm/block.py index 2cc6b947f2255..a43da8ac6777b 100644 --- a/vllm/block.py +++ b/vllm/block.py @@ -7,7 +7,6 @@ DEFAULT_LAST_ACCESSED_TIME = -1 - class LogicalTokenBlock: """A block that stores a contiguous chunk of tokens from left to right. From d66154c558a5ba50539a7e1b5d592b20aa305ff4 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Fri, 23 Feb 2024 14:30:57 -0500 Subject: [PATCH 69/79] format --- vllm/block.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/block.py b/vllm/block.py index a43da8ac6777b..2cc6b947f2255 100644 --- a/vllm/block.py +++ b/vllm/block.py @@ -7,6 +7,7 @@ DEFAULT_LAST_ACCESSED_TIME = -1 + class LogicalTokenBlock: """A block that stores a contiguous chunk of tokens from left to right. From 6a3843968f1daccd0888755bad3e954ebeef98e3 Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Wed, 28 Feb 2024 08:06:02 -0500 Subject: [PATCH 70/79] Fix timestamp in eviction policy --- vllm/core/evictor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index 1fe1286d3b2d3..8c6d520b4f53a 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -2,7 +2,7 @@ from typing import Dict, List, Optional from abc import ABC, abstractmethod, abstractproperty -from vllm.block import PhysicalTokenBlock, DEFAULT_LAST_ACCESSED_TIME +from vllm.block import PhysicalTokenBlock class EvictionPolicy(enum.Enum): @@ -70,7 +70,7 @@ def evict(self) -> PhysicalTokenBlock: raise ValueError("No usable cache memory left") # Find lowest timestamp - lowest_timestamp = DEFAULT_LAST_ACCESSED_TIME + lowest_timestamp = free_blocks[0].last_accessed for block in free_blocks: if block.last_accessed < lowest_timestamp: lowest_timestamp = block.last_accessed From a449eb67f35b6f310459e9b6c046eabf980a6e91 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Thu, 29 Feb 2024 14:01:53 +0000 Subject: [PATCH 71/79] addressing review comments --- vllm/config.py | 4 ++-- vllm/core/block_manager.py | 48 +++++++++++++++++++------------------- vllm/core/evictor.py | 36 ++++++++++++++-------------- vllm/core/scheduler.py | 2 +- vllm/engine/arg_utils.py | 4 ++-- vllm/sequence.py | 4 +++- 6 files changed, 50 insertions(+), 48 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index e3b08bdc72c67..03f8cbd0c3d29 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -295,14 +295,14 @@ def __init__( swap_space: int, cache_dtype: str, sliding_window: Optional[int] = None, - disable_prefix_caching: bool = False, + enable_prefix_caching: bool = False, ) -> None: self.block_size = block_size self.gpu_memory_utilization = gpu_memory_utilization self.swap_space_bytes = swap_space * _GB self.cache_dtype = cache_dtype self.sliding_window = sliding_window - self.disable_prefix_caching = disable_prefix_caching + self.enable_prefix_caching = enable_prefix_caching self._verify_args() self._verify_cache_dtype() diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 31e58fab916ec..1905dec232595 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -23,17 +23,17 @@ def __init__(self, block_size: int, num_blocks: int, eviction_policy: EvictionPolicy = EvictionPolicy.LRU, - disable_caching: bool = False) -> None: + enable_caching: bool = False) -> None: self.device = device self.block_size = block_size self.num_blocks = num_blocks - self.disable_caching = disable_caching + self.enable_caching = enable_caching self.current_num_blocks = 0 - self.table: Dict[int, PhysicalTokenBlock] = {} + self.cached_blocks: Dict[int, PhysicalTokenBlock] = {} # Switch over to FIFO eviction when caching is disabled - if self.disable_caching: + if not self.enable_caching: eviction_policy = EvictionPolicy.FIFO self.evictor: Evictor = make_evictor(eviction_policy) @@ -58,7 +58,7 @@ def allocate(self, block_hash: Optional[int] = None, num_hashed_tokens: int = 0) -> PhysicalTokenBlock: # If caching is disabled, just allocate a new block and return it - if self.disable_caching: + if not self.enable_caching: block = self.allocate_block(next(self.default_hash_ctr), num_hashed_tokens) block.ref_count += 1 @@ -67,17 +67,17 @@ def allocate(self, if block_hash is None: block_hash = next(self.default_hash_ctr) if block_hash in self.evictor: - assert block_hash not in self.table + assert block_hash not in self.cached_blocks block = self.evictor.remove(block_hash) assert block.ref_count == 0 - self.table[block_hash] = block + self.cached_blocks[block_hash] = block block.ref_count += 1 assert block.block_hash == block_hash return block - if block_hash not in self.table: - self.table[block_hash] = self.allocate_block( + if block_hash not in self.cached_blocks: + self.cached_blocks[block_hash] = self.allocate_block( block_hash, num_hashed_tokens) - block = self.table[block_hash] + block = self.cached_blocks[block_hash] assert block.block_hash == block_hash block.ref_count += 1 return block @@ -88,27 +88,27 @@ def free(self, block: PhysicalTokenBlock) -> None: block.ref_count -= 1 if block.ref_count == 0: assert block.block_hash not in self.evictor - self.evictor.append(block) + self.evictor.add(block) - # If caching is enabled, remove the block from the table - if not self.disable_caching: - del self.table[block.block_hash] + # If caching is enabled, remove the block from the cached_blocks + if self.enable_caching: + del self.cached_blocks[block.block_hash] def get_num_free_blocks(self) -> int: return self.num_blocks - self.current_num_blocks + self.evictor.num_blocks def contains_block(self, block_hash: int) -> bool: - return block_hash in self.table or block_hash in self.evictor + return block_hash in self.cached_blocks or block_hash in self.evictor def update_hash(self, block_hash: int, block: PhysicalTokenBlock): - assert (not self.contains_block(block_hash)) + assert not self.contains_block(block_hash) old_hash = block.block_hash block.block_hash = block_hash - # If caching is enabled, update the table - if not self.disable_caching: - del self.table[old_hash] - self.table[block_hash] = block + # If caching is enabled, update the cached_blocks + if self.enable_caching: + del self.cached_blocks[old_hash] + self.cached_blocks[block_hash] = block class AllocStatus(enum.Enum): @@ -135,7 +135,7 @@ def __init__( num_cpu_blocks: int, watermark: float = 0.01, sliding_window: Optional[int] = None, - disable_caching: bool = False, + enable_caching: bool = False, ) -> None: self.block_size = block_size self.num_total_gpu_blocks = num_gpu_blocks @@ -154,11 +154,11 @@ def __init__( self.gpu_allocator = BlockAllocator(Device.GPU, block_size, num_gpu_blocks, - disable_caching=disable_caching) + enable_caching=enable_caching) self.cpu_allocator = BlockAllocator(Device.CPU, block_size, num_cpu_blocks, - disable_caching=disable_caching) + enable_caching=enable_caching) # Mapping: seq_id -> BlockTable. self.block_tables: Dict[int, BlockTable] = {} @@ -263,7 +263,7 @@ def _allocate_last_physical_block( len(seq.logical_token_blocks) - 1) new_block = self.gpu_allocator.allocate(block_hash, num_hashed_tokens) if block_hash is None: - assert (new_block.ref_count == 1) + assert new_block.ref_count == 1 return new_block def append_slot( diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index 8c6d520b4f53a..62757c74922cf 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -32,7 +32,7 @@ def evict(self) -> PhysicalTokenBlock: pass @abstractmethod - def append(self, block: PhysicalTokenBlock): + def add(self, block: PhysicalTokenBlock): """Adds block to the evictor, making it a candidate for eviction""" pass @@ -64,6 +64,7 @@ def __init__(self): def __contains__(self, block_hash: int) -> bool: return block_hash in self.free_table + # TODO: The performance of this evict function can be optimized further. def evict(self) -> PhysicalTokenBlock: free_blocks: List[PhysicalTokenBlock] = list(self.free_table.values()) if len(free_blocks) == 0: @@ -102,7 +103,7 @@ def evict(self) -> PhysicalTokenBlock: evicted_block.computed = False return evicted_block - def append(self, block: PhysicalTokenBlock): + def add(self, block: PhysicalTokenBlock): self.free_table[block.block_hash] = block def remove(self, block_hash: int) -> PhysicalTokenBlock: @@ -118,41 +119,40 @@ def num_blocks(self) -> int: return len(self.free_table) -class FIFOEvictor(Evictor): +class RandomEvictor(Evictor): """Evicts in a first-in-first-out order""" def __init__(self): - self.free_list: List[PhysicalTokenBlock] = [] + self.free_table: Dict[int, PhysicalTokenBlock] = {} def __contains__(self, block_hash: int) -> bool: - return any(block_hash == free_block.block_hash - for free_block in self.free_list) + return block_hash in self.free_table def evict(self) -> PhysicalTokenBlock: - if len(self.free_list) == 0: + if len(self.free_table) == 0: raise ValueError("No usable cache memory left") - return self.free_list.popleft() + return next(iter(self.free_table.values())) - def append(self, block: PhysicalTokenBlock): - self.free_list.append(block) + def add(self, block: PhysicalTokenBlock): + self.free_table[block.block_hash] = block def remove(self, block_hash: int) -> PhysicalTokenBlock: - for free_block in self.free_list: - if block_hash == free_block.block_hash: - self.free_list.remove(free_block) - return free_block - raise ValueError( - "Attempting to remove block that's not in the evictor") + if block_hash not in self.free_table: + raise ValueError( + "Attempting to remove block that's not in the evictor") + block: PhysicalTokenBlock = self.free_table[block_hash] + del self.free_table[block_hash] + return block @property def num_blocks(self) -> int: - return len(self.free_list) + return len(self.free_table) def make_evictor(eviction_policy: EvictionPolicy) -> Evictor: if eviction_policy == EvictionPolicy.LRU: return LRUEvictor() elif eviction_policy == EvictionPolicy.FIFO: - return FIFOEvictor() + return RandomEvictor() else: raise ValueError(f"Unknown cache eviction policy: {eviction_policy}") diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 78cf9324585fb..1ae58f525b0fb 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -95,7 +95,7 @@ def __init__( num_gpu_blocks=self.cache_config.num_gpu_blocks, num_cpu_blocks=self.cache_config.num_cpu_blocks, sliding_window=self.cache_config.sliding_window, - disable_caching=self.cache_config.disable_prefix_caching) + enable_caching=self.cache_config.enable_prefix_caching) # Sequence groups in the WAITING state. self.waiting: Deque[SequenceGroup] = deque() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ac70310cd2a9a..55d5b1c0c6a80 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -25,7 +25,7 @@ class EngineArgs: tensor_parallel_size: int = 1 max_parallel_loading_workers: Optional[int] = None block_size: int = 16 - disable_prefix_caching: bool = False + enable_prefix_caching: bool = False swap_space: int = 4 # GiB gpu_memory_utilization: float = 0.90 max_num_batched_tokens: Optional[int] = None @@ -302,7 +302,7 @@ def create_engine_configs( self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, model_config.get_sliding_window(), - self.disable_prefix_caching) + self.enable_prefix_caching) parallel_config = ParallelConfig(self.pipeline_parallel_size, self.tensor_parallel_size, self.worker_use_ray, diff --git a/vllm/sequence.py b/vllm/sequence.py index 1a7dc86718a8e..122960035e505 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -160,9 +160,11 @@ def __init__( def lora_int_id(self) -> int: return self.lora_request.lora_int_id if self.lora_request else 0 + # TODO The current hashing function is O(L^2). We should optimize this in + # the future. def hash_of_block(self, logical_idx: int) -> int: # Compute the number of tokens in the sequence - num_tokens = logical_idx * self.block_size + self.block_size + num_tokens = self.num_hashed_tokens_of_block(logical_idx) return hash(tuple(self.data.get_token_ids()[0:num_tokens])) def num_hashed_tokens_of_block(self, logical_idx: int): From 30708b891ffc379f6c54d302eaee30f5e3245b6d Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Thu, 29 Feb 2024 14:58:30 +0000 Subject: [PATCH 72/79] minor evictor fix --- benchmarks/benchmark_throughput.py | 8 +++++++- vllm/core/evictor.py | 5 ++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 1ad502526c97c..be774e7fc5584 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -73,6 +73,7 @@ def run_vllm( enforce_eager: bool, kv_cache_dtype: str, device: str, + enable_prefix_caching: bool, ) -> float: from vllm import LLM, SamplingParams llm = LLM( @@ -87,6 +88,7 @@ def run_vllm( enforce_eager=enforce_eager, kv_cache_dtype=kv_cache_dtype, device=device, + enable_prefix_caching=enable_prefix_caching ) # Add the requests to the engine. @@ -211,7 +213,7 @@ def main(args: argparse.Namespace): args.seed, args.n, args.use_beam_search, args.trust_remote_code, args.dtype, args.max_model_len, args.enforce_eager, - args.kv_cache_dtype, args.device) + args.kv_cache_dtype, args.device, args.enable_prefix_caching) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -302,6 +304,10 @@ def main(args: argparse.Namespace): default="cuda", choices=["cuda"], help='device type for vLLM execution, supporting CUDA only currently.') + parser.add_argument( + "--enable_prefix_caching", + action='store_true' + ) args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index 62757c74922cf..b538ea574b604 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -131,7 +131,10 @@ def __contains__(self, block_hash: int) -> bool: def evict(self) -> PhysicalTokenBlock: if len(self.free_table) == 0: raise ValueError("No usable cache memory left") - return next(iter(self.free_table.values())) + evicted_block = next(iter(self.free_table.values())) + evicted_block.computed = False + del self.free_table[evicted_block.block_hash] + return evicted_block def add(self, block: PhysicalTokenBlock): self.free_table[block.block_hash] = block From 4e996602cdee3bc7a13a046743db8a5cd9e29611 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Thu, 29 Feb 2024 16:08:10 +0000 Subject: [PATCH 73/79] format --- benchmarks/benchmark_throughput.py | 34 +++++++++++++----------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index be774e7fc5584..51c1a6540a451 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -76,20 +76,18 @@ def run_vllm( enable_prefix_caching: bool, ) -> float: from vllm import LLM, SamplingParams - llm = LLM( - model=model, - tokenizer=tokenizer, - quantization=quantization, - tensor_parallel_size=tensor_parallel_size, - seed=seed, - trust_remote_code=trust_remote_code, - dtype=dtype, - max_model_len=max_model_len, - enforce_eager=enforce_eager, - kv_cache_dtype=kv_cache_dtype, - device=device, - enable_prefix_caching=enable_prefix_caching - ) + llm = LLM(model=model, + tokenizer=tokenizer, + quantization=quantization, + tensor_parallel_size=tensor_parallel_size, + seed=seed, + trust_remote_code=trust_remote_code, + dtype=dtype, + max_model_len=max_model_len, + enforce_eager=enforce_eager, + kv_cache_dtype=kv_cache_dtype, + device=device, + enable_prefix_caching=enable_prefix_caching) # Add the requests to the engine. for prompt, _, output_len in requests: @@ -213,7 +211,8 @@ def main(args: argparse.Namespace): args.seed, args.n, args.use_beam_search, args.trust_remote_code, args.dtype, args.max_model_len, args.enforce_eager, - args.kv_cache_dtype, args.device, args.enable_prefix_caching) + args.kv_cache_dtype, args.device, + args.enable_prefix_caching) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -304,10 +303,7 @@ def main(args: argparse.Namespace): default="cuda", choices=["cuda"], help='device type for vLLM execution, supporting CUDA only currently.') - parser.add_argument( - "--enable_prefix_caching", - action='store_true' - ) + parser.add_argument("--enable_prefix_caching", action='store_true') args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model From 5b4413b132bcd731cb7f826361a28fa069207941 Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Thu, 29 Feb 2024 12:12:55 -0500 Subject: [PATCH 74/79] More protection against sliding window --- vllm/worker/model_runner.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 979a2503595bb..01b5bbeec024c 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -143,10 +143,9 @@ def _prepare_prompt( # NOTE: This only works for oooooooxxx style attention. computed_block_nums = seq_group_metadata.computed_block_nums if computed_block_nums is not None and len( - computed_block_nums) > 0: + computed_block_nums) > 0 and self.sliding_window is None: # Prefix is not supported with sliding_window - if self.sliding_window is None: - computed_len = len(computed_block_nums) * self.block_size + computed_len = len(computed_block_nums) * self.block_size prompt_tokens = prompt_tokens[computed_len:] prefix_block_tables.append(computed_block_nums) else: From 7d17304e4b5450de7c65a11a7f95b33e733889ba Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Thu, 29 Feb 2024 12:22:49 -0500 Subject: [PATCH 75/79] Change automatic prefix caching arg to enable in arg utils --- docs/source/models/engine_args.rst | 4 ++-- vllm/engine/arg_utils.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/models/engine_args.rst b/docs/source/models/engine_args.rst index 945e315d663fd..9f5f672ae4f34 100644 --- a/docs/source/models/engine_args.rst +++ b/docs/source/models/engine_args.rst @@ -81,9 +81,9 @@ Below, you can find an explanation of every engine argument for vLLM: Token block size for contiguous chunks of tokens. -.. option:: --disable-prefix-caching +.. option:: --enable-prefix-caching - Disables automatic prefix caching + Enables automatic prefix caching .. option:: --seed diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 55d5b1c0c6a80..59ecfaab77803 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -175,9 +175,9 @@ def add_cli_args( choices=[8, 16, 32], help='token block size') - parser.add_argument('--disable-prefix-caching', + parser.add_argument('--enable-prefix-caching', action='store_true', - help='Disables automatic prefix caching') + help='Enables automatic prefix caching') parser.add_argument('--seed', type=int, From 6358bf0ab9bb1f4525dd3f23f41ea6579038b7c8 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Thu, 29 Feb 2024 20:32:37 +0000 Subject: [PATCH 76/79] fix minor BlockAllocator update_hash bug --- vllm/core/block_manager.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 1905dec232595..01b3a678c65ff 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -101,12 +101,11 @@ def contains_block(self, block_hash: int) -> bool: return block_hash in self.cached_blocks or block_hash in self.evictor def update_hash(self, block_hash: int, block: PhysicalTokenBlock): - assert not self.contains_block(block_hash) - old_hash = block.block_hash - block.block_hash = block_hash - - # If caching is enabled, update the cached_blocks + # If caching is enabled, update the hash of block and the cached_blocks dictionary. if self.enable_caching: + assert not self.contains_block(block_hash) + old_hash = block.block_hash + block.block_hash = block_hash del self.cached_blocks[old_hash] self.cached_blocks[block_hash] = block From b9fbb666b9e04ae660cfa96eeb806af2042fee1f Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Thu, 29 Feb 2024 20:34:02 +0000 Subject: [PATCH 77/79] fix test_prefix_caching test --- tests/prefix_caching/test_prefix_caching.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index de4a19df89097..1a614619f5594 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -15,7 +15,7 @@ def test_block_allocator( num_blocks: int, ): block_hash = 1 - block_allocator = BlockAllocator(Device.CPU, block_size, num_blocks) + block_allocator = BlockAllocator(Device.CPU, block_size, num_blocks, enable_caching=True) # Allocate two PysicalTokenBlocks with the same hash and check that they are the same PhysicalTokenBlock first_block = block_allocator.allocate(block_hash, 0) @@ -39,7 +39,7 @@ def test_block_allocator( @pytest.mark.parametrize("num_blocks", [16]) def test_eviction(num_blocks: int, ): block_size = 16 - block_allocator = BlockAllocator(Device.CPU, block_size, num_blocks) + block_allocator = BlockAllocator(Device.CPU, block_size, num_blocks, enable_caching=True) blocks = [] for i in range(num_blocks): From 4ce8ceb863389998ad6fede573a51a2cab6202c9 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Fri, 1 Mar 2024 01:54:37 +0000 Subject: [PATCH 78/79] fix minor perf regression --- tests/prefix_caching/test_prefix_caching.py | 10 ++++++++-- vllm/engine/llm_engine.py | 7 +++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index 1a614619f5594..7ef8dde7bb8f6 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -15,7 +15,10 @@ def test_block_allocator( num_blocks: int, ): block_hash = 1 - block_allocator = BlockAllocator(Device.CPU, block_size, num_blocks, enable_caching=True) + block_allocator = BlockAllocator(Device.CPU, + block_size, + num_blocks, + enable_caching=True) # Allocate two PysicalTokenBlocks with the same hash and check that they are the same PhysicalTokenBlock first_block = block_allocator.allocate(block_hash, 0) @@ -39,7 +42,10 @@ def test_block_allocator( @pytest.mark.parametrize("num_blocks", [16]) def test_eviction(num_blocks: int, ): block_size = 16 - block_allocator = BlockAllocator(Device.CPU, block_size, num_blocks, enable_caching=True) + block_allocator = BlockAllocator(Device.CPU, + block_size, + num_blocks, + enable_caching=True) blocks = [] for i in range(num_blocks): diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 316478eafadf3..4d7731e3f8ca3 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -722,8 +722,11 @@ def _process_model_outputs( # Update the scheduled sequence groups with the model outputs. scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups - for seq_group in scheduled_seq_groups: - self.scheduler.mark_blocks_as_computed(seq_group) + # If prefix caching is enabled, mark all blocks in the sequence groups + # as completed so that future requests don't attempt to recompute them + if self.cache_config.enable_prefix_caching: + for seq_group in scheduled_seq_groups: + self.scheduler.mark_blocks_as_computed(seq_group) for seq_group, outputs in zip(scheduled_seq_groups, output): self._process_sequence_group_outputs(seq_group, outputs) From 11126ab599e281e619d21a3be82d9e087ffcd201 Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Fri, 1 Mar 2024 02:33:28 -0500 Subject: [PATCH 79/79] Only mark last prefix block as computed, assume no computed blocks with caching disabled --- vllm/core/block_manager.py | 44 ++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 01b3a678c65ff..08d519ab767a9 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -1,6 +1,6 @@ """A block manager that manages token blocks.""" import enum -from itertools import takewhile, count +from itertools import count from os.path import commonprefix from typing import Dict, List, Optional, Set, Tuple @@ -149,6 +149,8 @@ def __init__( self.watermark = watermark assert watermark >= 0.0 + self.enable_caching = enable_caching + self.watermark_blocks = int(watermark * num_gpu_blocks) self.gpu_allocator = BlockAllocator(Device.GPU, block_size, @@ -431,39 +433,39 @@ def access_all_blocks_in_seq( for block in block_table: block.last_accessed = access_time - def compute_all_blocks_in_seq(self, seq: Sequence): + def compute_last_full_block_in_seq(self, seq: Sequence): if seq.seq_id not in self.block_tables: return + max_full_block = seq.get_len() // seq.block_size - 1 block_table = self.block_tables[seq.seq_id] - counter = 0 - max_computed_blocks = seq.get_len() // seq.block_size - for block in block_table: - if counter >= max_computed_blocks: - return - block.computed = True - counter += 1 + if max_full_block == -1: + return + block_table[max_full_block].computed = True - def get_all_computed_block_ids_seq(self, seq: Sequence) -> List[int]: + def get_all_block_ids_till_computed(self, seq: Sequence) -> List[int]: if seq.seq_id not in self.block_tables: return [] block_table = self.block_tables[seq.seq_id] - last_block = block_table[-1] - # We want to get the first n contiguous completed blocks - # We exclude the last block because it's most likely not cached yet - return [ - block.block_number for block in takewhile( - lambda block: block.computed and block != last_block, - block_table) - ] + for block_idx in reversed(range(len(block_table))): + if block_table[block_idx].computed: + return [b.block_number for b in block_table[:block_idx + 1]] + return [] + # Can return non-empty result only with prefix caching enabled. def get_common_computed_block_ids(self, seq_group: SequenceGroup) -> List[int]: + if not self.enable_caching: + return [] + ids_list = [ - self.get_all_computed_block_ids_seq(seq) + self.get_all_block_ids_till_computed(seq) for seq in iter(seq_group.seqs_dict.values()) ] return commonprefix([ids for ids in ids_list if ids != []]) + # We only mark the last full block because with prefix caching, + # all blocks until the marked one are guaranteed to be computed. def mark_blocks_as_computed(self, seq_group: SequenceGroup): - for seq in seq_group.seqs_dict.values(): - self.compute_all_blocks_in_seq(seq) + if self.enable_caching: + for seq in seq_group.seqs_dict.values(): + self.compute_last_full_block_in_seq(seq)