From d1a91aa81e68ee29db6e0f66f387b4ec1aeb1c95 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Fri, 2 Feb 2024 15:42:42 -0500
Subject: [PATCH 01/79] init

---
 tests/prefix_caching/test_prefix_caching.py |  1 -
 vllm/core/block_manager.py                  | 79 ++++++++++-----------
 vllm/core/scheduler.py                      |  5 --
 vllm/engine/llm_engine.py                   | 13 +---
 vllm/sequence.py                            | 12 ++--
 vllm/worker/model_runner.py                 |  8 +--
 6 files changed, 47 insertions(+), 71 deletions(-)

diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index 1e301bedfc21e..dded5e1b0f7a4 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -38,4 +38,3 @@ def test_prefix_caching(
             outputs_without_prefix, outputs_with_prefix):
         assert (output_without_prefix.outputs[0].token_ids ==
                 output_with_prefix.outputs[0].token_ids)
-    assert len(llm.llm_engine.scheduler.prefix_pool.prefixes) == 1
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 7f91051f03ac1..7b6bbe6b60a4a 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -1,6 +1,7 @@
 """A block manager that manages token blocks."""
 import enum
-from typing import Dict, List, Optional, Set, Tuple
+from collections import deque
+from typing import Dict, List, Optional, Set, Tuple, Deque
 
 from vllm.block import BlockTable, PhysicalTokenBlock
 from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
@@ -25,22 +26,38 @@ def __init__(
         self.block_size = block_size
         self.num_blocks = num_blocks
 
+        self.current_num_blocks = 0
+        self.table: Dict[int, PhysicalTokenBlock] = {}
         # Initialize the free blocks.
-        self.free_blocks: BlockTable = []
-        for i in range(num_blocks):
-            block = PhysicalTokenBlock(device=device,
-                                       block_number=i,
-                                       block_size=block_size)
-            self.free_blocks.append(block)
+        self.free_blocks: Deque[PhysicalTokenBlock] = deque()
 
-    def allocate(self) -> PhysicalTokenBlock:
-        if not self.free_blocks:
-            raise ValueError("Out of memory! No free blocks are available.")
+    def evict(self) -> PhysicalTokenBlock:
+        assert (len(self.free_blocks))
+        # Find the block in the main hash table
         block = self.free_blocks.pop()
-        block.ref_count = 1
+        key = list(self.table.keys())[list(self.table.values()).index()]
+        del self.table[key]
+        return block
+
+    def allocate_block(self) -> PhysicalTokenBlock:
+        if self.current_num_blocks == self.num_blocks:
+            return self.evict()
+        block = PhysicalTokenBlock(device=self.device,
+                                   block_number=self.current_num_blocks,
+                                   block_size=self.block_size)
+        self.current_num_blocks += 1
+        return block
+
+    def allocate(self, i: int) -> PhysicalTokenBlock:
+        if i not in self.table:
+            self.table[i] = self.allocate_block()
+        block = self.table[i]
+        block.ref_count += 1
+        # print(f"REFCOUNT ON ALLOCTION: {block}")
         return block
 
     def free(self, block: PhysicalTokenBlock) -> None:
+        # print(f"FREEING: {block}")
         if block.ref_count == 0:
             raise ValueError(f"Double free! {block} is already freed.")
         block.ref_count -= 1
@@ -48,7 +65,7 @@ def free(self, block: PhysicalTokenBlock) -> None:
             self.free_blocks.append(block)
 
     def get_num_free_blocks(self) -> int:
-        return len(self.free_blocks)
+        return self.num_blocks - self.current_num_blocks
 
 
 class AllocStatus(enum.Enum):
@@ -103,9 +120,6 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
         num_required_blocks = len(seq.logical_token_blocks)
 
-        if seq_group.prefix is not None and seq_group.prefix.allocated:
-            num_required_blocks -= seq_group.prefix.get_num_blocks()
-
         if self.block_sliding_window is not None:
             num_required_blocks = min(num_required_blocks,
                                       self.block_sliding_window)
@@ -129,36 +143,17 @@ def allocate(self, seq_group: SequenceGroup) -> None:
         num_prompt_blocks = len(seq.logical_token_blocks)
 
         block_table: BlockTable = []
-        prefix_block_table: BlockTable = []
-        num_prefix_blocks = 0
-
-        prefix = seq_group.prefix
-        if prefix is not None and prefix.allocated:
-            # Prefix has already been allocated. Use the existing block table.
-            num_prompt_blocks -= prefix.get_num_blocks()
-            for block in prefix.block_table:
-                block.ref_count += seq_group.num_seqs()
-                block_table.append(block)
 
         for logical_idx in range(num_prompt_blocks):
             if (self.block_sliding_window is not None
                     and logical_idx >= self.block_sliding_window):
                 block = block_table[logical_idx % self.block_sliding_window]
             else:
-                block = self.gpu_allocator.allocate()
+                block = self.gpu_allocator.allocate(seq.hash(logical_idx))
             # Set the reference counts of the token blocks.
-            block.ref_count = seq_group.num_seqs()
+            # block.ref_count = seq_group.num_seqs()
             block_table.append(block)
 
-        if prefix is not None and not prefix.allocated:
-            # Allocate blocks for the prefix, we will compute the prefix's
-            # KV cache in this run.
-            num_prefix_blocks = prefix.get_num_blocks()
-            prefix_block_table = block_table[:num_prefix_blocks]
-            for block in prefix_block_table:
-                block.ref_count += 1
-            prefix.set_block_table(prefix_block_table)
-
         # Assign the block table for each sequence.
         for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
             self.block_tables[seq.seq_id] = block_table.copy()
@@ -184,7 +179,8 @@ def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]:
             else:
                 # The sequence has a new logical block.
                 # Allocate a new physical block.
-                block = self.gpu_allocator.allocate()
+                block = self.gpu_allocator.allocate(
+                    seq.hash(len(logical_blocks) - 1))
                 block_table.append(block)
                 return None
 
@@ -197,7 +193,8 @@ def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]:
         else:
             # The last block is shared with other sequences.
             # Copy on Write: Allocate a new block and copy the tokens.
-            new_block = self.gpu_allocator.allocate()
+            new_block = self.gpu_allocator.allocate(
+                seq.hash(len(logical_blocks) - 1))
             block_table[-1] = new_block
             self.gpu_allocator.free(last_block)
             return last_block.block_number, new_block.block_number
@@ -251,7 +248,8 @@ def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]:
                     gpu_block = mapping[cpu_block]
                     gpu_block.ref_count += 1
                 else:
-                    gpu_block = self.gpu_allocator.allocate()
+                    gpu_block = self.gpu_allocator.allocate(
+                        seq.hash(len(seq.logical_blocks) - 1))
                     mapping[cpu_block] = gpu_block
                 new_block_table.append(gpu_block)
                 # Free the CPU block swapped in to GPU.
@@ -286,7 +284,8 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
                     cpu_block = mapping[gpu_block]
                     cpu_block.ref_count += 1
                 else:
-                    cpu_block = self.cpu_allocator.allocate()
+                    cpu_block = self.cpu_allocator.allocate(
+                        seq.hash(len(seq.logical_blocks) - 1))
                     mapping[gpu_block] = cpu_block
                 new_block_table.append(cpu_block)
                 # Free the GPU block swapped out to CPU.
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 4fdf9ec341cfd..213f9bb9cf30c 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -10,7 +10,6 @@
 from vllm.logger import init_logger
 from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
                            SequenceGroupMetadata, SequenceStatus)
-from vllm.prefix import PrefixPool
 
 logger = init_logger(__name__)
 
@@ -97,9 +96,6 @@ def __init__(
             num_cpu_blocks=self.cache_config.num_cpu_blocks,
             sliding_window=self.cache_config.sliding_window)
 
-        # Create the prefix pool to cache the prefixes.
-        self.prefix_pool = PrefixPool(self.cache_config.block_size)
-
         # Sequence groups in the WAITING state.
         self.waiting: Deque[SequenceGroup] = deque()
         # Sequence groups in the RUNNING state.
@@ -383,7 +379,6 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
                 sampling_params=seq_group.sampling_params,
                 block_tables=block_tables,
                 lora_request=seq_group.lora_request,
-                prefix=seq_group.prefix,
             )
             seq_group_metadata_list.append(seq_group_metadata)
         return seq_group_metadata_list, scheduler_outputs
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 0dedc232292dd..8258eba4453cb 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -438,14 +438,9 @@ def add_request(
         seq = Sequence(seq_id, prompt, prompt_token_ids, block_size,
                        lora_request)
 
-        # Check whether the input specifies prefix
-        prefix = self.scheduler.prefix_pool.add_or_get_prefix(
-            prompt_token_ids[:prefix_pos], lora_request.lora_int_id
-            if lora_request else 0) if prefix_pos is not None else None
-
         # Create the sequence group.
         seq_group = SequenceGroup(request_id, [seq], sampling_params,
-                                  arrival_time, lora_request, prefix)
+                                  arrival_time, lora_request)
 
         # Add the sequence group to the scheduler.
         self.scheduler.add_seq_group(seq_group)
@@ -720,12 +715,6 @@ def _process_model_outputs(
             request_output = RequestOutput.from_seq_group(seq_group)
             request_outputs.append(request_output)
 
-        # Update prefix state, now all the uncomputed prefixes are computed.
-        for seq_group in scheduled_seq_groups:
-            if (seq_group.prefix is not None and seq_group.prefix.allocated
-                    and not seq_group.prefix.computed):
-                seq_group.prefix.computed = True
-
         if self.log_stats:
             # Log the system stats.
             self._log_system_stats(scheduler_outputs.prompt_run,
diff --git a/vllm/sequence.py b/vllm/sequence.py
index d28627f47498f..ad50873e338b0 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -4,7 +4,6 @@
 from typing import Dict, List, Optional, Union
 
 from vllm.block import LogicalTokenBlock
-from vllm.prefix import Prefix
 from vllm.sampling_params import SamplingParams
 from vllm.lora.request import LoRARequest
 
@@ -142,6 +141,12 @@ def __init__(
     def lora_int_id(self) -> int:
         return self.lora_request.lora_int_id if self.lora_request else 0
 
+    def hash(self, logical_idx: int) -> int:
+        num_tokens = (logical_idx * self.block_size) + (
+            self.block_size -
+            self.logical_token_blocks[logical_idx].get_num_empty_slots())
+        return hash(tuple(self.data.get_token_ids()[0:num_tokens]))
+
     def _append_logical_block(self) -> None:
         block = LogicalTokenBlock(
             block_number=len(self.logical_token_blocks),
@@ -248,14 +253,12 @@ def __init__(
         sampling_params: SamplingParams,
         arrival_time: float,
         lora_request: Optional[LoRARequest] = None,
-        prefix: Optional[Prefix] = None,
     ) -> None:
         self.request_id = request_id
         self.seqs_dict = {seq.seq_id: seq for seq in seqs}
         self.sampling_params = sampling_params
         self.arrival_time = arrival_time
         self.lora_request = lora_request
-        self.prefix: Optional[Prefix] = prefix
         self.prompt_logprobs: Optional[PromptLogprobs] = None
 
     @property
@@ -354,7 +357,6 @@ class SequenceGroupMetadata:
         block_tables: The block tables. (Seq id -> list of physical block
             numbers)
         lora_request: LoRA request.
-        prefix: The prefix of the prompt of the sequence group.
     """
 
     def __init__(
@@ -365,7 +367,6 @@ def __init__(
         sampling_params: SamplingParams,
         block_tables: Dict[int, List[int]],
         lora_request: Optional[LoRARequest] = None,
-        prefix: Optional[Prefix] = None,
     ) -> None:
         self.request_id = request_id
         self.is_prompt = is_prompt
@@ -373,7 +374,6 @@ def __init__(
         self.sampling_params = sampling_params
         self.block_tables = block_tables
         self.lora_request = lora_request
-        self.prefix = prefix
 
     @property
     def lora_int_id(self) -> int:
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 985115613e044..590eaab77901b 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -117,13 +117,7 @@ def _prepare_prompt(
             prompt_len = len(prompt_tokens)
             prompt_lens.append(prompt_len)
             prefix_len = 0
-            prefix = seq_group_metadata.prefix
-            if prefix is not None and prefix.computed:
-                prefix_len = prefix.get_length()
-                prompt_tokens = prompt_tokens[prefix_len:]
-                prefix_block_tables.append(prefix.get_block_numbers())
-            else:
-                prefix_block_tables.append([])
+            prefix_block_tables.append([])
             # actual prompt lens
             context_lens.append(prefix_len)
             subquery_lens.append(prompt_len - prefix_len)

From ec211305e4f0f16ae680c9223b402973dd380422 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Mon, 5 Feb 2024 09:02:42 -0500
Subject: [PATCH 02/79] Move evictor and eviction policy to a separate class

---
 vllm/core/block_manager.py | 48 +++++++++++++++++++++++++++++++-------
 1 file changed, 39 insertions(+), 9 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 7b6bbe6b60a4a..2c5c445a08aef 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -7,6 +7,40 @@
 from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
 from vllm.utils import Device
 
+class EvictionPolicy(enum.Enum):
+    """Enum for eviction policy used by BlockAllocator."""
+    LRU = enum.auto()
+
+
+class Evictor:
+    """Evicts physical blocks on cache based on eviction policy."""
+
+    def __init__(
+        self,
+        eviction_policy: EvictionPolicy = EvictionPolicy.LRU
+    ) -> None:
+        self.eviction_policy = eviction_policy
+
+        # Initialize the free blocks.
+        self.free_blocks: Deque[PhysicalTokenBlock] = deque()
+
+    def evict(
+        self,
+        table: Dict[int, PhysicalTokenBlock]
+    ) -> PhysicalTokenBlock:
+        match(self.eviction_policy):
+            case EvictionPolicy.LRU:
+                assert (len(self.free_blocks))
+                # Find the block in the main hash table
+                block = self.free_blocks.pop()
+                key = list(table.keys())[list(table.values()).index()]
+                del table[key]
+                return block
+            case _:
+                raise ValueError(f"Unknown cache eviction policy: {self.eviction_policy}")
+
+    def return_block(self, block: PhysicalTokenBlock) -> None:
+        self.free_blocks.append(block)
 
 class BlockAllocator:
     """Manages free physical token blocks for a device.
@@ -21,23 +55,19 @@ def __init__(
         device: Device,
         block_size: int,
         num_blocks: int,
+        eviction_policy: EvictionPolicy = EvictionPolicy.LRU
     ) -> None:
         self.device = device
         self.block_size = block_size
         self.num_blocks = num_blocks
 
+        self.evictor = Evictor(eviction_policy)
+
         self.current_num_blocks = 0
         self.table: Dict[int, PhysicalTokenBlock] = {}
-        # Initialize the free blocks.
-        self.free_blocks: Deque[PhysicalTokenBlock] = deque()
 
     def evict(self) -> PhysicalTokenBlock:
-        assert (len(self.free_blocks))
-        # Find the block in the main hash table
-        block = self.free_blocks.pop()
-        key = list(self.table.keys())[list(self.table.values()).index()]
-        del self.table[key]
-        return block
+        return self.evictor.evict(self.table)
 
     def allocate_block(self) -> PhysicalTokenBlock:
         if self.current_num_blocks == self.num_blocks:
@@ -62,7 +92,7 @@ def free(self, block: PhysicalTokenBlock) -> None:
             raise ValueError(f"Double free! {block} is already freed.")
         block.ref_count -= 1
         if block.ref_count == 0:
-            self.free_blocks.append(block)
+            self.evictor.return_block(block)
 
     def get_num_free_blocks(self) -> int:
         return self.num_blocks - self.current_num_blocks

From 73ab52cbf8dd53eeaec642e830b0089cf73812e3 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Mon, 5 Feb 2024 09:14:39 -0500
Subject: [PATCH 03/79] format, replace match with if-else

---
 vllm/core/block_manager.py | 48 +++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 2c5c445a08aef..c182f702fa20b 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -7,41 +7,39 @@
 from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
 from vllm.utils import Device
 
+
 class EvictionPolicy(enum.Enum):
     """Enum for eviction policy used by BlockAllocator."""
     LRU = enum.auto()
 
 
 class Evictor:
-    """Evicts physical blocks on cache based on eviction policy."""
+    """Evicts physical blocks from cache based on eviction policy."""
 
-    def __init__(
-        self,
-        eviction_policy: EvictionPolicy = EvictionPolicy.LRU
-    ) -> None:
+    def __init__(self,
+                 eviction_policy: EvictionPolicy = EvictionPolicy.LRU) -> None:
         self.eviction_policy = eviction_policy
 
         # Initialize the free blocks.
         self.free_blocks: Deque[PhysicalTokenBlock] = deque()
 
-    def evict(
-        self,
-        table: Dict[int, PhysicalTokenBlock]
-    ) -> PhysicalTokenBlock:
-        match(self.eviction_policy):
-            case EvictionPolicy.LRU:
-                assert (len(self.free_blocks))
-                # Find the block in the main hash table
-                block = self.free_blocks.pop()
-                key = list(table.keys())[list(table.values()).index()]
-                del table[key]
-                return block
-            case _:
-                raise ValueError(f"Unknown cache eviction policy: {self.eviction_policy}")
+    def evict(self, table: Dict[int,
+                                PhysicalTokenBlock]) -> PhysicalTokenBlock:
+        if self.eviction_policy == EvictionPolicy.LRU:
+            assert (len(self.free_blocks))
+            # Find the block in the main hash table
+            block = self.free_blocks.pop()
+            key = list(table.keys())[list(table.values()).index()]
+            del table[key]
+            return block
+        else:
+            raise ValueError(
+                f"Unknown cache eviction policy: {self.eviction_policy}")
 
     def return_block(self, block: PhysicalTokenBlock) -> None:
         self.free_blocks.append(block)
 
+
 class BlockAllocator:
     """Manages free physical token blocks for a device.
 
@@ -50,13 +48,11 @@ class BlockAllocator:
     the reference count becomes zero, the block is added back to the free list.
     """
 
-    def __init__(
-        self,
-        device: Device,
-        block_size: int,
-        num_blocks: int,
-        eviction_policy: EvictionPolicy = EvictionPolicy.LRU
-    ) -> None:
+    def __init__(self,
+                 device: Device,
+                 block_size: int,
+                 num_blocks: int,
+                 eviction_policy: EvictionPolicy = EvictionPolicy.LRU) -> None:
         self.device = device
         self.block_size = block_size
         self.num_blocks = num_blocks

From 76b5290e6a3e32c17f8a0610605aee31ed45583c Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Mon, 5 Feb 2024 10:32:54 -0500
Subject: [PATCH 04/79] shore up some of the eviction logic

---
 vllm/block.py              |  3 +++
 vllm/core/block_manager.py | 48 ++++++++++++++++++++++----------------
 2 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/vllm/block.py b/vllm/block.py
index 5fe39ed47b2ff..d57173eb8a80b 100644
--- a/vllm/block.py
+++ b/vllm/block.py
@@ -55,6 +55,7 @@ def __init__(
         device: Device,
         block_number: int,
         block_size: int,
+        block_hash: int,
     ) -> None:
         self.device = device
         self.block_number = block_number
@@ -62,6 +63,8 @@ def __init__(
 
         self.ref_count = 0
 
+        self.block_hash = block_hash
+
     def __repr__(self) -> str:
         return (f'PhysicalTokenBlock(device={self.device}, '
                 f'block_number={self.block_number}, '
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index c182f702fa20b..b41ee6c9dc60d 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -23,21 +23,26 @@ def __init__(self,
         # Initialize the free blocks.
         self.free_blocks: Deque[PhysicalTokenBlock] = deque()
 
-    def evict(self, table: Dict[int,
-                                PhysicalTokenBlock]) -> PhysicalTokenBlock:
-        if self.eviction_policy == EvictionPolicy.LRU:
-            assert (len(self.free_blocks))
-            # Find the block in the main hash table
-            block = self.free_blocks.pop()
-            key = list(table.keys())[list(table.values()).index()]
-            del table[key]
-            return block
-        else:
-            raise ValueError(
-                f"Unknown cache eviction policy: {self.eviction_policy}")
+    def evict(
+        self,
+        table: Dict[int, PhysicalTokenBlock]
+    ) -> PhysicalTokenBlock:
+            if self.eviction_policy == EvictionPolicy.LRU:
+                assert (len(self.free_blocks))
+                # Find the block in the main hash table
+                block = self.free_blocks.pop()
+
+                # Continue poping blocks until we find one with a ref_count of 0
+                while block.ref_count != 0:
+                    block = self.free_blocks.pop()
+                
+                del table[block.block_hash]
+                return block
+            else:
+                raise ValueError(f"Unknown cache eviction policy: {self.eviction_policy}")
 
     def return_block(self, block: PhysicalTokenBlock) -> None:
-        self.free_blocks.append(block)
+        self.free_blocks.appendleft(block)
 
 
 class BlockAllocator:
@@ -65,19 +70,22 @@ def __init__(self,
     def evict(self) -> PhysicalTokenBlock:
         return self.evictor.evict(self.table)
 
-    def allocate_block(self) -> PhysicalTokenBlock:
+    def allocate_block(self, block_hash: int) -> PhysicalTokenBlock:
         if self.current_num_blocks == self.num_blocks:
-            return self.evict()
+            block = self.evict()
+            block.block_hash = block_hash
+            return block
         block = PhysicalTokenBlock(device=self.device,
                                    block_number=self.current_num_blocks,
-                                   block_size=self.block_size)
+                                   block_size=self.block_size, 
+                                   block_hash = block_hash)
         self.current_num_blocks += 1
         return block
 
-    def allocate(self, i: int) -> PhysicalTokenBlock:
-        if i not in self.table:
-            self.table[i] = self.allocate_block()
-        block = self.table[i]
+    def allocate(self, block_hash: int) -> PhysicalTokenBlock:
+        if block_hash not in self.table:
+            self.table[block_hash] = self.allocate_block(block_hash)
+        block = self.table[block_hash]
         block.ref_count += 1
         # print(f"REFCOUNT ON ALLOCTION: {block}")
         return block

From fb9132bb0b6d1f315ad9862b1c534a8491ccd0ab Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Mon, 5 Feb 2024 10:46:41 -0500
Subject: [PATCH 05/79] autoformat

---
 vllm/core/block_manager.py | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index b41ee6c9dc60d..11053a9998872 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -23,23 +23,22 @@ def __init__(self,
         # Initialize the free blocks.
         self.free_blocks: Deque[PhysicalTokenBlock] = deque()
 
-    def evict(
-        self,
-        table: Dict[int, PhysicalTokenBlock]
-    ) -> PhysicalTokenBlock:
-            if self.eviction_policy == EvictionPolicy.LRU:
-                assert (len(self.free_blocks))
-                # Find the block in the main hash table
+    def evict(self, table: Dict[int,
+                                PhysicalTokenBlock]) -> PhysicalTokenBlock:
+        if self.eviction_policy == EvictionPolicy.LRU:
+            assert (len(self.free_blocks))
+            # Find the block in the main hash table
+            block = self.free_blocks.pop()
+
+            # Continue poping blocks until we find one with a ref_count of 0
+            while block.ref_count != 0:
                 block = self.free_blocks.pop()
 
-                # Continue poping blocks until we find one with a ref_count of 0
-                while block.ref_count != 0:
-                    block = self.free_blocks.pop()
-                
-                del table[block.block_hash]
-                return block
-            else:
-                raise ValueError(f"Unknown cache eviction policy: {self.eviction_policy}")
+            del table[block.block_hash]
+            return block
+        else:
+            raise ValueError(
+                f"Unknown cache eviction policy: {self.eviction_policy}")
 
     def return_block(self, block: PhysicalTokenBlock) -> None:
         self.free_blocks.appendleft(block)
@@ -77,8 +76,8 @@ def allocate_block(self, block_hash: int) -> PhysicalTokenBlock:
             return block
         block = PhysicalTokenBlock(device=self.device,
                                    block_number=self.current_num_blocks,
-                                   block_size=self.block_size, 
-                                   block_hash = block_hash)
+                                   block_size=self.block_size,
+                                   block_hash=block_hash)
         self.current_num_blocks += 1
         return block
 

From c84bbdaa3dfc3574a1ec207f34a70537c6383b02 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Tue, 6 Feb 2024 02:42:51 -0500
Subject: [PATCH 06/79] Test block hashing

---
 tests/test_cache_block_hashing.py | 80 +++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 tests/test_cache_block_hashing.py

diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py
new file mode 100644
index 0000000000000..b36db3ce506ac
--- /dev/null
+++ b/tests/test_cache_block_hashing.py
@@ -0,0 +1,80 @@
+"""Test hashing of cache blocks.
+
+Run `pytest tests/test_cache_block_hashing.py`.
+"""
+import pytest
+
+from vllm.transformers_utils.tokenizer import TokenizerGroup
+from vllm.sequence import Sequence
+
+# Make two prefixes with different first blocks.
+prefix_start = [("You are an expert"), ("You are a")]
+prefix_common = (
+    " school principal, skilled in effectively managing "
+    "faculty and staff. Draft 10-15 questions for a potential first grade "
+    "Head Teacher for my K-12, all-girls', independent school that emphasizes "
+    "community, joyful discovery, and life-long learning. The candidate is "
+    "coming in for a first-round panel interview for a 8th grade Math "
+    "teaching role. They have 5 years of previous teaching experience "
+    "as an assistant teacher at a co-ed, public school with experience "
+    "in middle school math teaching. Based on this, fulfill "
+    "the following: ")
+prefixes = [start + prefix_common for start in prefix_start]
+
+# Sample prompts.
+sample_prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is"
+]
+
+# Helper function.
+def flatten_2d(l):
+    return [lss for ls in l for lss in ls]
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("max_num_seqs", [256])
+def test_auto_prefix_caching(
+    model: str,
+    block_size: int,
+    max_num_seqs: int
+):
+
+    tokenizer = TokenizerGroup(
+        tokenizer_id="facebook/opt-125m",
+        enable_lora=False,
+        max_num_seqs=max_num_seqs,
+        max_input_length=None,
+    )
+
+    hashes = []
+
+    for prefix in prefixes:
+        hashes.append([])
+        prompts = [prefix + prompt for prompt in sample_prompts]
+        seq_id = 0
+        for prompt in prompts:
+            hashes[-1].append([])
+            prompt_token_ids = tokenizer.encode(prompt)
+            seq = Sequence(seq_id, prompt, prompt_token_ids, block_size)
+            
+            num_blocks = len(prompt_token_ids) // block_size
+            for idx in range(num_blocks):
+                hashes[-1][-1].append(seq.hash(idx))
+            
+            seq_id += 1
+
+    # Check that hashes made with two prefixes with different first blocks are
+    # different everywhere.
+    for hash0, hash1 in zip(flatten_2d(hashes[0]), flatten_2d(hashes[1])):
+        assert (hash0 != hash1)
+
+    # Check that hashes of different prompts made with the same prefix are the
+    # same until the hashes that contain the prompt.
+    for hash_pref in hashes:
+        same_hashes = [tuple(h[:-1]) for h in hash_pref]
+        different_hashes = [h[-1] for h in hash_pref]
+        assert(len(set(same_hashes)) == 1)
+        assert(len(set(different_hashes)) == len(different_hashes))

From be146c070bae51b508df2e4a537e63532069b004 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Tue, 6 Feb 2024 02:47:37 -0500
Subject: [PATCH 07/79] Format

---
 tests/test_cache_block_hashing.py | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py
index b36db3ce506ac..f4eb90378eb0b 100644
--- a/tests/test_cache_block_hashing.py
+++ b/tests/test_cache_block_hashing.py
@@ -23,24 +23,20 @@
 
 # Sample prompts.
 sample_prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is"
+    "Hello, my name is", "The president of the United States is",
+    "The capital of France is", "The future of AI is"
 ]
 
+
 # Helper function.
-def flatten_2d(l):
-    return [lss for ls in l for lss in ls]
+def flatten_2d(li):
+    return [lss for ls in li for lss in ls]
+
 
 @pytest.mark.parametrize("model", ["facebook/opt-125m"])
 @pytest.mark.parametrize("block_size", [16])
 @pytest.mark.parametrize("max_num_seqs", [256])
-def test_auto_prefix_caching(
-    model: str,
-    block_size: int,
-    max_num_seqs: int
-):
+def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int):
 
     tokenizer = TokenizerGroup(
         tokenizer_id="facebook/opt-125m",
@@ -59,11 +55,11 @@ def test_auto_prefix_caching(
             hashes[-1].append([])
             prompt_token_ids = tokenizer.encode(prompt)
             seq = Sequence(seq_id, prompt, prompt_token_ids, block_size)
-            
+
             num_blocks = len(prompt_token_ids) // block_size
             for idx in range(num_blocks):
                 hashes[-1][-1].append(seq.hash(idx))
-            
+
             seq_id += 1
 
     # Check that hashes made with two prefixes with different first blocks are
@@ -76,5 +72,5 @@ def test_auto_prefix_caching(
     for hash_pref in hashes:
         same_hashes = [tuple(h[:-1]) for h in hash_pref]
         different_hashes = [h[-1] for h in hash_pref]
-        assert(len(set(same_hashes)) == 1)
-        assert(len(set(different_hashes)) == len(different_hashes))
+        assert (len(set(same_hashes)) == 1)
+        assert (len(set(different_hashes)) == len(different_hashes))

From 063d2fb9394823be4428094dfa80ff1df18b0826 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Tue, 6 Feb 2024 09:11:39 -0500
Subject: [PATCH 08/79] added block allocator tests

---
 tests/prefix_caching/test_prefix_caching.py | 75 +++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index dded5e1b0f7a4..e98fb67fbe5d5 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -5,6 +5,8 @@
 import pytest
 
 from vllm import LLM, SamplingParams
+from vllm.core.block_manager import BlockAllocator
+from vllm.utils import Device
 
 prefix = (
     "You are an expert school principal, skilled in effectively managing "
@@ -18,6 +20,14 @@
     "the following paragraph: ")
 
 
+def allocate_all_blocks(block_allocator, num_blocks):
+    blocks = []
+    for i in range(num_blocks):
+        # use i as the block_hash
+        blocks.append(block_allocator.allocate(i))
+    return blocks
+
+
 @pytest.mark.parametrize("model", ["facebook/opt-125m"])
 @pytest.mark.parametrize("max_tokens", [16])
 def test_prefix_caching(
@@ -38,3 +48,68 @@ def test_prefix_caching(
             outputs_without_prefix, outputs_with_prefix):
         assert (output_without_prefix.outputs[0].token_ids ==
                 output_with_prefix.outputs[0].token_ids)
+
+
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("num_blocks", [16])
+def test_block_allocator(
+    block_size: int,
+    num_blocks: int,
+):
+    block_hash = 1
+    block_allocator = BlockAllocator(Device.CPU, block_size, num_blocks)
+
+    # Allocate two PysicalTokenBlocks with the same hash and check that they are the same PhysicalTokenBlock
+    first_block = block_allocator.allocate(block_hash)
+    second_block = block_allocator.allocate(block_hash)
+    assert (first_block == second_block)
+    assert (second_block.ref_count == 2)
+
+    # Free the first_block and confirm that the ref_count is correctly decremented on the second block
+    block_allocator.free(first_block)
+    assert (second_block.ref_count == 1)
+
+    # Free the second block and confirm that the block ends up on the free list
+    block_allocator.free(second_block)
+    assert (len(block_allocator.evictor.free_blocks) == 1)
+    free_block = block_allocator.evictor.free_blocks[0]
+    assert (free_block == second_block)
+
+    # Reallocate the first block and confirm that, even after the block had its ref_count go to 0, we still get the same block back
+    first_block = block_allocator.allocate(block_hash)
+    assert (first_block == second_block)
+    assert (first_block.block_hash == block_hash)
+
+
+@pytest.mark.parametrize("num_blocks", [16])
+def test_eviction(num_blocks: int, ):
+    block_size = 16
+    block_allocator = BlockAllocator(Device.CPU, block_size, num_blocks)
+    blocks = []
+
+    for i in range(num_blocks):
+        # use i as the block_hash
+        blocks.append(block_allocator.allocate(i))
+
+    #Free all blocks
+    for block in blocks:
+        block_allocator.free(block)
+
+    # Allocate a new block and confirm that it's the first block freed. I.E The Least Recently Used block
+    new_block_hash = block_size
+    new_block = block_allocator.allocate(new_block_hash)
+    assert (new_block == blocks[0])
+    assert (new_block.block_hash == new_block_hash)
+
+    # Reallocate the second in blocks to remove it from the free list
+    realloc_block_hash = 1
+    realloc_block = block_allocator.allocate(realloc_block_hash)
+    assert (realloc_block == blocks[realloc_block_hash])
+    assert (realloc_block.block_hash == realloc_block_hash)
+
+    # Allocate a new block and confirm that it's not the realloc_block, since the realloc_block shouldn't be in the free list
+    new_block_hash = block_size + 1
+    new_block = block_allocator.allocate(new_block_hash)
+    assert (realloc_block != new_block)
+    assert (new_block.block_hash == new_block_hash)
+    assert (new_block.block_number == 2)

From 15099d2efa73ff5a0cf157c710c66c3ef01b1b0f Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Wed, 7 Feb 2024 15:17:37 -0500
Subject: [PATCH 09/79] added timestamps to the PhysicalTokenBlock and updated
 the eviction logic

---
 tests/prefix_caching/test_prefix_caching.py |  2 +-
 vllm/block.py                               |  3 +
 vllm/core/block_manager.py                  | 86 ++++++++++++++-------
 vllm/core/scheduler.py                      | 40 ++++++----
 4 files changed, 85 insertions(+), 46 deletions(-)

diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index e98fb67fbe5d5..798d9f1973df5 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -72,7 +72,7 @@ def test_block_allocator(
     # Free the second block and confirm that the block ends up on the free list
     block_allocator.free(second_block)
     assert (len(block_allocator.evictor.free_blocks) == 1)
-    free_block = block_allocator.evictor.free_blocks[0]
+    free_block = block_allocator.evictor.free_blocks[block_hash]
     assert (free_block == second_block)
 
     # Reallocate the first block and confirm that, even after the block had its ref_count go to 0, we still get the same block back
diff --git a/vllm/block.py b/vllm/block.py
index d57173eb8a80b..13a4d4bb067f5 100644
--- a/vllm/block.py
+++ b/vllm/block.py
@@ -65,6 +65,9 @@ def __init__(
 
         self.block_hash = block_hash
 
+        #TODO: is this a good default?
+        self.last_accessed = 0
+
     def __repr__(self) -> str:
         return (f'PhysicalTokenBlock(device={self.device}, '
                 f'block_number={self.block_number}, '
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 11053a9998872..552ddee4b0e35 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -1,7 +1,7 @@
 """A block manager that manages token blocks."""
 import enum
-from collections import deque
-from typing import Dict, List, Optional, Set, Tuple, Deque
+from time import monotonic
+from typing import Dict, List, Optional, Set, Tuple
 
 from vllm.block import BlockTable, PhysicalTokenBlock
 from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
@@ -21,27 +21,42 @@ def __init__(self,
         self.eviction_policy = eviction_policy
 
         # Initialize the free blocks.
-        self.free_blocks: Deque[PhysicalTokenBlock] = deque()
+        self.free_blocks: Dict[int, PhysicalTokenBlock] = {}
 
     def evict(self, table: Dict[int,
                                 PhysicalTokenBlock]) -> PhysicalTokenBlock:
         if self.eviction_policy == EvictionPolicy.LRU:
-            assert (len(self.free_blocks))
-            # Find the block in the main hash table
-            block = self.free_blocks.pop()
-
-            # Continue poping blocks until we find one with a ref_count of 0
-            while block.ref_count != 0:
-                block = self.free_blocks.pop()
-
-            del table[block.block_hash]
-            return block
+            all_blocks: List[PhysicalTokenBlock] = list(
+                self.free_blocks.values())
+            assert (len(all_blocks) > 0)
+
+            # Find lowest timestamp
+            lowest_timestamp = all_blocks[0].last_accessed
+            for block in all_blocks:
+                assert (block.last_accessed != 0)
+                if block.last_accessed < lowest_timestamp:
+                    lowest_timestamp = block.last_accessed
+
+            # Find all blocks with the lowest timestamp
+            eviction_candidates: List[PhysicalTokenBlock] = []
+            for block in all_blocks:
+                if block.last_accessed == lowest_timestamp:
+                    eviction_candidates.append(block)
+
+            # Arbitrarily evict the first candidate
+            # TODO: Evict based on the number of prefix tokens in the block
+            assert (len(eviction_candidates) > 0)
+            evicted_block = eviction_candidates[0]
+            del table[evicted_block.block_hash]
+            del self.free_blocks[evicted_block.block_hash]
+
+            return evicted_block
         else:
             raise ValueError(
                 f"Unknown cache eviction policy: {self.eviction_policy}")
 
     def return_block(self, block: PhysicalTokenBlock) -> None:
-        self.free_blocks.appendleft(block)
+        self.free_blocks[block.block_hash] = block
 
 
 class BlockAllocator:
@@ -85,16 +100,23 @@ def allocate(self, block_hash: int) -> PhysicalTokenBlock:
         if block_hash not in self.table:
             self.table[block_hash] = self.allocate_block(block_hash)
         block = self.table[block_hash]
+        if self.evictor.free_blocks.get(block_hash) is not None:
+            del self.evictor.free_blocks[block_hash]
+
         block.ref_count += 1
         # print(f"REFCOUNT ON ALLOCTION: {block}")
         return block
 
-    def free(self, block: PhysicalTokenBlock) -> None:
-        # print(f"FREEING: {block}")
+    def free(self,
+             block: PhysicalTokenBlock,
+             now: Optional[int] = None) -> None:
         if block.ref_count == 0:
             raise ValueError(f"Double free! {block} is already freed.")
         block.ref_count -= 1
         if block.ref_count == 0:
+            if now is None:
+                now = monotonic()
+            block.last_accessed = now
             self.evictor.return_block(block)
 
     def get_num_free_blocks(self) -> int:
@@ -198,7 +220,9 @@ def can_append_slot(self, seq_group: SequenceGroup) -> bool:
         num_seqs = seq_group.num_seqs(status=SequenceStatus.RUNNING)
         return num_seqs <= num_free_gpu_blocks
 
-    def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]:
+    def append_slot(self,
+                    seq: Sequence,
+                    now: Optional[float] = None) -> Optional[Tuple[int, int]]:
         """Allocate a physical slot for a new token."""
         logical_blocks = seq.logical_token_blocks
         block_table = self.block_tables[seq.seq_id]
@@ -229,7 +253,7 @@ def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]:
             new_block = self.gpu_allocator.allocate(
                 seq.hash(len(logical_blocks) - 1))
             block_table[-1] = new_block
-            self.gpu_allocator.free(last_block)
+            self.gpu_allocator.free(last_block, now)
             return last_block.block_number, new_block.block_number
 
     def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
@@ -261,7 +285,9 @@ def can_swap_in(self, seq_group: SequenceGroup) -> bool:
         num_required_blocks = len(blocks) + num_swapped_seqs
         return num_free_blocks - num_required_blocks >= self.watermark_blocks
 
-    def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]:
+    def swap_in(self,
+                seq_group: SequenceGroup,
+                now: Optional[float] = None) -> Dict[int, int]:
         # CPU block -> GPU block.
         if seq_group.prefix is not None:
             # make sure to swap in the prefix first
@@ -286,7 +312,7 @@ def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]:
                     mapping[cpu_block] = gpu_block
                 new_block_table.append(gpu_block)
                 # Free the CPU block swapped in to GPU.
-                self.cpu_allocator.free(cpu_block)
+                self.cpu_allocator.free(cpu_block, now)
             self.block_tables[seq.seq_id] = new_block_table
 
         block_number_mapping = {
@@ -299,7 +325,9 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool:
         blocks = self._get_physical_blocks(seq_group)
         return len(blocks) <= self.cpu_allocator.get_num_free_blocks()
 
-    def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
+    def swap_out(self,
+                 seq_group: SequenceGroup,
+                 now: Optional[float] = None) -> Dict[int, int]:
         # GPU block -> CPU block.
         mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {}
         for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
@@ -310,7 +338,7 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
                 if (seq_group.prefix is not None
                         and gpu_block in seq_group.prefix.block_table):
                     # NOTE: We do not swap out the prefix blocks for now.
-                    self.gpu_allocator.free(gpu_block)
+                    self.gpu_allocator.free(gpu_block, now)
                     continue
 
                 if gpu_block in mapping:
@@ -322,7 +350,7 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
                     mapping[gpu_block] = cpu_block
                 new_block_table.append(cpu_block)
                 # Free the GPU block swapped out to CPU.
-                self.gpu_allocator.free(gpu_block)
+                self.gpu_allocator.free(gpu_block, now)
             self.block_tables[seq.seq_id] = new_block_table
 
         block_number_mapping = {
@@ -331,19 +359,21 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
         }
         return block_number_mapping
 
-    def _free_block_table(self, block_table: BlockTable) -> None:
+    def _free_block_table(self,
+                          block_table: BlockTable,
+                          now: Optional[float] = None) -> None:
         for block in set(block_table):
             if block.device == Device.GPU:
-                self.gpu_allocator.free(block)
+                self.gpu_allocator.free(block, now)
             else:
-                self.cpu_allocator.free(block)
+                self.cpu_allocator.free(block, now)
 
-    def free(self, seq: Sequence) -> None:
+    def free(self, seq: Sequence, now: Optional[float] = None) -> None:
         if seq.seq_id not in self.block_tables:
             # Already freed or haven't been scheduled yet.
             return
         block_table = self.block_tables[seq.seq_id]
-        self._free_block_table(block_table)
+        self._free_block_table(block_table, now)
         del self.block_tables[seq.seq_id]
 
     def reset(self) -> None:
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 213f9bb9cf30c..158fec4d8d123 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -127,6 +127,8 @@ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
         if isinstance(request_id, str):
             request_id = (request_id, )
         request_ids = set(request_id)
+
+        now = time.monotonic()
         for state_queue in [self.waiting, self.running, self.swapped]:
             aborted_groups: List[SequenceGroup] = []
             for seq_group in state_queue:
@@ -145,7 +147,7 @@ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
                     if seq.is_finished():
                         continue
                     seq.status = SequenceStatus.FINISHED_ABORTED
-                    self.free_seq(seq)
+                    self.free_seq(seq, now)
 
     def has_unfinished_seqs(self) -> bool:
         return self.waiting or self.running or self.swapped
@@ -279,17 +281,18 @@ def _schedule(self) -> SchedulerOutputs:
                 if self.running:
                     # Preempt the lowest-priority sequence groups.
                     victim_seq_group = self.running.pop()
-                    self._preempt(victim_seq_group, blocks_to_swap_out)
+                    self._preempt(victim_seq_group, blocks_to_swap_out, None,
+                                  now)
                     preempted.append(victim_seq_group)
                 else:
                     # No other sequence groups can be preempted.
                     # Preempt the current sequence group.
-                    self._preempt(seq_group, blocks_to_swap_out)
+                    self._preempt(seq_group, blocks_to_swap_out, None, now)
                     preempted.append(seq_group)
                     break
             else:
                 # Append new slots to the sequence group.
-                self._append_slot(seq_group, blocks_to_copy)
+                self._append_slot(seq_group, blocks_to_copy, now)
                 running.append(seq_group)
         self.running = running
 
@@ -331,7 +334,7 @@ def _schedule(self) -> SchedulerOutputs:
                 if lora_int_id > 0:
                     curr_loras.add(lora_int_id)
                 self.swapped.popleft()
-                self._swap_in(seq_group, blocks_to_swap_in)
+                self._swap_in(seq_group, blocks_to_swap_in, now)
                 self._append_slot(seq_group, blocks_to_copy)
                 num_curr_seqs += num_new_seqs
                 self.running.append(seq_group)
@@ -386,8 +389,8 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
     def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None:
         self.block_manager.fork(parent_seq, child_seq)
 
-    def free_seq(self, seq: Sequence) -> None:
-        self.block_manager.free(seq)
+    def free_seq(self, seq: Sequence, now: Optional[float] = None) -> None:
+        self.block_manager.free(seq, now)
 
     def free_finished_seq_groups(self) -> None:
         self.running = deque(seq_group for seq_group in self.running
@@ -402,9 +405,10 @@ def _append_slot(
         self,
         seq_group: SequenceGroup,
         blocks_to_copy: Dict[int, List[int]],
+        now: Optional[float] = None,
     ) -> None:
         for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
-            ret = self.block_manager.append_slot(seq)
+            ret = self.block_manager.append_slot(seq, now)
             if ret is not None:
                 src_block, dst_block = ret
                 if src_block in blocks_to_copy:
@@ -417,6 +421,7 @@ def _preempt(
         seq_group: SequenceGroup,
         blocks_to_swap_out: Dict[int, int],
         preemption_mode: Optional[PreemptionMode] = None,
+        now: Optional[float] = None,
     ) -> None:
         # If preemption mode is not specified, we determine the mode as follows:
         # We use recomputation by default since it incurs lower overhead than
@@ -435,7 +440,7 @@ def _preempt(
             else:
                 preemption_mode = PreemptionMode.SWAP
         if preemption_mode == PreemptionMode.RECOMPUTE:
-            self._preempt_by_recompute(seq_group)
+            self._preempt_by_recompute(seq_group, now)
         elif preemption_mode == PreemptionMode.SWAP:
             self._preempt_by_swap(seq_group, blocks_to_swap_out)
         else:
@@ -444,12 +449,13 @@ def _preempt(
     def _preempt_by_recompute(
         self,
         seq_group: SequenceGroup,
+        now: Optional[float] = None,
     ) -> None:
         seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
         assert len(seqs) == 1
         for seq in seqs:
             seq.status = SequenceStatus.WAITING
-            self.block_manager.free(seq)
+            self.block_manager.free(seq, now)
         # NOTE: For FCFS, we insert the preempted sequence group to the front
         # of the waiting queue.
         self.waiting.appendleft(seq_group)
@@ -466,24 +472,24 @@ def _swap_in(
         self,
         seq_group: SequenceGroup,
         blocks_to_swap_in: Dict[int, int],
+        now: Optional[float] = None,
     ) -> None:
-        mapping = self.block_manager.swap_in(seq_group)
+        mapping = self.block_manager.swap_in(seq_group, now)
         blocks_to_swap_in.update(mapping)
         for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
             seq.status = SequenceStatus.RUNNING
 
-    def _swap_out(
-        self,
-        seq_group: SequenceGroup,
-        blocks_to_swap_out: Dict[int, int],
-    ) -> None:
+    def _swap_out(self,
+                  seq_group: SequenceGroup,
+                  blocks_to_swap_out: Dict[int, int],
+                  now: Optional[float] = None) -> None:
         if not self.block_manager.can_swap_out(seq_group):
             # FIXME(woosuk): Abort the sequence group instead of aborting the
             # entire engine.
             raise RuntimeError(
                 "Aborted due to the lack of CPU swap space. Please increase "
                 "the swap space to avoid this error.")
-        mapping = self.block_manager.swap_out(seq_group)
+        mapping = self.block_manager.swap_out(seq_group, now)
         blocks_to_swap_out.update(mapping)
         for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
             seq.status = SequenceStatus.SWAPPED

From 9411e06088ed30fa70025b2a0d11835e436288f9 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Wed, 7 Feb 2024 17:30:55 -0500
Subject: [PATCH 10/79] Delete the free hash table from the evictor class

---
 tests/prefix_caching/test_prefix_caching.py |  5 +---
 vllm/core/block_manager.py                  | 28 ++++++---------------
 vllm/sequence.py                            |  1 +
 3 files changed, 9 insertions(+), 25 deletions(-)

diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index 798d9f1973df5..9fe77b57f3fb9 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -69,11 +69,8 @@ def test_block_allocator(
     block_allocator.free(first_block)
     assert (second_block.ref_count == 1)
 
-    # Free the second block and confirm that the block ends up on the free list
+    # Free the second block
     block_allocator.free(second_block)
-    assert (len(block_allocator.evictor.free_blocks) == 1)
-    free_block = block_allocator.evictor.free_blocks[block_hash]
-    assert (free_block == second_block)
 
     # Reallocate the first block and confirm that, even after the block had its ref_count go to 0, we still get the same block back
     first_block = block_allocator.allocate(block_hash)
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 552ddee4b0e35..80f6ebc4e4b28 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -20,27 +20,23 @@ def __init__(self,
                  eviction_policy: EvictionPolicy = EvictionPolicy.LRU) -> None:
         self.eviction_policy = eviction_policy
 
-        # Initialize the free blocks.
-        self.free_blocks: Dict[int, PhysicalTokenBlock] = {}
-
     def evict(self, table: Dict[int,
                                 PhysicalTokenBlock]) -> PhysicalTokenBlock:
         if self.eviction_policy == EvictionPolicy.LRU:
             all_blocks: List[PhysicalTokenBlock] = list(
-                self.free_blocks.values())
+                table.values())
             assert (len(all_blocks) > 0)
 
             # Find lowest timestamp
-            lowest_timestamp = all_blocks[0].last_accessed
+            lowest_timestamp = monotonic()
             for block in all_blocks:
-                assert (block.last_accessed != 0)
-                if block.last_accessed < lowest_timestamp:
+                if block.ref_count == 0 and block.last_accessed < lowest_timestamp:
                     lowest_timestamp = block.last_accessed
 
             # Find all blocks with the lowest timestamp
             eviction_candidates: List[PhysicalTokenBlock] = []
             for block in all_blocks:
-                if block.last_accessed == lowest_timestamp:
+                if block.ref_count == 0 and block.last_accessed == lowest_timestamp:
                     eviction_candidates.append(block)
 
             # Arbitrarily evict the first candidate
@@ -48,17 +44,12 @@ def evict(self, table: Dict[int,
             assert (len(eviction_candidates) > 0)
             evicted_block = eviction_candidates[0]
             del table[evicted_block.block_hash]
-            del self.free_blocks[evicted_block.block_hash]
 
             return evicted_block
         else:
             raise ValueError(
                 f"Unknown cache eviction policy: {self.eviction_policy}")
 
-    def return_block(self, block: PhysicalTokenBlock) -> None:
-        self.free_blocks[block.block_hash] = block
-
-
 class BlockAllocator:
     """Manages free physical token blocks for a device.
 
@@ -100,9 +91,6 @@ def allocate(self, block_hash: int) -> PhysicalTokenBlock:
         if block_hash not in self.table:
             self.table[block_hash] = self.allocate_block(block_hash)
         block = self.table[block_hash]
-        if self.evictor.free_blocks.get(block_hash) is not None:
-            del self.evictor.free_blocks[block_hash]
-
         block.ref_count += 1
         # print(f"REFCOUNT ON ALLOCTION: {block}")
         return block
@@ -113,11 +101,9 @@ def free(self,
         if block.ref_count == 0:
             raise ValueError(f"Double free! {block} is already freed.")
         block.ref_count -= 1
-        if block.ref_count == 0:
-            if now is None:
-                now = monotonic()
-            block.last_accessed = now
-            self.evictor.return_block(block)
+        if now is None:
+            now = monotonic()
+        block.last_accessed = now
 
     def get_num_free_blocks(self) -> int:
         return self.num_blocks - self.current_num_blocks
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 2134e4f872c67..b3fcd0303c4f1 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -141,6 +141,7 @@ def lora_int_id(self) -> int:
         return self.lora_request.lora_int_id if self.lora_request else 0
 
     def hash(self, logical_idx: int) -> int:
+        # Compute the number of tokens in the sequence
         num_tokens = (logical_idx * self.block_size) + (
             self.block_size -
             self.logical_token_blocks[logical_idx].get_num_empty_slots())

From 359b82901f04c3137175fa3ba66b6fcb102f182c Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Wed, 7 Feb 2024 17:36:00 -0500
Subject: [PATCH 11/79] Remove the evictor class in favor of eviction free
 functions

---
 vllm/core/block_manager.py | 64 +++++++++++++++++---------------------
 1 file changed, 28 insertions(+), 36 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 80f6ebc4e4b28..62bded080ae4a 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -13,42 +13,30 @@ class EvictionPolicy(enum.Enum):
     LRU = enum.auto()
 
 
-class Evictor:
-    """Evicts physical blocks from cache based on eviction policy."""
+def lru_eviction(table: Dict[int, PhysicalTokenBlock]) -> PhysicalTokenBlock:
+    all_blocks: List[PhysicalTokenBlock] = list(table.values())
+    assert (len(all_blocks) > 0)
 
-    def __init__(self,
-                 eviction_policy: EvictionPolicy = EvictionPolicy.LRU) -> None:
-        self.eviction_policy = eviction_policy
+    # Find lowest timestamp
+    lowest_timestamp = monotonic()
+    for block in all_blocks:
+        if block.ref_count == 0 and block.last_accessed < lowest_timestamp:
+            lowest_timestamp = block.last_accessed
+
+    # Find all blocks with the lowest timestamp
+    eviction_candidates: List[PhysicalTokenBlock] = []
+    for block in all_blocks:
+        if block.ref_count == 0 and block.last_accessed == lowest_timestamp:
+            eviction_candidates.append(block)
+
+    # Arbitrarily evict the first candidate
+    # TODO: Evict based on the number of prefix tokens in the block
+    assert (len(eviction_candidates) > 0)
+    evicted_block = eviction_candidates[0]
+    del table[evicted_block.block_hash]
+
+    return evicted_block
 
-    def evict(self, table: Dict[int,
-                                PhysicalTokenBlock]) -> PhysicalTokenBlock:
-        if self.eviction_policy == EvictionPolicy.LRU:
-            all_blocks: List[PhysicalTokenBlock] = list(
-                table.values())
-            assert (len(all_blocks) > 0)
-
-            # Find lowest timestamp
-            lowest_timestamp = monotonic()
-            for block in all_blocks:
-                if block.ref_count == 0 and block.last_accessed < lowest_timestamp:
-                    lowest_timestamp = block.last_accessed
-
-            # Find all blocks with the lowest timestamp
-            eviction_candidates: List[PhysicalTokenBlock] = []
-            for block in all_blocks:
-                if block.ref_count == 0 and block.last_accessed == lowest_timestamp:
-                    eviction_candidates.append(block)
-
-            # Arbitrarily evict the first candidate
-            # TODO: Evict based on the number of prefix tokens in the block
-            assert (len(eviction_candidates) > 0)
-            evicted_block = eviction_candidates[0]
-            del table[evicted_block.block_hash]
-
-            return evicted_block
-        else:
-            raise ValueError(
-                f"Unknown cache eviction policy: {self.eviction_policy}")
 
 class BlockAllocator:
     """Manages free physical token blocks for a device.
@@ -67,13 +55,17 @@ def __init__(self,
         self.block_size = block_size
         self.num_blocks = num_blocks
 
-        self.evictor = Evictor(eviction_policy)
+        self.eviction_policy = eviction_policy
 
         self.current_num_blocks = 0
         self.table: Dict[int, PhysicalTokenBlock] = {}
 
     def evict(self) -> PhysicalTokenBlock:
-        return self.evictor.evict(self.table)
+        if self.eviction_policy == EvictionPolicy.LRU:
+            return lru_eviction(self.table)
+        else:
+            raise ValueError(
+                f"Unknown cache eviction policy: {self.eviction_policy}")
 
     def allocate_block(self, block_hash: int) -> PhysicalTokenBlock:
         if self.current_num_blocks == self.num_blocks:

From c9b0be6fcd20bea452cc2e136beb8fa1d35d9c05 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Thu, 8 Feb 2024 03:12:18 -0500
Subject: [PATCH 12/79] debugging in progress

---
 tests/test_cache_block_hashing.py   |  4 ++++
 vllm/core/block_manager.py          | 11 +++++++++++
 vllm/model_executor/weight_utils.py |  2 +-
 vllm/prefix.py                      |  1 +
 vllm/sequence.py                    |  3 +++
 vllm/worker/model_runner.py         |  4 ++++
 6 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py
index f4eb90378eb0b..ea8559508b481 100644
--- a/tests/test_cache_block_hashing.py
+++ b/tests/test_cache_block_hashing.py
@@ -60,6 +60,10 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int):
             for idx in range(num_blocks):
                 hashes[-1][-1].append(seq.hash(idx))
 
+            # Check that we can't hash incomplete blocks
+            with pytest.raises(ValueError) as e:
+                _ = seq.hash(num_blocks + 1)
+
             seq_id += 1
 
     # Check that hashes made with two prefixes with different first blocks are
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 11053a9998872..943fa58f637d5 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -173,6 +173,8 @@ def allocate(self, seq_group: SequenceGroup) -> None:
         seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
 
         # Allocate new physical token blocks that will store the prompt tokens.
+        # num_prompt_blocks = seq.get_prompt_len() // self.block_size
+
         num_prompt_blocks = len(seq.logical_token_blocks)
 
         block_table: BlockTable = []
@@ -182,11 +184,16 @@ def allocate(self, seq_group: SequenceGroup) -> None:
                     and logical_idx >= self.block_sliding_window):
                 block = block_table[logical_idx % self.block_sliding_window]
             else:
+                print(f"hash allocate {logical_idx}")
                 block = self.gpu_allocator.allocate(seq.hash(logical_idx))
             # Set the reference counts of the token blocks.
             # block.ref_count = seq_group.num_seqs()
             block_table.append(block)
 
+        # Append incomplete block to seq if any
+        # if num_prompt_blocks * self.block_size < seq.get_prompt_len():
+        #     # TODO
+
         # Assign the block table for each sequence.
         for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
             self.block_tables[seq.seq_id] = block_table.copy()
@@ -212,6 +219,7 @@ def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]:
             else:
                 # The sequence has a new logical block.
                 # Allocate a new physical block.
+                print("hash append_slot 1")
                 block = self.gpu_allocator.allocate(
                     seq.hash(len(logical_blocks) - 1))
                 block_table.append(block)
@@ -226,6 +234,7 @@ def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]:
         else:
             # The last block is shared with other sequences.
             # Copy on Write: Allocate a new block and copy the tokens.
+            print("hash append_slot 2")
             new_block = self.gpu_allocator.allocate(
                 seq.hash(len(logical_blocks) - 1))
             block_table[-1] = new_block
@@ -281,6 +290,7 @@ def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]:
                     gpu_block = mapping[cpu_block]
                     gpu_block.ref_count += 1
                 else:
+                    print("hash swap_in 1")
                     gpu_block = self.gpu_allocator.allocate(
                         seq.hash(len(seq.logical_blocks) - 1))
                     mapping[cpu_block] = gpu_block
@@ -317,6 +327,7 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
                     cpu_block = mapping[gpu_block]
                     cpu_block.ref_count += 1
                 else:
+                    print("hash swap_in 2")
                     cpu_block = self.cpu_allocator.allocate(
                         seq.hash(len(seq.logical_blocks) - 1))
                     mapping[gpu_block] = cpu_block
diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py
index 3570366887e78..a00062b8ddd1d 100644
--- a/vllm/model_executor/weight_utils.py
+++ b/vllm/model_executor/weight_utils.py
@@ -28,7 +28,7 @@ def __init__(self, *args, **kwargs):
 
 
 def get_lock(model_name_or_path: str, cache_dir: Optional[str] = None):
-    lock_dir = cache_dir if cache_dir is not None else "/tmp"
+    lock_dir = cache_dir if cache_dir is not None else "~/vllm_cache"
     lock_file_name = model_name_or_path.replace("/", "-") + ".lock"
     lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name))
     return lock
diff --git a/vllm/prefix.py b/vllm/prefix.py
index 5b6e8e4b92be6..4b780161a5278 100644
--- a/vllm/prefix.py
+++ b/vllm/prefix.py
@@ -74,6 +74,7 @@ def _truncate_token_ids(self, token_ids: Sequence[int]) -> Tuple[int]:
         new_length = len(token_ids) // self.block_size * self.block_size
         return tuple(token_ids[:new_length])
 
+    # TODO clean this up? It's not used anywhere now
     def add_or_get_prefix(self, token_ids: Sequence[int],
                           lora_int_id: int) -> Optional[Prefix]:
         token_ids = self._truncate_token_ids(token_ids)
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 2134e4f872c67..ce092900483c1 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -144,6 +144,9 @@ def hash(self, logical_idx: int) -> int:
         num_tokens = (logical_idx * self.block_size) + (
             self.block_size -
             self.logical_token_blocks[logical_idx].get_num_empty_slots())
+        # num_tokens = logical_idx * self.block_size + self.block_size
+        # if num_tokens > len(self.data.get_token_ids()):
+        #     raise ValueError(f"Can't hash incomplete block (block {logical_idx} needs hashing {num_tokens} tokens, but only {len(self.data.get_token_ids())} are present).")
         return hash(tuple(self.data.get_token_ids()[0:num_tokens]))
 
     def _append_logical_block(self) -> None:
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 5908d577e1a28..907208d065c36 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -172,6 +172,8 @@ def _prepare_prompt(
                     slot_mapping[-1].append(_PAD_SLOT_ID)
                     continue
 
+                print(block_table)
+                print(f"prepare {i}")
                 block_number = block_table[i // self.block_size]
                 block_offset = i % self.block_size
                 slot = block_number * self.block_size + block_offset
@@ -200,6 +202,8 @@ def _prepare_prompt(
         context_lens_tensor = torch.tensor(context_lens,
                                            dtype=torch.int,
                                            device=self.device)
+
+        print("prefix block tables:", prefix_block_tables)
         # Prepare prefix block tables
         max_prompt_block_table_len = max(len(t) for t in prefix_block_tables)
         block_tables = _make_tensor_with_pad(

From 6218d1a7f2bbd691530708f86fd3708941f898cf Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Thu, 8 Feb 2024 09:46:16 -0500
Subject: [PATCH 13/79] partial block support

---
 tests/test_cache_block_hashing.py   |  4 ----
 vllm/core/block_manager.py          | 35 +++++++++++++++++------------
 vllm/model_executor/weight_utils.py |  2 +-
 vllm/sequence.py                    | 11 ++++-----
 vllm/worker/model_runner.py         |  4 ----
 5 files changed, 26 insertions(+), 30 deletions(-)

diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py
index ea8559508b481..f4eb90378eb0b 100644
--- a/tests/test_cache_block_hashing.py
+++ b/tests/test_cache_block_hashing.py
@@ -60,10 +60,6 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int):
             for idx in range(num_blocks):
                 hashes[-1][-1].append(seq.hash(idx))
 
-            # Check that we can't hash incomplete blocks
-            with pytest.raises(ValueError) as e:
-                _ = seq.hash(num_blocks + 1)
-
             seq_id += 1
 
     # Check that hashes made with two prefixes with different first blocks are
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index d59fae743449f..04cd314c19de7 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -100,6 +100,12 @@ def free(self,
     def get_num_free_blocks(self) -> int:
         return self.num_blocks - self.current_num_blocks
 
+    def update_hash(self, block_hash: int, block: PhysicalTokenBlock) -> None:
+        old_hash = block.block_hash
+        del self.table[old_hash]
+        self.table[block_hash] = block
+        block.block_hash = block_hash
+
 
 class AllocStatus(enum.Enum):
     """Result for BlockSpaceManager.can_allocate
@@ -146,6 +152,7 @@ def __init__(
                                             num_cpu_blocks)
         # Mapping: seq_id -> BlockTable.
         self.block_tables: Dict[int, BlockTable] = {}
+        self.partial_block_table: Dict[int, PhysicalTokenBlock] = {}
 
     def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         # FIXME(woosuk): Here we assume that all sequences in the group share
@@ -173,7 +180,6 @@ def allocate(self, seq_group: SequenceGroup) -> None:
         seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
 
         # Allocate new physical token blocks that will store the prompt tokens.
-        # num_prompt_blocks = seq.get_prompt_len() // self.block_size
 
         num_prompt_blocks = len(seq.logical_token_blocks)
 
@@ -184,16 +190,14 @@ def allocate(self, seq_group: SequenceGroup) -> None:
                     and logical_idx >= self.block_sliding_window):
                 block = block_table[logical_idx % self.block_sliding_window]
             else:
-                print(f"hash allocate {logical_idx}")
                 block = self.gpu_allocator.allocate(seq.hash(logical_idx))
+                if logical_idx * self.block_size + self.block_size > len(
+                        seq.data.get_token_ids()):
+                    self.partial_block_table[seq.seq_id] = block
             # Set the reference counts of the token blocks.
             # block.ref_count = seq_group.num_seqs()
             block_table.append(block)
 
-        # Append incomplete block to seq if any
-        # if num_prompt_blocks * self.block_size < seq.get_prompt_len():
-        #     # TODO
-
         # Assign the block table for each sequence.
         for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
             self.block_tables[seq.seq_id] = block_table.copy()
@@ -212,6 +216,7 @@ def append_slot(self,
         logical_blocks = seq.logical_token_blocks
         block_table = self.block_tables[seq.seq_id]
 
+        # If we need to allocate a new physical block
         if len(block_table) < len(logical_blocks):
             if (self.block_sliding_window
                     and len(block_table) >= self.block_sliding_window):
@@ -221,10 +226,10 @@ def append_slot(self,
             else:
                 # The sequence has a new logical block.
                 # Allocate a new physical block.
-                print("hash append_slot 1")
-                block = self.gpu_allocator.allocate(
-                    seq.hash(len(logical_blocks) - 1))
-                block_table.append(block)
+                assert (seq.seq_id not in self.partial_block_table)
+                self.partial_block_table[
+                    seq.seq_id] = self.gpu_allocator.allocate(seq.seq_id)
+                block_table.append(self.partial_block_table[seq.seq_id])
                 return None
 
         # We want to append the token to the last physical block.
@@ -232,13 +237,17 @@ def append_slot(self,
         assert last_block.device == Device.GPU
         if last_block.ref_count == 1:
             # Not shared with other sequences. Appendable.
-            return None
+            if len(seq.data.get_token_ids()) % seq.block_size == 0:
+                del self.partial_block_table[seq.seq_id]
+                new_hash = seq.hash(len(logical_blocks) - 1)
+                self.gpu_allocator.update_hash(new_hash, last_block)
+                return None
         else:
             # The last block is shared with other sequences.
             # Copy on Write: Allocate a new block and copy the tokens.
-            print("hash append_slot 2")
             new_block = self.gpu_allocator.allocate(
                 seq.hash(len(logical_blocks) - 1))
+            assert (new_block.ref_count == 1)
             block_table[-1] = new_block
             self.gpu_allocator.free(last_block, now)
             return last_block.block_number, new_block.block_number
@@ -294,7 +303,6 @@ def swap_in(self,
                     gpu_block = mapping[cpu_block]
                     gpu_block.ref_count += 1
                 else:
-                    print("hash swap_in 1")
                     gpu_block = self.gpu_allocator.allocate(
                         seq.hash(len(seq.logical_blocks) - 1))
                     mapping[cpu_block] = gpu_block
@@ -333,7 +341,6 @@ def swap_out(self,
                     cpu_block = mapping[gpu_block]
                     cpu_block.ref_count += 1
                 else:
-                    print("hash swap_in 2")
                     cpu_block = self.cpu_allocator.allocate(
                         seq.hash(len(seq.logical_blocks) - 1))
                     mapping[gpu_block] = cpu_block
diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py
index a00062b8ddd1d..3570366887e78 100644
--- a/vllm/model_executor/weight_utils.py
+++ b/vllm/model_executor/weight_utils.py
@@ -28,7 +28,7 @@ def __init__(self, *args, **kwargs):
 
 
 def get_lock(model_name_or_path: str, cache_dir: Optional[str] = None):
-    lock_dir = cache_dir if cache_dir is not None else "~/vllm_cache"
+    lock_dir = cache_dir if cache_dir is not None else "/tmp"
     lock_file_name = model_name_or_path.replace("/", "-") + ".lock"
     lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name))
     return lock
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 8e505e437105b..295c7e51b5a01 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -142,13 +142,10 @@ def lora_int_id(self) -> int:
 
     def hash(self, logical_idx: int) -> int:
         # Compute the number of tokens in the sequence
-        num_tokens = (logical_idx * self.block_size) + (
-            self.block_size -
-            self.logical_token_blocks[logical_idx].get_num_empty_slots())
-        # num_tokens = logical_idx * self.block_size + self.block_size
-        # if num_tokens > len(self.data.get_token_ids()):
-        #     raise ValueError(f"Can't hash incomplete block (block {logical_idx} needs hashing {num_tokens} tokens, but only {len(self.data.get_token_ids())} are present).")
-        return hash(tuple(self.data.get_token_ids()[0:num_tokens]))
+        num_tokens = logical_idx * self.block_size + self.block_size
+        return hash(
+            tuple(self.data.get_token_ids()
+                  [0:min(num_tokens, len(self.data.get_token_ids()))]))
 
     def _append_logical_block(self) -> None:
         block = LogicalTokenBlock(
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 907208d065c36..5908d577e1a28 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -172,8 +172,6 @@ def _prepare_prompt(
                     slot_mapping[-1].append(_PAD_SLOT_ID)
                     continue
 
-                print(block_table)
-                print(f"prepare {i}")
                 block_number = block_table[i // self.block_size]
                 block_offset = i % self.block_size
                 slot = block_number * self.block_size + block_offset
@@ -202,8 +200,6 @@ def _prepare_prompt(
         context_lens_tensor = torch.tensor(context_lens,
                                            dtype=torch.int,
                                            device=self.device)
-
-        print("prefix block tables:", prefix_block_tables)
         # Prepare prefix block tables
         max_prompt_block_table_len = max(len(t) for t in prefix_block_tables)
         block_tables = _make_tensor_with_pad(

From b35819d3204541ed8ba5fbf14f253302f248f659 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 8 Feb 2024 16:08:50 -0500
Subject: [PATCH 14/79] Move PhysicalTokenBlock.last_accessed updates to the
 block_manager/scheduler

---
 vllm/core/block_manager.py | 45 ++++++++++++++-------------------
 vllm/core/scheduler.py     | 52 ++++++++++++++------------------------
 2 files changed, 38 insertions(+), 59 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 04cd314c19de7..e9f94deca425d 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -87,15 +87,10 @@ def allocate(self, block_hash: int) -> PhysicalTokenBlock:
         # print(f"REFCOUNT ON ALLOCTION: {block}")
         return block
 
-    def free(self,
-             block: PhysicalTokenBlock,
-             now: Optional[int] = None) -> None:
+    def free(self, block: PhysicalTokenBlock) -> None:
         if block.ref_count == 0:
             raise ValueError(f"Double free! {block} is already freed.")
         block.ref_count -= 1
-        if now is None:
-            now = monotonic()
-        block.last_accessed = now
 
     def get_num_free_blocks(self) -> int:
         return self.num_blocks - self.current_num_blocks
@@ -209,9 +204,7 @@ def can_append_slot(self, seq_group: SequenceGroup) -> bool:
         num_seqs = seq_group.num_seqs(status=SequenceStatus.RUNNING)
         return num_seqs <= num_free_gpu_blocks
 
-    def append_slot(self,
-                    seq: Sequence,
-                    now: Optional[float] = None) -> Optional[Tuple[int, int]]:
+    def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]:
         """Allocate a physical slot for a new token."""
         logical_blocks = seq.logical_token_blocks
         block_table = self.block_tables[seq.seq_id]
@@ -249,7 +242,7 @@ def append_slot(self,
                 seq.hash(len(logical_blocks) - 1))
             assert (new_block.ref_count == 1)
             block_table[-1] = new_block
-            self.gpu_allocator.free(last_block, now)
+            self.gpu_allocator.free(last_block)
             return last_block.block_number, new_block.block_number
 
     def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
@@ -281,9 +274,7 @@ def can_swap_in(self, seq_group: SequenceGroup) -> bool:
         num_required_blocks = len(blocks) + num_swapped_seqs
         return num_free_blocks - num_required_blocks >= self.watermark_blocks
 
-    def swap_in(self,
-                seq_group: SequenceGroup,
-                now: Optional[float] = None) -> Dict[int, int]:
+    def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]:
         # CPU block -> GPU block.
         if seq_group.prefix is not None:
             # make sure to swap in the prefix first
@@ -308,7 +299,7 @@ def swap_in(self,
                     mapping[cpu_block] = gpu_block
                 new_block_table.append(gpu_block)
                 # Free the CPU block swapped in to GPU.
-                self.cpu_allocator.free(cpu_block, now)
+                self.cpu_allocator.free(cpu_block)
             self.block_tables[seq.seq_id] = new_block_table
 
         block_number_mapping = {
@@ -321,9 +312,7 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool:
         blocks = self._get_physical_blocks(seq_group)
         return len(blocks) <= self.cpu_allocator.get_num_free_blocks()
 
-    def swap_out(self,
-                 seq_group: SequenceGroup,
-                 now: Optional[float] = None) -> Dict[int, int]:
+    def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
         # GPU block -> CPU block.
         mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {}
         for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
@@ -334,7 +323,7 @@ def swap_out(self,
                 if (seq_group.prefix is not None
                         and gpu_block in seq_group.prefix.block_table):
                     # NOTE: We do not swap out the prefix blocks for now.
-                    self.gpu_allocator.free(gpu_block, now)
+                    self.gpu_allocator.free(gpu_block)
                     continue
 
                 if gpu_block in mapping:
@@ -346,7 +335,7 @@ def swap_out(self,
                     mapping[gpu_block] = cpu_block
                 new_block_table.append(cpu_block)
                 # Free the GPU block swapped out to CPU.
-                self.gpu_allocator.free(gpu_block, now)
+                self.gpu_allocator.free(gpu_block)
             self.block_tables[seq.seq_id] = new_block_table
 
         block_number_mapping = {
@@ -355,21 +344,19 @@ def swap_out(self,
         }
         return block_number_mapping
 
-    def _free_block_table(self,
-                          block_table: BlockTable,
-                          now: Optional[float] = None) -> None:
+    def _free_block_table(self, block_table: BlockTable) -> None:
         for block in set(block_table):
             if block.device == Device.GPU:
-                self.gpu_allocator.free(block, now)
+                self.gpu_allocator.free(block)
             else:
-                self.cpu_allocator.free(block, now)
+                self.cpu_allocator.free(block)
 
-    def free(self, seq: Sequence, now: Optional[float] = None) -> None:
+    def free(self, seq: Sequence) -> None:
         if seq.seq_id not in self.block_tables:
             # Already freed or haven't been scheduled yet.
             return
         block_table = self.block_tables[seq.seq_id]
-        self._free_block_table(block_table, now)
+        self._free_block_table(block_table)
         del self.block_tables[seq.seq_id]
 
     def reset(self) -> None:
@@ -386,3 +373,9 @@ def get_num_free_gpu_blocks(self) -> int:
 
     def get_num_free_cpu_blocks(self) -> int:
         return self.cpu_allocator.get_num_free_blocks()
+
+    def access_all_blocks_in_seq(self, seq: Sequence,
+                                 access_time: float) -> None:
+        block_table = self.block_tables[seq.seq_id]
+        for block in block_table:
+            block.last_accessed = access_time
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 158fec4d8d123..4e92634ea76c9 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -128,7 +128,6 @@ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
             request_id = (request_id, )
         request_ids = set(request_id)
 
-        now = time.monotonic()
         for state_queue in [self.waiting, self.running, self.swapped]:
             aborted_groups: List[SequenceGroup] = []
             for seq_group in state_queue:
@@ -147,7 +146,7 @@ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
                     if seq.is_finished():
                         continue
                     seq.status = SequenceStatus.FINISHED_ABORTED
-                    self.free_seq(seq, now)
+                    self.free_seq(seq)
 
     def has_unfinished_seqs(self) -> bool:
         return self.waiting or self.running or self.swapped
@@ -292,7 +291,7 @@ def _schedule(self) -> SchedulerOutputs:
                     break
             else:
                 # Append new slots to the sequence group.
-                self._append_slot(seq_group, blocks_to_copy, now)
+                self._append_slot(seq_group, blocks_to_copy)
                 running.append(seq_group)
         self.running = running
 
@@ -334,7 +333,7 @@ def _schedule(self) -> SchedulerOutputs:
                 if lora_int_id > 0:
                     curr_loras.add(lora_int_id)
                 self.swapped.popleft()
-                self._swap_in(seq_group, blocks_to_swap_in, now)
+                self._swap_in(seq_group, blocks_to_swap_in)
                 self._append_slot(seq_group, blocks_to_copy)
                 num_curr_seqs += num_new_seqs
                 self.running.append(seq_group)
@@ -365,6 +364,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
         # such as self.running, self.swapped, and self.waiting.
         scheduler_outputs = self._schedule()
 
+        now = time.monotonic()
         # Create input data structures.
         seq_group_metadata_list: List[SequenceGroupMetadata] = []
         for seq_group in scheduler_outputs.scheduled_seq_groups:
@@ -374,6 +374,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
                 seq_id = seq.seq_id
                 seq_data[seq_id] = seq.data
                 block_tables[seq_id] = self.block_manager.get_block_table(seq)
+                self.block_manager.access_all_blocks_in_seq(seq, now)
 
             seq_group_metadata = SequenceGroupMetadata(
                 request_id=seq_group.request_id,
@@ -389,8 +390,8 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
     def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None:
         self.block_manager.fork(parent_seq, child_seq)
 
-    def free_seq(self, seq: Sequence, now: Optional[float] = None) -> None:
-        self.block_manager.free(seq, now)
+    def free_seq(self, seq: Sequence) -> None:
+        self.block_manager.free(seq)
 
     def free_finished_seq_groups(self) -> None:
         self.running = deque(seq_group for seq_group in self.running
@@ -401,14 +402,10 @@ def _allocate(self, seq_group: SequenceGroup) -> None:
         for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
             seq.status = SequenceStatus.RUNNING
 
-    def _append_slot(
-        self,
-        seq_group: SequenceGroup,
-        blocks_to_copy: Dict[int, List[int]],
-        now: Optional[float] = None,
-    ) -> None:
+    def _append_slot(self, seq_group: SequenceGroup,
+                     blocks_to_copy: Dict[int, List[int]]) -> None:
         for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
-            ret = self.block_manager.append_slot(seq, now)
+            ret = self.block_manager.append_slot(seq)
             if ret is not None:
                 src_block, dst_block = ret
                 if src_block in blocks_to_copy:
@@ -421,7 +418,6 @@ def _preempt(
         seq_group: SequenceGroup,
         blocks_to_swap_out: Dict[int, int],
         preemption_mode: Optional[PreemptionMode] = None,
-        now: Optional[float] = None,
     ) -> None:
         # If preemption mode is not specified, we determine the mode as follows:
         # We use recomputation by default since it incurs lower overhead than
@@ -440,22 +436,18 @@ def _preempt(
             else:
                 preemption_mode = PreemptionMode.SWAP
         if preemption_mode == PreemptionMode.RECOMPUTE:
-            self._preempt_by_recompute(seq_group, now)
+            self._preempt_by_recompute(seq_group)
         elif preemption_mode == PreemptionMode.SWAP:
             self._preempt_by_swap(seq_group, blocks_to_swap_out)
         else:
             raise AssertionError("Invalid preemption mode.")
 
-    def _preempt_by_recompute(
-        self,
-        seq_group: SequenceGroup,
-        now: Optional[float] = None,
-    ) -> None:
+    def _preempt_by_recompute(self, seq_group: SequenceGroup) -> None:
         seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
         assert len(seqs) == 1
         for seq in seqs:
             seq.status = SequenceStatus.WAITING
-            self.block_manager.free(seq, now)
+            self.block_manager.free(seq)
         # NOTE: For FCFS, we insert the preempted sequence group to the front
         # of the waiting queue.
         self.waiting.appendleft(seq_group)
@@ -468,28 +460,22 @@ def _preempt_by_swap(
         self._swap_out(seq_group, blocks_to_swap_out)
         self.swapped.append(seq_group)
 
-    def _swap_in(
-        self,
-        seq_group: SequenceGroup,
-        blocks_to_swap_in: Dict[int, int],
-        now: Optional[float] = None,
-    ) -> None:
-        mapping = self.block_manager.swap_in(seq_group, now)
+    def _swap_in(self, seq_group: SequenceGroup,
+                 blocks_to_swap_in: Dict[int, int]) -> None:
+        mapping = self.block_manager.swap_in(seq_group)
         blocks_to_swap_in.update(mapping)
         for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
             seq.status = SequenceStatus.RUNNING
 
-    def _swap_out(self,
-                  seq_group: SequenceGroup,
-                  blocks_to_swap_out: Dict[int, int],
-                  now: Optional[float] = None) -> None:
+    def _swap_out(self, seq_group: SequenceGroup,
+                  blocks_to_swap_out: Dict[int, int]) -> None:
         if not self.block_manager.can_swap_out(seq_group):
             # FIXME(woosuk): Abort the sequence group instead of aborting the
             # entire engine.
             raise RuntimeError(
                 "Aborted due to the lack of CPU swap space. Please increase "
                 "the swap space to avoid this error.")
-        mapping = self.block_manager.swap_out(seq_group, now)
+        mapping = self.block_manager.swap_out(seq_group)
         blocks_to_swap_out.update(mapping)
         for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
             seq.status = SequenceStatus.SWAPPED

From 38c1fc63c2f7dec06c363b52b0aa5609a18f184f Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 8 Feb 2024 16:29:32 -0500
Subject: [PATCH 15/79] Remove overly aggressive assert

---
 vllm/core/block_manager.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index e9f94deca425d..8e7375ebfad11 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -240,7 +240,6 @@ def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]:
             # Copy on Write: Allocate a new block and copy the tokens.
             new_block = self.gpu_allocator.allocate(
                 seq.hash(len(logical_blocks) - 1))
-            assert (new_block.ref_count == 1)
             block_table[-1] = new_block
             self.gpu_allocator.free(last_block)
             return last_block.block_number, new_block.block_number

From b3e73f5538ce289c427d5502e65bec2b546cfbf3 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 8 Feb 2024 16:40:12 -0500
Subject: [PATCH 16/79] minor refactoring

---
 vllm/block.py              | 4 ++--
 vllm/core/block_manager.py | 1 +
 vllm/core/scheduler.py     | 6 ++----
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/vllm/block.py b/vllm/block.py
index 13a4d4bb067f5..bdae7f1a82902 100644
--- a/vllm/block.py
+++ b/vllm/block.py
@@ -2,6 +2,7 @@
 from typing import List
 
 from vllm.utils import Device
+from time import monotonic
 
 _BLANK_TOKEN_ID = -1
 
@@ -65,8 +66,7 @@ def __init__(
 
         self.block_hash = block_hash
 
-        #TODO: is this a good default?
-        self.last_accessed = 0
+        self.last_accessed = monotonic
 
     def __repr__(self) -> str:
         return (f'PhysicalTokenBlock(device={self.device}, '
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 8e7375ebfad11..cc1706b0d78d7 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -92,6 +92,7 @@ def free(self, block: PhysicalTokenBlock) -> None:
             raise ValueError(f"Double free! {block} is already freed.")
         block.ref_count -= 1
 
+    # TODO: Should this account for the number of blocks with a ref count of 0?
     def get_num_free_blocks(self) -> int:
         return self.num_blocks - self.current_num_blocks
 
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 4e92634ea76c9..a26af0bf127b0 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -127,7 +127,6 @@ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
         if isinstance(request_id, str):
             request_id = (request_id, )
         request_ids = set(request_id)
-
         for state_queue in [self.waiting, self.running, self.swapped]:
             aborted_groups: List[SequenceGroup] = []
             for seq_group in state_queue:
@@ -280,13 +279,12 @@ def _schedule(self) -> SchedulerOutputs:
                 if self.running:
                     # Preempt the lowest-priority sequence groups.
                     victim_seq_group = self.running.pop()
-                    self._preempt(victim_seq_group, blocks_to_swap_out, None,
-                                  now)
+                    self._preempt(victim_seq_group, blocks_to_swap_out)
                     preempted.append(victim_seq_group)
                 else:
                     # No other sequence groups can be preempted.
                     # Preempt the current sequence group.
-                    self._preempt(seq_group, blocks_to_swap_out, None, now)
+                    self._preempt(seq_group, blocks_to_swap_out)
                     preempted.append(seq_group)
                     break
             else:

From 48624d9dfed8eb09ca089f0c239676238f22689d Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Fri, 9 Feb 2024 02:08:19 -0500
Subject: [PATCH 17/79] Add prefix len to eviction strategy

---
 tests/prefix_caching/test_prefix_caching.py | 16 ++++----
 vllm/block.py                               |  3 ++
 vllm/core/block_manager.py                  | 41 +++++++++++++++------
 vllm/core/scheduler.py                      |  3 +-
 vllm/engine/llm_engine.py                   |  2 +-
 vllm/sequence.py                            | 10 ++++-
 6 files changed, 53 insertions(+), 22 deletions(-)

diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index 9fe77b57f3fb9..e40ea9927bf22 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -24,7 +24,7 @@ def allocate_all_blocks(block_allocator, num_blocks):
     blocks = []
     for i in range(num_blocks):
         # use i as the block_hash
-        blocks.append(block_allocator.allocate(i))
+        blocks.append(block_allocator.allocate(i, 0))
     return blocks
 
 
@@ -60,8 +60,8 @@ def test_block_allocator(
     block_allocator = BlockAllocator(Device.CPU, block_size, num_blocks)
 
     # Allocate two PysicalTokenBlocks with the same hash and check that they are the same PhysicalTokenBlock
-    first_block = block_allocator.allocate(block_hash)
-    second_block = block_allocator.allocate(block_hash)
+    first_block = block_allocator.allocate(block_hash, 0)
+    second_block = block_allocator.allocate(block_hash, 0)
     assert (first_block == second_block)
     assert (second_block.ref_count == 2)
 
@@ -73,7 +73,7 @@ def test_block_allocator(
     block_allocator.free(second_block)
 
     # Reallocate the first block and confirm that, even after the block had its ref_count go to 0, we still get the same block back
-    first_block = block_allocator.allocate(block_hash)
+    first_block = block_allocator.allocate(block_hash, 0)
     assert (first_block == second_block)
     assert (first_block.block_hash == block_hash)
 
@@ -86,7 +86,7 @@ def test_eviction(num_blocks: int, ):
 
     for i in range(num_blocks):
         # use i as the block_hash
-        blocks.append(block_allocator.allocate(i))
+        blocks.append(block_allocator.allocate(i, 0))
 
     #Free all blocks
     for block in blocks:
@@ -94,19 +94,19 @@ def test_eviction(num_blocks: int, ):
 
     # Allocate a new block and confirm that it's the first block freed. I.E The Least Recently Used block
     new_block_hash = block_size
-    new_block = block_allocator.allocate(new_block_hash)
+    new_block = block_allocator.allocate(new_block_hash, 0)
     assert (new_block == blocks[0])
     assert (new_block.block_hash == new_block_hash)
 
     # Reallocate the second in blocks to remove it from the free list
     realloc_block_hash = 1
-    realloc_block = block_allocator.allocate(realloc_block_hash)
+    realloc_block = block_allocator.allocate(realloc_block_hash, 0)
     assert (realloc_block == blocks[realloc_block_hash])
     assert (realloc_block.block_hash == realloc_block_hash)
 
     # Allocate a new block and confirm that it's not the realloc_block, since the realloc_block shouldn't be in the free list
     new_block_hash = block_size + 1
-    new_block = block_allocator.allocate(new_block_hash)
+    new_block = block_allocator.allocate(new_block_hash, 0)
     assert (realloc_block != new_block)
     assert (new_block.block_hash == new_block_hash)
     assert (new_block.block_number == 2)
diff --git a/vllm/block.py b/vllm/block.py
index 13a4d4bb067f5..9796178c5a60e 100644
--- a/vllm/block.py
+++ b/vllm/block.py
@@ -56,6 +56,7 @@ def __init__(
         block_number: int,
         block_size: int,
         block_hash: int,
+        prefix_len: int,
     ) -> None:
         self.device = device
         self.block_number = block_number
@@ -68,6 +69,8 @@ def __init__(
         #TODO: is this a good default?
         self.last_accessed = 0
 
+        self.prefix_len = prefix_len
+
     def __repr__(self) -> str:
         return (f'PhysicalTokenBlock(device={self.device}, '
                 f'block_number={self.block_number}, '
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 04cd314c19de7..d6ec475337472 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -24,13 +24,24 @@ def lru_eviction(table: Dict[int, PhysicalTokenBlock]) -> PhysicalTokenBlock:
             lowest_timestamp = block.last_accessed
 
     # Find all blocks with the lowest timestamp
-    eviction_candidates: List[PhysicalTokenBlock] = []
+    least_recent: List[PhysicalTokenBlock] = []
     for block in all_blocks:
         if block.ref_count == 0 and block.last_accessed == lowest_timestamp:
+            least_recent.append(block)
+
+    # Find highest prefix count per block
+    highest_prefix_count = 0
+    for block in least_recent:
+        if block.ref_count == 0 and block.prefix_len > highest_prefix_count:
+            highest_prefix_count = block.prefix_len
+
+    # Find all blocks with the lowest timestamp
+    eviction_candidates: List[PhysicalTokenBlock] = []
+    for block in least_recent:
+        if block.ref_count == 0 and block.prefix_len == highest_prefix_count:
             eviction_candidates.append(block)
 
     # Arbitrarily evict the first candidate
-    # TODO: Evict based on the number of prefix tokens in the block
     assert (len(eviction_candidates) > 0)
     evicted_block = eviction_candidates[0]
     del table[evicted_block.block_hash]
@@ -67,7 +78,8 @@ def evict(self) -> PhysicalTokenBlock:
             raise ValueError(
                 f"Unknown cache eviction policy: {self.eviction_policy}")
 
-    def allocate_block(self, block_hash: int) -> PhysicalTokenBlock:
+    def allocate_block(self, block_hash: int,
+                       prefix_len: int) -> PhysicalTokenBlock:
         if self.current_num_blocks == self.num_blocks:
             block = self.evict()
             block.block_hash = block_hash
@@ -75,13 +87,15 @@ def allocate_block(self, block_hash: int) -> PhysicalTokenBlock:
         block = PhysicalTokenBlock(device=self.device,
                                    block_number=self.current_num_blocks,
                                    block_size=self.block_size,
-                                   block_hash=block_hash)
+                                   block_hash=block_hash,
+                                   prefix_len=prefix_len)
         self.current_num_blocks += 1
         return block
 
-    def allocate(self, block_hash: int) -> PhysicalTokenBlock:
+    def allocate(self, block_hash: int, prefix_len: int) -> PhysicalTokenBlock:
         if block_hash not in self.table:
-            self.table[block_hash] = self.allocate_block(block_hash)
+            self.table[block_hash] = self.allocate_block(
+                block_hash, prefix_len)
         block = self.table[block_hash]
         block.ref_count += 1
         # print(f"REFCOUNT ON ALLOCTION: {block}")
@@ -190,7 +204,8 @@ def allocate(self, seq_group: SequenceGroup) -> None:
                     and logical_idx >= self.block_sliding_window):
                 block = block_table[logical_idx % self.block_sliding_window]
             else:
-                block = self.gpu_allocator.allocate(seq.hash(logical_idx))
+                block = self.gpu_allocator.allocate(seq.hash(logical_idx),
+                                                    seq_group.get_prefix_len())
                 if logical_idx * self.block_size + self.block_size > len(
                         seq.data.get_token_ids()):
                     self.partial_block_table[seq.seq_id] = block
@@ -211,6 +226,7 @@ def can_append_slot(self, seq_group: SequenceGroup) -> bool:
 
     def append_slot(self,
                     seq: Sequence,
+                    prefix_len: int,
                     now: Optional[float] = None) -> Optional[Tuple[int, int]]:
         """Allocate a physical slot for a new token."""
         logical_blocks = seq.logical_token_blocks
@@ -228,7 +244,8 @@ def append_slot(self,
                 # Allocate a new physical block.
                 assert (seq.seq_id not in self.partial_block_table)
                 self.partial_block_table[
-                    seq.seq_id] = self.gpu_allocator.allocate(seq.seq_id)
+                    seq.seq_id] = self.gpu_allocator.allocate(
+                        seq.seq_id, prefix_len)
                 block_table.append(self.partial_block_table[seq.seq_id])
                 return None
 
@@ -246,7 +263,7 @@ def append_slot(self,
             # The last block is shared with other sequences.
             # Copy on Write: Allocate a new block and copy the tokens.
             new_block = self.gpu_allocator.allocate(
-                seq.hash(len(logical_blocks) - 1))
+                seq.hash(len(logical_blocks) - 1), prefix_len)
             assert (new_block.ref_count == 1)
             block_table[-1] = new_block
             self.gpu_allocator.free(last_block, now)
@@ -304,7 +321,8 @@ def swap_in(self,
                     gpu_block.ref_count += 1
                 else:
                     gpu_block = self.gpu_allocator.allocate(
-                        seq.hash(len(seq.logical_blocks) - 1))
+                        seq.hash(len(seq.logical_blocks) - 1),
+                        seq_group.get_prefix_len())
                     mapping[cpu_block] = gpu_block
                 new_block_table.append(gpu_block)
                 # Free the CPU block swapped in to GPU.
@@ -342,7 +360,8 @@ def swap_out(self,
                     cpu_block.ref_count += 1
                 else:
                     cpu_block = self.cpu_allocator.allocate(
-                        seq.hash(len(seq.logical_blocks) - 1))
+                        seq.hash(len(seq.logical_blocks) - 1),
+                        seq_group.get_prefix_len())
                     mapping[gpu_block] = cpu_block
                 new_block_table.append(cpu_block)
                 # Free the GPU block swapped out to CPU.
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 158fec4d8d123..912a9f1320d88 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -408,7 +408,8 @@ def _append_slot(
         now: Optional[float] = None,
     ) -> None:
         for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
-            ret = self.block_manager.append_slot(seq, now)
+            ret = self.block_manager.append_slot(seq, now,
+                                                 seq_group.get_prefix_len())
             if ret is not None:
                 src_block, dst_block = ret
                 if src_block in blocks_to_copy:
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 8c84e1dee1fff..5317874827357 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -451,7 +451,7 @@ def add_request(
 
         # Create the sequence group.
         seq_group = SequenceGroup(request_id, [seq], sampling_params,
-                                  arrival_time, lora_request)
+                                  arrival_time, lora_request, prefix_pos)
 
         # Add the sequence group to the scheduler.
         self.scheduler.add_seq_group(seq_group)
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 295c7e51b5a01..f9bb3eb24fc93 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -243,7 +243,7 @@ class SequenceGroup:
         sampling_params: The sampling parameters used to generate the outputs.
         arrival_time: The arrival time of the request.
         lora_request: LoRA request.
-        prefix: The prefix of the prompt of the sequence group.
+        prefix_pos: The end of prefix of the prompt of the sequence group.
     """
 
     def __init__(
@@ -253,6 +253,7 @@ def __init__(
         sampling_params: SamplingParams,
         arrival_time: float,
         lora_request: Optional[LoRARequest] = None,
+        prefix_pos: Optional[int] = None,
     ) -> None:
         self.request_id = request_id
         self.seqs_dict = {seq.seq_id: seq for seq in seqs}
@@ -260,6 +261,7 @@ def __init__(
         self.arrival_time = arrival_time
         self.last_token_time = arrival_time
         self.lora_request = lora_request
+        self.prefix_pos: Optional[int] = prefix_pos
         self.prompt_logprobs: Optional[PromptLogprobs] = None
 
     @property
@@ -347,6 +349,9 @@ def remove(self, seq_id: int) -> None:
     def is_finished(self) -> bool:
         return all(seq.is_finished() for seq in self.get_seqs())
 
+    def get_prefix_len(self) -> int:
+        return self.prefix_pos if self.prefix_pos is not None else 0
+
     def __repr__(self) -> str:
         return (f"SequenceGroup(request_id={self.request_id}, "
                 f"sampling_params={self.sampling_params}, "
@@ -364,6 +369,7 @@ class SequenceGroupMetadata:
         block_tables: The block tables. (Seq id -> list of physical block
             numbers)
         lora_request: LoRA request.
+        prefix_pos: The end of prefix of the prompt of the sequence group.
     """
 
     def __init__(
@@ -374,6 +380,7 @@ def __init__(
         sampling_params: SamplingParams,
         block_tables: Dict[int, List[int]],
         lora_request: Optional[LoRARequest] = None,
+        prefix_pos: Optional[int] = None,
     ) -> None:
         self.request_id = request_id
         self.is_prompt = is_prompt
@@ -381,6 +388,7 @@ def __init__(
         self.sampling_params = sampling_params
         self.block_tables = block_tables
         self.lora_request = lora_request
+        self.prefix_pos = prefix_pos
 
     @property
     def lora_int_id(self) -> int:

From bb471f2b397d9f0f28c682e0957a202cf595c6b2 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Fri, 9 Feb 2024 09:02:35 -0500
Subject: [PATCH 18/79] fixed a few bugs in the partial block management code

---
 vllm/core/block_manager.py | 46 +++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 13 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index e73a71979cfd9..0c0cb8e469eeb 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -220,8 +220,25 @@ def can_append_slot(self, seq_group: SequenceGroup) -> bool:
         num_seqs = seq_group.num_seqs(status=SequenceStatus.RUNNING)
         return num_seqs <= num_free_gpu_blocks
 
-    def append_slot(self, seq: Sequence,
-                    prefix_len: int) -> Optional[Tuple[int, int]]:
+    def replace_partial_block(self, seq: Sequence, block: PhysicalTokenBlock,
+                              old_block: PhysicalTokenBlock):
+        # If there's something already in the partial block table, delete it
+        block_hash: int = seq.seq_id
+        if block_hash in self.partial_block_table:
+            assert self.partial_block_table[block_hash] == old_block
+            del self.partial_block_table[block_hash]
+
+        self.partial_block_table[block_hash] = block
+
+    def promote_partial_block(self, seq: Sequence, block: PhysicalTokenBlock):
+        # Delete the block from the partial table, but don't decrement the ref count
+        del self.partial_block_table[seq.seq_id]
+
+        # Compute a new hash for the block so that it can be shared by other Sequences
+        new_hash = seq.hash(len(seq.logical_token_blocks) - 1)
+        self.gpu_allocator.update_hash(new_hash, block)
+
+    def append_slot(self, seq: Sequence, prefix_len: int) -> Optional[Tuple[int, int]]:
         """Allocate a physical slot for a new token."""
         logical_blocks = seq.logical_token_blocks
         block_table = self.block_tables[seq.seq_id]
@@ -237,10 +254,10 @@ def append_slot(self, seq: Sequence,
                 # The sequence has a new logical block.
                 # Allocate a new physical block.
                 assert (seq.seq_id not in self.partial_block_table)
-                self.partial_block_table[
-                    seq.seq_id] = self.gpu_allocator.allocate(
-                        seq.seq_id, prefix_len)
-                block_table.append(self.partial_block_table[seq.seq_id])
+                new_block = self.gpu_allocator.allocate(seq.seq_id, prefix_len)
+                self.partial_block_table[seq.seq_id] = new_block
+                assert (new_block.ref_count == 1)
+                block_table.append(new_block)
                 return None
 
         # We want to append the token to the last physical block.
@@ -248,17 +265,20 @@ def append_slot(self, seq: Sequence,
         assert last_block.device == Device.GPU
         if last_block.ref_count == 1:
             # Not shared with other sequences. Appendable.
-            if len(seq.data.get_token_ids()) % seq.block_size == 0:
-                del self.partial_block_table[seq.seq_id]
-                new_hash = seq.hash(len(logical_blocks) - 1)
-                self.gpu_allocator.update_hash(new_hash, last_block)
-                return None
+
+            # If the last block is now complete, promote it to a full block so that it can be shared
+            should_promote_partial_block = len(
+                seq.data.get_token_ids()) % seq.block_size == 0
+            if should_promote_partial_block:
+                self.promote_partial_block(seq, last_block)
+            return None
         else:
             # The last block is shared with other sequences.
             # Copy on Write: Allocate a new block and copy the tokens.
-            new_block = self.gpu_allocator.allocate(
-                seq.hash(len(logical_blocks) - 1), prefix_len)
+            new_block = self.gpu_allocator.allocate(seq.seq_id, prefix_len)
+            self.replace_partial_block(seq, new_block, last_block)
             block_table[-1] = new_block
+            assert (new_block.ref_count == 1)
             self.gpu_allocator.free(last_block)
             return last_block.block_number, new_block.block_number
 

From 5d5db121f97e85c3966cbd790ce5b521415e90f9 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Fri, 9 Feb 2024 09:10:19 -0500
Subject: [PATCH 19/79] auto format

---
 vllm/core/block_manager.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 0c0cb8e469eeb..5849b33d53ae1 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -238,7 +238,8 @@ def promote_partial_block(self, seq: Sequence, block: PhysicalTokenBlock):
         new_hash = seq.hash(len(seq.logical_token_blocks) - 1)
         self.gpu_allocator.update_hash(new_hash, block)
 
-    def append_slot(self, seq: Sequence, prefix_len: int) -> Optional[Tuple[int, int]]:
+    def append_slot(self, seq: Sequence,
+                    prefix_len: int) -> Optional[Tuple[int, int]]:
         """Allocate a physical slot for a new token."""
         logical_blocks = seq.logical_token_blocks
         block_table = self.block_tables[seq.seq_id]

From ffbddd9f7d0f54b678db8da52380338ac21ba9ac Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Fri, 9 Feb 2024 15:52:27 -0500
Subject: [PATCH 20/79] fix fork/cow mechanisms so that they work with partial
 blocks

---
 vllm/core/block_manager.py | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 5849b33d53ae1..9393859005725 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -194,7 +194,6 @@ def allocate(self, seq_group: SequenceGroup) -> None:
         num_prompt_blocks = len(seq.logical_token_blocks)
 
         block_table: BlockTable = []
-
         for logical_idx in range(num_prompt_blocks):
             if (self.block_sliding_window is not None
                     and logical_idx >= self.block_sliding_window):
@@ -210,8 +209,12 @@ def allocate(self, seq_group: SequenceGroup) -> None:
             block_table.append(block)
 
         # Assign the block table for each sequence.
+        first_id = seq.seq_id
         for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
             self.block_tables[seq.seq_id] = block_table.copy()
+            if first_id in self.partial_block_table and first_id != seq.seq_id:
+                self.partial_block_table[
+                    seq.seq_id] = self.partial_block_table[first_id]
 
     def can_append_slot(self, seq_group: SequenceGroup) -> bool:
         # Simple heuristic: If there is at least one free block
@@ -243,7 +246,6 @@ def append_slot(self, seq: Sequence,
         """Allocate a physical slot for a new token."""
         logical_blocks = seq.logical_token_blocks
         block_table = self.block_tables[seq.seq_id]
-
         # If we need to allocate a new physical block
         if len(block_table) < len(logical_blocks):
             if (self.block_sliding_window
@@ -255,7 +257,8 @@ def append_slot(self, seq: Sequence,
                 # The sequence has a new logical block.
                 # Allocate a new physical block.
                 assert (seq.seq_id not in self.partial_block_table)
-                new_block = self.gpu_allocator.allocate(seq.seq_id, prefix_len)
+                new_block = self.gpu_allocator.allocate(
+                    monotonic(), prefix_len)
                 self.partial_block_table[seq.seq_id] = new_block
                 assert (new_block.ref_count == 1)
                 block_table.append(new_block)
@@ -266,18 +269,22 @@ def append_slot(self, seq: Sequence,
         assert last_block.device == Device.GPU
         if last_block.ref_count == 1:
             # Not shared with other sequences. Appendable.
-
             # If the last block is now complete, promote it to a full block so that it can be shared
-            should_promote_partial_block = len(
-                seq.data.get_token_ids()) % seq.block_size == 0
+            should_promote_partial_block = (len(
+                seq.data.get_token_ids())) % seq.block_size == 0
             if should_promote_partial_block:
                 self.promote_partial_block(seq, last_block)
             return None
         else:
             # The last block is shared with other sequences.
             # Copy on Write: Allocate a new block and copy the tokens.
-            new_block = self.gpu_allocator.allocate(seq.seq_id, prefix_len)
-            self.replace_partial_block(seq, new_block, last_block)
+            new_block = self.gpu_allocator.allocate(monotonic(), prefix_len)
+            should_promote_partial_block = (len(
+                seq.data.get_token_ids())) % seq.block_size == 0
+            if not should_promote_partial_block:
+                self.replace_partial_block(seq, new_block, last_block)
+            else:
+                del self.partial_block_table[seq.seq_id]
             block_table[-1] = new_block
             assert (new_block.ref_count == 1)
             self.gpu_allocator.free(last_block)
@@ -290,6 +297,9 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
         self.block_tables[child_seq.seq_id] = src_block_table.copy()
         for block in src_block_table:
             block.ref_count += 1
+        if parent_seq.seq_id in self.partial_block_table:
+            self.partial_block_table[
+                child_seq.seq_id] = self.partial_block_table[parent_seq.seq_id]
 
     def _get_physical_blocks(
             self, seq_group: SequenceGroup) -> List[PhysicalTokenBlock]:

From 1f7fe4279a89295b0dffc7a64cd20eb230efe9f4 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Mon, 12 Feb 2024 08:37:55 -0500
Subject: [PATCH 21/79] replace the partial block table with a simpler
 promotion mechanism

---
 vllm/core/block_manager.py | 47 ++++++++------------------------------
 1 file changed, 10 insertions(+), 37 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 9393859005725..998c00338abd1 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -162,7 +162,6 @@ def __init__(
                                             num_cpu_blocks)
         # Mapping: seq_id -> BlockTable.
         self.block_tables: Dict[int, BlockTable] = {}
-        self.partial_block_table: Dict[int, PhysicalTokenBlock] = {}
 
     def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         # FIXME(woosuk): Here we assume that all sequences in the group share
@@ -201,20 +200,13 @@ def allocate(self, seq_group: SequenceGroup) -> None:
             else:
                 block = self.gpu_allocator.allocate(seq.hash(logical_idx),
                                                     seq_group.get_prefix_len())
-                if logical_idx * self.block_size + self.block_size > len(
-                        seq.data.get_token_ids()):
-                    self.partial_block_table[seq.seq_id] = block
             # Set the reference counts of the token blocks.
             # block.ref_count = seq_group.num_seqs()
             block_table.append(block)
 
         # Assign the block table for each sequence.
-        first_id = seq.seq_id
         for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
             self.block_tables[seq.seq_id] = block_table.copy()
-            if first_id in self.partial_block_table and first_id != seq.seq_id:
-                self.partial_block_table[
-                    seq.seq_id] = self.partial_block_table[first_id]
 
     def can_append_slot(self, seq_group: SequenceGroup) -> bool:
         # Simple heuristic: If there is at least one free block
@@ -223,24 +215,14 @@ def can_append_slot(self, seq_group: SequenceGroup) -> bool:
         num_seqs = seq_group.num_seqs(status=SequenceStatus.RUNNING)
         return num_seqs <= num_free_gpu_blocks
 
-    def replace_partial_block(self, seq: Sequence, block: PhysicalTokenBlock,
-                              old_block: PhysicalTokenBlock):
-        # If there's something already in the partial block table, delete it
-        block_hash: int = seq.seq_id
-        if block_hash in self.partial_block_table:
-            assert self.partial_block_table[block_hash] == old_block
-            del self.partial_block_table[block_hash]
-
-        self.partial_block_table[block_hash] = block
-
-    def promote_partial_block(self, seq: Sequence, block: PhysicalTokenBlock):
-        # Delete the block from the partial table, but don't decrement the ref count
-        del self.partial_block_table[seq.seq_id]
-
+    def promote_last_block(self, seq: Sequence, block: PhysicalTokenBlock):
         # Compute a new hash for the block so that it can be shared by other Sequences
         new_hash = seq.hash(len(seq.logical_token_blocks) - 1)
         self.gpu_allocator.update_hash(new_hash, block)
 
+    def should_promote_last_block(self, seq: Sequence) -> bool:
+        return (len(seq.data.get_token_ids())) % seq.block_size == 0
+
     def append_slot(self, seq: Sequence,
                     prefix_len: int) -> Optional[Tuple[int, int]]:
         """Allocate a physical slot for a new token."""
@@ -256,10 +238,8 @@ def append_slot(self, seq: Sequence,
             else:
                 # The sequence has a new logical block.
                 # Allocate a new physical block.
-                assert (seq.seq_id not in self.partial_block_table)
                 new_block = self.gpu_allocator.allocate(
                     monotonic(), prefix_len)
-                self.partial_block_table[seq.seq_id] = new_block
                 assert (new_block.ref_count == 1)
                 block_table.append(new_block)
                 return None
@@ -267,24 +247,20 @@ def append_slot(self, seq: Sequence,
         # We want to append the token to the last physical block.
         last_block = block_table[-1]
         assert last_block.device == Device.GPU
+        should_promote_last_block = self.should_promote_last_block(seq)
         if last_block.ref_count == 1:
             # Not shared with other sequences. Appendable.
             # If the last block is now complete, promote it to a full block so that it can be shared
-            should_promote_partial_block = (len(
-                seq.data.get_token_ids())) % seq.block_size == 0
-            if should_promote_partial_block:
-                self.promote_partial_block(seq, last_block)
+            if (should_promote_last_block):
+                self.promote_last_block(seq, last_block)
             return None
         else:
             # The last block is shared with other sequences.
             # Copy on Write: Allocate a new block and copy the tokens.
             new_block = self.gpu_allocator.allocate(monotonic(), prefix_len)
-            should_promote_partial_block = (len(
-                seq.data.get_token_ids())) % seq.block_size == 0
-            if not should_promote_partial_block:
-                self.replace_partial_block(seq, new_block, last_block)
-            else:
-                del self.partial_block_table[seq.seq_id]
+
+            if (should_promote_last_block):
+                self.promote_last_block(seq, new_block)
             block_table[-1] = new_block
             assert (new_block.ref_count == 1)
             self.gpu_allocator.free(last_block)
@@ -297,9 +273,6 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
         self.block_tables[child_seq.seq_id] = src_block_table.copy()
         for block in src_block_table:
             block.ref_count += 1
-        if parent_seq.seq_id in self.partial_block_table:
-            self.partial_block_table[
-                child_seq.seq_id] = self.partial_block_table[parent_seq.seq_id]
 
     def _get_physical_blocks(
             self, seq_group: SequenceGroup) -> List[PhysicalTokenBlock]:

From 7ab75d759e97c08cfe3cec712c8b4336d36a28b5 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Mon, 12 Feb 2024 09:15:37 -0500
Subject: [PATCH 22/79] clean up the BlockSpaceManager a bit

---
 vllm/core/block_manager.py | 44 +++++++++++++++++++++++++++-----------
 1 file changed, 31 insertions(+), 13 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 998c00338abd1..b2e8424b69f4c 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -92,7 +92,11 @@ def allocate_block(self, block_hash: int,
         self.current_num_blocks += 1
         return block
 
-    def allocate(self, block_hash: int, prefix_len: int) -> PhysicalTokenBlock:
+    def allocate(self,
+                 block_hash: Optional[int] = None,
+                 prefix_len: int = 0) -> PhysicalTokenBlock:
+        if block_hash is None:
+            block_hash = monotonic()
         if block_hash not in self.table:
             self.table[block_hash] = self.allocate_block(
                 block_hash, prefix_len)
@@ -215,14 +219,32 @@ def can_append_slot(self, seq_group: SequenceGroup) -> bool:
         num_seqs = seq_group.num_seqs(status=SequenceStatus.RUNNING)
         return num_seqs <= num_free_gpu_blocks
 
-    def promote_last_block(self, seq: Sequence, block: PhysicalTokenBlock):
+    def _promote_last_block(self, seq: Sequence, block: PhysicalTokenBlock):
         # Compute a new hash for the block so that it can be shared by other Sequences
         new_hash = seq.hash(len(seq.logical_token_blocks) - 1)
+
+        # TODO: What if the hash already exists in the table? If it does, we can free and use that block?
         self.gpu_allocator.update_hash(new_hash, block)
 
-    def should_promote_last_block(self, seq: Sequence) -> bool:
+    def _should_promote_last_block(self, seq: Sequence) -> bool:
         return (len(seq.data.get_token_ids())) % seq.block_size == 0
 
+    def _maybe_promote_last_block(self, seq: Sequence,
+                                 last_block: PhysicalTokenBlock) -> None:
+        if self._should_promote_last_block(seq):
+            self._promote_last_block(seq, last_block)
+
+    def _allocate_last_physical_block(self, seq: Sequence,
+                                     prefix_len: int) -> PhysicalTokenBlock:
+        block_hash: Optional[int] = None
+        if (self._should_promote_last_block(seq)):
+            block_hash = seq.hash(len(seq.logical_token_blocks) - 1)
+        new_block = self.gpu_allocator.allocate(block_hash,
+                                                prefix_len=prefix_len)
+
+        assert (new_block.ref_count == 1)
+        return new_block
+
     def append_slot(self, seq: Sequence,
                     prefix_len: int) -> Optional[Tuple[int, int]]:
         """Allocate a physical slot for a new token."""
@@ -230,6 +252,9 @@ def append_slot(self, seq: Sequence,
         block_table = self.block_tables[seq.seq_id]
         # If we need to allocate a new physical block
         if len(block_table) < len(logical_blocks):
+            # Currently this code only supports adding one physical block
+            assert len(block_table) == len(logical_blocks) - 1
+
             if (self.block_sliding_window
                     and len(block_table) >= self.block_sliding_window):
                 # re-use a block
@@ -238,31 +263,24 @@ def append_slot(self, seq: Sequence,
             else:
                 # The sequence has a new logical block.
                 # Allocate a new physical block.
-                new_block = self.gpu_allocator.allocate(
-                    monotonic(), prefix_len)
-                assert (new_block.ref_count == 1)
+                new_block = self.allocate_last_physical_block(seq, prefix_len)
                 block_table.append(new_block)
                 return None
 
         # We want to append the token to the last physical block.
         last_block = block_table[-1]
         assert last_block.device == Device.GPU
-        should_promote_last_block = self.should_promote_last_block(seq)
         if last_block.ref_count == 1:
             # Not shared with other sequences. Appendable.
             # If the last block is now complete, promote it to a full block so that it can be shared
-            if (should_promote_last_block):
-                self.promote_last_block(seq, last_block)
+            self.maybe_promote_last_block(seq, last_block)
             return None
         else:
             # The last block is shared with other sequences.
             # Copy on Write: Allocate a new block and copy the tokens.
-            new_block = self.gpu_allocator.allocate(monotonic(), prefix_len)
+            new_block = self.allocate_last_physical_block(seq, prefix_len)
 
-            if (should_promote_last_block):
-                self.promote_last_block(seq, new_block)
             block_table[-1] = new_block
-            assert (new_block.ref_count == 1)
             self.gpu_allocator.free(last_block)
             return last_block.block_number, new_block.block_number
 

From ca3e288d1f6dafbb7805f9b9dd2048c056a001f9 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Mon, 12 Feb 2024 09:18:20 -0500
Subject: [PATCH 23/79] fix minor typos

---
 vllm/core/block_manager.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index b2e8424b69f4c..d065219c2e93c 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -230,12 +230,12 @@ def _should_promote_last_block(self, seq: Sequence) -> bool:
         return (len(seq.data.get_token_ids())) % seq.block_size == 0
 
     def _maybe_promote_last_block(self, seq: Sequence,
-                                 last_block: PhysicalTokenBlock) -> None:
+                                  last_block: PhysicalTokenBlock) -> None:
         if self._should_promote_last_block(seq):
             self._promote_last_block(seq, last_block)
 
     def _allocate_last_physical_block(self, seq: Sequence,
-                                     prefix_len: int) -> PhysicalTokenBlock:
+                                      prefix_len: int) -> PhysicalTokenBlock:
         block_hash: Optional[int] = None
         if (self._should_promote_last_block(seq)):
             block_hash = seq.hash(len(seq.logical_token_blocks) - 1)
@@ -263,7 +263,7 @@ def append_slot(self, seq: Sequence,
             else:
                 # The sequence has a new logical block.
                 # Allocate a new physical block.
-                new_block = self.allocate_last_physical_block(seq, prefix_len)
+                new_block = self._allocate_last_physical_block(seq, prefix_len)
                 block_table.append(new_block)
                 return None
 
@@ -273,12 +273,12 @@ def append_slot(self, seq: Sequence,
         if last_block.ref_count == 1:
             # Not shared with other sequences. Appendable.
             # If the last block is now complete, promote it to a full block so that it can be shared
-            self.maybe_promote_last_block(seq, last_block)
+            self._maybe_promote_last_block(seq, last_block)
             return None
         else:
             # The last block is shared with other sequences.
             # Copy on Write: Allocate a new block and copy the tokens.
-            new_block = self.allocate_last_physical_block(seq, prefix_len)
+            new_block = self._allocate_last_physical_block(seq, prefix_len)
 
             block_table[-1] = new_block
             self.gpu_allocator.free(last_block)

From ecf389dcf8fd91084072074564ce2291db2f8734 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Mon, 12 Feb 2024 09:19:21 -0500
Subject: [PATCH 24/79] minor name change

---
 vllm/core/block_manager.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index d065219c2e93c..81d2e92183d35 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -226,18 +226,18 @@ def _promote_last_block(self, seq: Sequence, block: PhysicalTokenBlock):
         # TODO: What if the hash already exists in the table? If it does, we can free and use that block?
         self.gpu_allocator.update_hash(new_hash, block)
 
-    def _should_promote_last_block(self, seq: Sequence) -> bool:
+    def _is_last_block_full(self, seq: Sequence) -> bool:
         return (len(seq.data.get_token_ids())) % seq.block_size == 0
 
     def _maybe_promote_last_block(self, seq: Sequence,
                                   last_block: PhysicalTokenBlock) -> None:
-        if self._should_promote_last_block(seq):
+        if self._is_last_block_full(seq):
             self._promote_last_block(seq, last_block)
 
     def _allocate_last_physical_block(self, seq: Sequence,
                                       prefix_len: int) -> PhysicalTokenBlock:
         block_hash: Optional[int] = None
-        if (self._should_promote_last_block(seq)):
+        if (self._is_last_block_full(seq)):
             block_hash = seq.hash(len(seq.logical_token_blocks) - 1)
         new_block = self.gpu_allocator.allocate(block_hash,
                                                 prefix_len=prefix_len)

From 427566a7a75651fc40b8fe8dc3cef103e1d718af Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Mon, 12 Feb 2024 09:40:04 -0500
Subject: [PATCH 25/79] update assert

---
 vllm/core/block_manager.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 81d2e92183d35..76774bdb7f850 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -242,7 +242,8 @@ def _allocate_last_physical_block(self, seq: Sequence,
         new_block = self.gpu_allocator.allocate(block_hash,
                                                 prefix_len=prefix_len)
 
-        assert (new_block.ref_count == 1)
+        if block_hash is None:
+            assert (new_block.ref_count == 1)
         return new_block
 
     def append_slot(self, seq: Sequence,

From a3431bbf63627814df966926e711828d880c8836 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Mon, 12 Feb 2024 10:45:10 -0500
Subject: [PATCH 26/79] fix swap_in and swap_out

---
 vllm/core/block_manager.py | 49 +++++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 14 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 76774bdb7f850..bd80dbb3648c5 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -229,23 +229,43 @@ def _promote_last_block(self, seq: Sequence, block: PhysicalTokenBlock):
     def _is_last_block_full(self, seq: Sequence) -> bool:
         return (len(seq.data.get_token_ids())) % seq.block_size == 0
 
+    def _is_last_block(self, seq: Sequence, index: int) -> bool:
+        return index == len(seq.logical_token_blocks) - 1
+
+    def _is_block_full(self, seq: Sequence, index: int) -> bool:
+        return not self._is_last_block(seq,
+                                       index) or self._is_last_block_full(seq)
+
     def _maybe_promote_last_block(self, seq: Sequence,
                                   last_block: PhysicalTokenBlock) -> None:
         if self._is_last_block_full(seq):
             self._promote_last_block(seq, last_block)
 
-    def _allocate_last_physical_block(self, seq: Sequence,
-                                      prefix_len: int) -> PhysicalTokenBlock:
+    def _allocate_physical_block(self,
+                                 seq: Sequence,
+                                 index: int,
+                                 prefix_len: int,
+                                 use_gpu: bool = True) -> PhysicalTokenBlock:
         block_hash: Optional[int] = None
-        if (self._is_last_block_full(seq)):
-            block_hash = seq.hash(len(seq.logical_token_blocks) - 1)
-        new_block = self.gpu_allocator.allocate(block_hash,
-                                                prefix_len=prefix_len)
+        if (self._is_block_full(seq, index)):
+            block_hash = seq.hash(index)
+        if use_gpu:
+            new_block = self.gpu_allocator.allocate(block_hash,
+                                                    prefix_len=prefix_len)
+        else:
+            new_block = self.cpu_allocator.allocate(block_hash,
+                                                    prefix_len=prefix_len)
 
         if block_hash is None:
             assert (new_block.ref_count == 1)
         return new_block
 
+    def _allocate_last_physical_block(self, seq: Sequence,
+                                      prefix_len: int) -> PhysicalTokenBlock:
+        return self._allocate_physical_block(seq,
+                                             len(seq.logical_token_blocks) - 1,
+                                             prefix_len)
+
     def append_slot(self, seq: Sequence,
                     prefix_len: int) -> Optional[Tuple[int, int]]:
         """Allocate a physical slot for a new token."""
@@ -329,14 +349,15 @@ def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]:
                     new_block_table.append(block)
                     block.ref_count += 1
 
-            for cpu_block in block_table:
+            # Assumption that len(block_table) == len(logical_blocks)
+            for i in range(len(block_table)):
+                cpu_block = block_table[i]
                 if cpu_block in mapping:
                     gpu_block = mapping[cpu_block]
                     gpu_block.ref_count += 1
                 else:
-                    gpu_block = self.gpu_allocator.allocate(
-                        seq.hash(len(seq.logical_blocks) - 1),
-                        seq_group.get_prefix_len())
+                    gpu_block = self._allocate_physical_block(
+                        seq, i, seq_group.get_prefix_len())
                     mapping[cpu_block] = gpu_block
                 new_block_table.append(gpu_block)
                 # Free the CPU block swapped in to GPU.
@@ -360,7 +381,8 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
             new_block_table: BlockTable = []
             block_table = self.block_tables[seq.seq_id]
 
-            for gpu_block in block_table:
+            for i in range(len(block_table)):
+                gpu_block = block_table[i]
                 if (seq_group.prefix is not None
                         and gpu_block in seq_group.prefix.block_table):
                     # NOTE: We do not swap out the prefix blocks for now.
@@ -371,9 +393,8 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
                     cpu_block = mapping[gpu_block]
                     cpu_block.ref_count += 1
                 else:
-                    cpu_block = self.cpu_allocator.allocate(
-                        seq.hash(len(seq.logical_blocks) - 1),
-                        seq_group.get_prefix_len())
+                    cpu_block = self._allocate_physical_block(
+                        seq, i, seq_group.get_prefix_len(), use_gpu=False)
                     mapping[gpu_block] = cpu_block
                 new_block_table.append(cpu_block)
                 # Free the GPU block swapped out to CPU.

From dedc9c0b0fe713ef978e0bbe608041d4bb3a3c94 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Mon, 12 Feb 2024 11:30:52 -0500
Subject: [PATCH 27/79] remove dead code in BlockSpaceManager

---
 vllm/core/block_manager.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index bd80dbb3648c5..71b61e12e5261 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -336,10 +336,6 @@ def can_swap_in(self, seq_group: SequenceGroup) -> bool:
 
     def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]:
         # CPU block -> GPU block.
-        if seq_group.prefix is not None:
-            # make sure to swap in the prefix first
-            assert seq_group.prefix.allocated and seq_group.prefix.computed
-
         mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {}
         for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
             new_block_table: BlockTable = []
@@ -383,12 +379,6 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
 
             for i in range(len(block_table)):
                 gpu_block = block_table[i]
-                if (seq_group.prefix is not None
-                        and gpu_block in seq_group.prefix.block_table):
-                    # NOTE: We do not swap out the prefix blocks for now.
-                    self.gpu_allocator.free(gpu_block)
-                    continue
-
                 if gpu_block in mapping:
                     cpu_block = mapping[gpu_block]
                     cpu_block.ref_count += 1

From 86299a457b9dc4e8ba57580e50d4fda425416996 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Mon, 12 Feb 2024 12:05:00 -0500
Subject: [PATCH 28/79] refactor swap_in/swap_out in BlockSpaceManager

---
 vllm/core/block_manager.py | 42 ++++++++++++--------------------------
 1 file changed, 13 insertions(+), 29 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 71b61e12e5261..a67982e659515 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -241,31 +241,18 @@ def _maybe_promote_last_block(self, seq: Sequence,
         if self._is_last_block_full(seq):
             self._promote_last_block(seq, last_block)
 
-    def _allocate_physical_block(self,
-                                 seq: Sequence,
-                                 index: int,
-                                 prefix_len: int,
-                                 use_gpu: bool = True) -> PhysicalTokenBlock:
+    def _allocate_last_physical_block(self, seq: Sequence,
+                                      prefix_len: int) -> PhysicalTokenBlock:
         block_hash: Optional[int] = None
-        if (self._is_block_full(seq, index)):
-            block_hash = seq.hash(index)
-        if use_gpu:
-            new_block = self.gpu_allocator.allocate(block_hash,
-                                                    prefix_len=prefix_len)
-        else:
-            new_block = self.cpu_allocator.allocate(block_hash,
-                                                    prefix_len=prefix_len)
-
+        logical_idx = len(seq.logical_token_blocks) - 1
+        if (self._is_block_full(seq, logical_idx)):
+            block_hash = seq.hash(logical_idx)
+        new_block = self.gpu_allocator.allocate(block_hash,
+                                                prefix_len=prefix_len)
         if block_hash is None:
             assert (new_block.ref_count == 1)
         return new_block
 
-    def _allocate_last_physical_block(self, seq: Sequence,
-                                      prefix_len: int) -> PhysicalTokenBlock:
-        return self._allocate_physical_block(seq,
-                                             len(seq.logical_token_blocks) - 1,
-                                             prefix_len)
-
     def append_slot(self, seq: Sequence,
                     prefix_len: int) -> Optional[Tuple[int, int]]:
         """Allocate a physical slot for a new token."""
@@ -345,15 +332,13 @@ def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]:
                     new_block_table.append(block)
                     block.ref_count += 1
 
-            # Assumption that len(block_table) == len(logical_blocks)
-            for i in range(len(block_table)):
-                cpu_block = block_table[i]
+            for cpu_block in block_table:
                 if cpu_block in mapping:
                     gpu_block = mapping[cpu_block]
                     gpu_block.ref_count += 1
                 else:
-                    gpu_block = self._allocate_physical_block(
-                        seq, i, seq_group.get_prefix_len())
+                    gpu_block = self.gpu_allocator.allocate(
+                        cpu_block.block_hash, cpu_block.prefix_len)
                     mapping[cpu_block] = gpu_block
                 new_block_table.append(gpu_block)
                 # Free the CPU block swapped in to GPU.
@@ -377,14 +362,13 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
             new_block_table: BlockTable = []
             block_table = self.block_tables[seq.seq_id]
 
-            for i in range(len(block_table)):
-                gpu_block = block_table[i]
+            for gpu_block in block_table:
                 if gpu_block in mapping:
                     cpu_block = mapping[gpu_block]
                     cpu_block.ref_count += 1
                 else:
-                    cpu_block = self._allocate_physical_block(
-                        seq, i, seq_group.get_prefix_len(), use_gpu=False)
+                    cpu_block = self.cpu_allocator.allocate(
+                        gpu_block.block_hash, gpu_block.prefix_len)
                     mapping[gpu_block] = cpu_block
                 new_block_table.append(cpu_block)
                 # Free the GPU block swapped out to CPU.

From 614a197e20c258c788156788d7637fc5a410b916 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Mon, 12 Feb 2024 17:10:09 -0500
Subject: [PATCH 29/79] Update the partial block promotion logic to account for
 the full version already being in the cache

---
 vllm/core/block_manager.py | 43 ++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index a67982e659515..ef6e47cb89f44 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -102,7 +102,6 @@ def allocate(self,
                 block_hash, prefix_len)
         block = self.table[block_hash]
         block.ref_count += 1
-        # print(f"REFCOUNT ON ALLOCTION: {block}")
         return block
 
     def free(self, block: PhysicalTokenBlock) -> None:
@@ -114,7 +113,11 @@ def free(self, block: PhysicalTokenBlock) -> None:
     def get_num_free_blocks(self) -> int:
         return self.num_blocks - self.current_num_blocks
 
-    def update_hash(self, block_hash: int, block: PhysicalTokenBlock) -> None:
+    def contains_block(self, block_hash: int) -> bool:
+        return block_hash in self.table
+
+    def update_hash(self, block_hash: int, block: PhysicalTokenBlock):
+        assert (not self.contains_block(block_hash))
         old_hash = block.block_hash
         del self.table[old_hash]
         self.table[block_hash] = block
@@ -204,8 +207,6 @@ def allocate(self, seq_group: SequenceGroup) -> None:
             else:
                 block = self.gpu_allocator.allocate(seq.hash(logical_idx),
                                                     seq_group.get_prefix_len())
-            # Set the reference counts of the token blocks.
-            # block.ref_count = seq_group.num_seqs()
             block_table.append(block)
 
         # Assign the block table for each sequence.
@@ -219,12 +220,19 @@ def can_append_slot(self, seq_group: SequenceGroup) -> bool:
         num_seqs = seq_group.num_seqs(status=SequenceStatus.RUNNING)
         return num_seqs <= num_free_gpu_blocks
 
-    def _promote_last_block(self, seq: Sequence, block: PhysicalTokenBlock):
+    def _promote_last_block(
+            self, seq: Sequence,
+            last_block: PhysicalTokenBlock) -> PhysicalTokenBlock:
         # Compute a new hash for the block so that it can be shared by other Sequences
         new_hash = seq.hash(len(seq.logical_token_blocks) - 1)
 
-        # TODO: What if the hash already exists in the table? If it does, we can free and use that block?
-        self.gpu_allocator.update_hash(new_hash, block)
+        # if new_hash is already in the cached table, then free last_block and return the cached version
+        if self.gpu_allocator.contains_block(new_hash):
+            self.gpu_allocator.free(last_block)
+            return self.gpu_allocator.allocate(new_hash)
+        else:
+            self.gpu_allocator.update_hash(new_hash, last_block)
+            return last_block
 
     def _is_last_block_full(self, seq: Sequence) -> bool:
         return (len(seq.data.get_token_ids())) % seq.block_size == 0
@@ -232,21 +240,19 @@ def _is_last_block_full(self, seq: Sequence) -> bool:
     def _is_last_block(self, seq: Sequence, index: int) -> bool:
         return index == len(seq.logical_token_blocks) - 1
 
-    def _is_block_full(self, seq: Sequence, index: int) -> bool:
-        return not self._is_last_block(seq,
-                                       index) or self._is_last_block_full(seq)
-
-    def _maybe_promote_last_block(self, seq: Sequence,
-                                  last_block: PhysicalTokenBlock) -> None:
+    def _maybe_promote_last_block(
+            self, seq: Sequence,
+            last_block: PhysicalTokenBlock) -> PhysicalTokenBlock:
         if self._is_last_block_full(seq):
-            self._promote_last_block(seq, last_block)
+            return self._promote_last_block(seq, last_block)
+        else:
+            return last_block
 
     def _allocate_last_physical_block(self, seq: Sequence,
                                       prefix_len: int) -> PhysicalTokenBlock:
         block_hash: Optional[int] = None
-        logical_idx = len(seq.logical_token_blocks) - 1
-        if (self._is_block_full(seq, logical_idx)):
-            block_hash = seq.hash(logical_idx)
+        if (self._is_last_block_full(seq)):
+            block_hash = seq.hash(len(seq.logical_token_blocks) - 1)
         new_block = self.gpu_allocator.allocate(block_hash,
                                                 prefix_len=prefix_len)
         if block_hash is None:
@@ -281,7 +287,8 @@ def append_slot(self, seq: Sequence,
         if last_block.ref_count == 1:
             # Not shared with other sequences. Appendable.
             # If the last block is now complete, promote it to a full block so that it can be shared
-            self._maybe_promote_last_block(seq, last_block)
+            new_block = self._maybe_promote_last_block(seq, last_block)
+            block_table[-1] = new_block
             return None
         else:
             # The last block is shared with other sequences.

From 0f8547423565a72e470f7355bb7e223701765e1b Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Tue, 13 Feb 2024 07:55:12 -0500
Subject: [PATCH 30/79] remove min from sequence hash

---
 vllm/sequence.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vllm/sequence.py b/vllm/sequence.py
index f9bb3eb24fc93..2c72be85fc520 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -143,9 +143,7 @@ def lora_int_id(self) -> int:
     def hash(self, logical_idx: int) -> int:
         # Compute the number of tokens in the sequence
         num_tokens = logical_idx * self.block_size + self.block_size
-        return hash(
-            tuple(self.data.get_token_ids()
-                  [0:min(num_tokens, len(self.data.get_token_ids()))]))
+        return hash(tuple(self.data.get_token_ids()[0:num_tokens]))
 
     def _append_logical_block(self) -> None:
         block = LogicalTokenBlock(

From 9672b20fe6c9a7fc46c9e8eeb7f3f2f9b07570ee Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Tue, 13 Feb 2024 09:43:26 -0500
Subject: [PATCH 31/79] Remove prefix.py

---
 vllm/prefix.py | 88 --------------------------------------------------
 1 file changed, 88 deletions(-)
 delete mode 100644 vllm/prefix.py

diff --git a/vllm/prefix.py b/vllm/prefix.py
deleted file mode 100644
index 4b780161a5278..0000000000000
--- a/vllm/prefix.py
+++ /dev/null
@@ -1,88 +0,0 @@
-from typing import Dict, List, Sequence, Tuple, Optional
-
-from vllm.block import BlockTable
-
-
-class Prefix:
-    """Data and states associated with a prefix of prompt tokens for multiple
-    sequence groups.
-
-    NOTE: This feature is experimental and may be replaced with automatic
-        prefix caching in the future.
-
-    Args:
-        token_ids: The token ids of the prefix.
-        block_size: The block size of the executed model.
-    """
-
-    def __init__(
-        self,
-        token_ids: Sequence[int],
-        block_size: int,
-    ) -> None:
-        self.token_ids = tuple(token_ids)
-        self.block_size = block_size
-        self.length = len(token_ids)
-        self.hash = hash(token_ids)
-        assert self.length % block_size == 0
-        self.block_table: Optional[BlockTable] = None
-        self.computed = False
-
-    @property
-    def allocated(self) -> bool:
-        return self.block_table is not None
-
-    def get_num_blocks(self) -> int:
-        return self.length // self.block_size
-
-    def get_block_numbers(self) -> List[int]:
-        return [block.block_number for block in self.block_table]
-
-    def get_length(self) -> int:
-        return self.length
-
-    def __hash__(self) -> int:
-        return self.hash
-
-    def set_block_table(self, block_table: BlockTable) -> None:
-        self.block_table = block_table.copy()
-
-
-class PrefixPool:
-    """Manages all the prompt prefixes.
-
-    NOTE: This feature is experimental and may be replaced with automatic
-        prefix caching in the future.
-
-    Args:
-        block_size: The block size of the executed model.
-
-    Attributes:
-        prefixes: A list of all the prefixes.
-        block_size: The block size of the executed model.
-    """
-
-    def __init__(
-        self,
-        block_size: int,
-    ) -> None:
-        # TODO(zhuohan): Add a capacity limit to the prefix pool.
-        self.prefixes: Dict[int, Prefix] = {}
-        self.block_size = block_size
-
-    def _truncate_token_ids(self, token_ids: Sequence[int]) -> Tuple[int]:
-        new_length = len(token_ids) // self.block_size * self.block_size
-        return tuple(token_ids[:new_length])
-
-    # TODO clean this up? It's not used anywhere now
-    def add_or_get_prefix(self, token_ids: Sequence[int],
-                          lora_int_id: int) -> Optional[Prefix]:
-        token_ids = self._truncate_token_ids(token_ids)
-        if len(token_ids) == 0:
-            # Prefix is empty.
-            return None
-        prefix = Prefix(token_ids, self.block_size)
-        prefix_hash = hash((prefix, lora_int_id))
-        if prefix_hash not in self.prefixes:
-            self.prefixes[prefix_hash] = prefix
-        return self.prefixes[prefix_hash]

From 6044c2b39462c4ffaecdf37420ade26439ad9fdc Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Tue, 13 Feb 2024 09:56:20 -0500
Subject: [PATCH 32/79] misc formatting

---
 vllm/core/block_manager.py | 45 +++++++++++++++++++++++++++-----------
 vllm/core/scheduler.py     | 26 ++++++++++++++++------
 2 files changed, 51 insertions(+), 20 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index ef6e47cb89f44..1ecb7127232d8 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -196,7 +196,6 @@ def allocate(self, seq_group: SequenceGroup) -> None:
         seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
 
         # Allocate new physical token blocks that will store the prompt tokens.
-
         num_prompt_blocks = len(seq.logical_token_blocks)
 
         block_table: BlockTable = []
@@ -221,8 +220,10 @@ def can_append_slot(self, seq_group: SequenceGroup) -> bool:
         return num_seqs <= num_free_gpu_blocks
 
     def _promote_last_block(
-            self, seq: Sequence,
-            last_block: PhysicalTokenBlock) -> PhysicalTokenBlock:
+        self,
+        seq: Sequence,
+        last_block: PhysicalTokenBlock,
+    ) -> PhysicalTokenBlock:
         # Compute a new hash for the block so that it can be shared by other Sequences
         new_hash = seq.hash(len(seq.logical_token_blocks) - 1)
 
@@ -234,22 +235,34 @@ def _promote_last_block(
             self.gpu_allocator.update_hash(new_hash, last_block)
             return last_block
 
-    def _is_last_block_full(self, seq: Sequence) -> bool:
+    def _is_last_block_full(
+        self,
+        seq: Sequence,
+    ) -> bool:
         return (len(seq.data.get_token_ids())) % seq.block_size == 0
 
-    def _is_last_block(self, seq: Sequence, index: int) -> bool:
+    def _is_last_block(
+        self,
+        seq: Sequence,
+        index: int,
+    ) -> bool:
         return index == len(seq.logical_token_blocks) - 1
 
     def _maybe_promote_last_block(
-            self, seq: Sequence,
-            last_block: PhysicalTokenBlock) -> PhysicalTokenBlock:
+        self,
+        seq: Sequence,
+        last_block: PhysicalTokenBlock,
+    ) -> PhysicalTokenBlock:
         if self._is_last_block_full(seq):
             return self._promote_last_block(seq, last_block)
         else:
             return last_block
 
-    def _allocate_last_physical_block(self, seq: Sequence,
-                                      prefix_len: int) -> PhysicalTokenBlock:
+    def _allocate_last_physical_block(
+        self,
+        seq: Sequence,
+        prefix_len: int,
+    ) -> PhysicalTokenBlock:
         block_hash: Optional[int] = None
         if (self._is_last_block_full(seq)):
             block_hash = seq.hash(len(seq.logical_token_blocks) - 1)
@@ -259,8 +272,11 @@ def _allocate_last_physical_block(self, seq: Sequence,
             assert (new_block.ref_count == 1)
         return new_block
 
-    def append_slot(self, seq: Sequence,
-                    prefix_len: int) -> Optional[Tuple[int, int]]:
+    def append_slot(
+        self,
+        seq: Sequence,
+        prefix_len: int,
+    ) -> Optional[Tuple[int, int]]:
         """Allocate a physical slot for a new token."""
         logical_blocks = seq.logical_token_blocks
         block_table = self.block_tables[seq.seq_id]
@@ -418,8 +434,11 @@ def get_num_free_gpu_blocks(self) -> int:
     def get_num_free_cpu_blocks(self) -> int:
         return self.cpu_allocator.get_num_free_blocks()
 
-    def access_all_blocks_in_seq(self, seq: Sequence,
-                                 access_time: float) -> None:
+    def access_all_blocks_in_seq(
+        self,
+        seq: Sequence,
+        access_time: float,
+    ) -> None:
         block_table = self.block_tables[seq.seq_id]
         for block in block_table:
             block.last_accessed = access_time
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index cd08940c49a0b..b12457afa85b9 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -400,8 +400,11 @@ def _allocate(self, seq_group: SequenceGroup) -> None:
         for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
             seq.status = SequenceStatus.RUNNING
 
-    def _append_slot(self, seq_group: SequenceGroup,
-                     blocks_to_copy: Dict[int, List[int]]) -> None:
+    def _append_slot(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_copy: Dict[int, List[int]],
+    ) -> None:
         for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
             ret = self.block_manager.append_slot(seq,
                                                  seq_group.get_prefix_len())
@@ -441,7 +444,10 @@ def _preempt(
         else:
             raise AssertionError("Invalid preemption mode.")
 
-    def _preempt_by_recompute(self, seq_group: SequenceGroup) -> None:
+    def _preempt_by_recompute(
+        self,
+        seq_group: SequenceGroup,
+    ) -> None:
         seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
         assert len(seqs) == 1
         for seq in seqs:
@@ -459,15 +465,21 @@ def _preempt_by_swap(
         self._swap_out(seq_group, blocks_to_swap_out)
         self.swapped.append(seq_group)
 
-    def _swap_in(self, seq_group: SequenceGroup,
-                 blocks_to_swap_in: Dict[int, int]) -> None:
+    def _swap_in(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_swap_in: Dict[int, int],
+    ) -> None:
         mapping = self.block_manager.swap_in(seq_group)
         blocks_to_swap_in.update(mapping)
         for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
             seq.status = SequenceStatus.RUNNING
 
-    def _swap_out(self, seq_group: SequenceGroup,
-                  blocks_to_swap_out: Dict[int, int]) -> None:
+    def _swap_out(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_swap_out: Dict[int, int],
+    ) -> None:
         if not self.block_manager.can_swap_out(seq_group):
             # FIXME(woosuk): Abort the sequence group instead of aborting the
             # entire engine.

From 9f7ae9f1ab3f973652f273a463702e8437d823f1 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Tue, 13 Feb 2024 12:09:40 -0500
Subject: [PATCH 33/79] bring back free table

---
 vllm/core/block_manager.py | 39 ++++++++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 12 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 1ecb7127232d8..ba8415d110c79 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -13,38 +13,41 @@ class EvictionPolicy(enum.Enum):
     LRU = enum.auto()
 
 
-def lru_eviction(table: Dict[int, PhysicalTokenBlock]) -> PhysicalTokenBlock:
-    all_blocks: List[PhysicalTokenBlock] = list(table.values())
-    assert (len(all_blocks) > 0)
+def lru_eviction(free_table: Dict[int, PhysicalTokenBlock]) -> PhysicalTokenBlock:
+    free_blocks: List[PhysicalTokenBlock] = list(free_table.values())
+    if len(free_blocks) == 0:
+        raise ValueError("No usable cache memory left")
 
     # Find lowest timestamp
     lowest_timestamp = monotonic()
-    for block in all_blocks:
-        if block.ref_count == 0 and block.last_accessed < lowest_timestamp:
+    for block in free_blocks:
+        if block.last_accessed < lowest_timestamp:
             lowest_timestamp = block.last_accessed
 
     # Find all blocks with the lowest timestamp
     least_recent: List[PhysicalTokenBlock] = []
-    for block in all_blocks:
-        if block.ref_count == 0 and block.last_accessed == lowest_timestamp:
+    for block in free_blocks:
+        if block.last_accessed == lowest_timestamp:
             least_recent.append(block)
 
     # Find highest prefix count per block
     highest_prefix_count = 0
     for block in least_recent:
-        if block.ref_count == 0 and block.prefix_len > highest_prefix_count:
+        if block.prefix_len > highest_prefix_count:
             highest_prefix_count = block.prefix_len
 
     # Find all blocks with the lowest timestamp
     eviction_candidates: List[PhysicalTokenBlock] = []
     for block in least_recent:
-        if block.ref_count == 0 and block.prefix_len == highest_prefix_count:
+        if block.prefix_len == highest_prefix_count:
             eviction_candidates.append(block)
 
     # Arbitrarily evict the first candidate
-    assert (len(eviction_candidates) > 0)
+    if len(eviction_candidates) == 0:
+        raise ValueError("No usable cache memory left")
+
     evicted_block = eviction_candidates[0]
-    del table[evicted_block.block_hash]
+    del free_table[evicted_block.block_hash]
 
     return evicted_block
 
@@ -70,10 +73,11 @@ def __init__(self,
 
         self.current_num_blocks = 0
         self.table: Dict[int, PhysicalTokenBlock] = {}
+        self.free_table: Dict[int, PhysicalTokenBlock] = {}
 
     def evict(self) -> PhysicalTokenBlock:
         if self.eviction_policy == EvictionPolicy.LRU:
-            return lru_eviction(self.table)
+            return lru_eviction(self.free_table)
         else:
             raise ValueError(
                 f"Unknown cache eviction policy: {self.eviction_policy}")
@@ -83,6 +87,7 @@ def allocate_block(self, block_hash: int,
         if self.current_num_blocks == self.num_blocks:
             block = self.evict()
             block.block_hash = block_hash
+            block.prefix_len = prefix_len
             return block
         block = PhysicalTokenBlock(device=self.device,
                                    block_number=self.current_num_blocks,
@@ -97,6 +102,13 @@ def allocate(self,
                  prefix_len: int = 0) -> PhysicalTokenBlock:
         if block_hash is None:
             block_hash = monotonic()
+        if block_hash in self.free_table:
+            assert block_hash not in self.table
+            block = self.free_table[block_hash]
+            self.table[block_hash] = block
+            block.ref_count += 1
+            del self.free_table[block_hash]
+            return block
         if block_hash not in self.table:
             self.table[block_hash] = self.allocate_block(
                 block_hash, prefix_len)
@@ -108,6 +120,9 @@ def free(self, block: PhysicalTokenBlock) -> None:
         if block.ref_count == 0:
             raise ValueError(f"Double free! {block} is already freed.")
         block.ref_count -= 1
+        if block.ref_count == 0:
+            self.free_table[block.block_hash] = block
+            del self.table[block.block_hash]
 
     # TODO: Should this account for the number of blocks with a ref count of 0?
     def get_num_free_blocks(self) -> int:

From 66551303da18d5e4ac3d5d92e3bd1cea0680c9ac Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Tue, 13 Feb 2024 12:10:13 -0500
Subject: [PATCH 34/79] format

---
 vllm/core/block_manager.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index ba8415d110c79..9db130949f2ad 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -13,7 +13,8 @@ class EvictionPolicy(enum.Enum):
     LRU = enum.auto()
 
 
-def lru_eviction(free_table: Dict[int, PhysicalTokenBlock]) -> PhysicalTokenBlock:
+def lru_eviction(
+        free_table: Dict[int, PhysicalTokenBlock]) -> PhysicalTokenBlock:
     free_blocks: List[PhysicalTokenBlock] = list(free_table.values())
     if len(free_blocks) == 0:
         raise ValueError("No usable cache memory left")

From 1d6f0a03be761e9aed5a2c6f1d8870b101db8ec1 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Tue, 13 Feb 2024 14:41:17 -0500
Subject: [PATCH 35/79] update get_num_free_blocks to account for blocks in
 free table

---
 vllm/core/block_manager.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 9db130949f2ad..883a88fddb2e7 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -125,9 +125,8 @@ def free(self, block: PhysicalTokenBlock) -> None:
             self.free_table[block.block_hash] = block
             del self.table[block.block_hash]
 
-    # TODO: Should this account for the number of blocks with a ref count of 0?
     def get_num_free_blocks(self) -> int:
-        return self.num_blocks - self.current_num_blocks
+        return self.num_blocks - self.current_num_blocks + len(self.free_table)
 
     def contains_block(self, block_hash: int) -> bool:
         return block_hash in self.table

From 0ca5c43685b287652ecece544faa5518cf368e81 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Tue, 13 Feb 2024 15:02:55 -0500
Subject: [PATCH 36/79] add some more asserts to BlockAllocator

---
 vllm/core/block_manager.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 883a88fddb2e7..0f4f33e42ebee 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -106,6 +106,7 @@ def allocate(self,
         if block_hash in self.free_table:
             assert block_hash not in self.table
             block = self.free_table[block_hash]
+            assert block.ref_count == 0
             self.table[block_hash] = block
             block.ref_count += 1
             del self.free_table[block_hash]
@@ -122,6 +123,7 @@ def free(self, block: PhysicalTokenBlock) -> None:
             raise ValueError(f"Double free! {block} is already freed.")
         block.ref_count -= 1
         if block.ref_count == 0:
+            assert block.block_hash not in self.free_table
             self.free_table[block.block_hash] = block
             del self.table[block.block_hash]
 

From 7d6444d4125f47c97d0633d4539fa72181e51bf0 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Tue, 13 Feb 2024 21:49:57 -0500
Subject: [PATCH 37/79] contains_block() now looks at both table and free_table
 + a couple asserts

---
 vllm/core/block_manager.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 0f4f33e42ebee..88d3ca96758a6 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -110,11 +110,13 @@ def allocate(self,
             self.table[block_hash] = block
             block.ref_count += 1
             del self.free_table[block_hash]
+            assert block.block_hash == block_hash
             return block
         if block_hash not in self.table:
             self.table[block_hash] = self.allocate_block(
                 block_hash, prefix_len)
         block = self.table[block_hash]
+        assert block.block_hash == block_hash
         block.ref_count += 1
         return block
 
@@ -131,7 +133,7 @@ def get_num_free_blocks(self) -> int:
         return self.num_blocks - self.current_num_blocks + len(self.free_table)
 
     def contains_block(self, block_hash: int) -> bool:
-        return block_hash in self.table
+        return block_hash in self.table or block_hash in self.free_table
 
     def update_hash(self, block_hash: int, block: PhysicalTokenBlock):
         assert (not self.contains_block(block_hash))

From 47754236324221ee0e57eb2bdd9fccdcf270194b Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Wed, 14 Feb 2024 16:58:45 -0500
Subject: [PATCH 38/79] updated semantics of prefix length in block

---
 vllm/core/block_manager.py | 10 +++++++---
 vllm/sequence.py           |  7 +++++++
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 88d3ca96758a6..225019ed87880 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -223,8 +223,10 @@ def allocate(self, seq_group: SequenceGroup) -> None:
                     and logical_idx >= self.block_sliding_window):
                 block = block_table[logical_idx % self.block_sliding_window]
             else:
-                block = self.gpu_allocator.allocate(seq.hash(logical_idx),
-                                                    seq_group.get_prefix_len())
+                block = self.gpu_allocator.allocate(
+                    seq.hash(logical_idx),
+                    seq.prefix_len_of_block(logical_idx,
+                                            seq_group.get_prefix_len()))
             block_table.append(block)
 
         # Assign the block table for each sequence.
@@ -285,8 +287,10 @@ def _allocate_last_physical_block(
         block_hash: Optional[int] = None
         if (self._is_last_block_full(seq)):
             block_hash = seq.hash(len(seq.logical_token_blocks) - 1)
+        block_prefix_len = seq.prefix_len_of_block(
+            len(seq.logical_token_blocks) - 1, prefix_len)
         new_block = self.gpu_allocator.allocate(block_hash,
-                                                prefix_len=prefix_len)
+                                                prefix_len=block_prefix_len)
         if block_hash is None:
             assert (new_block.ref_count == 1)
         return new_block
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 2c72be85fc520..8c4dc0f72a529 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -145,6 +145,13 @@ def hash(self, logical_idx: int) -> int:
         num_tokens = logical_idx * self.block_size + self.block_size
         return hash(tuple(self.data.get_token_ids()[0:num_tokens]))
 
+    def prefix_len_of_block(self, logical_idx: int, full_prefix_len: int):
+        num_tokens = logical_idx * self.block_size + self.block_size
+        if num_tokens > full_prefix_len:
+            return full_prefix_len
+        else:
+            return num_tokens
+
     def _append_logical_block(self) -> None:
         block = LogicalTokenBlock(
             block_number=len(self.logical_token_blocks),

From 5cfee5fd7435952fa9ad75abf60eb2a8ed8496f2 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Thu, 15 Feb 2024 03:01:03 -0500
Subject: [PATCH 39/79] bring back prefix block tables

---
 vllm/core/block_manager.py  | 8 ++++++++
 vllm/core/scheduler.py      | 2 ++
 vllm/sequence.py            | 4 ++++
 vllm/worker/model_runner.py | 8 +++++++-
 4 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 88d3ca96758a6..16a5d7f9d91ae 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -227,6 +227,14 @@ def allocate(self, seq_group: SequenceGroup) -> None:
                                                     seq_group.get_prefix_len())
             block_table.append(block)
 
+        #TODO add block ref_counts for each block in prefix?
+        if seq_group.prefix_pos is not None and seq_group.prefix_pos > 0 and seq_group.prefix_block_nums is None:
+            num_prefix_blocks = seq_group.prefix_pos // self.block_size
+            prefix_block_table = block_table[:num_prefix_blocks]
+            seq_group.prefix_block_nums = [
+                block.block_number for block in prefix_block_table
+            ]
+
         # Assign the block table for each sequence.
         for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
             self.block_tables[seq.seq_id] = block_table.copy()
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index b12457afa85b9..75b1bd8b5a64c 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -381,6 +381,8 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
                 sampling_params=seq_group.sampling_params,
                 block_tables=block_tables,
                 lora_request=seq_group.lora_request,
+                prefix_pos=seq_group.prefix_pos,
+                prefix_block_nums=seq_group.prefix_block_nums,
             )
             seq_group_metadata_list.append(seq_group_metadata)
         return seq_group_metadata_list, scheduler_outputs
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 2c72be85fc520..1d15d29cd30af 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -252,6 +252,7 @@ def __init__(
         arrival_time: float,
         lora_request: Optional[LoRARequest] = None,
         prefix_pos: Optional[int] = None,
+        prefix_block_nums: Optional[List[int]] = None,
     ) -> None:
         self.request_id = request_id
         self.seqs_dict = {seq.seq_id: seq for seq in seqs}
@@ -261,6 +262,7 @@ def __init__(
         self.lora_request = lora_request
         self.prefix_pos: Optional[int] = prefix_pos
         self.prompt_logprobs: Optional[PromptLogprobs] = None
+        self.prefix_block_nums = prefix_block_nums
 
     @property
     def prompt(self) -> str:
@@ -379,6 +381,7 @@ def __init__(
         block_tables: Dict[int, List[int]],
         lora_request: Optional[LoRARequest] = None,
         prefix_pos: Optional[int] = None,
+        prefix_block_nums: Optional[List[int]] = None,
     ) -> None:
         self.request_id = request_id
         self.is_prompt = is_prompt
@@ -387,6 +390,7 @@ def __init__(
         self.block_tables = block_tables
         self.lora_request = lora_request
         self.prefix_pos = prefix_pos
+        self.prefix_block_nums = prefix_block_nums
 
     @property
     def lora_int_id(self) -> int:
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 5908d577e1a28..ef7b8d1e8f2f6 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -125,7 +125,13 @@ def _prepare_prompt(
             prompt_len = len(prompt_tokens)
             prompt_lens.append(prompt_len)
             prefix_len = 0
-            prefix_block_tables.append([])
+            prefix_block_nums = seq_group_metadata.prefix_block_nums
+            if prefix_block_nums is not None:
+                prefix_len = seq_group_metadata.prefix_pos
+                prompt_tokens = prompt_tokens[prefix_len:]
+                prefix_block_tables.append(prefix_block_nums)
+            else:
+                prefix_block_tables.append([])
             # actual prompt lens
             context_lens.append(prefix_len)
             subquery_lens.append(prompt_len - prefix_len)

From 492507175d937688eb0f129b770e59b616527c62 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Thu, 15 Feb 2024 03:18:21 -0500
Subject: [PATCH 40/79] Nits (style)

---
 vllm/block.py              | 7 ++-----
 vllm/core/block_manager.py | 2 +-
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/vllm/block.py b/vllm/block.py
index c34591e6ad236..da05d0d3f4f4d 100644
--- a/vllm/block.py
+++ b/vllm/block.py
@@ -62,15 +62,12 @@ def __init__(
         self.device = device
         self.block_number = block_number
         self.block_size = block_size
-
-        self.ref_count = 0
-
         self.block_hash = block_hash
+        self.prefix_len = prefix_len
 
+        self.ref_count = 0
         self.last_accessed = monotonic()
 
-        self.prefix_len = prefix_len
-
     def __repr__(self) -> str:
         return (f'PhysicalTokenBlock(device={self.device}, '
                 f'block_number={self.block_number}, '
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 66021a8fb6117..602fad22418b8 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -139,8 +139,8 @@ def update_hash(self, block_hash: int, block: PhysicalTokenBlock):
         assert (not self.contains_block(block_hash))
         old_hash = block.block_hash
         del self.table[old_hash]
-        self.table[block_hash] = block
         block.block_hash = block_hash
+        self.table[block_hash] = block
 
 
 class AllocStatus(enum.Enum):

From 4fba5f9c28fe6c89e2abfdf239324a8ab6a07557 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Thu, 15 Feb 2024 07:11:56 -0500
Subject: [PATCH 41/79] delete comment

---
 vllm/core/block_manager.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 602fad22418b8..90d74e95ae5ac 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -229,7 +229,6 @@ def allocate(self, seq_group: SequenceGroup) -> None:
                                             seq_group.get_prefix_len()))
             block_table.append(block)
 
-        #TODO add block ref_counts for each block in prefix?
         if seq_group.prefix_pos is not None and seq_group.prefix_pos > 0 and seq_group.prefix_block_nums is None:
             num_prefix_blocks = seq_group.prefix_pos // self.block_size
             prefix_block_table = block_table[:num_prefix_blocks]

From 46c62e4789f613002da576936327f4ab2cd09daf Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 15 Feb 2024 10:43:44 -0500
Subject: [PATCH 42/79] Added computed_block_nums

---
 vllm/block.py               |  3 +++
 vllm/core/block_manager.py  | 38 ++++++++++++++++++++++++++++++-------
 vllm/core/scheduler.py      |  6 +++++-
 vllm/engine/llm_engine.py   |  4 ++++
 vllm/sequence.py            |  6 ++----
 vllm/worker/model_runner.py |  9 ++++++---
 6 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/vllm/block.py b/vllm/block.py
index da05d0d3f4f4d..5be24a6b4f88e 100644
--- a/vllm/block.py
+++ b/vllm/block.py
@@ -68,6 +68,9 @@ def __init__(
         self.ref_count = 0
         self.last_accessed = monotonic()
 
+        self.computed = False
+
+    # TODO: update this
     def __repr__(self) -> str:
         return (f'PhysicalTokenBlock(device={self.device}, '
                 f'block_number={self.block_number}, '
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 90d74e95ae5ac..f5bbeb0cffbc3 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -50,6 +50,7 @@ def lru_eviction(
     evicted_block = eviction_candidates[0]
     del free_table[evicted_block.block_hash]
 
+    evicted_block.computed = False
     return evicted_block
 
 
@@ -229,13 +230,6 @@ def allocate(self, seq_group: SequenceGroup) -> None:
                                             seq_group.get_prefix_len()))
             block_table.append(block)
 
-        if seq_group.prefix_pos is not None and seq_group.prefix_pos > 0 and seq_group.prefix_block_nums is None:
-            num_prefix_blocks = seq_group.prefix_pos // self.block_size
-            prefix_block_table = block_table[:num_prefix_blocks]
-            seq_group.prefix_block_nums = [
-                block.block_number for block in prefix_block_table
-            ]
-
         # Assign the block table for each sequence.
         for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
             self.block_tables[seq.seq_id] = block_table.copy()
@@ -472,3 +466,33 @@ def access_all_blocks_in_seq(
         block_table = self.block_tables[seq.seq_id]
         for block in block_table:
             block.last_accessed = access_time
+
+    def compute_all_blocks_in_seq(self, seq: Sequence):
+        if seq.seq_id not in self.block_tables:
+            return
+        block_table = self.block_tables[seq.seq_id]
+        for block in block_table:
+            block.computed = True
+
+    def get_all_computed_block_ids_2(self, seq: Sequence):
+        block_ids: List[int] = []
+        if seq.seq_id not in self.block_tables:
+            return block_ids
+        block_table = self.block_tables[seq.seq_id]
+        # We want to get the first n contiguous completed blocks
+        for block in block_table:
+            if block.computed:
+                block_ids.append(block.block_number)
+            else:
+                return block_ids
+        return block_ids
+
+    def get_all_computed_block_ids(self,
+                                   seq_group: SequenceGroup) -> List[int]:
+
+        return self.get_all_computed_block_ids_2(
+            next(iter(seq_group.seqs_dict.values())))
+
+    def mark_blocks_as_computed(self, seq_group: SequenceGroup):
+        for seq in seq_group.seqs_dict.values():
+            self.compute_all_blocks_in_seq(seq)
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 75b1bd8b5a64c..ba80ee204c010 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -382,7 +382,8 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
                 block_tables=block_tables,
                 lora_request=seq_group.lora_request,
                 prefix_pos=seq_group.prefix_pos,
-                prefix_block_nums=seq_group.prefix_block_nums,
+                computed_block_nums=self.block_manager.
+                get_all_computed_block_ids(seq_group),
             )
             seq_group_metadata_list.append(seq_group_metadata)
         return seq_group_metadata_list, scheduler_outputs
@@ -492,3 +493,6 @@ def _swap_out(
         blocks_to_swap_out.update(mapping)
         for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
             seq.status = SequenceStatus.SWAPPED
+
+    def mark_blocks_as_computed(self, seq_group: SequenceGroup):
+        self.block_manager.mark_blocks_as_computed(seq_group)
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 5317874827357..a3e672e067bf6 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -712,6 +712,10 @@ def _process_model_outputs(
             scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]:
         # Update the scheduled sequence groups with the model outputs.
         scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups
+
+        for seq_group in scheduled_seq_groups:
+            self.scheduler.mark_blocks_as_computed(seq_group)
+
         for seq_group, outputs in zip(scheduled_seq_groups, output):
             self._process_sequence_group_outputs(seq_group, outputs)
 
diff --git a/vllm/sequence.py b/vllm/sequence.py
index c03e9fde816ff..bc39f317b0277 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -259,7 +259,6 @@ def __init__(
         arrival_time: float,
         lora_request: Optional[LoRARequest] = None,
         prefix_pos: Optional[int] = None,
-        prefix_block_nums: Optional[List[int]] = None,
     ) -> None:
         self.request_id = request_id
         self.seqs_dict = {seq.seq_id: seq for seq in seqs}
@@ -269,7 +268,6 @@ def __init__(
         self.lora_request = lora_request
         self.prefix_pos: Optional[int] = prefix_pos
         self.prompt_logprobs: Optional[PromptLogprobs] = None
-        self.prefix_block_nums = prefix_block_nums
 
     @property
     def prompt(self) -> str:
@@ -388,7 +386,7 @@ def __init__(
         block_tables: Dict[int, List[int]],
         lora_request: Optional[LoRARequest] = None,
         prefix_pos: Optional[int] = None,
-        prefix_block_nums: Optional[List[int]] = None,
+        computed_block_nums: Optional[List[int]] = None,
     ) -> None:
         self.request_id = request_id
         self.is_prompt = is_prompt
@@ -397,7 +395,7 @@ def __init__(
         self.block_tables = block_tables
         self.lora_request = lora_request
         self.prefix_pos = prefix_pos
-        self.prefix_block_nums = prefix_block_nums
+        self.computed_block_nums = computed_block_nums
 
     @property
     def lora_int_id(self) -> int:
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index ef7b8d1e8f2f6..382b9feb6b85c 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -125,11 +125,14 @@ def _prepare_prompt(
             prompt_len = len(prompt_tokens)
             prompt_lens.append(prompt_len)
             prefix_len = 0
-            prefix_block_nums = seq_group_metadata.prefix_block_nums
-            if prefix_block_nums is not None:
+
+            # NOTE: This only works for oooooooxxx style attention.
+            computed_block_nums = seq_group_metadata.computed_block_nums
+            if computed_block_nums is not None and len(
+                    computed_block_nums) > 0:
                 prefix_len = seq_group_metadata.prefix_pos
                 prompt_tokens = prompt_tokens[prefix_len:]
-                prefix_block_tables.append(prefix_block_nums)
+                prefix_block_tables.append(computed_block_nums)
             else:
                 prefix_block_tables.append([])
             # actual prompt lens

From 38b34d82031e2b904a0c7aae64f3195ccbe20e19 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Thu, 15 Feb 2024 11:17:34 -0500
Subject: [PATCH 43/79] pythonize get_all_computed_block_ids

---
 vllm/core/block_manager.py | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index f5bbeb0cffbc3..5f278fdfa5953 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -1,5 +1,7 @@
 """A block manager that manages token blocks."""
 import enum
+from itertools import takewhile
+from os.path import commonprefix
 from time import monotonic
 from typing import Dict, List, Optional, Set, Tuple
 
@@ -474,24 +476,23 @@ def compute_all_blocks_in_seq(self, seq: Sequence):
         for block in block_table:
             block.computed = True
 
-    def get_all_computed_block_ids_2(self, seq: Sequence):
-        block_ids: List[int] = []
+    def get_all_computed_block_ids_seq(self, seq: Sequence) -> List[int]:
         if seq.seq_id not in self.block_tables:
-            return block_ids
+            return []
         block_table = self.block_tables[seq.seq_id]
         # We want to get the first n contiguous completed blocks
-        for block in block_table:
-            if block.computed:
-                block_ids.append(block.block_number)
-            else:
-                return block_ids
-        return block_ids
+        return [
+            block.block_number
+            for block in takewhile(lambda block: block.computed, block_table)
+        ]
 
     def get_all_computed_block_ids(self,
                                    seq_group: SequenceGroup) -> List[int]:
-
-        return self.get_all_computed_block_ids_2(
-            next(iter(seq_group.seqs_dict.values())))
+        ids_list = [
+            self.get_all_computed_block_ids_seq(seq)
+            for seq in iter(seq_group.seqs_dict.values())
+        ]
+        return commonprefix([ids for ids in ids_list if ids != []])
 
     def mark_blocks_as_computed(self, seq_group: SequenceGroup):
         for seq in seq_group.seqs_dict.values():

From ba97f8026c827f626d7247fb3c458c9b75c5f4e5 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 15 Feb 2024 11:47:17 -0500
Subject: [PATCH 44/79] account for prefix_len=0 in _prepare_prompt

---
 vllm/sequence.py            | 3 +++
 vllm/worker/model_runner.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/sequence.py b/vllm/sequence.py
index bc39f317b0277..2ec3d974fa087 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -397,6 +397,9 @@ def __init__(
         self.prefix_pos = prefix_pos
         self.computed_block_nums = computed_block_nums
 
+    def get_prefix_len(self) -> int:
+        return self.prefix_pos if self.prefix_pos is not None else 0
+
     @property
     def lora_int_id(self) -> int:
         return self.lora_request.lora_int_id if self.lora_request else 0
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 382b9feb6b85c..069c05a7c6402 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -130,7 +130,7 @@ def _prepare_prompt(
             computed_block_nums = seq_group_metadata.computed_block_nums
             if computed_block_nums is not None and len(
                     computed_block_nums) > 0:
-                prefix_len = seq_group_metadata.prefix_pos
+                prefix_len = seq_group_metadata.get_prefix_len()
                 prompt_tokens = prompt_tokens[prefix_len:]
                 prefix_block_tables.append(computed_block_nums)
             else:

From fe37722b0825fcbb20883eded201a609207ec9c5 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 15 Feb 2024 13:11:14 -0500
Subject: [PATCH 45/79] attempt to fix build

---
 vllm/block.py              | 5 ++++-
 vllm/core/block_manager.py | 3 ++-
 vllm/core/scheduler.py     | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/vllm/block.py b/vllm/block.py
index 5be24a6b4f88e..e5f16e1bf611e 100644
--- a/vllm/block.py
+++ b/vllm/block.py
@@ -74,7 +74,10 @@ def __init__(
     def __repr__(self) -> str:
         return (f'PhysicalTokenBlock(device={self.device}, '
                 f'block_number={self.block_number}, '
-                f'ref_count={self.ref_count})')
+                f'prefix_len={self.prefix_len}, '
+                f'ref_count={self.ref_count}, '
+                f'last_accessed={self.last_accessed}, '
+                f'computed={self.computed})')
 
 
 # Mapping: logical block number -> physical block.
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 5f278fdfa5953..c4de36e3c668d 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -475,6 +475,7 @@ def compute_all_blocks_in_seq(self, seq: Sequence):
         block_table = self.block_tables[seq.seq_id]
         for block in block_table:
             block.computed = True
+        block_table[-1].computed = False
 
     def get_all_computed_block_ids_seq(self, seq: Sequence) -> List[int]:
         if seq.seq_id not in self.block_tables:
@@ -486,7 +487,7 @@ def get_all_computed_block_ids_seq(self, seq: Sequence) -> List[int]:
             for block in takewhile(lambda block: block.computed, block_table)
         ]
 
-    def get_all_computed_block_ids(self,
+    def get_common_computed_block_ids(self,
                                    seq_group: SequenceGroup) -> List[int]:
         ids_list = [
             self.get_all_computed_block_ids_seq(seq)
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index ba80ee204c010..acf2a59d65bb3 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -383,7 +383,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
                 lora_request=seq_group.lora_request,
                 prefix_pos=seq_group.prefix_pos,
                 computed_block_nums=self.block_manager.
-                get_all_computed_block_ids(seq_group),
+                get_common_computed_block_ids(seq_group),
             )
             seq_group_metadata_list.append(seq_group_metadata)
         return seq_group_metadata_list, scheduler_outputs

From 28f4ad26f494faefeeba58b043ce16fd6f53fb45 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 15 Feb 2024 13:12:27 -0500
Subject: [PATCH 46/79] attempt to fix build

---
 vllm/core/block_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index c4de36e3c668d..9e01f1aa43643 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -488,7 +488,7 @@ def get_all_computed_block_ids_seq(self, seq: Sequence) -> List[int]:
         ]
 
     def get_common_computed_block_ids(self,
-                                   seq_group: SequenceGroup) -> List[int]:
+                                      seq_group: SequenceGroup) -> List[int]:
         ids_list = [
             self.get_all_computed_block_ids_seq(seq)
             for seq in iter(seq_group.seqs_dict.values())

From bff30a79632e9da6adb7ec02c9c73407f102709b Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 15 Feb 2024 13:19:27 -0500
Subject: [PATCH 47/79] cap computed blocks to prefix length

---
 vllm/core/block_manager.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 9e01f1aa43643..3565ca449a5a7 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -493,7 +493,8 @@ def get_common_computed_block_ids(self,
             self.get_all_computed_block_ids_seq(seq)
             for seq in iter(seq_group.seqs_dict.values())
         ]
-        return commonprefix([ids for ids in ids_list if ids != []])
+        cp = commonprefix([ids for ids in ids_list if ids != []])
+        return cp[:seq_group.get_prefix_len() // 16]
 
     def mark_blocks_as_computed(self, seq_group: SequenceGroup):
         for seq in seq_group.seqs_dict.values():

From e829c34578a4f02dd2cea0ad49c359a86ea2ba33 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 15 Feb 2024 13:28:01 -0500
Subject: [PATCH 48/79] misc fixes

---
 vllm/core/block_manager.py | 2 +-
 vllm/core/scheduler.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 3565ca449a5a7..f4865de9f42c2 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -494,7 +494,7 @@ def get_common_computed_block_ids(self,
             for seq in iter(seq_group.seqs_dict.values())
         ]
         cp = commonprefix([ids for ids in ids_list if ids != []])
-        return cp[:seq_group.get_prefix_len() // 16]
+        return cp[:seq_group.get_prefix_len()]
 
     def mark_blocks_as_computed(self, seq_group: SequenceGroup):
         for seq in seq_group.seqs_dict.values():
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index acf2a59d65bb3..3225542b7fc25 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -381,7 +381,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
                 sampling_params=seq_group.sampling_params,
                 block_tables=block_tables,
                 lora_request=seq_group.lora_request,
-                prefix_pos=seq_group.prefix_pos,
+                prefix_pos=(seq_group.prefix_pos // 16) * 16,
                 computed_block_nums=self.block_manager.
                 get_common_computed_block_ids(seq_group),
             )

From 7f78ad4cc135965c64fe6f1c559f997762a15d2a Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 15 Feb 2024 13:29:10 -0500
Subject: [PATCH 49/79] typo

---
 vllm/core/block_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index f4865de9f42c2..ddc9e7f5d7ee0 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -494,7 +494,7 @@ def get_common_computed_block_ids(self,
             for seq in iter(seq_group.seqs_dict.values())
         ]
         cp = commonprefix([ids for ids in ids_list if ids != []])
-        return cp[:seq_group.get_prefix_len()]
+        return cp
 
     def mark_blocks_as_computed(self, seq_group: SequenceGroup):
         for seq in seq_group.seqs_dict.values():

From 18da5e6de5e30395df759d2e8ecc37eeba22393d Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 15 Feb 2024 13:32:22 -0500
Subject: [PATCH 50/79] account for none

---
 vllm/core/scheduler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 3225542b7fc25..f708befffae24 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -381,7 +381,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
                 sampling_params=seq_group.sampling_params,
                 block_tables=block_tables,
                 lora_request=seq_group.lora_request,
-                prefix_pos=(seq_group.prefix_pos // 16) * 16,
+                prefix_pos=(seq_group.get_prefix_len() // 16) * 16,
                 computed_block_nums=self.block_manager.
                 get_common_computed_block_ids(seq_group),
             )

From 49357be8f041bd5c5189e9f9e98e7d0679275d38 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 15 Feb 2024 13:44:15 -0500
Subject: [PATCH 51/79] block manager refactoring

---
 vllm/core/block_manager.py | 15 ++++++++++-----
 vllm/core/scheduler.py     |  2 +-
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index ddc9e7f5d7ee0..9603f6e6f23e5 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -469,13 +469,17 @@ def access_all_blocks_in_seq(
         for block in block_table:
             block.last_accessed = access_time
 
-    def compute_all_blocks_in_seq(self, seq: Sequence):
+    def compute_all_blocks_in_seq(self, seq: Sequence,
+                                  max_computed_blocks: int):
         if seq.seq_id not in self.block_tables:
             return
         block_table = self.block_tables[seq.seq_id]
+        counter = 0
         for block in block_table:
+            if counter >= max_computed_blocks:
+                return
             block.computed = True
-        block_table[-1].computed = False
+            counter += 1
 
     def get_all_computed_block_ids_seq(self, seq: Sequence) -> List[int]:
         if seq.seq_id not in self.block_tables:
@@ -493,9 +497,10 @@ def get_common_computed_block_ids(self,
             self.get_all_computed_block_ids_seq(seq)
             for seq in iter(seq_group.seqs_dict.values())
         ]
-        cp = commonprefix([ids for ids in ids_list if ids != []])
-        return cp
+        return commonprefix([ids for ids in ids_list if ids != []])
 
     def mark_blocks_as_computed(self, seq_group: SequenceGroup):
         for seq in seq_group.seqs_dict.values():
-            self.compute_all_blocks_in_seq(seq)
+            self.compute_all_blocks_in_seq(
+                seq,
+                seq_group.get_prefix_len() // seq.block_size)
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index f708befffae24..acf2a59d65bb3 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -381,7 +381,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
                 sampling_params=seq_group.sampling_params,
                 block_tables=block_tables,
                 lora_request=seq_group.lora_request,
-                prefix_pos=(seq_group.get_prefix_len() // 16) * 16,
+                prefix_pos=seq_group.prefix_pos,
                 computed_block_nums=self.block_manager.
                 get_common_computed_block_ids(seq_group),
             )

From ea4ec9d57004bee4762e8534962a1687e0606f80 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 15 Feb 2024 14:17:06 -0500
Subject: [PATCH 52/79] clamp prefix length down to a multiple of block size

---
 vllm/core/scheduler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index acf2a59d65bb3..ef7a8c8bb81fb 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -381,7 +381,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
                 sampling_params=seq_group.sampling_params,
                 block_tables=block_tables,
                 lora_request=seq_group.lora_request,
-                prefix_pos=seq_group.prefix_pos,
+                prefix_pos=seq_group.get_prefix_len() // 16,
                 computed_block_nums=self.block_manager.
                 get_common_computed_block_ids(seq_group),
             )

From f5fa2de9b4aa6a282f28ed168fd18942730ba865 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 15 Feb 2024 15:02:22 -0500
Subject: [PATCH 53/79] minor prefix length fix

---
 vllm/core/scheduler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index ef7a8c8bb81fb..f708befffae24 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -381,7 +381,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
                 sampling_params=seq_group.sampling_params,
                 block_tables=block_tables,
                 lora_request=seq_group.lora_request,
-                prefix_pos=seq_group.get_prefix_len() // 16,
+                prefix_pos=(seq_group.get_prefix_len() // 16) * 16,
                 computed_block_nums=self.block_manager.
                 get_common_computed_block_ids(seq_group),
             )

From 704aa47edce334593dba90b85e5bb07d1bddb947 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Fri, 16 Feb 2024 08:15:11 -0500
Subject: [PATCH 54/79] replace 16 with block size

---
 vllm/core/scheduler.py | 7 ++++++-
 vllm/sequence.py       | 4 ++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index f708befffae24..1698d6d15f694 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -368,6 +368,11 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
         for seq_group in scheduler_outputs.scheduled_seq_groups:
             seq_data: Dict[int, SequenceData] = {}
             block_tables: Dict[int, List[int]] = {}
+
+            # Round the prefix position down to the last full block
+            rounded_prefix_pos = (seq_group.get_prefix_len() //
+                                  seq_group.block_size) * seq_group.block_size
+
             for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
                 seq_id = seq.seq_id
                 seq_data[seq_id] = seq.data
@@ -381,7 +386,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
                 sampling_params=seq_group.sampling_params,
                 block_tables=block_tables,
                 lora_request=seq_group.lora_request,
-                prefix_pos=(seq_group.get_prefix_len() // 16) * 16,
+                prefix_pos=rounded_prefix_pos,
                 computed_block_nums=self.block_manager.
                 get_common_computed_block_ids(seq_group),
             )
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 2ec3d974fa087..ad5c399f3a642 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -281,6 +281,10 @@ def prompt_token_ids(self) -> List[int]:
         # We use the prompt of an arbitrary sequence.
         return next(iter(self.seqs_dict.values())).data.prompt_token_ids
 
+    @property
+    def block_size(self) -> int:
+        return next(iter(self.seqs_dict.values())).block_size
+
     @property
     def lora_int_id(self) -> int:
         return self.lora_request.lora_int_id if self.lora_request else 0

From 8771b3f82c65d6b3f967fcd32a462a5108227519 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Wed, 21 Feb 2024 08:44:15 -0500
Subject: [PATCH 55/79] First round of feedback changes

---
 vllm/core/block_manager.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 9603f6e6f23e5..7a5a6aad28777 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -1,6 +1,6 @@
 """A block manager that manages token blocks."""
 import enum
-from itertools import takewhile
+from itertools import takewhile, count
 from os.path import commonprefix
 from time import monotonic
 from typing import Dict, List, Optional, Set, Tuple
@@ -39,17 +39,16 @@ def lru_eviction(
         if block.prefix_len > highest_prefix_count:
             highest_prefix_count = block.prefix_len
 
-    # Find all blocks with the lowest timestamp
-    eviction_candidates: List[PhysicalTokenBlock] = []
+    evicted_block: Optional[PhysicalTokenBlock] = None
+
+    # Find the first block with the lowest timestamp
     for block in least_recent:
         if block.prefix_len == highest_prefix_count:
-            eviction_candidates.append(block)
+            evicted_block = block
+            break
 
-    # Arbitrarily evict the first candidate
-    if len(eviction_candidates) == 0:
-        raise ValueError("No usable cache memory left")
+    assert evicted_block is not None
 
-    evicted_block = eviction_candidates[0]
     del free_table[evicted_block.block_hash]
 
     evicted_block.computed = False
@@ -72,13 +71,14 @@ def __init__(self,
         self.device = device
         self.block_size = block_size
         self.num_blocks = num_blocks
-
         self.eviction_policy = eviction_policy
 
         self.current_num_blocks = 0
         self.table: Dict[int, PhysicalTokenBlock] = {}
         self.free_table: Dict[int, PhysicalTokenBlock] = {}
 
+        self.default_hash_ctr = count()
+
     def evict(self) -> PhysicalTokenBlock:
         if self.eviction_policy == EvictionPolicy.LRU:
             return lru_eviction(self.free_table)
@@ -105,7 +105,7 @@ def allocate(self,
                  block_hash: Optional[int] = None,
                  prefix_len: int = 0) -> PhysicalTokenBlock:
         if block_hash is None:
-            block_hash = monotonic()
+            block_hash = next(self.default_hash_ctr)
         if block_hash in self.free_table:
             assert block_hash not in self.table
             block = self.free_table[block_hash]
@@ -263,7 +263,8 @@ def _is_last_block_full(
         self,
         seq: Sequence,
     ) -> bool:
-        return (len(seq.data.get_token_ids())) % seq.block_size == 0
+        token_ids_len = len(seq.data.get_token_ids())
+        return token_ids_len > 0 and token_ids_len % seq.block_size == 0
 
     def _is_last_block(
         self,

From 2dba195be91fb7d8fdb4ceb060853d592a68b8fa Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Wed, 21 Feb 2024 15:23:12 -0500
Subject: [PATCH 56/79] added a flag to disable automatic prefix caching

---
 docs/source/models/engine_args.rst | 4 ++++
 vllm/config.py                     | 2 ++
 vllm/core/block_manager.py         | 4 ----
 vllm/engine/arg_utils.py           | 8 +++++++-
 vllm/engine/llm_engine.py          | 7 +++++--
 5 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/docs/source/models/engine_args.rst b/docs/source/models/engine_args.rst
index d89b795149501..945e315d663fd 100644
--- a/docs/source/models/engine_args.rst
+++ b/docs/source/models/engine_args.rst
@@ -81,6 +81,10 @@ Below, you can find an explanation of every engine argument for vLLM:
 
     Token block size for contiguous chunks of tokens.
 
+.. option:: --disable-prefix-caching
+
+    Disables automatic prefix caching
+
 .. option:: --seed <seed>
 
     Random seed for operations.
diff --git a/vllm/config.py b/vllm/config.py
index 0b8a2a27f6d43..95466f84780af 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -295,12 +295,14 @@ def __init__(
         swap_space: int,
         cache_dtype: str,
         sliding_window: Optional[int] = None,
+        disable_prefix_caching: bool = False,
     ) -> None:
         self.block_size = block_size
         self.gpu_memory_utilization = gpu_memory_utilization
         self.swap_space_bytes = swap_space * _GB
         self.cache_dtype = cache_dtype
         self.sliding_window = sliding_window
+        self.disable_prefix_caching = disable_prefix_caching
         self._verify_args()
         self._verify_cache_dtype()
 
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 7a5a6aad28777..a3596ef02f99a 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -377,10 +377,6 @@ def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]:
         for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
             new_block_table: BlockTable = []
             block_table = self.block_tables[seq.seq_id]
-            if seq_group.prefix is not None:
-                for block in seq_group.prefix.block_table:
-                    new_block_table.append(block)
-                    block.ref_count += 1
 
             for cpu_block in block_table:
                 if cpu_block in mapping:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 8ac0157151d8e..1e0729ba7fb23 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -25,6 +25,7 @@ class EngineArgs:
     tensor_parallel_size: int = 1
     max_parallel_loading_workers: Optional[int] = None
     block_size: int = 16
+    disable_prefix_caching: bool = False
     swap_space: int = 4  # GiB
     gpu_memory_utilization: float = 0.90
     max_num_batched_tokens: Optional[int] = None
@@ -173,6 +174,10 @@ def add_cli_args(
                             default=EngineArgs.block_size,
                             choices=[8, 16, 32],
                             help='token block size')
+        parser.add_argument('--disable-prefix-caching',
+                            action='store_true',
+                            help='Disables automatic prefix caching')
+
         # TODO(woosuk): Support fine-grained seeds (e.g., seed per request).
         parser.add_argument('--seed',
                             type=int,
@@ -296,7 +301,8 @@ def create_engine_configs(
         cache_config = CacheConfig(self.block_size,
                                    self.gpu_memory_utilization,
                                    self.swap_space, self.kv_cache_dtype,
-                                   model_config.get_sliding_window())
+                                   model_config.get_sliding_window(),
+                                   self.disable_prefix_caching)
         parallel_config = ParallelConfig(self.pipeline_parallel_size,
                                          self.tensor_parallel_size,
                                          self.worker_use_ray,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 97aefa35a0426..0a7d5bc71e8f0 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -726,8 +726,11 @@ def _process_model_outputs(
         # Update the scheduled sequence groups with the model outputs.
         scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups
 
-        for seq_group in scheduled_seq_groups:
-            self.scheduler.mark_blocks_as_computed(seq_group)
+        # If atomatic prefix caching is disabled, all previously computed blocks
+        # will be recomputed
+        if not self.cache_config.disable_prefix_caching:
+            for seq_group in scheduled_seq_groups:
+                self.scheduler.mark_blocks_as_computed(seq_group)
 
         for seq_group, outputs in zip(scheduled_seq_groups, output):
             self._process_sequence_group_outputs(seq_group, outputs)

From ba01fa8f53c5a3059d07a160684e52520280a39d Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 21 Feb 2024 16:07:00 -0500
Subject: [PATCH 57/79] Update vllm/engine/llm_engine.py

---
 vllm/engine/llm_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 0a7d5bc71e8f0..4a3e288482502 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -726,7 +726,7 @@ def _process_model_outputs(
         # Update the scheduled sequence groups with the model outputs.
         scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups
 
-        # If atomatic prefix caching is disabled, all previously computed blocks
+        # If automatic prefix caching is disabled, all previously computed blocks
         # will be recomputed
         if not self.cache_config.disable_prefix_caching:
             for seq_group in scheduled_seq_groups:

From 2914b5ab98207aba6aa49c77ec54b81dd32cfe3c Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Fri, 23 Feb 2024 05:41:51 -0500
Subject: [PATCH 58/79] remove explicit prefix pos

---
 examples/offline_inference_with_prefix.py   | 11 +----
 tests/prefix_caching/test_prefix_caching.py |  6 +--
 tests/test_cache_block_hashing.py           |  2 +-
 vllm/block.py                               |  6 +--
 vllm/core/block_manager.py                  | 49 +++++++++------------
 vllm/core/scheduler.py                      |  8 +---
 vllm/engine/async_llm_engine.py             | 14 +-----
 vllm/engine/llm_engine.py                   |  8 +---
 vllm/entrypoints/api_server.py              |  6 +--
 vllm/entrypoints/llm.py                     | 14 +-----
 vllm/sequence.py                            | 22 ++-------
 vllm/worker/model_runner.py                 | 20 ++++-----
 12 files changed, 48 insertions(+), 118 deletions(-)

diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference_with_prefix.py
index 8ccfb1ceea731..1aa718b88907c 100644
--- a/examples/offline_inference_with_prefix.py
+++ b/examples/offline_inference_with_prefix.py
@@ -37,20 +37,13 @@
 
 print("-" * 80)
 
-# -1 since the last token can change when concatenating prompts.
-prefix_pos = len(llm.llm_engine.tokenizer.encode(prefix)) - 1
-
 # The llm.generate call will batch all prompts and send the batch at once if resources allow.
 # The prefix will only be cached after the first batch is processed, so we need to call generate once
 # to calculate the prefix and cache it.
-outputs = llm.generate(generating_prompts[0],
-                       sampling_params,
-                       prefix_pos=[prefix_pos])
+outputs = llm.generate(generating_prompts[0], sampling_params)
 
 # Subsequent batches can leverage the cached prefix
-outputs = llm.generate(generating_prompts,
-                       sampling_params,
-                       prefix_pos=[prefix_pos] * len(generating_prompts))
+outputs = llm.generate(generating_prompts, sampling_params)
 
 # Print the outputs. You should see the same outputs as before
 for output in outputs:
diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index e40ea9927bf22..ffa6fc8f91f15 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -36,14 +36,10 @@ def test_prefix_caching(
     max_tokens: int,
 ):
     llm = LLM(model=model)
-    # -1 since the last token can change when concatenating prompts.
-    prefix_pos = len(llm.llm_engine.tokenizer.encode(prefix)) - 1
     prompts = [prefix + prompt for prompt in example_prompts]
     sampling_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
     outputs_without_prefix = llm.generate(prompts, sampling_params)
-    outputs_with_prefix = llm.generate(prompts,
-                                       sampling_params,
-                                       prefix_pos=[prefix_pos] * len(prompts))
+    outputs_with_prefix = llm.generate(prompts, sampling_params)
     for output_without_prefix, output_with_prefix in zip(
             outputs_without_prefix, outputs_with_prefix):
         assert (output_without_prefix.outputs[0].token_ids ==
diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py
index f4eb90378eb0b..7c4ade7f8c8ed 100644
--- a/tests/test_cache_block_hashing.py
+++ b/tests/test_cache_block_hashing.py
@@ -58,7 +58,7 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int):
 
             num_blocks = len(prompt_token_ids) // block_size
             for idx in range(num_blocks):
-                hashes[-1][-1].append(seq.hash(idx))
+                hashes[-1][-1].append(seq.hash_of_block(idx))
 
             seq_id += 1
 
diff --git a/vllm/block.py b/vllm/block.py
index e5f16e1bf611e..4fc54f918554b 100644
--- a/vllm/block.py
+++ b/vllm/block.py
@@ -57,13 +57,13 @@ def __init__(
         block_number: int,
         block_size: int,
         block_hash: int,
-        prefix_len: int,
+        num_hashed_tokens: int,
     ) -> None:
         self.device = device
         self.block_number = block_number
         self.block_size = block_size
         self.block_hash = block_hash
-        self.prefix_len = prefix_len
+        self.num_hashed_tokens = num_hashed_tokens
 
         self.ref_count = 0
         self.last_accessed = monotonic()
@@ -74,7 +74,7 @@ def __init__(
     def __repr__(self) -> str:
         return (f'PhysicalTokenBlock(device={self.device}, '
                 f'block_number={self.block_number}, '
-                f'prefix_len={self.prefix_len}, '
+                f'num_hashed_tokens={self.num_hashed_tokens}, '
                 f'ref_count={self.ref_count}, '
                 f'last_accessed={self.last_accessed}, '
                 f'computed={self.computed})')
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index a3596ef02f99a..a2bf2b75e5046 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -34,16 +34,16 @@ def lru_eviction(
             least_recent.append(block)
 
     # Find highest prefix count per block
-    highest_prefix_count = 0
+    highest_num_hashed_tokens = 0
     for block in least_recent:
-        if block.prefix_len > highest_prefix_count:
-            highest_prefix_count = block.prefix_len
+        if block.num_hashed_tokens > highest_num_hashed_tokens:
+            highest_num_hashed_tokens = block.num_hashed_tokens
 
     evicted_block: Optional[PhysicalTokenBlock] = None
 
     # Find the first block with the lowest timestamp
     for block in least_recent:
-        if block.prefix_len == highest_prefix_count:
+        if block.num_hashed_tokens == highest_num_hashed_tokens:
             evicted_block = block
             break
 
@@ -87,23 +87,23 @@ def evict(self) -> PhysicalTokenBlock:
                 f"Unknown cache eviction policy: {self.eviction_policy}")
 
     def allocate_block(self, block_hash: int,
-                       prefix_len: int) -> PhysicalTokenBlock:
+                       num_hashed_tokens: int) -> PhysicalTokenBlock:
         if self.current_num_blocks == self.num_blocks:
             block = self.evict()
             block.block_hash = block_hash
-            block.prefix_len = prefix_len
+            block.num_hashed_tokens = num_hashed_tokens
             return block
         block = PhysicalTokenBlock(device=self.device,
                                    block_number=self.current_num_blocks,
                                    block_size=self.block_size,
                                    block_hash=block_hash,
-                                   prefix_len=prefix_len)
+                                   num_hashed_tokens=num_hashed_tokens)
         self.current_num_blocks += 1
         return block
 
     def allocate(self,
                  block_hash: Optional[int] = None,
-                 prefix_len: int = 0) -> PhysicalTokenBlock:
+                 num_hashed_tokens: int = 0) -> PhysicalTokenBlock:
         if block_hash is None:
             block_hash = next(self.default_hash_ctr)
         if block_hash in self.free_table:
@@ -117,7 +117,7 @@ def allocate(self,
             return block
         if block_hash not in self.table:
             self.table[block_hash] = self.allocate_block(
-                block_hash, prefix_len)
+                block_hash, num_hashed_tokens)
         block = self.table[block_hash]
         assert block.block_hash == block_hash
         block.ref_count += 1
@@ -227,9 +227,8 @@ def allocate(self, seq_group: SequenceGroup) -> None:
                 block = block_table[logical_idx % self.block_sliding_window]
             else:
                 block = self.gpu_allocator.allocate(
-                    seq.hash(logical_idx),
-                    seq.prefix_len_of_block(logical_idx,
-                                            seq_group.get_prefix_len()))
+                    seq.hash_of_block(logical_idx),
+                    seq.num_hashed_tokens_of_block(logical_idx))
             block_table.append(block)
 
         # Assign the block table for each sequence.
@@ -249,7 +248,7 @@ def _promote_last_block(
         last_block: PhysicalTokenBlock,
     ) -> PhysicalTokenBlock:
         # Compute a new hash for the block so that it can be shared by other Sequences
-        new_hash = seq.hash(len(seq.logical_token_blocks) - 1)
+        new_hash = seq.hash_of_block(len(seq.logical_token_blocks) - 1)
 
         # if new_hash is already in the cached table, then free last_block and return the cached version
         if self.gpu_allocator.contains_block(new_hash):
@@ -286,15 +285,13 @@ def _maybe_promote_last_block(
     def _allocate_last_physical_block(
         self,
         seq: Sequence,
-        prefix_len: int,
     ) -> PhysicalTokenBlock:
         block_hash: Optional[int] = None
         if (self._is_last_block_full(seq)):
-            block_hash = seq.hash(len(seq.logical_token_blocks) - 1)
-        block_prefix_len = seq.prefix_len_of_block(
-            len(seq.logical_token_blocks) - 1, prefix_len)
-        new_block = self.gpu_allocator.allocate(block_hash,
-                                                prefix_len=block_prefix_len)
+            block_hash = seq.hash_of_block(len(seq.logical_token_blocks) - 1)
+        num_hashed_tokens = seq.num_hashed_tokens_of_block(
+            len(seq.logical_token_blocks) - 1)
+        new_block = self.gpu_allocator.allocate(block_hash, num_hashed_tokens)
         if block_hash is None:
             assert (new_block.ref_count == 1)
         return new_block
@@ -302,7 +299,6 @@ def _allocate_last_physical_block(
     def append_slot(
         self,
         seq: Sequence,
-        prefix_len: int,
     ) -> Optional[Tuple[int, int]]:
         """Allocate a physical slot for a new token."""
         logical_blocks = seq.logical_token_blocks
@@ -320,7 +316,7 @@ def append_slot(
             else:
                 # The sequence has a new logical block.
                 # Allocate a new physical block.
-                new_block = self._allocate_last_physical_block(seq, prefix_len)
+                new_block = self._allocate_last_physical_block(seq)
                 block_table.append(new_block)
                 return None
 
@@ -336,7 +332,7 @@ def append_slot(
         else:
             # The last block is shared with other sequences.
             # Copy on Write: Allocate a new block and copy the tokens.
-            new_block = self._allocate_last_physical_block(seq, prefix_len)
+            new_block = self._allocate_last_physical_block(seq)
 
             block_table[-1] = new_block
             self.gpu_allocator.free(last_block)
@@ -384,7 +380,7 @@ def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]:
                     gpu_block.ref_count += 1
                 else:
                     gpu_block = self.gpu_allocator.allocate(
-                        cpu_block.block_hash, cpu_block.prefix_len)
+                        cpu_block.block_hash, cpu_block.num_hashed_tokens)
                     mapping[cpu_block] = gpu_block
                 new_block_table.append(gpu_block)
                 # Free the CPU block swapped in to GPU.
@@ -414,7 +410,7 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
                     cpu_block.ref_count += 1
                 else:
                     cpu_block = self.cpu_allocator.allocate(
-                        gpu_block.block_hash, gpu_block.prefix_len)
+                        gpu_block.block_hash, gpu_block.num_hashed_tokens)
                     mapping[gpu_block] = cpu_block
                 new_block_table.append(cpu_block)
                 # Free the GPU block swapped out to CPU.
@@ -498,6 +494,5 @@ def get_common_computed_block_ids(self,
 
     def mark_blocks_as_computed(self, seq_group: SequenceGroup):
         for seq in seq_group.seqs_dict.values():
-            self.compute_all_blocks_in_seq(
-                seq,
-                seq_group.get_prefix_len() // seq.block_size)
+            self.compute_all_blocks_in_seq(seq,
+                                           seq.get_len() // seq.block_size)
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 5f1644fcda945..38e470e20acae 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -372,10 +372,6 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
             seq_data: Dict[int, SequenceData] = {}
             block_tables: Dict[int, List[int]] = {}
 
-            # Round the prefix position down to the last full block
-            rounded_prefix_pos = (seq_group.get_prefix_len() //
-                                  seq_group.block_size) * seq_group.block_size
-
             for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
                 seq_id = seq.seq_id
                 seq_data[seq_id] = seq.data
@@ -389,7 +385,6 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
                 sampling_params=seq_group.sampling_params,
                 block_tables=block_tables,
                 lora_request=seq_group.lora_request,
-                prefix_pos=rounded_prefix_pos,
                 computed_block_nums=self.block_manager.
                 get_common_computed_block_ids(seq_group),
                 state=seq_group.state,
@@ -418,8 +413,7 @@ def _append_slot(
         blocks_to_copy: Dict[int, List[int]],
     ) -> None:
         for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
-            ret = self.block_manager.append_slot(seq,
-                                                 seq_group.get_prefix_len())
+            ret = self.block_manager.append_slot(seq)
             if ret is not None:
                 src_block, dst_block = ret
                 if src_block in blocks_to_copy:
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 7cba654602779..605aa1bb6bd8d 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -225,7 +225,6 @@ async def add_request_async(
         prompt_token_ids: Optional[List[int]] = None,
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
-        prefix_pos: Optional[int] = None,
     ) -> None:
         if lora_request is not None and not self.lora_config:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
@@ -245,7 +244,6 @@ async def add_request_async(
             sampling_params=sampling_params,
             arrival_time=arrival_time,
             lora_request=lora_request,
-            prefix_pos=prefix_pos,
         )
 
     async def _run_workers_async(
@@ -419,7 +417,6 @@ async def add_request(
         prompt_token_ids: Optional[List[int]] = None,
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
-        prefix_pos: Optional[int] = None,
     ) -> AsyncStream:
         if self.log_requests:
             shortened_prompt = prompt
@@ -432,7 +429,6 @@ async def add_request(
                                                               max_log_len]
             logger.info(f"Received request {request_id}: "
                         f"prompt: {shortened_prompt!r}, "
-                        f"prefix_pos: {prefix_pos},"
                         f"sampling_params: {sampling_params}, "
                         f"prompt_token_ids: {shortened_token_ids}, "
                         f"lora_request: {lora_request}.")
@@ -469,8 +465,7 @@ async def add_request(
             sampling_params=sampling_params,
             prompt_token_ids=prompt_token_ids,
             arrival_time=arrival_time,
-            lora_request=lora_request,
-            prefix_pos=prefix_pos)
+            lora_request=lora_request)
 
         return stream
 
@@ -481,7 +476,6 @@ async def generate(
         request_id: str,
         prompt_token_ids: Optional[List[int]] = None,
         lora_request: Optional[LoRARequest] = None,
-        prefix_pos: Optional[int] = None,
     ) -> AsyncIterator[RequestOutput]:
         """Generate outputs for a request.
 
@@ -497,11 +491,6 @@ async def generate(
             prompt_token_ids: The token IDs of the prompt. If None, we
                 use the tokenizer to convert the prompts to token IDs.
             lora_request: LoRA request to use for generation, if any.
-            prefix_pos: If not None, we use the given position as the prefix
-                position for each prompt. We will cache the prefix's KV
-                cache and reuse it for the next request with the same prefix.
-                This is an experimental feature, and may be replaced with
-                automatic prefix caching in the future.
 
         Yields:
             The output `RequestOutput` objects from the LLMEngine for the
@@ -562,7 +551,6 @@ async def generate(
                 prompt_token_ids=prompt_token_ids,
                 arrival_time=arrival_time,
                 lora_request=lora_request,
-                prefix_pos=prefix_pos,
             )
 
             async for request_output in stream:
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index b71846e1b466f..e126f0b12c06f 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -395,7 +395,6 @@ def add_request(
         prompt_token_ids: Optional[List[int]] = None,
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
-        prefix_pos: Optional[int] = None,
     ) -> None:
         """Add a request to the engine's request pool.
 
@@ -412,11 +411,6 @@ def add_request(
                 use the tokenizer to convert the prompts to token IDs.
             arrival_time: The arrival time of the request. If None, we use
                 the current monotonic time.
-            prefix_pos: If not None, we use the given position as the prefix
-                position for each prompt. We will cache the prefix's KV
-                cache and reuse it for the next request with the same prefix.
-                This is an experimental feature, and may be replaced with
-                automatic prefix caching in the future.
 
         Details:
             - Set arrival_time to the current time if it is None.
@@ -464,7 +458,7 @@ def add_request(
 
         # Create the sequence group.
         seq_group = SequenceGroup(request_id, [seq], sampling_params,
-                                  arrival_time, lora_request, prefix_pos)
+                                  arrival_time, lora_request)
 
         # Add the sequence group to the scheduler.
         self.scheduler.add_seq_group(seq_group)
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index e7af2c6db5e4c..1eb4ab8b06b64 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -39,15 +39,11 @@ async def generate(request: Request) -> Response:
     """
     request_dict = await request.json()
     prompt = request_dict.pop("prompt")
-    prefix_pos = request_dict.pop("prefix_pos", None)
     stream = request_dict.pop("stream", False)
     sampling_params = SamplingParams(**request_dict)
     request_id = random_uuid()
 
-    results_generator = engine.generate(prompt,
-                                        sampling_params,
-                                        request_id,
-                                        prefix_pos=prefix_pos)
+    results_generator = engine.generate(prompt, sampling_params, request_id)
 
     # Streaming case
     async def stream_results() -> AsyncGenerator[bytes, None]:
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index fc82018d18eb6..62f1d172377f6 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -124,7 +124,6 @@ def generate(
         prompts: Optional[Union[str, List[str]]] = None,
         sampling_params: Optional[SamplingParams] = None,
         prompt_token_ids: Optional[List[List[int]]] = None,
-        prefix_pos: Optional[Union[int, List[int]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[LoRARequest] = None,
     ) -> List[RequestOutput]:
@@ -140,11 +139,6 @@ def generate(
                 None, we use the default sampling parameters.
             prompt_token_ids: A list of token IDs for the prompts. If None, we
                 use the tokenizer to convert the prompts to token IDs.
-            prefix_pos: If not None, we use the given position as the prefix
-                position for each prompt. We will cache the prefix's KV
-                cache and reuse it for the next request with the same prefix.
-                This is an experimental feature, and may be replaced with
-                automatic prefix caching in the future.
             use_tqdm: Whether to use tqdm to display the progress bar.
             lora_request: LoRA request to use for generation, if any.
 
@@ -171,14 +165,12 @@ def generate(
             prompt_token_ids)
         for i in range(num_requests):
             prompt = prompts[i] if prompts is not None else None
-            prefix_pos_i = prefix_pos[i] if prefix_pos is not None else None
             token_ids = None if prompt_token_ids is None else prompt_token_ids[
                 i]
             self._add_request(prompt,
                               sampling_params,
                               token_ids,
-                              lora_request=lora_request,
-                              prefix_pos=prefix_pos_i)
+                              lora_request=lora_request)
         return self._run_engine(use_tqdm)
 
     def _add_request(
@@ -187,15 +179,13 @@ def _add_request(
         sampling_params: SamplingParams,
         prompt_token_ids: Optional[List[int]],
         lora_request: Optional[LoRARequest] = None,
-        prefix_pos: Optional[int] = None,
     ) -> None:
         request_id = str(next(self.request_counter))
         self.llm_engine.add_request(request_id,
                                     prompt,
                                     sampling_params,
                                     prompt_token_ids,
-                                    lora_request=lora_request,
-                                    prefix_pos=prefix_pos)
+                                    lora_request=lora_request)
 
     def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]:
         # Initialize tqdm.
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 23f51b04a985f..1a7dc86718a8e 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -160,17 +160,13 @@ def __init__(
     def lora_int_id(self) -> int:
         return self.lora_request.lora_int_id if self.lora_request else 0
 
-    def hash(self, logical_idx: int) -> int:
+    def hash_of_block(self, logical_idx: int) -> int:
         # Compute the number of tokens in the sequence
         num_tokens = logical_idx * self.block_size + self.block_size
         return hash(tuple(self.data.get_token_ids()[0:num_tokens]))
 
-    def prefix_len_of_block(self, logical_idx: int, full_prefix_len: int):
-        num_tokens = logical_idx * self.block_size + self.block_size
-        if num_tokens > full_prefix_len:
-            return full_prefix_len
-        else:
-            return num_tokens
+    def num_hashed_tokens_of_block(self, logical_idx: int):
+        return logical_idx * self.block_size + self.block_size
 
     def _append_logical_block(self) -> None:
         block = LogicalTokenBlock(
@@ -276,7 +272,6 @@ class SequenceGroup:
         sampling_params: The sampling parameters used to generate the outputs.
         arrival_time: The arrival time of the request.
         lora_request: LoRA request.
-        prefix_pos: The end of prefix of the prompt of the sequence group.
     """
 
     def __init__(
@@ -286,7 +281,6 @@ def __init__(
         sampling_params: SamplingParams,
         arrival_time: float,
         lora_request: Optional[LoRARequest] = None,
-        prefix_pos: Optional[int] = None,
     ) -> None:
         self.request_id = request_id
         self.seqs_dict = {seq.seq_id: seq for seq in seqs}
@@ -297,7 +291,6 @@ def __init__(
                                       first_token_time=None,
                                       time_in_queue=None)
         self.lora_request = lora_request
-        self.prefix_pos: Optional[int] = prefix_pos
         self.prompt_logprobs: Optional[PromptLogprobs] = None
         self.state = SequenceGroupState()
 
@@ -405,9 +398,6 @@ def remove(self, seq_id: int) -> None:
     def is_finished(self) -> bool:
         return all(seq.is_finished() for seq in self.get_seqs())
 
-    def get_prefix_len(self) -> int:
-        return self.prefix_pos if self.prefix_pos is not None else 0
-
     def __repr__(self) -> str:
         return (f"SequenceGroup(request_id={self.request_id}, "
                 f"sampling_params={self.sampling_params}, "
@@ -426,7 +416,6 @@ class SequenceGroupMetadata:
             numbers)
         state: Internal state tied to this sequence group.
         lora_request: LoRA request.
-        prefix_pos: The end of prefix of the prompt of the sequence group.
     """
 
     def __init__(
@@ -437,7 +426,6 @@ def __init__(
         sampling_params: SamplingParams,
         block_tables: Dict[int, List[int]],
         lora_request: Optional[LoRARequest] = None,
-        prefix_pos: Optional[int] = None,
         computed_block_nums: Optional[List[int]] = None,
         state: Optional[SequenceGroupState] = None,
     ) -> None:
@@ -447,13 +435,9 @@ def __init__(
         self.sampling_params = sampling_params
         self.block_tables = block_tables
         self.lora_request = lora_request
-        self.prefix_pos = prefix_pos
         self.computed_block_nums = computed_block_nums
         self.state = SequenceGroupState() if state is None else state
 
-    def get_prefix_len(self) -> int:
-        return self.prefix_pos if self.prefix_pos is not None else 0
-
     @property
     def lora_int_id(self) -> int:
         return self.lora_request.lora_int_id if self.lora_request else 0
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 9ac54cdf36fa1..54e5350e68cf9 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -138,36 +138,36 @@ def _prepare_prompt(
             prompt_tokens = seq_data.get_token_ids()
             prompt_len = len(prompt_tokens)
             prompt_lens.append(prompt_len)
-            prefix_len = 0
+            computed_len = 0
 
             # NOTE: This only works for oooooooxxx style attention.
             computed_block_nums = seq_group_metadata.computed_block_nums
             if computed_block_nums is not None and len(
                     computed_block_nums) > 0:
-                prefix_len = seq_group_metadata.get_prefix_len()
-                prompt_tokens = prompt_tokens[prefix_len:]
+                computed_len = len(computed_block_nums) * self.block_size
+                prompt_tokens = prompt_tokens[computed_len:]
                 prefix_block_tables.append(computed_block_nums)
             else:
                 prefix_block_tables.append([])
             # actual prompt lens
-            context_lens.append(prefix_len)
-            subquery_lens.append(prompt_len - prefix_len)
+            context_lens.append(computed_len)
+            subquery_lens.append(prompt_len - computed_len)
 
             input_tokens.append(prompt_tokens)
             # NOTE(woosuk): Here we assume that the first token in the prompt
             # is always the first token in the sequence.
             input_positions.append(
-                list(range(prefix_len, prefix_len + len(prompt_tokens))))
+                list(range(computed_len, computed_len + len(prompt_tokens))))
 
             lora_id = seq_group_metadata.lora_int_id
 
             if lora_id > 0:
                 lora_requests.add(seq_group_metadata.lora_request)
 
-            lora_index_mapping.append([lora_id] * (prompt_len - prefix_len))
+            lora_index_mapping.append([lora_id] * (prompt_len - computed_len))
             lora_prompt_mapping.extend(
                 [lora_id] *
-                (prompt_len - prefix_len
+                (prompt_len - computed_len
                  if seq_group_metadata.sampling_params.prompt_logprobs else 1))
 
             if seq_group_metadata.block_tables is None:
@@ -186,11 +186,11 @@ def _prepare_prompt(
             # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
             start_idx = 0
             if self.sliding_window is not None:
-                assert prefix_len == 0, (
+                assert computed_len == 0, (
                     "Prefix caching is currently not supported with "
                     "sliding window attention")
                 start_idx = max(0, prompt_len - self.sliding_window)
-            for i in range(prefix_len, prompt_len):
+            for i in range(computed_len, prompt_len):
                 if i < start_idx:
                     slot_mapping[-1].append(_PAD_SLOT_ID)
                     continue

From bd235fdac55edd39a888ecb84157f2d38846c099 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Fri, 23 Feb 2024 06:26:48 -0500
Subject: [PATCH 59/79] remove assert for sliding window, check what will
 happen

---
 vllm/worker/model_runner.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 54e5350e68cf9..80d72d59c7e46 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -186,9 +186,6 @@ def _prepare_prompt(
             # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
             start_idx = 0
             if self.sliding_window is not None:
-                assert computed_len == 0, (
-                    "Prefix caching is currently not supported with "
-                    "sliding window attention")
                 start_idx = max(0, prompt_len - self.sliding_window)
             for i in range(computed_len, prompt_len):
                 if i < start_idx:

From ba382d93ca65461b6dc4d9ece908bc70055ff695 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Fri, 23 Feb 2024 06:35:15 -0500
Subject: [PATCH 60/79] Try the other way around

---
 vllm/worker/model_runner.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 80d72d59c7e46..979a2503595bb 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -144,7 +144,9 @@ def _prepare_prompt(
             computed_block_nums = seq_group_metadata.computed_block_nums
             if computed_block_nums is not None and len(
                     computed_block_nums) > 0:
-                computed_len = len(computed_block_nums) * self.block_size
+                # Prefix is not supported with sliding_window
+                if self.sliding_window is None:
+                    computed_len = len(computed_block_nums) * self.block_size
                 prompt_tokens = prompt_tokens[computed_len:]
                 prefix_block_tables.append(computed_block_nums)
             else:
@@ -186,6 +188,9 @@ def _prepare_prompt(
             # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
             start_idx = 0
             if self.sliding_window is not None:
+                assert computed_len == 0, (
+                    "Prefix caching is currently not supported with "
+                    "sliding window attention")
                 start_idx = max(0, prompt_len - self.sliding_window)
             for i in range(computed_len, prompt_len):
                 if i < start_idx:

From 660007f4178c024adb599aa3f562f7e97adf08d4 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Fri, 23 Feb 2024 08:36:34 -0500
Subject: [PATCH 61/79] Delete redundant prefix caching test

---
 tests/prefix_caching/test_prefix_caching.py | 38 ---------------------
 1 file changed, 38 deletions(-)

diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index ffa6fc8f91f15..de4a19df89097 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -4,47 +4,9 @@
 """
 import pytest
 
-from vllm import LLM, SamplingParams
 from vllm.core.block_manager import BlockAllocator
 from vllm.utils import Device
 
-prefix = (
-    "You are an expert school principal, skilled in effectively managing "
-    "faculty and staff. Draft 10-15 questions for a potential first grade "
-    "Head Teacher for my K-12, all-girls', independent school that emphasizes "
-    "community, joyful discovery, and life-long learning. The candidate is "
-    "coming in for a first-round panel interview for a 8th grade Math "
-    "teaching role. They have 5 years of previous teaching experience "
-    "as an assistant teacher at a co-ed, public school with experience "
-    "in middle school math teaching. Based on these information, fulfill "
-    "the following paragraph: ")
-
-
-def allocate_all_blocks(block_allocator, num_blocks):
-    blocks = []
-    for i in range(num_blocks):
-        # use i as the block_hash
-        blocks.append(block_allocator.allocate(i, 0))
-    return blocks
-
-
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
-@pytest.mark.parametrize("max_tokens", [16])
-def test_prefix_caching(
-    example_prompts,
-    model: str,
-    max_tokens: int,
-):
-    llm = LLM(model=model)
-    prompts = [prefix + prompt for prompt in example_prompts]
-    sampling_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
-    outputs_without_prefix = llm.generate(prompts, sampling_params)
-    outputs_with_prefix = llm.generate(prompts, sampling_params)
-    for output_without_prefix, output_with_prefix in zip(
-            outputs_without_prefix, outputs_with_prefix):
-        assert (output_without_prefix.outputs[0].token_ids ==
-                output_with_prefix.outputs[0].token_ids)
-
 
 @pytest.mark.parametrize("block_size", [16])
 @pytest.mark.parametrize("num_blocks", [16])

From f74f67df87f8a69f4cd7f64c3cb326479b60b409 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Fri, 23 Feb 2024 11:55:48 -0500
Subject: [PATCH 62/79] Don't add last block to

---
 vllm/core/block_manager.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index a2bf2b75e5046..91ada347e5722 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -462,12 +462,12 @@ def access_all_blocks_in_seq(
         for block in block_table:
             block.last_accessed = access_time
 
-    def compute_all_blocks_in_seq(self, seq: Sequence,
-                                  max_computed_blocks: int):
+    def compute_all_blocks_in_seq(self, seq: Sequence):
         if seq.seq_id not in self.block_tables:
             return
         block_table = self.block_tables[seq.seq_id]
         counter = 0
+        max_computed_blocks = seq.get_len() // seq.block_size
         for block in block_table:
             if counter >= max_computed_blocks:
                 return
@@ -478,10 +478,12 @@ def get_all_computed_block_ids_seq(self, seq: Sequence) -> List[int]:
         if seq.seq_id not in self.block_tables:
             return []
         block_table = self.block_tables[seq.seq_id]
+        last_block = block_table[-1]
         # We want to get the first n contiguous completed blocks
+        # We exclude the last block because it's most likely not cached yet
         return [
             block.block_number
-            for block in takewhile(lambda block: block.computed, block_table)
+            for block in takewhile(lambda block: block.computed and block != last_block, block_table)
         ]
 
     def get_common_computed_block_ids(self,
@@ -494,5 +496,4 @@ def get_common_computed_block_ids(self,
 
     def mark_blocks_as_computed(self, seq_group: SequenceGroup):
         for seq in seq_group.seqs_dict.values():
-            self.compute_all_blocks_in_seq(seq,
-                                           seq.get_len() // seq.block_size)
+            self.compute_all_blocks_in_seq(seq)

From 093cb1cbb5a4d2ef07133d7a707d9b9d5c5fe05b Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Fri, 23 Feb 2024 11:58:50 -0500
Subject: [PATCH 63/79] Format

---
 vllm/core/block_manager.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 91ada347e5722..35b05e7bd5960 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -482,8 +482,9 @@ def get_all_computed_block_ids_seq(self, seq: Sequence) -> List[int]:
         # We want to get the first n contiguous completed blocks
         # We exclude the last block because it's most likely not cached yet
         return [
-            block.block_number
-            for block in takewhile(lambda block: block.computed and block != last_block, block_table)
+            block.block_number for block in takewhile(
+                lambda block: block.computed and block != last_block,
+                block_table)
         ]
 
     def get_common_computed_block_ids(self,

From d459d15bdff469b9558ad7b31f3c85097d72e28a Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Fri, 23 Feb 2024 12:26:48 -0500
Subject: [PATCH 64/79] refactored the eviction logic into a separate class

---
 vllm/core/block_manager.py | 112 ++++++++++++++-----------------------
 vllm/core/scheduler.py     |   3 +-
 vllm/engine/llm_engine.py  |   7 +--
 3 files changed, 45 insertions(+), 77 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 35b05e7bd5960..ae744599dabf4 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -2,57 +2,12 @@
 import enum
 from itertools import takewhile, count
 from os.path import commonprefix
-from time import monotonic
 from typing import Dict, List, Optional, Set, Tuple
 
 from vllm.block import BlockTable, PhysicalTokenBlock
 from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
 from vllm.utils import Device
-
-
-class EvictionPolicy(enum.Enum):
-    """Enum for eviction policy used by BlockAllocator."""
-    LRU = enum.auto()
-
-
-def lru_eviction(
-        free_table: Dict[int, PhysicalTokenBlock]) -> PhysicalTokenBlock:
-    free_blocks: List[PhysicalTokenBlock] = list(free_table.values())
-    if len(free_blocks) == 0:
-        raise ValueError("No usable cache memory left")
-
-    # Find lowest timestamp
-    lowest_timestamp = monotonic()
-    for block in free_blocks:
-        if block.last_accessed < lowest_timestamp:
-            lowest_timestamp = block.last_accessed
-
-    # Find all blocks with the lowest timestamp
-    least_recent: List[PhysicalTokenBlock] = []
-    for block in free_blocks:
-        if block.last_accessed == lowest_timestamp:
-            least_recent.append(block)
-
-    # Find highest prefix count per block
-    highest_num_hashed_tokens = 0
-    for block in least_recent:
-        if block.num_hashed_tokens > highest_num_hashed_tokens:
-            highest_num_hashed_tokens = block.num_hashed_tokens
-
-    evicted_block: Optional[PhysicalTokenBlock] = None
-
-    # Find the first block with the lowest timestamp
-    for block in least_recent:
-        if block.num_hashed_tokens == highest_num_hashed_tokens:
-            evicted_block = block
-            break
-
-    assert evicted_block is not None
-
-    del free_table[evicted_block.block_hash]
-
-    evicted_block.computed = False
-    return evicted_block
+from vllm.core.evictor import Evictor, EvictionPolicy, make_evictor
 
 
 class BlockAllocator:
@@ -67,29 +22,27 @@ def __init__(self,
                  device: Device,
                  block_size: int,
                  num_blocks: int,
-                 eviction_policy: EvictionPolicy = EvictionPolicy.LRU) -> None:
+                 eviction_policy: EvictionPolicy = EvictionPolicy.LRU,
+                 disable_caching: bool = False) -> None:
         self.device = device
         self.block_size = block_size
         self.num_blocks = num_blocks
-        self.eviction_policy = eviction_policy
+        self.disable_caching = disable_caching
 
         self.current_num_blocks = 0
         self.table: Dict[int, PhysicalTokenBlock] = {}
-        self.free_table: Dict[int, PhysicalTokenBlock] = {}
 
-        self.default_hash_ctr = count()
+        # Switch over to FIFO eviction when caching is disabled
+        if self.disable_caching:
+            eviction_policy = EvictionPolicy.FIFO
+        self.evictor: Evictor = make_evictor(eviction_policy)
 
-    def evict(self) -> PhysicalTokenBlock:
-        if self.eviction_policy == EvictionPolicy.LRU:
-            return lru_eviction(self.free_table)
-        else:
-            raise ValueError(
-                f"Unknown cache eviction policy: {self.eviction_policy}")
+        self.default_hash_ctr = count()
 
     def allocate_block(self, block_hash: int,
                        num_hashed_tokens: int) -> PhysicalTokenBlock:
         if self.current_num_blocks == self.num_blocks:
-            block = self.evict()
+            block = self.evictor.evict()
             block.block_hash = block_hash
             block.num_hashed_tokens = num_hashed_tokens
             return block
@@ -104,15 +57,21 @@ def allocate_block(self, block_hash: int,
     def allocate(self,
                  block_hash: Optional[int] = None,
                  num_hashed_tokens: int = 0) -> PhysicalTokenBlock:
+        # If caching is disabled, just allocate a new block and return it
+        if self.disable_caching:
+            block = self.allocate_block(next(self.default_hash_ctr),
+                                        num_hashed_tokens)
+            block.ref_count += 1
+            return block
+
         if block_hash is None:
             block_hash = next(self.default_hash_ctr)
-        if block_hash in self.free_table:
+        if block_hash in self.evictor:
             assert block_hash not in self.table
-            block = self.free_table[block_hash]
+            block = self.evictor.remove(block_hash)
             assert block.ref_count == 0
             self.table[block_hash] = block
             block.ref_count += 1
-            del self.free_table[block_hash]
             assert block.block_hash == block_hash
             return block
         if block_hash not in self.table:
@@ -128,22 +87,28 @@ def free(self, block: PhysicalTokenBlock) -> None:
             raise ValueError(f"Double free! {block} is already freed.")
         block.ref_count -= 1
         if block.ref_count == 0:
-            assert block.block_hash not in self.free_table
-            self.free_table[block.block_hash] = block
-            del self.table[block.block_hash]
+            assert block.block_hash not in self.evictor
+            self.evictor.append(block)
+
+            # If caching is enabled, remove the block from the table
+            if not self.disable_caching:
+                del self.table[block.block_hash]
 
     def get_num_free_blocks(self) -> int:
-        return self.num_blocks - self.current_num_blocks + len(self.free_table)
+        return self.num_blocks - self.current_num_blocks + self.evictor.num_blocks
 
     def contains_block(self, block_hash: int) -> bool:
-        return block_hash in self.table or block_hash in self.free_table
+        return block_hash in self.table or block_hash in self.evictor
 
     def update_hash(self, block_hash: int, block: PhysicalTokenBlock):
         assert (not self.contains_block(block_hash))
         old_hash = block.block_hash
-        del self.table[old_hash]
         block.block_hash = block_hash
-        self.table[block_hash] = block
+
+        # If caching is enabled, update the table
+        if not self.disable_caching:
+            del self.table[old_hash]
+            self.table[block_hash] = block
 
 
 class AllocStatus(enum.Enum):
@@ -170,6 +135,7 @@ def __init__(
         num_cpu_blocks: int,
         watermark: float = 0.01,
         sliding_window: Optional[int] = None,
+        disable_caching: bool = False,
     ) -> None:
         self.block_size = block_size
         self.num_total_gpu_blocks = num_gpu_blocks
@@ -185,10 +151,14 @@ def __init__(
         assert watermark >= 0.0
 
         self.watermark_blocks = int(watermark * num_gpu_blocks)
-        self.gpu_allocator = BlockAllocator(Device.GPU, block_size,
-                                            num_gpu_blocks)
-        self.cpu_allocator = BlockAllocator(Device.CPU, block_size,
-                                            num_cpu_blocks)
+        self.gpu_allocator = BlockAllocator(Device.GPU,
+                                            block_size,
+                                            num_gpu_blocks,
+                                            disable_caching=disable_caching)
+        self.cpu_allocator = BlockAllocator(Device.CPU,
+                                            block_size,
+                                            num_cpu_blocks,
+                                            disable_caching=disable_caching)
         # Mapping: seq_id -> BlockTable.
         self.block_tables: Dict[int, BlockTable] = {}
 
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 38e470e20acae..fd8086a7adda5 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -94,7 +94,8 @@ def __init__(
             block_size=self.cache_config.block_size,
             num_gpu_blocks=self.cache_config.num_gpu_blocks,
             num_cpu_blocks=self.cache_config.num_cpu_blocks,
-            sliding_window=self.cache_config.sliding_window)
+            sliding_window=self.cache_config.sliding_window,
+            disable_caching=self.cache_config.disable_prefix_caching)
 
         # Sequence groups in the WAITING state.
         self.waiting: Deque[SequenceGroup] = deque()
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index e126f0b12c06f..145ef27ea8320 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -721,11 +721,8 @@ def _process_model_outputs(
         # Update the scheduled sequence groups with the model outputs.
         scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups
 
-        # If automatic prefix caching is disabled, all previously computed blocks
-        # will be recomputed
-        if not self.cache_config.disable_prefix_caching:
-            for seq_group in scheduled_seq_groups:
-                self.scheduler.mark_blocks_as_computed(seq_group)
+        for seq_group in scheduled_seq_groups:
+            self.scheduler.mark_blocks_as_computed(seq_group)
 
         for seq_group, outputs in zip(scheduled_seq_groups, output):
             self._process_sequence_group_outputs(seq_group, outputs)

From fea6789a8d4fb487ae6865df4a2d83a6fa4ab6b8 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Fri, 23 Feb 2024 12:55:12 -0500
Subject: [PATCH 65/79] minor fixes

---
 vllm/block.py        |   4 +-
 vllm/core/evictor.py | 138 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 140 insertions(+), 2 deletions(-)
 create mode 100644 vllm/core/evictor.py

diff --git a/vllm/block.py b/vllm/block.py
index 4fc54f918554b..b8e4aa828496b 100644
--- a/vllm/block.py
+++ b/vllm/block.py
@@ -2,10 +2,10 @@
 from typing import List
 
 from vllm.utils import Device
-from time import monotonic
 
 _BLANK_TOKEN_ID = -1
 
+DEFAULT_LAST_ACCESSED_TIME = -1
 
 class LogicalTokenBlock:
     """A block that stores a contiguous chunk of tokens from left to right.
@@ -66,7 +66,7 @@ def __init__(
         self.num_hashed_tokens = num_hashed_tokens
 
         self.ref_count = 0
-        self.last_accessed = monotonic()
+        self.last_accessed = DEFAULT_LAST_ACCESSED_TIME
 
         self.computed = False
 
diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
new file mode 100644
index 0000000000000..9c8e74a27d9da
--- /dev/null
+++ b/vllm/core/evictor.py
@@ -0,0 +1,138 @@
+import enum
+from typing import Dict, List, Optional
+from abc import ABC, abstractmethod, abstractproperty
+
+from vllm.block import PhysicalTokenBlock, DEFAULT_LAST_ACCESSED_TIME
+
+class EvictionPolicy(enum.Enum):
+    """Enum for eviction policy used by make_evictor to instantiate the correct
+       Evictor subclass.
+    """
+    LRU = enum.auto()
+    FIFO = enum.auto()
+
+
+class Evictor(ABC):
+    """
+    """
+
+    @abstractmethod
+    def evict(self) -> PhysicalTokenBlock:
+        pass
+
+    @abstractmethod
+    def __contains__(self, block_hash: int) -> bool:
+        pass
+
+    @abstractmethod
+    def append(self, block: PhysicalTokenBlock):
+        pass
+
+    @abstractmethod
+    def remove(self, block_hash: int) -> PhysicalTokenBlock:
+        pass
+
+    @abstractproperty
+    def num_blocks(self) -> int:
+        pass
+
+
+class LRUEvictor(Evictor):
+    def __init__(self):
+        self.free_table: Dict[int, PhysicalTokenBlock] = {}
+
+    def __contains__(self, block_hash: int) -> bool:
+        return block_hash in self.free_table
+
+    def evict(self) -> PhysicalTokenBlock:
+        free_blocks: List[PhysicalTokenBlock] = list(self.free_table.values())
+        if len(free_blocks) == 0:
+            raise ValueError("No usable cache memory left")
+
+        # Find lowest timestamp
+        lowest_timestamp = DEFAULT_LAST_ACCESSED_TIME
+        for block in free_blocks:
+            if block.last_accessed < lowest_timestamp:
+                lowest_timestamp = block.last_accessed
+
+        # Find all blocks with the lowest timestamp
+        least_recent: List[PhysicalTokenBlock] = []
+        for block in free_blocks:
+            if block.last_accessed == lowest_timestamp:
+                least_recent.append(block)
+
+        # Find highest prefix count per block
+        highest_num_hashed_tokens = 0
+        for block in least_recent:
+            if block.num_hashed_tokens > highest_num_hashed_tokens:
+                highest_num_hashed_tokens = block.num_hashed_tokens
+
+        evicted_block: Optional[PhysicalTokenBlock] = None
+
+        # Find the first block with the lowest timestamp
+        for block in least_recent:
+            if block.num_hashed_tokens == highest_num_hashed_tokens:
+                evicted_block = block
+                break
+
+        assert evicted_block is not None
+
+        del self.free_table[evicted_block.block_hash]
+
+        evicted_block.computed = False
+        return evicted_block
+
+    def append(self, block: PhysicalTokenBlock):
+        self.free_table[block.block_hash] = block
+
+    def remove(self, block_hash: int) -> PhysicalTokenBlock:
+        if not block_hash in self.free_table:
+            raise AssertionError(
+            "Attempting to remove block that's not in the evictor") 
+        block: PhysicalTokenBlock = self.free_table[block_hash]
+        del self.free_table[block_hash]
+        return block
+
+    @property
+    def num_blocks(self) -> int:
+        return len(self.free_table)
+
+
+class FIFOEvictor(Evictor):
+    """Evicts in a first-in-first-out order"""
+    
+    def __init__(self):
+        self.free_list: List[PhysicalTokenBlock] = []
+
+    def __contains__(self, block_hash: int) -> bool:
+        return any(block_hash == free_block.block_hash
+                   for free_block in self.free_list)
+
+    def evict(self) -> PhysicalTokenBlock:
+        if len(self.free_list) == 0:
+            raise ValueError("No usable cache memory left")
+        return self.free_list.popleft()
+
+    def append(self, block: PhysicalTokenBlock):
+        self.free_list.append(block)
+
+    def remove(self, block_hash: int) -> PhysicalTokenBlock:
+        for free_block in self.free_list:
+            if block_hash == free_block.block_hash:
+                self.free_list.remove(free_block)
+                return free_block
+        raise AssertionError(
+            "Attempting to remove block that's not in the evictor")
+    
+    @property
+    def num_blocks(self) -> int:
+        return len(self.free_list)
+
+
+def make_evictor(eviction_policy: EvictionPolicy) -> Evictor:
+    if eviction_policy == EvictionPolicy.LRU:
+        return LRUEvictor()
+    elif eviction_policy == EvictionPolicy.FIFO:
+        return FIFOEvictor()
+    else:
+        raise ValueError(f"Unknown cache eviction policy: {eviction_policy}")
\ No newline at end of file

From 052c29452b7d1dbffb938ff3ee0667f8ab4aab51 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Fri, 23 Feb 2024 13:01:53 -0500
Subject: [PATCH 66/79] format evictor file

---
 vllm/block.py          |  2 +-
 vllm/core/evictor.py   | 12 +++++++-----
 vllm/core/scheduler.py |  1 -
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/vllm/block.py b/vllm/block.py
index b8e4aa828496b..2cc6b947f2255 100644
--- a/vllm/block.py
+++ b/vllm/block.py
@@ -7,6 +7,7 @@
 
 DEFAULT_LAST_ACCESSED_TIME = -1
 
+
 class LogicalTokenBlock:
     """A block that stores a contiguous chunk of tokens from left to right.
 
@@ -70,7 +71,6 @@ def __init__(
 
         self.computed = False
 
-    # TODO: update this
     def __repr__(self) -> str:
         return (f'PhysicalTokenBlock(device={self.device}, '
                 f'block_number={self.block_number}, '
diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
index 9c8e74a27d9da..d42e52065a3b2 100644
--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@@ -4,6 +4,7 @@
 
 from vllm.block import PhysicalTokenBlock, DEFAULT_LAST_ACCESSED_TIME
 
+
 class EvictionPolicy(enum.Enum):
     """Enum for eviction policy used by make_evictor to instantiate the correct
        Evictor subclass.
@@ -38,6 +39,7 @@ def num_blocks(self) -> int:
 
 
 class LRUEvictor(Evictor):
+
     def __init__(self):
         self.free_table: Dict[int, PhysicalTokenBlock] = {}
 
@@ -86,9 +88,9 @@ def append(self, block: PhysicalTokenBlock):
         self.free_table[block.block_hash] = block
 
     def remove(self, block_hash: int) -> PhysicalTokenBlock:
-        if not block_hash in self.free_table:
+        if block_hash not in self.free_table:
             raise AssertionError(
-            "Attempting to remove block that's not in the evictor") 
+                "Attempting to remove block that's not in the evictor")
         block: PhysicalTokenBlock = self.free_table[block_hash]
         del self.free_table[block_hash]
         return block
@@ -100,7 +102,7 @@ def num_blocks(self) -> int:
 
 class FIFOEvictor(Evictor):
     """Evicts in a first-in-first-out order"""
-    
+
     def __init__(self):
         self.free_list: List[PhysicalTokenBlock] = []
 
@@ -123,7 +125,7 @@ def remove(self, block_hash: int) -> PhysicalTokenBlock:
                 return free_block
         raise AssertionError(
             "Attempting to remove block that's not in the evictor")
-    
+
     @property
     def num_blocks(self) -> int:
         return len(self.free_list)
@@ -135,4 +137,4 @@ def make_evictor(eviction_policy: EvictionPolicy) -> Evictor:
     elif eviction_policy == EvictionPolicy.FIFO:
         return FIFOEvictor()
     else:
-        raise ValueError(f"Unknown cache eviction policy: {eviction_policy}")
\ No newline at end of file
+        raise ValueError(f"Unknown cache eviction policy: {eviction_policy}")
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index fd8086a7adda5..741a6ed69d838 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -364,7 +364,6 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
         scheduler_outputs = self._schedule()
         now = time.time()
 
-        now = time.monotonic()
         # Create input data structures.
         seq_group_metadata_list: List[SequenceGroupMetadata] = []
         for seq_group in scheduler_outputs.scheduled_seq_groups:

From e26cd8e3c5c99bb1705278fa71377a070b4b8fc1 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Fri, 23 Feb 2024 13:24:01 -0500
Subject: [PATCH 67/79] added documentation to the evictor class

---
 vllm/core/evictor.py | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
index d42e52065a3b2..1fe1286d3b2d3 100644
--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@@ -14,23 +14,35 @@ class EvictionPolicy(enum.Enum):
 
 
 class Evictor(ABC):
-    """
+    """The Evictor subclasses should be used by the BlockAllocator class to
+    handle eviction of freed PhysicalTokenBlocks.
     """
 
     @abstractmethod
-    def evict(self) -> PhysicalTokenBlock:
+    def __init__(self):
         pass
 
     @abstractmethod
     def __contains__(self, block_hash: int) -> bool:
         pass
 
+    @abstractmethod
+    def evict(self) -> PhysicalTokenBlock:
+        """Runs the eviction algorithm and returns the evicted block"""
+        pass
+
     @abstractmethod
     def append(self, block: PhysicalTokenBlock):
+        """Adds block to the evictor, making it a candidate for eviction"""
         pass
 
     @abstractmethod
     def remove(self, block_hash: int) -> PhysicalTokenBlock:
+        """Simply removes the block with the hash value block_hash from the
+        evictor. Caller is responsible for making sure that block_hash is contained
+        in the evictor before calling remove. Should be used to "bring back" blocks
+        that have been freed but not evicted yet.
+        """
         pass
 
     @abstractproperty
@@ -39,6 +51,12 @@ def num_blocks(self) -> int:
 
 
 class LRUEvictor(Evictor):
+    """Evicts in a least-recently-used order using the last_accessed timestamp
+    that's recorded in the PhysicalTokenBlock. If there are multiple blocks with
+    the same last_accessed time, then the one with the largest num_hashed_tokens
+    will be evicted. If two blocks each have the lowest last_accessed time and
+    highest num_hashed_tokens value, then one will be chose arbitrarily
+    """
 
     def __init__(self):
         self.free_table: Dict[int, PhysicalTokenBlock] = {}
@@ -89,7 +107,7 @@ def append(self, block: PhysicalTokenBlock):
 
     def remove(self, block_hash: int) -> PhysicalTokenBlock:
         if block_hash not in self.free_table:
-            raise AssertionError(
+            raise ValueError(
                 "Attempting to remove block that's not in the evictor")
         block: PhysicalTokenBlock = self.free_table[block_hash]
         del self.free_table[block_hash]
@@ -123,7 +141,7 @@ def remove(self, block_hash: int) -> PhysicalTokenBlock:
             if block_hash == free_block.block_hash:
                 self.free_list.remove(free_block)
                 return free_block
-        raise AssertionError(
+        raise ValueError(
             "Attempting to remove block that's not in the evictor")
 
     @property

From 2335360d4fa3adeacbfae7ec159b3e9cbb98cc3e Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Fri, 23 Feb 2024 14:26:41 -0500
Subject: [PATCH 68/79] delete newline

---
 vllm/block.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/block.py b/vllm/block.py
index 2cc6b947f2255..a43da8ac6777b 100644
--- a/vllm/block.py
+++ b/vllm/block.py
@@ -7,7 +7,6 @@
 
 DEFAULT_LAST_ACCESSED_TIME = -1
 
-
 class LogicalTokenBlock:
     """A block that stores a contiguous chunk of tokens from left to right.
 

From d66154c558a5ba50539a7e1b5d592b20aa305ff4 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Fri, 23 Feb 2024 14:30:57 -0500
Subject: [PATCH 69/79] format

---
 vllm/block.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/block.py b/vllm/block.py
index a43da8ac6777b..2cc6b947f2255 100644
--- a/vllm/block.py
+++ b/vllm/block.py
@@ -7,6 +7,7 @@
 
 DEFAULT_LAST_ACCESSED_TIME = -1
 
+
 class LogicalTokenBlock:
     """A block that stores a contiguous chunk of tokens from left to right.
 

From 6a3843968f1daccd0888755bad3e954ebeef98e3 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Wed, 28 Feb 2024 08:06:02 -0500
Subject: [PATCH 70/79] Fix timestamp in eviction policy

---
 vllm/core/evictor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
index 1fe1286d3b2d3..8c6d520b4f53a 100644
--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@@ -2,7 +2,7 @@
 from typing import Dict, List, Optional
 from abc import ABC, abstractmethod, abstractproperty
 
-from vllm.block import PhysicalTokenBlock, DEFAULT_LAST_ACCESSED_TIME
+from vllm.block import PhysicalTokenBlock
 
 
 class EvictionPolicy(enum.Enum):
@@ -70,7 +70,7 @@ def evict(self) -> PhysicalTokenBlock:
             raise ValueError("No usable cache memory left")
 
         # Find lowest timestamp
-        lowest_timestamp = DEFAULT_LAST_ACCESSED_TIME
+        lowest_timestamp = free_blocks[0].last_accessed
         for block in free_blocks:
             if block.last_accessed < lowest_timestamp:
                 lowest_timestamp = block.last_accessed

From a449eb67f35b6f310459e9b6c046eabf980a6e91 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 29 Feb 2024 14:01:53 +0000
Subject: [PATCH 71/79] addressing review comments

---
 vllm/config.py             |  4 ++--
 vllm/core/block_manager.py | 48 +++++++++++++++++++-------------------
 vllm/core/evictor.py       | 36 ++++++++++++++--------------
 vllm/core/scheduler.py     |  2 +-
 vllm/engine/arg_utils.py   |  4 ++--
 vllm/sequence.py           |  4 +++-
 6 files changed, 50 insertions(+), 48 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index e3b08bdc72c67..03f8cbd0c3d29 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -295,14 +295,14 @@ def __init__(
         swap_space: int,
         cache_dtype: str,
         sliding_window: Optional[int] = None,
-        disable_prefix_caching: bool = False,
+        enable_prefix_caching: bool = False,
     ) -> None:
         self.block_size = block_size
         self.gpu_memory_utilization = gpu_memory_utilization
         self.swap_space_bytes = swap_space * _GB
         self.cache_dtype = cache_dtype
         self.sliding_window = sliding_window
-        self.disable_prefix_caching = disable_prefix_caching
+        self.enable_prefix_caching = enable_prefix_caching
         self._verify_args()
         self._verify_cache_dtype()
 
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 31e58fab916ec..1905dec232595 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -23,17 +23,17 @@ def __init__(self,
                  block_size: int,
                  num_blocks: int,
                  eviction_policy: EvictionPolicy = EvictionPolicy.LRU,
-                 disable_caching: bool = False) -> None:
+                 enable_caching: bool = False) -> None:
         self.device = device
         self.block_size = block_size
         self.num_blocks = num_blocks
-        self.disable_caching = disable_caching
+        self.enable_caching = enable_caching
 
         self.current_num_blocks = 0
-        self.table: Dict[int, PhysicalTokenBlock] = {}
+        self.cached_blocks: Dict[int, PhysicalTokenBlock] = {}
 
         # Switch over to FIFO eviction when caching is disabled
-        if self.disable_caching:
+        if not self.enable_caching:
             eviction_policy = EvictionPolicy.FIFO
         self.evictor: Evictor = make_evictor(eviction_policy)
 
@@ -58,7 +58,7 @@ def allocate(self,
                  block_hash: Optional[int] = None,
                  num_hashed_tokens: int = 0) -> PhysicalTokenBlock:
         # If caching is disabled, just allocate a new block and return it
-        if self.disable_caching:
+        if not self.enable_caching:
             block = self.allocate_block(next(self.default_hash_ctr),
                                         num_hashed_tokens)
             block.ref_count += 1
@@ -67,17 +67,17 @@ def allocate(self,
         if block_hash is None:
             block_hash = next(self.default_hash_ctr)
         if block_hash in self.evictor:
-            assert block_hash not in self.table
+            assert block_hash not in self.cached_blocks
             block = self.evictor.remove(block_hash)
             assert block.ref_count == 0
-            self.table[block_hash] = block
+            self.cached_blocks[block_hash] = block
             block.ref_count += 1
             assert block.block_hash == block_hash
             return block
-        if block_hash not in self.table:
-            self.table[block_hash] = self.allocate_block(
+        if block_hash not in self.cached_blocks:
+            self.cached_blocks[block_hash] = self.allocate_block(
                 block_hash, num_hashed_tokens)
-        block = self.table[block_hash]
+        block = self.cached_blocks[block_hash]
         assert block.block_hash == block_hash
         block.ref_count += 1
         return block
@@ -88,27 +88,27 @@ def free(self, block: PhysicalTokenBlock) -> None:
         block.ref_count -= 1
         if block.ref_count == 0:
             assert block.block_hash not in self.evictor
-            self.evictor.append(block)
+            self.evictor.add(block)
 
-            # If caching is enabled, remove the block from the table
-            if not self.disable_caching:
-                del self.table[block.block_hash]
+            # If caching is enabled, remove the block from the cached_blocks
+            if self.enable_caching:
+                del self.cached_blocks[block.block_hash]
 
     def get_num_free_blocks(self) -> int:
         return self.num_blocks - self.current_num_blocks + self.evictor.num_blocks
 
     def contains_block(self, block_hash: int) -> bool:
-        return block_hash in self.table or block_hash in self.evictor
+        return block_hash in self.cached_blocks or block_hash in self.evictor
 
     def update_hash(self, block_hash: int, block: PhysicalTokenBlock):
-        assert (not self.contains_block(block_hash))
+        assert not self.contains_block(block_hash)
         old_hash = block.block_hash
         block.block_hash = block_hash
 
-        # If caching is enabled, update the table
-        if not self.disable_caching:
-            del self.table[old_hash]
-            self.table[block_hash] = block
+        # If caching is enabled, update the cached_blocks
+        if self.enable_caching:
+            del self.cached_blocks[old_hash]
+            self.cached_blocks[block_hash] = block
 
 
 class AllocStatus(enum.Enum):
@@ -135,7 +135,7 @@ def __init__(
         num_cpu_blocks: int,
         watermark: float = 0.01,
         sliding_window: Optional[int] = None,
-        disable_caching: bool = False,
+        enable_caching: bool = False,
     ) -> None:
         self.block_size = block_size
         self.num_total_gpu_blocks = num_gpu_blocks
@@ -154,11 +154,11 @@ def __init__(
         self.gpu_allocator = BlockAllocator(Device.GPU,
                                             block_size,
                                             num_gpu_blocks,
-                                            disable_caching=disable_caching)
+                                            enable_caching=enable_caching)
         self.cpu_allocator = BlockAllocator(Device.CPU,
                                             block_size,
                                             num_cpu_blocks,
-                                            disable_caching=disable_caching)
+                                            enable_caching=enable_caching)
         # Mapping: seq_id -> BlockTable.
         self.block_tables: Dict[int, BlockTable] = {}
 
@@ -263,7 +263,7 @@ def _allocate_last_physical_block(
             len(seq.logical_token_blocks) - 1)
         new_block = self.gpu_allocator.allocate(block_hash, num_hashed_tokens)
         if block_hash is None:
-            assert (new_block.ref_count == 1)
+            assert new_block.ref_count == 1
         return new_block
 
     def append_slot(
diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
index 8c6d520b4f53a..62757c74922cf 100644
--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@@ -32,7 +32,7 @@ def evict(self) -> PhysicalTokenBlock:
         pass
 
     @abstractmethod
-    def append(self, block: PhysicalTokenBlock):
+    def add(self, block: PhysicalTokenBlock):
         """Adds block to the evictor, making it a candidate for eviction"""
         pass
 
@@ -64,6 +64,7 @@ def __init__(self):
     def __contains__(self, block_hash: int) -> bool:
         return block_hash in self.free_table
 
+    # TODO: The performance of this evict function can be optimized further.
     def evict(self) -> PhysicalTokenBlock:
         free_blocks: List[PhysicalTokenBlock] = list(self.free_table.values())
         if len(free_blocks) == 0:
@@ -102,7 +103,7 @@ def evict(self) -> PhysicalTokenBlock:
         evicted_block.computed = False
         return evicted_block
 
-    def append(self, block: PhysicalTokenBlock):
+    def add(self, block: PhysicalTokenBlock):
         self.free_table[block.block_hash] = block
 
     def remove(self, block_hash: int) -> PhysicalTokenBlock:
@@ -118,41 +119,40 @@ def num_blocks(self) -> int:
         return len(self.free_table)
 
 
-class FIFOEvictor(Evictor):
+class RandomEvictor(Evictor):
     """Evicts in a first-in-first-out order"""
 
     def __init__(self):
-        self.free_list: List[PhysicalTokenBlock] = []
+        self.free_table: Dict[int, PhysicalTokenBlock] = {}
 
     def __contains__(self, block_hash: int) -> bool:
-        return any(block_hash == free_block.block_hash
-                   for free_block in self.free_list)
+        return block_hash in self.free_table
 
     def evict(self) -> PhysicalTokenBlock:
-        if len(self.free_list) == 0:
+        if len(self.free_table) == 0:
             raise ValueError("No usable cache memory left")
-        return self.free_list.popleft()
+        return next(iter(self.free_table.values()))
 
-    def append(self, block: PhysicalTokenBlock):
-        self.free_list.append(block)
+    def add(self, block: PhysicalTokenBlock):
+        self.free_table[block.block_hash] = block
 
     def remove(self, block_hash: int) -> PhysicalTokenBlock:
-        for free_block in self.free_list:
-            if block_hash == free_block.block_hash:
-                self.free_list.remove(free_block)
-                return free_block
-        raise ValueError(
-            "Attempting to remove block that's not in the evictor")
+        if block_hash not in self.free_table:
+            raise ValueError(
+                "Attempting to remove block that's not in the evictor")
+        block: PhysicalTokenBlock = self.free_table[block_hash]
+        del self.free_table[block_hash]
+        return block
 
     @property
     def num_blocks(self) -> int:
-        return len(self.free_list)
+        return len(self.free_table)
 
 
 def make_evictor(eviction_policy: EvictionPolicy) -> Evictor:
     if eviction_policy == EvictionPolicy.LRU:
         return LRUEvictor()
     elif eviction_policy == EvictionPolicy.FIFO:
-        return FIFOEvictor()
+        return RandomEvictor()
     else:
         raise ValueError(f"Unknown cache eviction policy: {eviction_policy}")
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 78cf9324585fb..1ae58f525b0fb 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -95,7 +95,7 @@ def __init__(
             num_gpu_blocks=self.cache_config.num_gpu_blocks,
             num_cpu_blocks=self.cache_config.num_cpu_blocks,
             sliding_window=self.cache_config.sliding_window,
-            disable_caching=self.cache_config.disable_prefix_caching)
+            enable_caching=self.cache_config.enable_prefix_caching)
 
         # Sequence groups in the WAITING state.
         self.waiting: Deque[SequenceGroup] = deque()
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ac70310cd2a9a..55d5b1c0c6a80 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -25,7 +25,7 @@ class EngineArgs:
     tensor_parallel_size: int = 1
     max_parallel_loading_workers: Optional[int] = None
     block_size: int = 16
-    disable_prefix_caching: bool = False
+    enable_prefix_caching: bool = False
     swap_space: int = 4  # GiB
     gpu_memory_utilization: float = 0.90
     max_num_batched_tokens: Optional[int] = None
@@ -302,7 +302,7 @@ def create_engine_configs(
                                    self.gpu_memory_utilization,
                                    self.swap_space, self.kv_cache_dtype,
                                    model_config.get_sliding_window(),
-                                   self.disable_prefix_caching)
+                                   self.enable_prefix_caching)
         parallel_config = ParallelConfig(self.pipeline_parallel_size,
                                          self.tensor_parallel_size,
                                          self.worker_use_ray,
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 1a7dc86718a8e..122960035e505 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -160,9 +160,11 @@ def __init__(
     def lora_int_id(self) -> int:
         return self.lora_request.lora_int_id if self.lora_request else 0
 
+    # TODO The current hashing function is O(L^2). We should optimize this in
+    # the future.
     def hash_of_block(self, logical_idx: int) -> int:
         # Compute the number of tokens in the sequence
-        num_tokens = logical_idx * self.block_size + self.block_size
+        num_tokens = self.num_hashed_tokens_of_block(logical_idx)
         return hash(tuple(self.data.get_token_ids()[0:num_tokens]))
 
     def num_hashed_tokens_of_block(self, logical_idx: int):

From 30708b891ffc379f6c54d302eaee30f5e3245b6d Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 29 Feb 2024 14:58:30 +0000
Subject: [PATCH 72/79] minor evictor fix

---
 benchmarks/benchmark_throughput.py | 8 +++++++-
 vllm/core/evictor.py               | 5 ++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 1ad502526c97c..be774e7fc5584 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -73,6 +73,7 @@ def run_vllm(
     enforce_eager: bool,
     kv_cache_dtype: str,
     device: str,
+    enable_prefix_caching: bool,
 ) -> float:
     from vllm import LLM, SamplingParams
     llm = LLM(
@@ -87,6 +88,7 @@ def run_vllm(
         enforce_eager=enforce_eager,
         kv_cache_dtype=kv_cache_dtype,
         device=device,
+        enable_prefix_caching=enable_prefix_caching
     )
 
     # Add the requests to the engine.
@@ -211,7 +213,7 @@ def main(args: argparse.Namespace):
                                 args.seed, args.n, args.use_beam_search,
                                 args.trust_remote_code, args.dtype,
                                 args.max_model_len, args.enforce_eager,
-                                args.kv_cache_dtype, args.device)
+                                args.kv_cache_dtype, args.device, args.enable_prefix_caching)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -302,6 +304,10 @@ def main(args: argparse.Namespace):
         default="cuda",
         choices=["cuda"],
         help='device type for vLLM execution, supporting CUDA only currently.')
+    parser.add_argument(
+        "--enable_prefix_caching",
+        action='store_true'
+    )
     args = parser.parse_args()
     if args.tokenizer is None:
         args.tokenizer = args.model
diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
index 62757c74922cf..b538ea574b604 100644
--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@@ -131,7 +131,10 @@ def __contains__(self, block_hash: int) -> bool:
     def evict(self) -> PhysicalTokenBlock:
         if len(self.free_table) == 0:
             raise ValueError("No usable cache memory left")
-        return next(iter(self.free_table.values()))
+        evicted_block = next(iter(self.free_table.values()))
+        evicted_block.computed = False
+        del self.free_table[evicted_block.block_hash]
+        return evicted_block
 
     def add(self, block: PhysicalTokenBlock):
         self.free_table[block.block_hash] = block

From 4e996602cdee3bc7a13a046743db8a5cd9e29611 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 29 Feb 2024 16:08:10 +0000
Subject: [PATCH 73/79] format

---
 benchmarks/benchmark_throughput.py | 34 +++++++++++++-----------------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index be774e7fc5584..51c1a6540a451 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -76,20 +76,18 @@ def run_vllm(
     enable_prefix_caching: bool,
 ) -> float:
     from vllm import LLM, SamplingParams
-    llm = LLM(
-        model=model,
-        tokenizer=tokenizer,
-        quantization=quantization,
-        tensor_parallel_size=tensor_parallel_size,
-        seed=seed,
-        trust_remote_code=trust_remote_code,
-        dtype=dtype,
-        max_model_len=max_model_len,
-        enforce_eager=enforce_eager,
-        kv_cache_dtype=kv_cache_dtype,
-        device=device,
-        enable_prefix_caching=enable_prefix_caching
-    )
+    llm = LLM(model=model,
+              tokenizer=tokenizer,
+              quantization=quantization,
+              tensor_parallel_size=tensor_parallel_size,
+              seed=seed,
+              trust_remote_code=trust_remote_code,
+              dtype=dtype,
+              max_model_len=max_model_len,
+              enforce_eager=enforce_eager,
+              kv_cache_dtype=kv_cache_dtype,
+              device=device,
+              enable_prefix_caching=enable_prefix_caching)
 
     # Add the requests to the engine.
     for prompt, _, output_len in requests:
@@ -213,7 +211,8 @@ def main(args: argparse.Namespace):
                                 args.seed, args.n, args.use_beam_search,
                                 args.trust_remote_code, args.dtype,
                                 args.max_model_len, args.enforce_eager,
-                                args.kv_cache_dtype, args.device, args.enable_prefix_caching)
+                                args.kv_cache_dtype, args.device,
+                                args.enable_prefix_caching)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -304,10 +303,7 @@ def main(args: argparse.Namespace):
         default="cuda",
         choices=["cuda"],
         help='device type for vLLM execution, supporting CUDA only currently.')
-    parser.add_argument(
-        "--enable_prefix_caching",
-        action='store_true'
-    )
+    parser.add_argument("--enable_prefix_caching", action='store_true')
     args = parser.parse_args()
     if args.tokenizer is None:
         args.tokenizer = args.model

From 5b4413b132bcd731cb7f826361a28fa069207941 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Thu, 29 Feb 2024 12:12:55 -0500
Subject: [PATCH 74/79] More protection against sliding window

---
 vllm/worker/model_runner.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 979a2503595bb..01b5bbeec024c 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -143,10 +143,9 @@ def _prepare_prompt(
             # NOTE: This only works for oooooooxxx style attention.
             computed_block_nums = seq_group_metadata.computed_block_nums
             if computed_block_nums is not None and len(
-                    computed_block_nums) > 0:
+                    computed_block_nums) > 0 and self.sliding_window is None:
                 # Prefix is not supported with sliding_window
-                if self.sliding_window is None:
-                    computed_len = len(computed_block_nums) * self.block_size
+                computed_len = len(computed_block_nums) * self.block_size
                 prompt_tokens = prompt_tokens[computed_len:]
                 prefix_block_tables.append(computed_block_nums)
             else:

From 7d17304e4b5450de7c65a11a7f95b33e733889ba Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Thu, 29 Feb 2024 12:22:49 -0500
Subject: [PATCH 75/79] Change automatic prefix caching arg to enable in arg
 utils

---
 docs/source/models/engine_args.rst | 4 ++--
 vllm/engine/arg_utils.py           | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/models/engine_args.rst b/docs/source/models/engine_args.rst
index 945e315d663fd..9f5f672ae4f34 100644
--- a/docs/source/models/engine_args.rst
+++ b/docs/source/models/engine_args.rst
@@ -81,9 +81,9 @@ Below, you can find an explanation of every engine argument for vLLM:
 
     Token block size for contiguous chunks of tokens.
 
-.. option:: --disable-prefix-caching
+.. option:: --enable-prefix-caching
 
-    Disables automatic prefix caching
+    Enables automatic prefix caching
 
 .. option:: --seed <seed>
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 55d5b1c0c6a80..59ecfaab77803 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -175,9 +175,9 @@ def add_cli_args(
                             choices=[8, 16, 32],
                             help='token block size')
 
-        parser.add_argument('--disable-prefix-caching',
+        parser.add_argument('--enable-prefix-caching',
                             action='store_true',
-                            help='Disables automatic prefix caching')
+                            help='Enables automatic prefix caching')
 
         parser.add_argument('--seed',
                             type=int,

From 6358bf0ab9bb1f4525dd3f23f41ea6579038b7c8 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 29 Feb 2024 20:32:37 +0000
Subject: [PATCH 76/79] fix minor BlockAllocator update_hash bug

---
 vllm/core/block_manager.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 1905dec232595..01b3a678c65ff 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -101,12 +101,11 @@ def contains_block(self, block_hash: int) -> bool:
         return block_hash in self.cached_blocks or block_hash in self.evictor
 
     def update_hash(self, block_hash: int, block: PhysicalTokenBlock):
-        assert not self.contains_block(block_hash)
-        old_hash = block.block_hash
-        block.block_hash = block_hash
-
-        # If caching is enabled, update the cached_blocks
+        # If caching is enabled, update the hash of block and the cached_blocks dictionary.
         if self.enable_caching:
+            assert not self.contains_block(block_hash)
+            old_hash = block.block_hash
+            block.block_hash = block_hash
             del self.cached_blocks[old_hash]
             self.cached_blocks[block_hash] = block
 

From b9fbb666b9e04ae660cfa96eeb806af2042fee1f Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 29 Feb 2024 20:34:02 +0000
Subject: [PATCH 77/79] fix test_prefix_caching test

---
 tests/prefix_caching/test_prefix_caching.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index de4a19df89097..1a614619f5594 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -15,7 +15,7 @@ def test_block_allocator(
     num_blocks: int,
 ):
     block_hash = 1
-    block_allocator = BlockAllocator(Device.CPU, block_size, num_blocks)
+    block_allocator = BlockAllocator(Device.CPU, block_size, num_blocks, enable_caching=True)
 
     # Allocate two PysicalTokenBlocks with the same hash and check that they are the same PhysicalTokenBlock
     first_block = block_allocator.allocate(block_hash, 0)
@@ -39,7 +39,7 @@ def test_block_allocator(
 @pytest.mark.parametrize("num_blocks", [16])
 def test_eviction(num_blocks: int, ):
     block_size = 16
-    block_allocator = BlockAllocator(Device.CPU, block_size, num_blocks)
+    block_allocator = BlockAllocator(Device.CPU, block_size, num_blocks, enable_caching=True)
     blocks = []
 
     for i in range(num_blocks):

From 4ce8ceb863389998ad6fede573a51a2cab6202c9 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Fri, 1 Mar 2024 01:54:37 +0000
Subject: [PATCH 78/79] fix minor perf regression

---
 tests/prefix_caching/test_prefix_caching.py | 10 ++++++++--
 vllm/engine/llm_engine.py                   |  7 +++++--
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index 1a614619f5594..7ef8dde7bb8f6 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -15,7 +15,10 @@ def test_block_allocator(
     num_blocks: int,
 ):
     block_hash = 1
-    block_allocator = BlockAllocator(Device.CPU, block_size, num_blocks, enable_caching=True)
+    block_allocator = BlockAllocator(Device.CPU,
+                                     block_size,
+                                     num_blocks,
+                                     enable_caching=True)
 
     # Allocate two PysicalTokenBlocks with the same hash and check that they are the same PhysicalTokenBlock
     first_block = block_allocator.allocate(block_hash, 0)
@@ -39,7 +42,10 @@ def test_block_allocator(
 @pytest.mark.parametrize("num_blocks", [16])
 def test_eviction(num_blocks: int, ):
     block_size = 16
-    block_allocator = BlockAllocator(Device.CPU, block_size, num_blocks, enable_caching=True)
+    block_allocator = BlockAllocator(Device.CPU,
+                                     block_size,
+                                     num_blocks,
+                                     enable_caching=True)
     blocks = []
 
     for i in range(num_blocks):
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 316478eafadf3..4d7731e3f8ca3 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -722,8 +722,11 @@ def _process_model_outputs(
         # Update the scheduled sequence groups with the model outputs.
         scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups
 
-        for seq_group in scheduled_seq_groups:
-            self.scheduler.mark_blocks_as_computed(seq_group)
+        # If prefix caching is enabled, mark all blocks in the sequence groups
+        # as completed so that future requests don't attempt to recompute them
+        if self.cache_config.enable_prefix_caching:
+            for seq_group in scheduled_seq_groups:
+                self.scheduler.mark_blocks_as_computed(seq_group)
 
         for seq_group, outputs in zip(scheduled_seq_groups, output):
             self._process_sequence_group_outputs(seq_group, outputs)

From 11126ab599e281e619d21a3be82d9e087ffcd201 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Fri, 1 Mar 2024 02:33:28 -0500
Subject: [PATCH 79/79] Only mark last prefix block as computed, assume no
 computed blocks with caching disabled

---
 vllm/core/block_manager.py | 44 ++++++++++++++++++++------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 01b3a678c65ff..08d519ab767a9 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -1,6 +1,6 @@
 """A block manager that manages token blocks."""
 import enum
-from itertools import takewhile, count
+from itertools import count
 from os.path import commonprefix
 from typing import Dict, List, Optional, Set, Tuple
 
@@ -149,6 +149,8 @@ def __init__(
         self.watermark = watermark
         assert watermark >= 0.0
 
+        self.enable_caching = enable_caching
+
         self.watermark_blocks = int(watermark * num_gpu_blocks)
         self.gpu_allocator = BlockAllocator(Device.GPU,
                                             block_size,
@@ -431,39 +433,39 @@ def access_all_blocks_in_seq(
         for block in block_table:
             block.last_accessed = access_time
 
-    def compute_all_blocks_in_seq(self, seq: Sequence):
+    def compute_last_full_block_in_seq(self, seq: Sequence):
         if seq.seq_id not in self.block_tables:
             return
+        max_full_block = seq.get_len() // seq.block_size - 1
         block_table = self.block_tables[seq.seq_id]
-        counter = 0
-        max_computed_blocks = seq.get_len() // seq.block_size
-        for block in block_table:
-            if counter >= max_computed_blocks:
-                return
-            block.computed = True
-            counter += 1
+        if max_full_block == -1:
+            return
+        block_table[max_full_block].computed = True
 
-    def get_all_computed_block_ids_seq(self, seq: Sequence) -> List[int]:
+    def get_all_block_ids_till_computed(self, seq: Sequence) -> List[int]:
         if seq.seq_id not in self.block_tables:
             return []
         block_table = self.block_tables[seq.seq_id]
-        last_block = block_table[-1]
-        # We want to get the first n contiguous completed blocks
-        # We exclude the last block because it's most likely not cached yet
-        return [
-            block.block_number for block in takewhile(
-                lambda block: block.computed and block != last_block,
-                block_table)
-        ]
+        for block_idx in reversed(range(len(block_table))):
+            if block_table[block_idx].computed:
+                return [b.block_number for b in block_table[:block_idx + 1]]
+        return []
 
+    # Can return non-empty result only with prefix caching enabled.
     def get_common_computed_block_ids(self,
                                       seq_group: SequenceGroup) -> List[int]:
+        if not self.enable_caching:
+            return []
+
         ids_list = [
-            self.get_all_computed_block_ids_seq(seq)
+            self.get_all_block_ids_till_computed(seq)
             for seq in iter(seq_group.seqs_dict.values())
         ]
         return commonprefix([ids for ids in ids_list if ids != []])
 
+    # We only mark the last full block because with prefix caching,
+    # all blocks until the marked one are guaranteed to be computed.
     def mark_blocks_as_computed(self, seq_group: SequenceGroup):
-        for seq in seq_group.seqs_dict.values():
-            self.compute_all_blocks_in_seq(seq)
+        if self.enable_caching:
+            for seq in seq_group.seqs_dict.values():
+                self.compute_last_full_block_in_seq(seq)