vllm-project · zhuohan123 · Mar 2, 2024 · Feb 2, 2024 · Feb 5, 2024 · Feb 5, 2024
diff --git a/docs/source/models/engine_args.rst b/docs/source/models/engine_args.rst
@@ -81,6 +81,10 @@ Below, you can find an explanation of every engine argument for vLLM:
 
     Token block size for contiguous chunks of tokens.
 
+.. option:: --disable-prefix-caching
+
+    Disables automatic prefix caching
+
 .. option:: --seed <seed>
 
     Random seed for operations.

@@ -5,6 +5,8 @@
 import pytest
 
 from vllm import LLM, SamplingParams
+from vllm.core.block_manager import BlockAllocator
+from vllm.utils import Device
 
 prefix = (
     "You are an expert school principal, skilled in effectively managing "
@@ -18,6 +20,14 @@
     "the following paragraph: ")
 
 
+def allocate_all_blocks(block_allocator, num_blocks):
+    blocks = []
+    for i in range(num_blocks):
+        # use i as the block_hash
+        blocks.append(block_allocator.allocate(i, 0))
+    return blocks
+
+
 @pytest.mark.parametrize("model", ["facebook/opt-125m"])
 @pytest.mark.parametrize("max_tokens", [16])
 def test_prefix_caching(
@@ -38,4 +48,65 @@ def test_prefix_caching(
             outputs_without_prefix, outputs_with_prefix):
         assert (output_without_prefix.outputs[0].token_ids ==
                 output_with_prefix.outputs[0].token_ids)
-    assert len(llm.llm_engine.scheduler.prefix_pool.prefixes) == 1
+
+
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("num_blocks", [16])
+def test_block_allocator(
+    block_size: int,
+    num_blocks: int,
+):
+    block_hash = 1
+    block_allocator = BlockAllocator(Device.CPU, block_size, num_blocks)
+
+    # Allocate two PysicalTokenBlocks with the same hash and check that they are the same PhysicalTokenBlock
+    first_block = block_allocator.allocate(block_hash, 0)
+    second_block = block_allocator.allocate(block_hash, 0)
+    assert (first_block == second_block)
+    assert (second_block.ref_count == 2)
+
+    # Free the first_block and confirm that the ref_count is correctly decremented on the second block
+    block_allocator.free(first_block)
+    assert (second_block.ref_count == 1)
+
+    # Free the second block
+    block_allocator.free(second_block)
+
+    # Reallocate the first block and confirm that, even after the block had its ref_count go to 0, we still get the same block back
+    first_block = block_allocator.allocate(block_hash, 0)
+    assert (first_block == second_block)
+    assert (first_block.block_hash == block_hash)
+
+
+@pytest.mark.parametrize("num_blocks", [16])
+def test_eviction(num_blocks: int, ):
+    block_size = 16
+    block_allocator = BlockAllocator(Device.CPU, block_size, num_blocks)
+    blocks = []
+
+    for i in range(num_blocks):
+        # use i as the block_hash
+        blocks.append(block_allocator.allocate(i, 0))
+
+    #Free all blocks
+    for block in blocks:
+        block_allocator.free(block)
+
+    # Allocate a new block and confirm that it's the first block freed. I.E The Least Recently Used block
+    new_block_hash = block_size
+    new_block = block_allocator.allocate(new_block_hash, 0)
+    assert (new_block == blocks[0])
+    assert (new_block.block_hash == new_block_hash)
+
+    # Reallocate the second in blocks to remove it from the free list
+    realloc_block_hash = 1
+    realloc_block = block_allocator.allocate(realloc_block_hash, 0)
+    assert (realloc_block == blocks[realloc_block_hash])
+    assert (realloc_block.block_hash == realloc_block_hash)
+
+    # Allocate a new block and confirm that it's not the realloc_block, since the realloc_block shouldn't be in the free list
+    new_block_hash = block_size + 1
+    new_block = block_allocator.allocate(new_block_hash, 0)
+    assert (realloc_block != new_block)
+    assert (new_block.block_hash == new_block_hash)
+    assert (new_block.block_number == 2)
diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py
@@ -0,0 +1,76 @@
+"""Test hashing of cache blocks.
+
+Run `pytest tests/test_cache_block_hashing.py`.
+"""
+import pytest
+
+from vllm.transformers_utils.tokenizer import TokenizerGroup
+from vllm.sequence import Sequence
+
+# Make two prefixes with different first blocks.
+prefix_start = [("You are an expert"), ("You are a")]
+prefix_common = (
+    " school principal, skilled in effectively managing "
+    "faculty and staff. Draft 10-15 questions for a potential first grade "
+    "Head Teacher for my K-12, all-girls', independent school that emphasizes "
+    "community, joyful discovery, and life-long learning. The candidate is "
+    "coming in for a first-round panel interview for a 8th grade Math "
+    "teaching role. They have 5 years of previous teaching experience "
+    "as an assistant teacher at a co-ed, public school with experience "
+    "in middle school math teaching. Based on this, fulfill "
+    "the following: ")
+prefixes = [start + prefix_common for start in prefix_start]
+
+# Sample prompts.
+sample_prompts = [
+    "Hello, my name is", "The president of the United States is",
+    "The capital of France is", "The future of AI is"
+]
+
+
+# Helper function.
+def flatten_2d(li):
+    return [lss for ls in li for lss in ls]
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("max_num_seqs", [256])
+def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int):
+
+    tokenizer = TokenizerGroup(
+        tokenizer_id="facebook/opt-125m",
+        enable_lora=False,
+        max_num_seqs=max_num_seqs,
+        max_input_length=None,
+    )
+
+    hashes = []
+
+    for prefix in prefixes:
+        hashes.append([])
+        prompts = [prefix + prompt for prompt in sample_prompts]
+        seq_id = 0
+        for prompt in prompts:
+            hashes[-1].append([])
+            prompt_token_ids = tokenizer.encode(prompt)
+            seq = Sequence(seq_id, prompt, prompt_token_ids, block_size)
+
+            num_blocks = len(prompt_token_ids) // block_size
+            for idx in range(num_blocks):
+                hashes[-1][-1].append(seq.hash(idx))
+
+            seq_id += 1
+
+    # Check that hashes made with two prefixes with different first blocks are
+    # different everywhere.
+    for hash0, hash1 in zip(flatten_2d(hashes[0]), flatten_2d(hashes[1])):
+        assert (hash0 != hash1)
+
+    # Check that hashes of different prompts made with the same prefix are the
+    # same until the hashes that contain the prompt.
+    for hash_pref in hashes:
+        same_hashes = [tuple(h[:-1]) for h in hash_pref]
+        different_hashes = [h[-1] for h in hash_pref]
+        assert (len(set(same_hashes)) == 1)
+        assert (len(set(different_hashes)) == len(different_hashes))
diff --git a/vllm/block.py b/vllm/block.py
@@ -2,6 +2,7 @@
 from typing import List
 
 from vllm.utils import Device
+from time import monotonic
 
 _BLANK_TOKEN_ID = -1
 
@@ -55,17 +56,28 @@ def __init__(
         device: Device,
         block_number: int,
         block_size: int,
+        block_hash: int,
+        prefix_len: int,
     ) -> None:
         self.device = device
         self.block_number = block_number
         self.block_size = block_size
+        self.block_hash = block_hash
+        self.prefix_len = prefix_len
 
         self.ref_count = 0
+        self.last_accessed = monotonic()
 
+        self.computed = False
+
+    # TODO: update this
     def __repr__(self) -> str:
         return (f'PhysicalTokenBlock(device={self.device}, '
                 f'block_number={self.block_number}, '
-                f'ref_count={self.ref_count})')
+                f'prefix_len={self.prefix_len}, '
+                f'ref_count={self.ref_count}, '
+                f'last_accessed={self.last_accessed}, '
+                f'computed={self.computed})')
 
 
 # Mapping: logical block number -> physical block.

diff --git a/vllm/config.py b/vllm/config.py
@@ -295,12 +295,14 @@ def __init__(
         swap_space: int,
         cache_dtype: str,
         sliding_window: Optional[int] = None,
+        disable_prefix_caching: bool = False,
     ) -> None:
         self.block_size = block_size
         self.gpu_memory_utilization = gpu_memory_utilization
         self.swap_space_bytes = swap_space * _GB
         self.cache_dtype = cache_dtype
         self.sliding_window = sliding_window
+        self.disable_prefix_caching = disable_prefix_caching
         self._verify_args()
         self._verify_cache_dtype()