Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 4 additions & 18 deletions vllm/v1/core/block_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,34 +107,20 @@ def cache_full_blocks(
assert prev_block.block_hash is not None
prev_block_hash_value = prev_block.block_hash.hash_value

# Find the first uncached block.
# FIXME: num_cached_blocks should be corrected by the caller
# so this should never happen.
offset = 0
for blk in new_full_blocks:
if blk.block_hash is None:
break
else:
prev_block_hash_value = blk.block_hash.hash_value
offset += 1
else:
# All blocks are cached.
return

for i, blk in enumerate(new_full_blocks[offset:]):
blk_idx = num_cached_blocks + offset + i
for i, blk in enumerate(new_full_blocks):
assert blk.block_hash is None

if i + offset < len(new_block_hashes):
if i < len(new_block_hashes):
# The block hash may already be computed in
# "get_computed_blocks" if the tokens are not generated by
# this request (either the prompt tokens or the previously
# generated tokens with preemption). In this case we simply
# reuse the block hash.
block_hash = new_block_hashes[i + offset]
block_hash = new_block_hashes[i]
else:
# Otherwise compute the block hash and cache it in the request
# in case it will be preempted in the future.
blk_idx = num_cached_blocks + i
start_token_idx = blk_idx * block_size
end_token_idx = (blk_idx + 1) * block_size
block_tokens = request.all_token_ids[
Expand Down
9 changes: 5 additions & 4 deletions vllm/v1/core/kv_cache_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def __init__(
# This is used to track the number of cached blocks for each request.
# This is only used to track the RUNNING requests, we do not track the
# data for reempted ones.
self.num_cached_block: Dict[str, int] = defaultdict(int)
self.num_cached_block: Dict[str, int] = {}
self.prefix_cache_stats = PrefixCacheStats()

@property
Expand Down Expand Up @@ -224,9 +224,10 @@ def allocate_slots(
if not self.enable_caching:
return new_blocks

# FIXME: `num_cached_blocks` is not correct when the prefix cache
# of a new request is hit.
num_cached_blocks = self.num_cached_block[request.request_id]
# Use `new_computed_blocks` for a new request, and `num_cached_block`
# for a running request.
num_cached_blocks = self.num_cached_block.get(request.request_id,
len(new_computed_blocks))
# Speculated tokens might be rejected in the future, so we does
# not cache any speculated tokens. We only cache blocks with
# generated (accepted) tokens.
Expand Down