From 6a286064c971d910cd8a965aca4321d6bf294a52 Mon Sep 17 00:00:00 2001 From: Sungjae Lee <33976427+llsj14@users.noreply.github.com> Date: Tue, 6 Aug 2024 14:56:12 +0000 Subject: [PATCH 01/14] feat: optimize evictor v2 performance using priority queue and lazy deletion Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com> --- vllm/core/evictor.py | 46 +++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index ed7e06cab2996..d7372493675e7 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -1,6 +1,7 @@ +import heapq import enum from abc import ABC, abstractmethod -from typing import OrderedDict, Tuple +from typing import Dict, Tuple class EvictionPolicy(enum.Enum): @@ -76,7 +77,8 @@ class LRUEvictor(Evictor): """ def __init__(self): - self.free_table: OrderedDict[int, BlockMetaData] = OrderedDict() + self.free_table: Dict[int, BlockMetaData] = {} + self.priority_queue = [] def __contains__(self, block_id: int) -> bool: return block_id in self.free_table @@ -85,34 +87,38 @@ def evict(self) -> Tuple[int, int]: if len(self.free_table) == 0: raise ValueError("No usable cache memory left") - evicted_block, evicted_block_id = None, None - # The blocks with the lowest timestamps should be placed consecutively - # at the start of OrderedDict. Loop through all these blocks to - # find the one with maximum number of hashed tokens. - for _id, block in self.free_table.items(): - if evicted_block is None: - evicted_block, evicted_block_id = block, _id - continue - if evicted_block.last_accessed < block.last_accessed: - break - if evicted_block.num_hashed_tokens < block.num_hashed_tokens: - evicted_block, evicted_block_id = block, _id - - assert evicted_block is not None - assert evicted_block_id is not None - self.free_table.pop(evicted_block_id) - - return evicted_block_id, evicted_block.content_hash + while self.priority_queue: + # Lazy deletion algorithm is applied by checking blocks in the free table. + last_accessed, _, content_hash, block_id = heapq.heappop(self.priority_queue) + if block_id in self.free_table: + if self.free_table[block_id].last_accessed == last_accessed: + self.free_table.pop(block_id) + return block_id, content_hash + + raise ValueError("No usable cache memory left") def add(self, block_id: int, content_hash: int, num_hashed_tokens: int, last_accessed: float): self.free_table[block_id] = BlockMetaData(content_hash, num_hashed_tokens, last_accessed) + heapq.heappush(self.priority_queue, (last_accessed, -num_hashed_tokens, content_hash, block_id)) + self._cleanup_if_necessary() def update(self, block_id: int, last_accessed: float): self.free_table[block_id].last_accessed = last_accessed + def _cleanup_if_necessary(self): + if len(self.priority_queue) > 50 * len(self.free_table): + self._cleanup() + + def _cleanup(self): + new_priority_queue = [] + for last_accessed, neg_num_hashed_tokens, content_hash, block_id in self.priority_queue: + if block_id in self.free_table and self.free_table[block_id].last_accessed == last_accessed: + heapq.heappush(new_priority_queue, (last_accessed, neg_num_hashed_tokens, content_hash, block_id)) + self.priority_queue = new_priority_queue + def remove(self, block_id: int): if block_id not in self.free_table: raise ValueError( From 461c8fd88f86d2baf5a9b89911e4e792859e909b Mon Sep 17 00:00:00 2001 From: Sungjae Lee <33976427+llsj14@users.noreply.github.com> Date: Tue, 6 Aug 2024 15:26:28 +0000 Subject: [PATCH 02/14] refactor: make format Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com> --- vllm/core/evictor.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index d7372493675e7..68bd011e541df 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -1,7 +1,7 @@ -import heapq import enum +import heapq from abc import ABC, abstractmethod -from typing import Dict, Tuple +from typing import Dict, List, Tuple class EvictionPolicy(enum.Enum): @@ -88,12 +88,13 @@ def evict(self) -> Tuple[int, int]: raise ValueError("No usable cache memory left") while self.priority_queue: - # Lazy deletion algorithm is applied by checking blocks in the free table. - last_accessed, _, content_hash, block_id = heapq.heappop(self.priority_queue) - if block_id in self.free_table: - if self.free_table[block_id].last_accessed == last_accessed: - self.free_table.pop(block_id) - return block_id, content_hash + # Lazy deletion algorithm is applied. + last_accessed, _, content_hash, block_id = heapq.heappop( + self.priority_queue) + if (block_id in self.free_table and + self.free_table[block_id].last_accessed == last_accessed): + self.free_table.pop(block_id) + return block_id, content_hash raise ValueError("No usable cache memory left") @@ -102,7 +103,9 @@ def add(self, block_id: int, content_hash: int, num_hashed_tokens: int, self.free_table[block_id] = BlockMetaData(content_hash, num_hashed_tokens, last_accessed) - heapq.heappush(self.priority_queue, (last_accessed, -num_hashed_tokens, content_hash, block_id)) + heapq.heappush( + self.priority_queue, + (last_accessed, -num_hashed_tokens, content_hash, block_id)) self._cleanup_if_necessary() def update(self, block_id: int, last_accessed: float): @@ -113,10 +116,14 @@ def _cleanup_if_necessary(self): self._cleanup() def _cleanup(self): - new_priority_queue = [] - for last_accessed, neg_num_hashed_tokens, content_hash, block_id in self.priority_queue: - if block_id in self.free_table and self.free_table[block_id].last_accessed == last_accessed: - heapq.heappush(new_priority_queue, (last_accessed, neg_num_hashed_tokens, content_hash, block_id)) + new_priority_queue: List[Tuple[int, int, int, int]] = [] + for last_accessed, neg_num_hashed_tokens, content_hash, block_id in ( + self.priority_queue): + if (block_id in self.free_table and + self.free_table[block_id].last_accessed == last_accessed): + heapq.heappush(new_priority_queue, + (last_accessed, neg_num_hashed_tokens, + content_hash, block_id)) self.priority_queue = new_priority_queue def remove(self, block_id: int): From ad9bf4a4a0e714ee0d3ae89f6a8c6c024336cdb6 Mon Sep 17 00:00:00 2001 From: Sungjae Lee <33976427+llsj14@users.noreply.github.com> Date: Tue, 6 Aug 2024 23:54:16 +0000 Subject: [PATCH 03/14] refactor: use global defined variable for cleanup threshold Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com> --- vllm/core/evictor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index 68bd011e541df..750f72f89baef 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -3,6 +3,8 @@ from abc import ABC, abstractmethod from typing import Dict, List, Tuple +CLEANUP_THRESHOLD = 50 + class EvictionPolicy(enum.Enum): """Enum for eviction policy used by make_evictor to instantiate the correct @@ -112,7 +114,7 @@ def update(self, block_id: int, last_accessed: float): self.free_table[block_id].last_accessed = last_accessed def _cleanup_if_necessary(self): - if len(self.priority_queue) > 50 * len(self.free_table): + if len(self.priority_queue) > CLEANUP_THRESHOLD * len(self.free_table): self._cleanup() def _cleanup(self): From a1ef9eccbc12932c4971a6c40d75122d3b552b92 Mon Sep 17 00:00:00 2001 From: Sungjae Lee <33976427+llsj14@users.noreply.github.com> Date: Wed, 7 Aug 2024 02:27:18 +0000 Subject: [PATCH 04/14] refactor: make CLEAN_THRESHOLD as a static class member Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com> --- vllm/core/evictor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index 750f72f89baef..92cf50ea192f4 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -3,8 +3,6 @@ from abc import ABC, abstractmethod from typing import Dict, List, Tuple -CLEANUP_THRESHOLD = 50 - class EvictionPolicy(enum.Enum): """Enum for eviction policy used by make_evictor to instantiate the correct @@ -78,6 +76,8 @@ class LRUEvictor(Evictor): highest num_hashed_tokens value, then one will be chose arbitrarily """ + CLEANUP_THRESHOLD = 50 + def __init__(self): self.free_table: Dict[int, BlockMetaData] = {} self.priority_queue = [] @@ -114,7 +114,7 @@ def update(self, block_id: int, last_accessed: float): self.free_table[block_id].last_accessed = last_accessed def _cleanup_if_necessary(self): - if len(self.priority_queue) > CLEANUP_THRESHOLD * len(self.free_table): + if len(self.priority_queue) > LRUEvictor.CLEANUP_THRESHOLD * len(self.free_table): self._cleanup() def _cleanup(self): From c505a93a7f6a7a2b070fb4e322e79f5f321c3770 Mon Sep 17 00:00:00 2001 From: Sungjae Lee <33976427+llsj14@users.noreply.github.com> Date: Wed, 7 Aug 2024 02:28:00 +0000 Subject: [PATCH 05/14] refactor: make format Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com> --- vllm/core/evictor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index 92cf50ea192f4..ccc0c1b3681eb 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -114,7 +114,8 @@ def update(self, block_id: int, last_accessed: float): self.free_table[block_id].last_accessed = last_accessed def _cleanup_if_necessary(self): - if len(self.priority_queue) > LRUEvictor.CLEANUP_THRESHOLD * len(self.free_table): + if len(self.priority_queue) > LRUEvictor.CLEANUP_THRESHOLD * len( + self.free_table): self._cleanup() def _cleanup(self): From 02e92f7bd5986ed88dbb3b9fc6ffcae1822392d0 Mon Sep 17 00:00:00 2001 From: Sungjae Lee <33976427+llsj14@users.noreply.github.com> Date: Wed, 7 Aug 2024 04:45:01 +0000 Subject: [PATCH 06/14] fix: optimize priority queue cleanup operation Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com> --- vllm/core/evictor.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index ccc0c1b3681eb..1573a16ce7639 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -119,14 +119,14 @@ def _cleanup_if_necessary(self): self._cleanup() def _cleanup(self): - new_priority_queue: List[Tuple[int, int, int, int]] = [] - for last_accessed, neg_num_hashed_tokens, content_hash, block_id in ( - self.priority_queue): - if (block_id in self.free_table and - self.free_table[block_id].last_accessed == last_accessed): - heapq.heappush(new_priority_queue, - (last_accessed, neg_num_hashed_tokens, - content_hash, block_id)) + new_priority_queue: List[Tuple[float, int, int, int]] = [] + + for block_id, block in self.free_table.items(): + new_priority_queue.append( + (block.last_accessed, -block.num_hashed_tokens, + block.content_hash, block_id)) + heapq.heapify(new_priority_queue) + self.priority_queue = new_priority_queue def remove(self, block_id: int): From 76e4665c98c9235ccd6384d7da01043405b86643 Mon Sep 17 00:00:00 2001 From: Sungjae Lee <33976427+llsj14@users.noreply.github.com> Date: Wed, 11 Dec 2024 13:09:03 +0000 Subject: [PATCH 07/14] trigger test Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com> From 840612a5c4779f5b4125dc47c1ac1ddd5e1ad2e4 Mon Sep 17 00:00:00 2001 From: Sungjae Lee <33976427+llsj14@users.noreply.github.com> Date: Fri, 13 Dec 2024 09:08:28 +0000 Subject: [PATCH 08/14] prioritize block_id in priority queue Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com> --- vllm/core/evictor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index 1573a16ce7639..02584c12d5178 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -91,7 +91,7 @@ def evict(self) -> Tuple[int, int]: while self.priority_queue: # Lazy deletion algorithm is applied. - last_accessed, _, content_hash, block_id = heapq.heappop( + last_accessed, _, block_id, content_hash = heapq.heappop( self.priority_queue) if (block_id in self.free_table and self.free_table[block_id].last_accessed == last_accessed): @@ -107,7 +107,7 @@ def add(self, block_id: int, content_hash: int, num_hashed_tokens: int, last_accessed) heapq.heappush( self.priority_queue, - (last_accessed, -num_hashed_tokens, content_hash, block_id)) + (last_accessed, -num_hashed_tokens, block_id, content_hash)) self._cleanup_if_necessary() def update(self, block_id: int, last_accessed: float): @@ -124,7 +124,7 @@ def _cleanup(self): for block_id, block in self.free_table.items(): new_priority_queue.append( (block.last_accessed, -block.num_hashed_tokens, - block.content_hash, block_id)) + block_id, block.content_hash)) heapq.heapify(new_priority_queue) self.priority_queue = new_priority_queue From add810e5dcc85e160fdbaec4d6432aae4ddc660f Mon Sep 17 00:00:00 2001 From: Sungjae Lee <33976427+llsj14@users.noreply.github.com> Date: Fri, 13 Dec 2024 09:13:29 +0000 Subject: [PATCH 09/14] make format Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com> --- vllm/core/evictor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index 02584c12d5178..5e87e6deaf382 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -123,8 +123,8 @@ def _cleanup(self): for block_id, block in self.free_table.items(): new_priority_queue.append( - (block.last_accessed, -block.num_hashed_tokens, - block_id, block.content_hash)) + (block.last_accessed, -block.num_hashed_tokens, block_id, + block.content_hash)) heapq.heapify(new_priority_queue) self.priority_queue = new_priority_queue From 1c8c2b834b3bbb93a03a16a8842537048c71e861 Mon Sep 17 00:00:00 2001 From: Sungjae Lee <33976427+llsj14@users.noreply.github.com> Date: Fri, 13 Dec 2024 09:34:43 +0000 Subject: [PATCH 10/14] retrigger test Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com> From e1d7d7a326fd2fe4c2c70897763b8f7bffa54db4 Mon Sep 17 00:00:00 2001 From: Sungjae Lee <33976427+llsj14@users.noreply.github.com> Date: Sat, 14 Dec 2024 01:22:34 +0000 Subject: [PATCH 11/14] add comment Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com> --- vllm/core/evictor.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index 5e87e6deaf382..f7886846301dc 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -76,6 +76,9 @@ class LRUEvictor(Evictor): highest num_hashed_tokens value, then one will be chose arbitrarily """ + # CLEANUP_THRESHOLD determines the maximum allowable size of the priority queue + # relative to the free table size. When this threshold is exceeded, a cleanup + # operation is triggered to reduce memory usage. CLEANUP_THRESHOLD = 50 def __init__(self): @@ -90,7 +93,9 @@ def evict(self) -> Tuple[int, int]: raise ValueError("No usable cache memory left") while self.priority_queue: - # Lazy deletion algorithm is applied. + # Lazy deletion algorithm is applied here. We do not remove outdated + # entries from the priority queue at the time of updating the last_accessed + # timestamp. Instead, outdated entries are filtered out during eviction. last_accessed, _, block_id, content_hash = heapq.heappop( self.priority_queue) if (block_id in self.free_table and From 0d554e48565d76940dab5a7372d344ab758a7cc1 Mon Sep 17 00:00:00 2001 From: Sungjae Lee <33976427+llsj14@users.noreply.github.com> Date: Sat, 14 Dec 2024 01:37:03 +0000 Subject: [PATCH 12/14] make format Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com> --- vllm/core/evictor.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index f7886846301dc..be17845a9a607 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -76,9 +76,9 @@ class LRUEvictor(Evictor): highest num_hashed_tokens value, then one will be chose arbitrarily """ - # CLEANUP_THRESHOLD determines the maximum allowable size of the priority queue - # relative to the free table size. When this threshold is exceeded, a cleanup - # operation is triggered to reduce memory usage. + # CLEANUP_THRESHOLD determines the maximum allowable size of the priority + # queue relative to the free table size. When this threshold is exceeded, + # a cleanup operation is triggered to reduce memory usage. CLEANUP_THRESHOLD = 50 def __init__(self): @@ -94,8 +94,9 @@ def evict(self) -> Tuple[int, int]: while self.priority_queue: # Lazy deletion algorithm is applied here. We do not remove outdated - # entries from the priority queue at the time of updating the last_accessed - # timestamp. Instead, outdated entries are filtered out during eviction. + # entries from the priority queue at the time of updating the + # last_accessed timestamp. + # Instead, outdated entries are filtered out during eviction. last_accessed, _, block_id, content_hash = heapq.heappop( self.priority_queue) if (block_id in self.free_table and From b92306084c20c348b8663fc11d69e4cb13b30f3f Mon Sep 17 00:00:00 2001 From: Sungjae Lee <33976427+llsj14@users.noreply.github.com> Date: Sat, 14 Dec 2024 10:57:14 +0900 Subject: [PATCH 13/14] update comments Co-authored-by: Cody Yu Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com> --- vllm/core/evictor.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index be17845a9a607..56f63df63be7a 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -93,10 +93,10 @@ def evict(self) -> Tuple[int, int]: raise ValueError("No usable cache memory left") while self.priority_queue: - # Lazy deletion algorithm is applied here. We do not remove outdated - # entries from the priority queue at the time of updating the - # last_accessed timestamp. - # Instead, outdated entries are filtered out during eviction. + # We do not remove outdated entries from the priority queue at the + # time of updating the last_accessed timestamp. Instead, outdated + # entries are filtered out here during eviction. Outdated entries would + # either not in the free table, or have older last accessed time. last_accessed, _, block_id, content_hash = heapq.heappop( self.priority_queue) if (block_id in self.free_table and From 46798adcb84f4618d1173b16c764dcbc2f28bc7a Mon Sep 17 00:00:00 2001 From: Sungjae Lee <33976427+llsj14@users.noreply.github.com> Date: Sat, 14 Dec 2024 01:59:05 +0000 Subject: [PATCH 14/14] make format Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com> --- vllm/core/evictor.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index 56f63df63be7a..44adc4158abec 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -95,8 +95,9 @@ def evict(self) -> Tuple[int, int]: while self.priority_queue: # We do not remove outdated entries from the priority queue at the # time of updating the last_accessed timestamp. Instead, outdated - # entries are filtered out here during eviction. Outdated entries would - # either not in the free table, or have older last accessed time. + # entries are filtered out here during eviction. Outdated entries + # would either not in the free table, or have older last accessed + # time. last_accessed, _, block_id, content_hash = heapq.heappop( self.priority_queue) if (block_id in self.free_table and