-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Make DictionaryCache have better expiry properties #13292
Changes from 12 commits
326a175
40a8fba
a22716c
f046366
602a81f
23c2f39
cad555f
7aceec3
057ae8b
7f7b36d
462db2a
129691f
45d0dce
ca8e1af
e4723df
a74baa6
a9ebcd2
6a76dba
d4133b2
88aa56c
f053edb
740fe2f
378aec5
5709037
b376618
12e14f2
fed7755
c9f13f3
67bb06d
2ec4cab
50bb901
aa203c6
2151474
43e0030
3c23161
e5ef14d
10fb0d0
d521be2
3c84a98
9c54881
daa2741
055a5dd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Make `DictionaryCache` expire full entries if they haven't been queried in a while, even if specific keys have been queried recently. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,11 +14,13 @@ | |
import enum | ||
import logging | ||
import threading | ||
from typing import Any, Dict, Generic, Iterable, Optional, Set, TypeVar | ||
from typing import Any, Dict, Generic, Iterable, Optional, Set, Tuple, TypeVar, Union | ||
|
||
import attr | ||
from typing_extensions import Literal | ||
|
||
from synapse.util.caches.lrucache import LruCache | ||
from synapse.util.caches.treecache import TreeCache, iterate_tree_cache_items | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
@@ -53,20 +55,67 @@ def __len__(self) -> int: | |
return len(self.value) | ||
|
||
|
||
class _FullCacheKey(enum.Enum): | ||
"""The key we use to cache the full dict.""" | ||
|
||
KEY = object() | ||
|
||
|
||
class _Sentinel(enum.Enum): | ||
# defining a sentinel in this way allows mypy to correctly handle the | ||
# type of a dictionary lookup. | ||
sentinel = object() | ||
|
||
|
||
class _PerKeyValue(Generic[DV]): | ||
erikjohnston marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"""The cached value of a dictionary key. If `value` is the sentinel, | ||
indicates that the requested key is known to *not* be in the full dict. | ||
""" | ||
|
||
__slots__ = ["value"] | ||
|
||
def __init__(self, value: Union[DV, Literal[_Sentinel.sentinel]]) -> None: | ||
self.value = value | ||
|
||
def __len__(self) -> int: | ||
# We add a `__len__` implementation as we use this class in a cache | ||
# where the values are variable length. | ||
return 1 | ||
|
||
|
||
class DictionaryCache(Generic[KT, DKT, DV]): | ||
"""Caches key -> dictionary lookups, supporting caching partial dicts, i.e. | ||
fetching a subset of dictionary keys for a particular key. | ||
""" | ||
erikjohnston marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def __init__(self, name: str, max_entries: int = 1000): | ||
self.cache: LruCache[KT, DictionaryEntry] = LruCache( | ||
max_size=max_entries, cache_name=name, size_callback=len | ||
# We use a single cache to cache two different types of entries: | ||
erikjohnston marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# 1. Map from (key, dict_key) -> dict value (or sentinel, indicating | ||
# the key doesn't exist in the dict); and | ||
# 2. Map from (key, _FullCacheKey.KEY) -> full dict. | ||
# | ||
# The former is used when explicit keys of the dictionary are looked up, | ||
# and the latter when the full dictionary is requested. | ||
# | ||
# If when explicit keys are requested and not in the cache, we then look | ||
# to see if we have the full dict and use that if we do. If found in the | ||
# full dict each key is added into the cache. | ||
# | ||
# This set up allows the `LruCache` to prune the full dict entries if | ||
# they haven't been used in a while, even when there have been recent | ||
# queries for subsets of the dict. | ||
# | ||
# Typing: | ||
# * A key of `(KT, DKT)` has a value of `_PerKeyValue` | ||
# * A key of `(KT, _FullCacheKey.KEY)` has a value of `Dict[DKT, DV]` | ||
self.cache: LruCache[ | ||
Tuple[KT, Union[DKT, Literal[_FullCacheKey.KEY]]], | ||
Union[_PerKeyValue, Dict[DKT, DV]], | ||
] = LruCache( | ||
max_size=max_entries, | ||
cache_name=name, | ||
cache_type=TreeCache, | ||
size_callback=len, | ||
) | ||
|
||
self.name = name | ||
|
@@ -96,28 +145,107 @@ def get( | |
Returns: | ||
DictionaryEntry | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this doesn't document what happens if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ... please? I think it returns some fixed shape of DictionaryEntry ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, I'd put some docs under the |
||
""" | ||
entry = self.cache.get(key, _Sentinel.sentinel) | ||
if entry is not _Sentinel.sentinel: | ||
if dict_keys is None: | ||
return DictionaryEntry( | ||
entry.full, entry.known_absent, dict(entry.value) | ||
) | ||
|
||
erikjohnston marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if dict_keys is None: | ||
# First we check if we have cached the full dict. | ||
entry = self.cache.get((key, _FullCacheKey.KEY), _Sentinel.sentinel) | ||
if entry is not _Sentinel.sentinel: | ||
assert isinstance(entry, dict) | ||
return DictionaryEntry(True, set(), entry) | ||
|
||
# If not, check if we have cached any of dict keys. | ||
erikjohnston marked this conversation as resolved.
Show resolved
Hide resolved
|
||
all_entries = self.cache.get_multi( | ||
(key,), | ||
_Sentinel.sentinel, | ||
) | ||
if all_entries is _Sentinel.sentinel: | ||
return DictionaryEntry(False, set(), {}) | ||
|
||
# If there are entries we need to unwrap the returned cache nodes | ||
# and `_PerKeyValue` into the `DictionaryEntry`. | ||
values = {} | ||
known_absent = set() | ||
for dict_key, dict_value in iterate_tree_cache_items((), all_entries): | ||
dict_key = dict_key[0] | ||
erikjohnston marked this conversation as resolved.
Show resolved
Hide resolved
|
||
dict_value = dict_value.value | ||
richvdh marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# We have explicitly looked for a full cache key, so we | ||
# shouldn't see one. | ||
assert dict_key != _FullCacheKey.KEY | ||
|
||
# ... therefore the values must be `_PerKeyValue` | ||
assert isinstance(dict_value, _PerKeyValue) | ||
|
||
if dict_value.value is _Sentinel.sentinel: | ||
known_absent.add(dict_key) | ||
else: | ||
values[dict_key] = dict_value.value | ||
|
||
return DictionaryEntry(False, known_absent, values) | ||
erikjohnston marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# We are being asked for a subset of keys. | ||
|
||
# First got and check for each requested dict key in the cache, tracking | ||
erikjohnston marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# which we couldn't find. | ||
values = {} | ||
known_absent = set() | ||
missing = set() | ||
erikjohnston marked this conversation as resolved.
Show resolved
Hide resolved
|
||
for dict_key in dict_keys: | ||
entry = self.cache.get((key, dict_key), _Sentinel.sentinel) | ||
if entry is _Sentinel.sentinel: | ||
missing.add(dict_key) | ||
continue | ||
|
||
assert isinstance(entry, _PerKeyValue) | ||
|
||
if entry.value is _Sentinel.sentinel: | ||
known_absent.add(dict_key) | ||
else: | ||
return DictionaryEntry( | ||
entry.full, | ||
entry.known_absent, | ||
{k: entry.value[k] for k in dict_keys if k in entry.value}, | ||
) | ||
values[dict_key] = entry.value | ||
|
||
# If we found everything we can return immediately. | ||
if not missing: | ||
return DictionaryEntry(False, known_absent, values) | ||
|
||
# If we are missing any keys check if we happen to have the full dict in | ||
erikjohnston marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# the cache. | ||
# | ||
# We don't update the last access time for this cache fetch, as we | ||
# aren't explicitly interested in the full dict and so we don't want | ||
# requests for explicit dict keys to keep the full dict in the cache. | ||
entry = self.cache.get( | ||
(key, _FullCacheKey.KEY), | ||
_Sentinel.sentinel, | ||
update_last_access=False, | ||
) | ||
if entry is _Sentinel.sentinel: | ||
# Not in the cache, return the subset of keys we found. | ||
return DictionaryEntry(False, known_absent, values) | ||
|
||
# We have the full dict! | ||
assert isinstance(entry, dict) | ||
|
||
return DictionaryEntry(False, set(), {}) | ||
values = {} | ||
for dict_key in dict_keys: | ||
erikjohnston marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# We explicitly add each dict key to the cache, so that cache hit | ||
# rates for each key can be tracked separately. | ||
erikjohnston marked this conversation as resolved.
Show resolved
Hide resolved
|
||
value = entry.get(dict_key, _Sentinel.sentinel) # type: ignore[arg-type] | ||
self.cache[(key, dict_key)] = _PerKeyValue(value) | ||
|
||
if value is not _Sentinel.sentinel: | ||
values[dict_key] = value | ||
|
||
return DictionaryEntry(True, set(), values) | ||
erikjohnston marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def invalidate(self, key: KT) -> None: | ||
self.check_thread() | ||
|
||
# Increment the sequence number so that any SELECT statements that | ||
# raced with the INSERT don't update the cache (SYN-369) | ||
self.sequence += 1 | ||
self.cache.pop(key, None) | ||
|
||
# Del-multi accepts truncated tuples. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know what this means. Is it telling me why it's better to use |
||
self.cache.del_multi((key,)) # type: ignore[arg-type] | ||
|
||
def invalidate_all(self) -> None: | ||
self.check_thread() | ||
|
@@ -149,20 +277,27 @@ def update( | |
# Only update the cache if the caches sequence number matches the | ||
# number that the cache had before the SELECT was started (SYN-369) | ||
if fetched_keys is None: | ||
self._insert(key, value, set()) | ||
self.cache[(key, _FullCacheKey.KEY)] = value | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we not need to update (or at least invalidate) entries for individual dict keys here as well? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think so, if the value in the DB has changed then we should have previously called There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fair enough. That seems like an assumption that we should document in the docstring though. (Also: shouldn't this method be called There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yup, will try and put up a separate PR to do some of these fixes if they're small. |
||
else: | ||
self._update_or_insert(key, value, fetched_keys) | ||
self._update_subset(key, value, fetched_keys) | ||
|
||
def _update_or_insert( | ||
self, key: KT, value: Dict[DKT, DV], known_absent: Iterable[DKT] | ||
def _update_subset( | ||
self, key: KT, value: Dict[DKT, DV], fetched_keys: Iterable[DKT] | ||
) -> None: | ||
# We pop and reinsert as we need to tell the cache the size may have | ||
# changed | ||
"""Add the given dictionary values as explicit keys in the cache. | ||
|
||
Args: | ||
key | ||
erikjohnston marked this conversation as resolved.
Show resolved
Hide resolved
|
||
value: The dictionary with all the values that we should cache | ||
fetched_keys: The full set of keys that were looked up, any keys | ||
erikjohnston marked this conversation as resolved.
Show resolved
Hide resolved
|
||
here not in `value` should be marked as "known absent". | ||
""" | ||
|
||
for dict_key, dict_value in value.items(): | ||
self.cache[(key, dict_key)] = _PerKeyValue(dict_value) | ||
|
||
entry: DictionaryEntry = self.cache.pop(key, DictionaryEntry(False, set(), {})) | ||
entry.value.update(value) | ||
entry.known_absent.update(known_absent) | ||
self.cache[key] = entry | ||
for dict_key in fetched_keys: | ||
if (key, dict_key) in self.cache: | ||
continue | ||
erikjohnston marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def _insert(self, key: KT, value: Dict[DKT, DV], known_absent: Set[DKT]) -> None: | ||
self.cache[key] = DictionaryEntry(True, known_absent, value) | ||
self.cache[(key, dict_key)] = _PerKeyValue(_Sentinel.sentinel) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ideally we'd separate this to a different PR. It's not directly related to the changes to DictionaryCache.