Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Make DictionaryCache have better expiry properties #13292

Merged
merged 42 commits into from
Jul 21, 2022
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
326a175
Make DictionaryCache have better expiry properties
erikjohnston Jul 15, 2022
40a8fba
Newsfile
erikjohnston Jul 15, 2022
a22716c
Fix literal
erikjohnston Jul 15, 2022
f046366
Fix test
erikjohnston Jul 15, 2022
602a81f
don't update access
erikjohnston Jul 15, 2022
23c2f39
Fix mypy
erikjohnston Jul 15, 2022
cad555f
Better stuff
erikjohnston Jul 15, 2022
7aceec3
Fix up
erikjohnston Jul 15, 2022
057ae8b
Comments
erikjohnston Jul 17, 2022
7f7b36d
Comment LruCache
erikjohnston Jul 18, 2022
462db2a
Comment LruCache
erikjohnston Jul 18, 2022
129691f
Comment TreeCache
erikjohnston Jul 18, 2022
45d0dce
Merge branch 'develop' into erikj/dict_cache
erikjohnston Jul 18, 2022
ca8e1af
Update synapse/util/caches/dictionary_cache.py
erikjohnston Jul 19, 2022
e4723df
Update synapse/util/caches/lrucache.py
erikjohnston Jul 19, 2022
a74baa6
Split out code to separate method
erikjohnston Jul 19, 2022
a9ebcd2
Mark DictionaryEntry as frozen
erikjohnston Jul 19, 2022
6a76dba
Don't reuse vars
erikjohnston Jul 19, 2022
d4133b2
Add example
erikjohnston Jul 19, 2022
88aa56c
Make `LruCacheget_multi` return something sane.
erikjohnston Jul 19, 2022
f053edb
Woo comments
erikjohnston Jul 20, 2022
740fe2f
Remove use of `cache_key`
erikjohnston Jul 20, 2022
378aec5
More comments
erikjohnston Jul 20, 2022
5709037
Support values being removed from dict
erikjohnston Jul 20, 2022
b376618
Fixup lint
erikjohnston Jul 20, 2022
12e14f2
Make `missing` a list
erikjohnston Jul 20, 2022
fed7755
Don't iterate twice over `dict_key`
erikjohnston Jul 20, 2022
c9f13f3
Clarify comment
erikjohnston Jul 20, 2022
67bb06d
Use iterable rather than generator
erikjohnston Jul 20, 2022
2ec4cab
Add example
erikjohnston Jul 20, 2022
50bb901
Add doc to TreeCache.get
erikjohnston Jul 20, 2022
aa203c6
Note that if full is True known_absent must be empty
erikjohnston Jul 20, 2022
2151474
When fetching full dict don't return partial
erikjohnston Jul 20, 2022
43e0030
Fix test, set takes an iterable
erikjohnston Jul 20, 2022
3c23161
Add simple test for invalidation
erikjohnston Jul 20, 2022
e5ef14d
Fix test now we don't return partial dicts
erikjohnston Jul 21, 2022
10fb0d0
Woo comments
erikjohnston Jul 21, 2022
d521be2
Comment if dict_keys is None
erikjohnston Jul 21, 2022
3c84a98
Document update doesn't invalidate
erikjohnston Jul 21, 2022
9c54881
Update synapse/util/caches/dictionary_cache.py
erikjohnston Jul 21, 2022
daa2741
Flesh out return value
erikjohnston Jul 21, 2022
055a5dd
Lint
erikjohnston Jul 21, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 42 additions & 43 deletions synapse/util/caches/dictionary_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@
class DictionaryEntry: # should be: Generic[DKT, DV].
"""Returned when getting an entry from the cache

If `full` is true then `known_absent` will be the empty set.

Attributes:
full: Whether the cache has the full or dict or just some keys.
If not full then not all requested keys will necessarily be present
Expand Down Expand Up @@ -86,10 +88,33 @@ def __len__(self) -> int:
class DictionaryCache(Generic[KT, DKT, DV]):
"""Caches key -> dictionary lookups, supporting caching partial dicts, i.e.
fetching a subset of dictionary keys for a particular key.

This cache has two levels of key. First there is the "cache key" (of type
`KT`), which maps to a dict. The keys to that dict are the "dict key" (of
type `DKT`). The overall structure is therefore `KT->DKT->DV`. For
example, it might look like:

{
1: { 1: "a", 2: "b" },
2: { 1: "c" },
}

It is possible to look up either individual dict keys, or the *complete*
dict for a given cache key.

Each dict item, and the complete dict is treated as a separate LRU
entry for the purpose of cache expiry. For example, given:
dict_cache.get(1, None) -> DictionaryEntry({1: "a", 2: "b"})
dict_cache.get(1, [1]) -> DictionaryEntry({1: "a"})
dict_cache.get(1, [2]) -> DictionaryEntry({2: "b"})

... then the cache entry for the complete dict will expire first,
followed by the cache entry for the '1' dict key, and finally that
for the '2' dict key.
"""

def __init__(self, name: str, max_entries: int = 1000):
# We use a single cache to cache two different types of entries:
# We use a single LruCache to store two different types of entries:
# 1. Map from (key, dict_key) -> dict value (or sentinel, indicating
# the key doesn't exist in the dict); and
# 2. Map from (key, _FullCacheKey.KEY) -> full dict.
Expand Down Expand Up @@ -145,21 +170,21 @@ def get(
Returns:
DictionaryEntry
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this doesn't document what happens if dict_keys is None, but we don't have the full dict cached. Could it, please?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

... please?

I think it returns some fixed shape of DictionaryEntry ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I'd put some docs under the dict_keys arg (and there is some in DictionaryEntry), but have also added some in the return value too.

"""

if dict_keys is None:
# The caller wants the full set of dictionary keys for this cache key
return self._get_full_dict(key)
erikjohnston marked this conversation as resolved.
Show resolved Hide resolved

# We are being asked for a subset of keys.

# First got and check for each requested dict key in the cache, tracking
# First go and check for each requested dict key in the cache, tracking
# which we couldn't find.
values = {}
known_absent = set()
missing = set()
missing = []
for dict_key in dict_keys:
entry = self.cache.get((key, dict_key), _Sentinel.sentinel)
if entry is _Sentinel.sentinel:
missing.add(dict_key)
missing.append(dict_key)
continue

assert isinstance(entry, _PerKeyValue)
Expand All @@ -173,7 +198,7 @@ def get(
if not missing:
return DictionaryEntry(False, known_absent, values)

# If we are missing any keys check if we happen to have the full dict in
# We are missing some keys, so check if we happen to have the full dict in
# the cache.
#
# We don't update the last access time for this cache fetch, as we
Expand All @@ -191,10 +216,9 @@ def get(
# We have the full dict!
assert isinstance(entry, dict)

values = {}
for dict_key in dict_keys:
for dict_key in missing:
# We explicitly add each dict key to the cache, so that cache hit
# rates for each key can be tracked separately.
# rates and LRU times for each key can be tracked separately.
value = entry.get(dict_key, _Sentinel.sentinel) # type: ignore[arg-type]
self.cache[(key, dict_key)] = _PerKeyValue(value)

Expand All @@ -215,36 +239,7 @@ def _get_full_dict(
assert isinstance(entry, dict)
return DictionaryEntry(True, set(), entry)

# If not, check if we have cached any dict keys at all for this cache
# key.
all_entries = self.cache.get_multi(
(key,),
_Sentinel.sentinel,
)
if all_entries is _Sentinel.sentinel:
return DictionaryEntry(False, set(), {})

# If there are entries we need to unwrap the returned cache nodes
# and `_PerKeyValue` into the `DictionaryEntry`.
values = {}
known_absent = set()
for cache_key, dict_value in all_entries:
# The key used for the `TreeCache` is `(key, dict_key)`
dict_key = cache_key[1]

# We have explicitly looked for a full cache key, so we
# shouldn't see one.
assert dict_key != _FullCacheKey.KEY

# ... therefore the values must be `_PerKeyValue`
assert isinstance(dict_value, _PerKeyValue)

if dict_value.value is _Sentinel.sentinel:
known_absent.add(dict_key)
else:
values[dict_key] = dict_value.value

return DictionaryEntry(False, known_absent, values)
return DictionaryEntry(False, set(), {})

def invalidate(self, key: KT) -> None:
self.check_thread()
Expand All @@ -253,7 +248,11 @@ def invalidate(self, key: KT) -> None:
# raced with the INSERT don't update the cache (SYN-369)
self.sequence += 1

# Del-multi accepts truncated tuples.
# We want to drop all information about the dict for the given key, so
# we use `del_multi` to delete it all in one go.
#
# We ignore the type error here `del_mutli` accepts a truncated key
# (when the key type is a tuple).
erikjohnston marked this conversation as resolved.
Show resolved Hide resolved
self.cache.del_multi((key,)) # type: ignore[arg-type]

def invalidate_all(self) -> None:
Expand Down Expand Up @@ -296,17 +295,17 @@ def _update_subset(
"""Add the given dictionary values as explicit keys in the cache.

Args:
key
key: top-level cache key
value: The dictionary with all the values that we should cache
fetched_keys: The full set of keys that were looked up, any keys
fetched_keys: The full set of dict keys that were looked up. Any keys
here not in `value` should be marked as "known absent".
"""

for dict_key, dict_value in value.items():
self.cache[(key, dict_key)] = _PerKeyValue(dict_value)

for dict_key in fetched_keys:
if (key, dict_key) in self.cache:
if dict_key in value:
continue

self.cache[(key, dict_key)] = _PerKeyValue(_Sentinel.sentinel)
20 changes: 15 additions & 5 deletions synapse/util/caches/lrucache.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
Callable,
Collection,
Dict,
Generator,
Generic,
Iterable,
List,
Optional,
Tuple,
Expand Down Expand Up @@ -598,27 +598,37 @@ def cache_get_multi(
key: tuple,
default: Literal[None] = None,
update_metrics: bool = True,
) -> Union[None, Generator[Tuple[KT, VT], None, None]]:
) -> Union[None, Iterable[Tuple[KT, VT]]]:
...

@overload
def cache_get_multi(
key: tuple,
default: T,
update_metrics: bool = True,
) -> Union[T, Generator[Tuple[KT, VT], None, None]]:
) -> Union[T, Iterable[Tuple[KT, VT]]]:
...

@synchronized
def cache_get_multi(
key: tuple,
default: Optional[T] = None,
update_metrics: bool = True,
) -> Union[None, T, Generator[Tuple[KT, VT], None, None]]:
) -> Union[None, T, Iterable[Tuple[KT, VT]]]:
"""Returns a generator yielding all entries under the given key.

Can only be used if backed by a tree cache.
erikjohnston marked this conversation as resolved.
Show resolved Hide resolved

Example:

cache = LruCache(10, cache_type=TreeCache)
cache[(1, 1)] = "a"
cache[(1, 2)] = "b"
cache[(2, 1)] = "c"

items = cache.get_multi((1,))
assert list(items) == [((1, 1), "a"), ((1, 2), "b")]

Returns:
Either default if the key doesn't exist, or a generator of the
key/value pairs.
Expand All @@ -631,7 +641,7 @@ def cache_get_multi(
if update_metrics and metrics:
metrics.inc_hits()

# Iterating over the node will return values of type `_Node`,
# We store entries in the `TreeCache` with values of type `_Node`,
# which we need to unwrap.
return (
(full_key, lru_node.value)
Expand Down
10 changes: 10 additions & 0 deletions synapse/util/caches/treecache.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,15 @@ def set(self, key, value) -> None:
self.size += 1

def get(self, key, default=None):
"""When `key` is a full key, fetches the value for the given key (if
any).

If `key` is only a partial key (i.e. a truncated tuple) then returns a
`TreeCacheNode`, which can be passed to the `iterate_tree_cache_*`
functions to iterate over all values in the cache with keys that start
erikjohnston marked this conversation as resolved.
Show resolved Hide resolved
with the given partial key.
"""

node = self.root
for k in key[:-1]:
node = node.get(k, None)
Expand Down Expand Up @@ -166,4 +175,5 @@ def iterate_tree_cache_items(key, value):
for sub_key, sub_value in value.items():
yield from iterate_tree_cache_items((*key, sub_key), sub_value)
else:
# we've reached a leaf of the tree.
yield key, value
erikjohnston marked this conversation as resolved.
Show resolved Hide resolved
8 changes: 4 additions & 4 deletions tests/storage/test_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ def test_get_state_for_event(self):

self.assertEqual(cache_entry.full, False)
self.assertEqual(cache_entry.known_absent, set())
richvdh marked this conversation as resolved.
Show resolved Hide resolved
self.assertDictEqual(state_dict_ids, {(e1.type, e1.state_key): e1.event_id})
self.assertDictEqual(state_dict_ids, {})

############################################
# test that things work with a partial cache
Expand All @@ -387,7 +387,7 @@ def test_get_state_for_event(self):
)

self.assertEqual(is_all, False)
self.assertDictEqual({(e1.type, e1.state_key): e1.event_id}, state_dict)
self.assertDictEqual({}, state_dict)

room_id = self.room.to_string()
(state_dict, is_all,) = self.state_datastore._get_state_for_group_using_cache(
Expand All @@ -412,7 +412,7 @@ def test_get_state_for_event(self):
)

self.assertEqual(is_all, False)
self.assertDictEqual({(e1.type, e1.state_key): e1.event_id}, state_dict)
self.assertDictEqual({}, state_dict)

(state_dict, is_all,) = self.state_datastore._get_state_for_group_using_cache(
self.state_datastore._state_group_members_cache,
Expand Down Expand Up @@ -443,7 +443,7 @@ def test_get_state_for_event(self):
)

self.assertEqual(is_all, False)
self.assertDictEqual({(e1.type, e1.state_key): e1.event_id}, state_dict)
self.assertDictEqual({}, state_dict)

(state_dict, is_all,) = self.state_datastore._get_state_for_group_using_cache(
self.state_datastore._state_group_members_cache,
Expand Down
32 changes: 28 additions & 4 deletions tests/util/test_dict_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

class DictCacheTestCase(unittest.TestCase):
def setUp(self):
self.cache = DictionaryCache("foobar")
self.cache = DictionaryCache("foobar", max_entries=10)

def test_simple_cache_hit_full(self):
key = "test_simple_cache_hit_full"
Expand Down Expand Up @@ -76,17 +76,41 @@ def test_multi_insert(self):

seq = self.cache.sequence
test_value_1 = {"test": "test_simple_cache_hit_miss_partial"}
self.cache.update(seq, key, test_value_1, fetched_keys=set("test"))
self.cache.update(seq, key, test_value_1, fetched_keys={"test"})

seq = self.cache.sequence
test_value_2 = {"test2": "test_simple_cache_hit_miss_partial2"}
self.cache.update(seq, key, test_value_2, fetched_keys=set("test2"))
self.cache.update(seq, key, test_value_2, fetched_keys={"test2"})

c = self.cache.get(key)
c = self.cache.get(key, dict_keys=["test", "test2"])
self.assertEqual(
{
"test": "test_simple_cache_hit_miss_partial",
"test2": "test_simple_cache_hit_miss_partial2",
},
c.value,
)
self.assertEqual(c.full, False)

def test_invalidation(self):
"""Test that the partial dict and full dicts get invalidated
separately.
"""
key = "some_key"

seq = self.cache.sequence
self.cache.update(seq, key, {"a": "b", "c": "d"})
erikjohnston marked this conversation as resolved.
Show resolved Hide resolved

for i in range(20):
erikjohnston marked this conversation as resolved.
Show resolved Hide resolved
self.cache.get(key, ["a"])
self.cache.update(seq, f"key{i}", {1: 2})

# We should have evicted the full dict...
r = self.cache.get(key)
self.assertFalse(r.full)
self.assertTrue("c" not in r.value)

# ... but kept the "a" entry that we kept querying.
r = self.cache.get(key, dict_keys=["a"])
self.assertFalse(r.full)
self.assertEqual(r.value, {"a": "b"})