matrix-org · erikjohnston · Jul 21, 2022 · Jul 15, 2022 · Jul 15, 2022 · Jul 15, 2022
@@ -39,6 +39,8 @@
 class DictionaryEntry:  # should be: Generic[DKT, DV].
     """Returned when getting an entry from the cache
 
+    If `full` is true then `known_absent` will be the empty set.
+
     Attributes:
         full: Whether the cache has the full or dict or just some keys.
             If not full then not all requested keys will necessarily be present
@@ -86,10 +88,33 @@ def __len__(self) -> int:
 class DictionaryCache(Generic[KT, DKT, DV]):
     """Caches key -> dictionary lookups, supporting caching partial dicts, i.e.
     fetching a subset of dictionary keys for a particular key.
+
+    This cache has two levels of key. First there is the "cache key" (of type
+    `KT`), which maps to a dict. The keys to that dict are the "dict key" (of
+    type `DKT`). The overall structure is therefore `KT->DKT->DV`. For
+    example, it might look like:
+
+       {
+           1: { 1: "a", 2: "b" },
+           2: { 1: "c" },
+       }
+
+    It is possible to look up either individual dict keys, or the *complete*
+    dict for a given cache key.
+
+    Each dict item, and the complete dict is treated as a separate LRU
+    entry for the purpose of cache expiry. For example, given:
+        dict_cache.get(1, None)  -> DictionaryEntry({1: "a", 2: "b"})
+        dict_cache.get(1, [1])  -> DictionaryEntry({1: "a"})
+        dict_cache.get(1, [2])  -> DictionaryEntry({2: "b"})
+
+    ... then the cache entry for the complete dict will expire first,
+    followed by the cache entry for the '1' dict key, and finally that
+    for the '2' dict key.
     """
 
     def __init__(self, name: str, max_entries: int = 1000):
-        # We use a single cache to cache two different types of entries:
+        # We use a single LruCache to store two different types of entries:
         #   1. Map from (key, dict_key) -> dict value (or sentinel, indicating
         #      the key doesn't exist in the dict); and
         #   2. Map from (key, _FullCacheKey.KEY) -> full dict.
@@ -145,21 +170,21 @@ def get(
         Returns:
             DictionaryEntry
         """
-
         if dict_keys is None:
+            # The caller wants the full set of dictionary keys for this cache key
             return self._get_full_dict(key)
 
         # We are being asked for a subset of keys.
 
-        # First got and check for each requested dict key in the cache, tracking
+        # First go and check for each requested dict key in the cache, tracking
         # which we couldn't find.
         values = {}
         known_absent = set()
-        missing = set()
+        missing = []
         for dict_key in dict_keys:
             entry = self.cache.get((key, dict_key), _Sentinel.sentinel)
             if entry is _Sentinel.sentinel:
-                missing.add(dict_key)
+                missing.append(dict_key)
                 continue
 
             assert isinstance(entry, _PerKeyValue)
@@ -173,7 +198,7 @@ def get(
         if not missing:
             return DictionaryEntry(False, known_absent, values)
 
-        # If we are missing any keys check if we happen to have the full dict in
+        # We are missing some keys, so check if we happen to have the full dict in
         # the cache.
         #
         # We don't update the last access time for this cache fetch, as we
@@ -191,10 +216,9 @@ def get(
         # We have the full dict!
         assert isinstance(entry, dict)
 
-        values = {}
-        for dict_key in dict_keys:
+        for dict_key in missing:
             # We explicitly add each dict key to the cache, so that cache hit
-            # rates for each key can be tracked separately.
+            # rates and LRU times for each key can be tracked separately.
             value = entry.get(dict_key, _Sentinel.sentinel)  # type: ignore[arg-type]
             self.cache[(key, dict_key)] = _PerKeyValue(value)
 
@@ -215,36 +239,7 @@ def _get_full_dict(
             assert isinstance(entry, dict)
             return DictionaryEntry(True, set(), entry)
 
-        # If not, check if we have cached any dict keys at all for this cache
-        # key.
-        all_entries = self.cache.get_multi(
-            (key,),
-            _Sentinel.sentinel,
-        )
-        if all_entries is _Sentinel.sentinel:
-            return DictionaryEntry(False, set(), {})
-
-        # If there are entries we need to unwrap the returned cache nodes
-        # and `_PerKeyValue` into the `DictionaryEntry`.
-        values = {}
-        known_absent = set()
-        for cache_key, dict_value in all_entries:
-            # The key used for the `TreeCache` is `(key, dict_key)`
-            dict_key = cache_key[1]
-
-            # We have explicitly looked for a full cache key, so we
-            # shouldn't see one.
-            assert dict_key != _FullCacheKey.KEY
-
-            # ... therefore the values must be `_PerKeyValue`
-            assert isinstance(dict_value, _PerKeyValue)
-
-            if dict_value.value is _Sentinel.sentinel:
-                known_absent.add(dict_key)
-            else:
-                values[dict_key] = dict_value.value
-
-        return DictionaryEntry(False, known_absent, values)
+        return DictionaryEntry(False, set(), {})
 
     def invalidate(self, key: KT) -> None:
         self.check_thread()
@@ -253,7 +248,11 @@ def invalidate(self, key: KT) -> None:
         # raced with the INSERT don't update the cache (SYN-369)
         self.sequence += 1
 
-        # Del-multi accepts truncated tuples.
+        # We want to drop all information about the dict for the given key, so
+        # we use `del_multi` to delete it all in one go.
+        #
+        # We ignore the type error here `del_mutli` accepts a truncated key
+        # (when the key type is a tuple).
         self.cache.del_multi((key,))  # type: ignore[arg-type]
 
     def invalidate_all(self) -> None:
@@ -296,17 +295,17 @@ def _update_subset(
         """Add the given dictionary values as explicit keys in the cache.
 
         Args:
-            key
+            key: top-level cache key
             value: The dictionary with all the values that we should cache
-            fetched_keys: The full set of keys that were looked up, any keys
+            fetched_keys: The full set of dict keys that were looked up. Any keys
                 here not in `value` should be marked as "known absent".
         """
 
         for dict_key, dict_value in value.items():
             self.cache[(key, dict_key)] = _PerKeyValue(dict_value)
 
         for dict_key in fetched_keys:
-            if (key, dict_key) in self.cache:
+            if dict_key in value:
                 continue
 
             self.cache[(key, dict_key)] = _PerKeyValue(_Sentinel.sentinel)
@@ -24,8 +24,8 @@
     Callable,
     Collection,
     Dict,
-    Generator,
     Generic,
+    Iterable,
     List,
     Optional,
     Tuple,
@@ -598,27 +598,37 @@ def cache_get_multi(
             key: tuple,
             default: Literal[None] = None,
             update_metrics: bool = True,
-        ) -> Union[None, Generator[Tuple[KT, VT], None, None]]:
+        ) -> Union[None, Iterable[Tuple[KT, VT]]]:
             ...
 
         @overload
         def cache_get_multi(
             key: tuple,
             default: T,
             update_metrics: bool = True,
-        ) -> Union[T, Generator[Tuple[KT, VT], None, None]]:
+        ) -> Union[T, Iterable[Tuple[KT, VT]]]:
             ...
 
         @synchronized
         def cache_get_multi(
             key: tuple,
             default: Optional[T] = None,
             update_metrics: bool = True,
-        ) -> Union[None, T, Generator[Tuple[KT, VT], None, None]]:
+        ) -> Union[None, T, Iterable[Tuple[KT, VT]]]:
             """Returns a generator yielding all entries under the given key.
 
             Can only be used if backed by a tree cache.
 
+            Example:
+
+                cache = LruCache(10, cache_type=TreeCache)
+                cache[(1, 1)] = "a"
+                cache[(1, 2)] = "b"
+                cache[(2, 1)] = "c"
+
+                items = cache.get_multi((1,))
+                assert list(items) == [((1, 1), "a"), ((1, 2), "b")]
+
             Returns:
                 Either default if the key doesn't exist, or a generator of the
                 key/value pairs.
@@ -631,7 +641,7 @@ def cache_get_multi(
                 if update_metrics and metrics:
                     metrics.inc_hits()
 
-                # Iterating over the node will return values of type `_Node`,
+                # We store entries in the `TreeCache` with values of type `_Node`,
                 # which we need to unwrap.
                 return (
                     (full_key, lru_node.value)

@@ -64,6 +64,15 @@ def set(self, key, value) -> None:
         self.size += 1
 
     def get(self, key, default=None):
+        """When `key` is a full key, fetches the value for the given key (if
+        any).
+
+        If `key` is only a partial key (i.e. a truncated tuple) then returns a
+        `TreeCacheNode`, which can be passed to the `iterate_tree_cache_*`
+        functions to iterate over all values in the cache with keys that start
+        with the given partial key.
+        """
+
         node = self.root
         for k in key[:-1]:
             node = node.get(k, None)
@@ -166,4 +175,5 @@ def iterate_tree_cache_items(key, value):
         for sub_key, sub_value in value.items():
             yield from iterate_tree_cache_items((*key, sub_key), sub_value)
     else:
+        # we've reached a leaf of the tree.
         yield key, value
@@ -370,7 +370,7 @@ def test_get_state_for_event(self):
 
         self.assertEqual(cache_entry.full, False)
         self.assertEqual(cache_entry.known_absent, set())
-        self.assertDictEqual(state_dict_ids, {(e1.type, e1.state_key): e1.event_id})
+        self.assertDictEqual(state_dict_ids, {})
 
         ############################################
         # test that things work with a partial cache
@@ -387,7 +387,7 @@ def test_get_state_for_event(self):
         )
 
         self.assertEqual(is_all, False)
-        self.assertDictEqual({(e1.type, e1.state_key): e1.event_id}, state_dict)
+        self.assertDictEqual({}, state_dict)
 
         room_id = self.room.to_string()
         (state_dict, is_all,) = self.state_datastore._get_state_for_group_using_cache(
@@ -412,7 +412,7 @@ def test_get_state_for_event(self):
         )
 
         self.assertEqual(is_all, False)
-        self.assertDictEqual({(e1.type, e1.state_key): e1.event_id}, state_dict)
+        self.assertDictEqual({}, state_dict)
 
         (state_dict, is_all,) = self.state_datastore._get_state_for_group_using_cache(
             self.state_datastore._state_group_members_cache,
@@ -443,7 +443,7 @@ def test_get_state_for_event(self):
         )
 
         self.assertEqual(is_all, False)
-        self.assertDictEqual({(e1.type, e1.state_key): e1.event_id}, state_dict)
+        self.assertDictEqual({}, state_dict)
 
         (state_dict, is_all,) = self.state_datastore._get_state_for_group_using_cache(
             self.state_datastore._state_group_members_cache,

@@ -20,7 +20,7 @@
 
 class DictCacheTestCase(unittest.TestCase):
     def setUp(self):
-        self.cache = DictionaryCache("foobar")
+        self.cache = DictionaryCache("foobar", max_entries=10)
 
     def test_simple_cache_hit_full(self):
         key = "test_simple_cache_hit_full"
@@ -76,17 +76,41 @@ def test_multi_insert(self):
 
         seq = self.cache.sequence
         test_value_1 = {"test": "test_simple_cache_hit_miss_partial"}
-        self.cache.update(seq, key, test_value_1, fetched_keys=set("test"))
+        self.cache.update(seq, key, test_value_1, fetched_keys={"test"})
 
         seq = self.cache.sequence
         test_value_2 = {"test2": "test_simple_cache_hit_miss_partial2"}
-        self.cache.update(seq, key, test_value_2, fetched_keys=set("test2"))
+        self.cache.update(seq, key, test_value_2, fetched_keys={"test2"})
 
-        c = self.cache.get(key)
+        c = self.cache.get(key, dict_keys=["test", "test2"])
         self.assertEqual(
             {
                 "test": "test_simple_cache_hit_miss_partial",
                 "test2": "test_simple_cache_hit_miss_partial2",
             },
             c.value,
         )
+        self.assertEqual(c.full, False)
+
+    def test_invalidation(self):
+        """Test that the partial dict and full dicts get invalidated
+        separately.
+        """
+        key = "some_key"
+
+        seq = self.cache.sequence
+        self.cache.update(seq, key, {"a": "b", "c": "d"})
+
+        for i in range(20):
+            self.cache.get(key, ["a"])
+            self.cache.update(seq, f"key{i}", {1: 2})
+
+        # We should have evicted the full dict...
+        r = self.cache.get(key)
+        self.assertFalse(r.full)
+        self.assertTrue("c" not in r.value)
+
+        # ... but kept the "a" entry that we kept querying.
+        r = self.cache.get(key, dict_keys=["a"])
+        self.assertFalse(r.full)
+        self.assertEqual(r.value, {"a": "b"})