From 41e0ba38d862388f3e6f9a12a5f331ae09047908 Mon Sep 17 00:00:00 2001
From: bwoodsend <bwoodsend@gmail.com>
Date: Sat, 4 Sep 2021 11:03:59 +0100
Subject: [PATCH] Add "table almost full" handling.

Almost full hash tables become very slow. Add detection of and callbacks for
tables which are almost full. A table can be configured to:

* Issue a warning.
* Raise an error.
* Do nothing.
* Resize the table.

Issuing the warning is the default to try to make users aware of the rather
unusual requirement to *make your tables a bit bigger than they need to be*.
---
 README.rst               |  33 +++++++++-
 hirola/_hash_table.py    | 127 +++++++++++++++++++++++++++++++++++++--
 hirola/exceptions.py     |   5 ++
 hirola/hash_table.c      |  15 +++--
 hirola/hash_table.h      |   1 +
 tests/__init__.py        |  26 ++++++++
 tests/test_hash_table.py |  98 +++++++++++++++++++++++++++++-
 7 files changed, 294 insertions(+), 11 deletions(-)
diff --git a/README.rst b/README.rst
index 228eeb7..84958be 100644
--- a/README.rst
+++ b/README.rst
@@ -131,7 +131,8 @@ Choosing a ``max`` size
 .......................
 
 Unlike Python's ``set`` and ``dict``, ``Hirola`` does not manage its size
-automatically.
+automatically by default
+(although `it can be reconfigured to <automatic-resize>`_).
 To prevent wasted resizing (which is what Python does under the hood),
 you have full control of and responsibility for how much space the table uses.
 Obviously the table has to be large enough to fit all the keys in it.
@@ -200,6 +201,36 @@ similarly to ``numpy.unique(..., return_args=True)``.
 Lookup the indices of points without adding them using ``table.get()``.
 
 
+.. _automatic-resize:
+
+Handling of nearly full hash tables
+...................................
+
+``HashTable``\ s become very slow when almost full.
+As of v0.3.0, an efficiency warning will notify you if a table exceeds 90% full.
+This warning can be reconfigured into an error, silenced or set to resize the
+table automatically to make room.
+These are demonstrated in the example constructors below:
+
+.. code-block:: python
+
+    # The default: Issue a warning when the table is 90% full.
+    HashTable(..., almost_full=(0.9, "warn"))
+
+    # Disable all "almost full" behaviours.
+    HashTable(..., almost_full=None)
+
+    # To consider a table exceeding 80% full as an error use:
+    HashTable(..., almost_full=(0.8, "raise"))
+
+    # To automatically triple in size whenever the table exceeds 80% full use:
+    HashTable(..., almost_full=(0.8, 3.0))
+
+Resizing tables is slow which is why it's not enabled by default.
+It should be avoided unless you really have no idea how big your table will need
+to be.
+
+
 Recipes
 *******
 
diff --git a/hirola/_hash_table.py b/hirola/_hash_table.py
index fabc195..cb49b3f 100644
--- a/hirola/_hash_table.py
+++ b/hirola/_hash_table.py
@@ -4,6 +4,7 @@
 
 import numbers
 import ctypes
+import math
 
 from numbers import Number
 from typing import Union, Tuple
@@ -34,7 +35,8 @@ class HashTable(object):
     _keys: np.ndarray
     _NO_DEFAULT = object()
 
-    def __init__(self, max: Number, dtype: dtype_types):
+    def __init__(self, max: Number, dtype: dtype_types,
+                 almost_full=(.9, "warn")):
         """
 
         Args:
@@ -44,6 +46,9 @@ def __init__(self, max: Number, dtype: dtype_types):
             dtype:
                 The data type for the table's keys. Sets the `dtype`
                 attribute.
+            almost_full:
+                The handling of almost full hash tables. Sets the `almost_full`
+                attribute.
 
         The **max** parameter is silently normalised to `int` and clipped
         to a minimum of 1 if it is less than 1.
@@ -74,6 +79,7 @@ def __init__(self, max: Number, dtype: dtype_types):
         self._raw = slug.dll.HashTable(max, key_size, ptr(self._hash_owners),
                                        ptr(self._keys_readonly),
                                        hash=ctypes.cast(hash, ctypes.c_void_p))
+        self.almost_full = almost_full
 
     @property
     def max(self) -> int:
@@ -151,21 +157,132 @@ def add(self, keys) -> np.ndarray:
                 If there is no space to place new keys.
             exceptions.HashTableDestroyed:
                 If the `destroy` method has been previously called.
+            exceptions.AlmostFull:
+                If the table becomes nearly full and is configured to raise an
+                error (set by the `almost_full` attribute).
+        Warns:
+            exceptions.AlmostFull:
+                If the table becomes nearly full and is configured to warn (set
+                by the `almost_full` attribute).
 
         """
         self._check_destroyed()
 
         keys, shape = self._norm_input_keys(keys)
         out = np.empty(shape, np.intp)
-        index = slug.dll.HT_adds(self._raw._ptr, ptr(keys), ptr(out), out.size)
-        if index != -1:
+
+        # This while loop will only iterate a second time if the "almost full"
+        # threshold is enabled and crossed. It will only iterate more than twice
+        # if `self.almost_full` is set to automatically upsize the table.
+        index = -1
+        while True:
+            index = slug.dll.HT_adds(self._raw._ptr, ptr(keys), ptr(out),
+                                     out.size, index + 1)
+
+            # If everything worked. Return the indices.
+            if index == out.size:
+                return out if shape else out.item()
+
+            # If the `almost_full` threshold has been crossed:
+            if index < 0:
+                # Convert to a real positive index.
+                index = -1 - index
+                # Issue a warning or raise an error or resize the table as per
+                # the user's configuration.
+                self._on_almost_full()
+                continue
+
+            # We're out of space. Raise an error.
             from hirola.exceptions import HashTableFullError
             source, value = self._blame_key(index, keys, shape)
             raise HashTableFullError(
                 f"Failed to add {source} = {value} to the "
                 f"hash table because the table is full and {value} "
                 f"isn't already in it.")
-        return out if shape else out.item()
+
+    @property
+    def almost_full(self):
+        """The response to an almost full hash table. Hash tables become
+        dramatically slower, the closer they get to being full. Hirola's default
+        behaviour is to warn if this happens but can be configured to ignore the
+        warning, raise an error or automatically make a new, larger table.
+
+        This is an overloaded parameter.
+
+        * :py:`almost_full = None`:
+            Disable the *almost full* warning entirely.
+        * :py:`almost_full = (0.8, "warn")`:
+            Issue a `hirola.exceptions.AlmostFull` warning if the table reaches
+            80% full.
+        * :py:`almost_full = (0.7, "raise")`:
+            Raise a `hirola.exceptions.AlmostFull` exception if the table
+            reaches 80% full.
+        * :py:`almost_full = (0.7, 2)`:
+            Whenever the table reaches 70% full, double the table size.
+
+        For reference, Python's `dict` grows 8-fold when two thirds full.
+        To mimic this behaviour, set :py:`table.almost_full = (2 / 3, 8)`.
+
+        """
+        return self._almost_full
+
+    @almost_full.setter
+    def almost_full(self, x):
+        # Asides from simply storing the user's value, this setter must also:
+        # * Calculate the "panic table length" (self._raw.panic_at) at which the
+        #   C code should notify Python that the table is almost full.
+        # * Ensure that the user input is valid.
+
+        if x is None:
+            self._raw.panic_at = -1
+            self._almost_full = None
+            return
+
+        try:
+            ratio, scale_up = x
+        except:
+            raise TypeError(f"`almost_full` must be a 2-tuple of floats or"
+                            f" None. Not `{repr(x)}`.") from None
+        if not (0 < ratio <= 1):
+            raise ValueError("The first parameter to almost_full must be "
+                             ">0 and <=1.")
+        if isinstance(scale_up, str):
+            if scale_up not in ("raise", "warn"):
+                raise ValueError("Valid near-full actions are 'raise' and "
+                                 f"'warn'. Not '{scale_up}'.")
+        elif isinstance(scale_up, numbers.Number):
+            if int(self.max * scale_up) <= self.max:
+                raise ValueError(
+                    f"A scale_up resize factor of {scale_up} would lead to an "
+                    f"infinite loop. Either increase scale_up or disable "
+                    f"automatic resizing by setting hash_table.almost_full to "
+                    f"None.")
+        else:
+            raise TypeError("The second parameter to almost_full must be "
+                            "either a string or a float.")
+
+        self._raw.panic_at = int(math.ceil(ratio * self.max))
+        self._almost_full = x
+
+    def _on_almost_full(self):
+        """The callback to be invoked whenever the table becomes almost full."""
+        assert self.almost_full is not None
+
+        if not isinstance(self.almost_full[1], str):
+            self.resize(self.max * self.almost_full[1], in_place=True)
+            return
+
+        from hirola.exceptions import AlmostFull
+
+        message = f"HashTable() is {round(100 * len(self) / self.max)}% full." \
+                  " A hash table becomes orders of magnitudes slower " \
+                  "when nearly full. See help(HashTable.almost_full) for how " \
+                  "to correct or silence this issue."
+
+        if self.almost_full[1] == "raise":
+            raise AlmostFull(message)
+        import warnings
+        warnings.warn(AlmostFull(message), stacklevel=3)
 
     def contains(self, keys) -> Union[bool, np.ndarray]:
         """Check if a key or keys are in the table.
@@ -367,7 +484,7 @@ def copy(self, usable=True) -> 'HashTable':
             Another `HashTable` with the same size, dtype and content.
 
         """
-        out = type(self)(self.max, self.dtype)
+        out = type(self)(self.max, self.dtype, self.almost_full)
         if self._destroyed and usable:
             out.add(self.keys)
         else:
diff --git a/hirola/exceptions.py b/hirola/exceptions.py
index eca90db..de2f376 100644
--- a/hirola/exceptions.py
+++ b/hirola/exceptions.py
@@ -27,3 +27,8 @@ class HashTableDestroyed(HirolaException):
     def __str__(self):
         return "This table has been destroyed by HashTable.destroy() and can " \
                "no longer be used."
+
+
+class AlmostFull(HirolaException, Warning):
+    """To be raised or warned if the near-fullness of a hash table can lead to
+    poor performance."""
diff --git a/hirola/hash_table.c b/hirola/hash_table.c
index 4ecc3e7..ff09347 100644
--- a/hirola/hash_table.c
+++ b/hirola/hash_table.c
@@ -135,14 +135,21 @@ ptrdiff_t HT_get(HashTable * self, void * key) {
    computation. */
 
 
-ptrdiff_t HT_adds(HashTable * self, void * keys, ptrdiff_t * out, size_t len) {
-  for (size_t i = 0; i < len; i++) {
+ptrdiff_t HT_adds(HashTable * self, void * keys, ptrdiff_t * out, ptrdiff_t len,
+                  size_t start) {
+  for (ptrdiff_t i = start; i < len; i++) {
     out[i] = HT_add(self, keys + (i * self->key_size));
-    if (out[i] == -1)
+    if (out[i] == -1) {
       // Out of space - abort.
       return i;
+    }
+    if (out[i] == self->panic_at - 1)
+       // Nearly out of space. Pass control back to Python so that it can decide
+       // what to do. Using a negative index is a signal to Python that this is
+       // nearly out of space rather than completely out of space.
+      return -i - 1;
   }
-  return -1;
+  return len;
 }
 
 
diff --git a/hirola/hash_table.h b/hirola/hash_table.h
index dbf54fd..0f1d6f6 100644
--- a/hirola/hash_table.h
+++ b/hirola/hash_table.h
@@ -19,6 +19,7 @@ typedef struct HashTable {
   void * const keys;
   size_t length;
   Hash hash;
+  ptrdiff_t panic_at;
 } HashTable;
 
 
diff --git a/tests/__init__.py b/tests/__init__.py
index f7dbcd9..1c1f3c9 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1,8 +1,14 @@
 # -*- coding: utf-8 -*-
 """
 """
+import functools
+import warnings
+from contextlib import contextmanager
+
 import numpy as np
 
+from hirola import exceptions
+
 
 def random_ids(max, count, at_least_once=True, sort=False):
     if at_least_once:
@@ -16,3 +22,23 @@ def random_ids(max, count, at_least_once=True, sort=False):
     else:
         np.random.shuffle(out)
     return out
+
+
+def ignore_almost_full_warnings(test):
+    """Decorate a test to disable exceptions.AlmostFull warnings."""
+
+    @functools.wraps(test)
+    def wrapped(*args, **kwargs):
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", category=exceptions.AlmostFull)
+            test(*args, **kwargs)
+
+    return wrapped
+
+
+@contextmanager
+def warnings_as_errors():
+    """A context manager which treats all warnings as errors."""
+    with warnings.catch_warnings():
+        warnings.filterwarnings("error")
+        yield
diff --git a/tests/test_hash_table.py b/tests/test_hash_table.py
index efeaac5..dbadf8c 100644
--- a/tests/test_hash_table.py
+++ b/tests/test_hash_table.py
@@ -13,7 +13,7 @@
 from hirola import HashTable, exceptions
 from hirola._hash_table import slug
 
-from tests import random_ids
+from tests import random_ids, ignore_almost_full_warnings, warnings_as_errors
 
 DATA = np.arange(120, dtype=np.int8).data
 DTYPES = [
@@ -41,6 +41,7 @@ def test_hash():
     assert slug.dll.hash(ptr(x), 12) == out
 
 
+@ignore_almost_full_warnings
 def test_walk_through():
     data = np.array([100, 101, 100, 103, 104, 105, 103, 107], dtype=np.float32)
     self = HashTable(5, dtype=data.dtype)
@@ -303,6 +304,7 @@ def test_destroy():
         self.get(.5)
 
 
+@ignore_almost_full_warnings
 def test_resize():
     self = HashTable(5, int)
     self.add([4, 3, 2, 9])
@@ -324,6 +326,100 @@ def test_resize():
     assert self.add(11) == 5
 
 
+def test_almost_full():
+    """Test table.almost_full set to warn, ignore or raise errors when the hash
+    table's fullness crosses a given threshold."""
+
+    # Test the default - to warn at 90% full.
+    self = HashTable(5, int)
+    assert self.almost_full == (.9, "warn")
+    # The internal integer threshold should always round up.
+    assert self._raw.panic_at == 5
+    # Nothing should happen until we hit the 5th key.
+    with warnings_as_errors():
+        self.add(range(4))
+    with pytest.warns(exceptions.AlmostFull, match="is 100% full"):
+        self.add(4)
+
+    # Yest raising an error when nearly full.
+    self = HashTable(10, int, almost_full=(.45, "raise"))
+    assert self.almost_full == (.45, "raise")
+    with pytest.raises(exceptions.AlmostFull, match="is 50% full"):
+        self.add(range(8))
+    assert len(self) == 5
+
+    self.almost_full = .7, "warn"
+    with pytest.warns(exceptions.AlmostFull, match="is 70% full"):
+        self.add(range(10))
+    assert len(self) == 10
+
+
+def test_disabled_almost_full():
+    """Verify that no almost full warnings are produced if disabled."""
+    self = HashTable(10, int, almost_full=None)
+
+    with warnings_as_errors():
+        self.add(range(10))
+
+
+def test_almost_full_input_guards():
+    """Test the various exceptions raised by the input validators of
+    HashTable.almost_full's setter."""
+    with pytest.raises(ValueError, match=".* first .* be >0 and <=1"):
+        HashTable(10, int, almost_full=(2, "warn"))
+    with pytest.raises(ValueError, match=".* first .* >0 and <=1"):
+        HashTable(10, int, almost_full=(0, "warn"))
+    with pytest.raises(ValueError,
+                       match="Valid near-full actions are .* Not 'bean'."):
+        HashTable(10, int, almost_full=(.6, "bean"))
+    with pytest.raises(TypeError, match=".* second parameter to almost_full"):
+        HashTable(10, int, almost_full=(.6, range))
+    with pytest.raises(TypeError):
+        HashTable(10, int, almost_full="hello")
+
+
+def test_infinite_resizing_check():
+    """If automatic resizing is enabled but the resize factor is <= 1 or so
+    close to 1 that ``int(table.max * resize_factor)`` truncates to
+    ``table.max`` then we could end up in an infinite loop of no-op resizes.
+    Verify that the HashTable.almost_full setter blocks such resize factors.
+    """
+    self = HashTable(10, int, almost_full=(1, 1.1))
+    assert self.add(range(20)).tolist() == list(range(20))
+
+    with pytest.raises(ValueError, match="resize factor of 1.09 would"):
+        HashTable(10, int, almost_full=(1, 1.09))
+
+    HashTable(100, int, almost_full=(1, 1.01))
+    with pytest.raises(ValueError, match="resize factor of 1.01 would"):
+        HashTable(99, int, almost_full=(1, 1.01))
+
+    with pytest.raises(ValueError, match="resize factor"):
+        HashTable(10, int, almost_full=(1, .8))
+    with pytest.raises(ValueError, match="resize factor"):
+        HashTable(10, int, almost_full=(1, -1))
+
+
+def test_automatic_resize():
+    """Test setting self.almost_full to automatically resize the hash table."""
+    # Upsize by x1.5 when 60% full.
+    self = HashTable(10, int, almost_full=(.6, 1.5))
+
+    # 5 out of 10 is less than 60%. Nothing should have changed.
+    self.add(range(5))
+    assert self.max == 10
+
+    # Adding one extra value brings us up to 60% which should trigger a resize.
+    self.add(5)
+    assert self.max == 15
+    # No information should have been lost in the resize.
+    assert np.array_equal(self.keys, np.arange(6))
+
+    # Adding loads of new keys should call resize as many times as needed.
+    self.add(np.arange(30))
+    assert self.max == 73
+
+
 def test_copy():
     self = HashTable(10, int)
     self.add(range(3, 8))