From 5ba93162cd49d9b1ca7149c502349bfb06833255 Mon Sep 17 00:00:00 2001
From: Giovanni Pizzi <giovanni.pizzi@epfl.ch>
Date: Fri, 7 Jul 2023 23:24:33 +0200
Subject: [PATCH] Various improvements to docs and code (#151)

First a number of sections have been added to the docs:
- `clean_storage` is now mentioned in the basic usage
- we also add basic examples of the new CompressMode.AUTO
  functionality
- Maintenance operations are documented and explained
- We discuss the long-term support of data even without this
  library (fixes #100)
- Links to the original performance tests
- Document the new compression modes
- Explain that nowadays `clean_storage` is safe to use concurrently
  on all platforms
- Explain how to reclaim space with `repacking` of packs after deletion

In addition, some changes to the code have been performed:
- `utf8` is explicitly specified for the config.json file
- `get_hash` is renamed to `get_hash_cls` (and `get_hash` is
  deprecated`)
-

Some of these changes address the remaining comments and
thus fix #101, as well as the remaining comments and thus
fix #3.

Extending the documentation
---
 disk_objectstore/container.py                 |  22 +--
 disk_objectstore/utils.py                     |  22 ++-
 docs/pages/design.md                          | 128 ++++++++++++++++++
 docs/pages/getting_started.md                 |  13 +-
 docs/pages/packing.md                         |  46 ++++++-
 .../validation-calls/performance.txt          |   2 +-
 tests/test_deprecations.py                    |  17 +++
 tests/test_utils.py                           |  12 +-
 8 files changed, 237 insertions(+), 25 deletions(-)
 create mode 100644 tests/test_deprecations.py

diff --git a/disk_objectstore/container.py b/disk_objectstore/container.py
index e069c88..e5e937f 100644
--- a/disk_objectstore/container.py
+++ b/disk_objectstore/container.py
@@ -50,7 +50,7 @@
     compute_hash_and_size,
     detect_where_sorted,
     get_compressobj_instance,
-    get_hash,
+    get_hash_cls,
     get_stream_decompresser,
     is_known_hash,
     merge_sorted,
@@ -273,7 +273,7 @@ def is_initialised(self) -> bool:
         """Return True if the container is already initialised."""
         # If the config file does not exist, the container is not initialised
         try:
-            with open(self._get_config_file()) as fhandle:
+            with open(self._get_config_file(), encoding="utf8") as fhandle:
                 json.load(fhandle)
         except (ValueError, OSError):
             return False
@@ -362,7 +362,7 @@ def init_container(
         get_stream_decompresser(compression_algorithm)
 
         # Create config file
-        with open(self._get_config_file(), "w") as fhandle:
+        with open(self._get_config_file(), "w", encoding="utf8") as fhandle:
             json.dump(
                 {
                     "container_version": 1,  # For future compatibility, this is the version of the format
@@ -392,7 +392,7 @@ def _get_repository_config(self) -> Dict[str, Union[int, str]]:
                 raise NotInitialised(
                     "The container is not initialised yet - use .init_container() first"
                 )
-            with open(self._get_config_file()) as fhandle:
+            with open(self._get_config_file(), encoding="utf8") as fhandle:
                 self._config = json.load(fhandle)
         return self._config
 
@@ -1306,7 +1306,7 @@ def _write_data_to_packfile(
         assert "a" in pack_handle.mode
 
         if hash_type:
-            hasher = get_hash(hash_type=hash_type)()
+            hasher = get_hash_cls(hash_type=hash_type)()
 
         if compress:
             compressobj = self._get_compressobj_instance()
@@ -1342,7 +1342,10 @@ def pack_all_loose(  # pylint: disable=too-many-locals,too-many-branches,too-man
     ) -> None:
         """Pack all loose objects.
 
-        This is a maintenance operation, needs to be done only by one process.
+        This is an operation that can be run at any time even when the container is being read
+        and written (ony to loose files, though), **BUT it needs to be done only by one
+        process at any given time**.
+
         :param compress: either a boolean or a `CompressMode` objects.
            If True (or `CompressMode.YES`), compress objects before storing them.
            If False (or `CompressMode.NO`), do not compress objects (the default).
@@ -1972,9 +1975,9 @@ def _vacuum(self) -> None:
     def clean_storage(  # pylint: disable=too-many-branches,too-many-locals
         self, vacuum: bool = False
     ) -> None:
-        """Perform some maintenance clean-up of the container.
+        """Perform some clean-up of the container.
 
-        .. note:: this is a maintenance operation, must be performed when nobody is using the container!
+        .. note:: this is an operation that should be run only by one process at a given time! Don't call it twice.
 
         In particular:
         - if `vacuum` is True, it first VACUUMs the DB, reclaiming unused space and
@@ -2545,7 +2548,8 @@ def delete_objects(self, hashkeys: List[str]) -> List[Union[str, Any]]:
            - The delete might fail because the loose object is open or reading the object might fail with
              a PermissionError because the object is being deleted (on Windows)
            - On MacOS, there is an unexpected race condition for which when reading the object during concurrent delete,
-             one gets an empty handle instead of either FileNotFoundError or the actual content
+             one gets an empty handle instead of either FileNotFoundError or the actual content (even if this seems
+             not to be the case anymore with recent, post-2021 Apple filesystems)
            - Routines might get the list of files before performing operations, and the objects might disappear in the
              meantime
            - Write access to packs is not possible as the DB will be locked (e.g. writing directly to packs, or
diff --git a/disk_objectstore/utils.py b/disk_objectstore/utils.py
index 242a89c..6a0d9c9 100644
--- a/disk_objectstore/utils.py
+++ b/disk_objectstore/utils.py
@@ -11,6 +11,7 @@
 import itertools
 import os
 import uuid
+import warnings
 import zlib
 from contextlib import contextmanager
 from enum import Enum
@@ -1171,13 +1172,13 @@ def read(self, size: int = -1) -> bytes:
 def is_known_hash(hash_type: str) -> bool:
     """Return True if the hash_type is known, False otherwise."""
     try:
-        get_hash(hash_type)
+        get_hash_cls(hash_type)
         return True
     except ValueError:
         return False
 
 
-def get_hash(hash_type: str) -> Callable:
+def get_hash_cls(hash_type: str) -> Callable:
     """Return a hash class with an update method and a hexdigest method."""
     known_hashes = {"sha1": hashlib.sha1, "sha256": hashlib.sha256}
 
@@ -1188,6 +1189,17 @@ def get_hash(hash_type: str) -> Callable:
         raise ValueError(f"Unknown or unsupported hash type '{hash_type}'")
 
 
+def get_hash(hash_type: str) -> Callable:
+    """Return a hash class with an update method and a hexdigest method.
+
+    .. deprecated:: Deprecated since 1.0. Use `get_hash_cls` instead.
+    """
+    warnings.warn(
+        "`get_hash` is deprecated, use `get_hash_cls` instead", DeprecationWarning
+    )
+    return get_hash_cls(hash_type)
+
+
 def _compute_hash_for_filename(filename: str, hash_type: str) -> str | None:
     """Return the hash for the given file.
 
@@ -1199,7 +1211,7 @@ def _compute_hash_for_filename(filename: str, hash_type: str) -> str | None:
     """
     _chunksize = 524288
 
-    hasher = get_hash(hash_type)()
+    hasher = get_hash_cls(hash_type)()
     try:
         with open(filename, "rb") as fhandle:
             while True:
@@ -1230,7 +1242,7 @@ def __init__(self, write_stream: BinaryIO, hash_type: str) -> None:
         assert "b" in self._write_stream.mode
         self._hash_type = hash_type
 
-        self._hash = get_hash(self._hash_type)()
+        self._hash = get_hash_cls(self._hash_type)()
         self._position = self._write_stream.tell()
 
     @property
@@ -1378,7 +1390,7 @@ def compute_hash_and_size(
     :return: a tuple with ``(hash, size)`` where ``hash`` is the hexdigest and ``size`` is the size in bytes
     """
     _hash_chunksize = 524288
-    hasher = get_hash(hash_type)()
+    hasher = get_hash_cls(hash_type)()
 
     # Read and hash all content
     size = 0
diff --git a/docs/pages/design.md b/docs/pages/design.md
index 82a070e..9b24e40 100644
--- a/docs/pages/design.md
+++ b/docs/pages/design.md
@@ -194,3 +194,131 @@ In addition, the following design choices have been made:
 
   Therefore, caches are per blocks/pages in linux, not per file.
   Concatenating files does not impact performance on cache efficiency.
+
+## Long-term support of the data
+One of the goals of the `disk-objectstore` library is to adopt a relatively simple
+packing mechanism, so that even if this library were to go unmaintained, it is always
+possible to retrieve the data (the objects) with standard tools.
+
+Here we discuss a simple bash script that allows to retrieve any object from a pack using
+relatively standard tools (note: if it's loose, it's very simple: it's just a file in the
+`loose` subfolder, with the filename equal to its hash, except for sharding).
+
+```bash
+#!/bin/bash
+CONTAINER_PATH="$1"
+HASHKEY="$2"
+
+METADATA=`sqlite3 "$CONTAINER_PATH"/packs.idx 'SELECT offset, length, pack_id, compressed FROM db_object WHERE hashkey = "'"$HASHKEY"'"'`
+
+if [ -z "$METADATA" ]
+then
+    echo "No object '" $HASHKEY "' found in container."
+    exit 1
+fi
+
+IFS='|' read -ra METADATA_ARR <<< "$METADATA"
+OFFSET=${METADATA_ARR[0]}
+LENGTH=${METADATA_ARR[1]}
+PACK_ID=${METADATA_ARR[2]}
+COMPRESSED=${METADATA_ARR[3]}
+
+let OFFSET_PLUS_ONE=OFFSET+1
+
+if [ "$COMPRESSED" == "0" ]
+then
+    tail -c+$OFFSET_PLUS_ONE "${CONTAINER_PATH}/packs/${PACK_ID}" | head -c"${LENGTH}"
+elif [ "$COMPRESSED" == "1" ]
+then
+    tail -c+${OFFSET_PLUS_ONE} "${CONTAINER_PATH}/packs/${PACK_ID}" | head -c"${LENGTH}" | zlib-flate -uncompress
+else
+    echo "Unknown compression mode "$COMPRESSED" for object '" $HASHKEY "'"
+    exit 2
+fi
+```
+This script gets two parameters. The first is the path to the container, and the second is the hashkey we want to
+retrieve.
+
+The requirements for this script to run are:
+
+- a [bash shell](https://www.gnu.org/software/bash/), typically available (often by default) on Mac, Unix, and installable
+  on Windows.
+- the [sqlite3 executable](https://www.sqlite.org/index.html): this is typically easily installable in most operating
+ systems and distributions. We also highlight that SQLite makes a strong commitment on long-term support for its
+ format for decades, as documented in the [SQlite long-term support page](https://www.sqlite.org/lts.html).
+- the `zlib-flate` executable: this comes from package `qpdf`, and it easily installable (e.g. on Ubuntu with
+  the command `apt install qpdf`, or on Mac with [HomeBrew](https://brew.sh) using `brew install qpdf`).
+  We note that this cmmand is actually needed only if the data is zlib-compressed (as a note, one cannot simply use
+  the `gzip` command, as it also expects the gzip headers, that however are redundant and not used by the
+  disk-objectstore implementation).
+
+In addition, we highlight that both `zlib` and `sqlite3` are libraries that are part of the standard python libraries,
+therefore one can very easily replace those calls with appropriately written short python scripts (e.g. calling
+`zlib.decompressobj`).
+
+## Performance
+When this library was first implemented, many performance tests were run. These are collected in the folder
+`performance-benchmarks` of the main [GitHub repository](https://github.com/aiidateam/disk-objectstore) of the
+`disk-objectstore` package.
+
+They are organized in appropriately named folders, with text files (README.txt or similar) discussing the results
+of that specific performance test. Feel free to navigate that folder if you are curious of the tests that have been
+performed.
+
+In case you are curious you can also read
+[issue #16 of the disk-objectstore repository](https://github.com/aiidateam/disk-objectstore/issues/16)
+to get more details on the (significant) improvement in backup time when transferring (e.g. via rsync) the whole
+container, with respect to just storing each object as a single file, when you have millions of objects.
+The same issue also discusses the space saving (thanks to deduplication) for a real DB, as well as the cost of
+keeping the index (SQLite DB).
+
+
+## Concurrent usage of the disk-objectstore by multiple processes
+The main goal of disk-objecstore is to allow to store objects efficiently, without a server running, with the
+additional requirement to allow any number of concurrent **readers** (of any object, loose or packed: the reader
+should not know) and **writers** (as long as they are OK to write to loose objects).
+This allows to essentially use the library with any number of "clients".
+
+In addition, a number of performance advantages are obtained only once objects are packed.
+Therefore, another requirement is that some basic packing functionality can be performed while multiple clients are
+reading and writing. Specific tests also stress-test that this is indeed the case, on the various platforms (Windows,
+Mac and Unix) supported by the library.
+
+However, packing **MUST ONLY BE PERFORMED BY ONE PROCESS AT A TIME** (i.e., it is invalid to call the packing methods
+from more than one process at the same time).
+
+Therefore, **the concurrent functionality that are supported include**:
+
+- **any number of readers**, from any object
+- **any number of writers**, to loose objects (no direct write to packs)
+- **one single process to pack loose objects**, that can call the two methods `pack_all_loose()` and `clean_storage()`.
+
+What is **NOT** allowed follows.
+- one **MUST NOT** run two ore more packing number of operations at the same time.
+
+In addition, a number of operations are considered **maintenance operations**. You should need to run them only
+very rarely, to optimize performance. **Only one maintenance operation can run at a given time**, and in addition
+**no other process can access (read or write) the container while a maintenance operation is running**.
+
+This means that, before running a maintenance operation, **you should really stop any process using the container**,
+run the maintenance operation, and then resume normal operation.
+
+Maintenance operations include:
+- deleting objects with `delete_objects()`
+- adding objects directly to a pack with `add_streamed_objects_to_pack()` (or similar functions such as
+  `add_objects_to_pack()`)
+- repacking (to change compression, reclaim unused disk space in the pack files, ...) with `repack()` (and similarly
+  for `repack_pack` to repack a single pack file).
+
+A note: while one could implement guards (e.g. a decorator `@maintenance` for the relevant methods) to prevent
+concurrent access (see e.g. [issue #6](https://github.com/aiidateam/disk-objectstore/issues/6)), this is complex to
+implement (mostly because one needs to record any time a process starts accessing the repository - so a maintenance
+operation can refuse to start - as well as check if any maintenance operation is running at every first access to a
+container and refuse to start using it).
+While not impossible, this is not easy (also to properly clean up if the computer reboots unexpectedly when a
+maintenance operation is running, etc) and also might have performance implications as a check has to be performed
+for every new operation on a Container.
+
+We note that however such logic was [implemented in AiiDA](https://github.com/aiidateam/aiida-core/pull/5270)
+(that uses `disk-objectstore` as a backend). Therefore guards are in place there, and if one needs to do the same
+from a different code, inspiration can be taken from the implementation in AiiDA.
diff --git a/docs/pages/getting_started.md b/docs/pages/getting_started.md
index 0b6f498..d8d938a 100644
--- a/docs/pages/getting_started.md
+++ b/docs/pages/getting_started.md
@@ -7,7 +7,7 @@ Here we show a minimal example of how to use the API of the `disk-objectstore` t
 Let us run a quick demo of how to store and retrieve objects in a container:
 
 ```python
-from disk_objectstore import Container
+from disk_objectstore import Container, CompressMode
 
 # Let's create a new container in the local folder `temp_container`, and initialise it
 container = Container("temp_container")
@@ -36,8 +36,15 @@ assert hash1bis == hash1
 
 # Let's pack all objects: instead of having a lot of files, one per object, all objects
 # are written in a few big files (great for performance, e.g. when using rsync) +
-# internally a SQLite database is used to know where each object is in the pack files
-container.pack_all_loose()
+# internally a SQLite database is used to know where each object is in the pack files.
+# In addition, we can also ask to compress files. `CompresMode.AUTO` will perform
+# fast heuristics to decide, object by object, if it's worth compressing the object or not.
+container.pack_all_loose(compress=CompressMode.AUTO)
+
+# The previous operation puts loose objects into packs, but does not delete by default
+# the loose objects. This function removes them (as they are not used anymore), and possibly
+# performs some additional cleanup.
+container.clean_storage()
 
 # After packing, everthing works as before
 container.get_object_content(hash2)
diff --git a/docs/pages/packing.md b/docs/pages/packing.md
index 31efcad..8f1b827 100644
--- a/docs/pages/packing.md
+++ b/docs/pages/packing.md
@@ -65,8 +65,34 @@ container.add_streamed_objects_to_pack([stream1, stream2])
 which has the same output as before.
 Note that this is just to demonstrate the interface: the `BytesIO` object will store the whole data in memory!
 
+## Compression
+While for efficiency reasons loose objects are always uncompressed, each object in a pack can (optionally) be
+compressed.
+
+We also define a `disk_objectstore.utils.CompressMode` enum that provides all supported modes:
+
+- `CompressMode.NO`: never compress objects
+- `CompressMode.YES`: always compress all objects
+- `CompressMode.KEEP`: keep the existing/previous compression mode (this is the most efficient mode, as
+  it does not require e.g. to decompress and then recompress an object when, e.g., repacking)
+- `CompressMode.AUTO`: (typically recommmended) run relatively inexpensive heuristics to decide if compression
+  would provide a significant space benefit, and compress only in this case.
+
+Functions that manage packs provide options to decide whether to compress or not the objects, such `pack_all_loose` or
+`repack`.
+Some functions (e.g. `pack_all_loose`) also simply accept a boolean (in addition to a `CompressMode` object) for the
+`compress` option (where `True` is equivalent to `CompressMode.YES` and `False` to `CompressMode.NO`).
+
+Note that the functions that write directly to packs only support a boolean, and not e.g. the `CompressMode.AUTO` mode,
+as this would require to read the stream and possibly rewrite it twice with a different compression mode.
+In this case, the suggested approach is to write with `compress=False`, and then call a `repack` at the end with
+`compress_mode=CompressMode.AUTO`: this is also good because writing directly to packs (that, by the way, should be
+done while other maintenance operations are not running) might "waste" space if the same object is written multiple
+times, and `repack` will allow to claim that space back.
+
 ## Reclaiming space
 
+### Loose objects
 To avoid race conditions, while packing the corresponding loose files are not deleted.
 In order to reclaim that space, after making sure that no process is still accessing the loose objects, one can do
 
@@ -81,7 +107,25 @@ Note: Technically processes can still continue using the container during this o
 - on Windows, the operation should be callable at any time; if a loose object is open, it will not be deleted.
   A future `clean_storage` call will delete it once it's not used anymore.
 - on Mac OS X, it is better not to call it while processes are still accessing the file, because there are race
-  conditions under which the file might be read as empty. If the file is already open, the same notes as Linux apply.
+  conditions under which the file might be read as empty. However, this was demonstrated on old Apple file-systems,
+  but recent (post-2021) machines do not have this bug anymore - I think this has been fixed.
+  If the file is already open, the same notes as Linux apply.
   However, once objects are packed, new implementations will prefer the packed version and open that one. So, it is
   OK to call the `clean_storage`. However, one should be careful with concurrently continuing to write loose objects and
   accessing them for the aforementioned race condition.
+
+So in summary, in recent machines it should be safe to call `clean_storage()` on an active repository, i.e. where
+data is being concurrently written to (in loose form) or read from.
+
+An additional note is that, when accessing a compressed object in a pack in a non-linear fashion, a loose object
+will be recreated as a "cache version", to enable fast random seeks in the file. Therfore, `clean_storage` is not called
+by default when calling `pack_all_loose` in case the user wants to avoid cleaning this cache.
+
+### Pack files
+In addition, if an object is deleted from the container and it was packed, only the reference to the object is removed
+from the SQLite index file, but the pack file is untouched, so no space is saved.
+This allows the `delete` operation to be fast and allow to run it while other operations are running (otherwise, one
+would need to fully rewrite at least the part of the pack that follows the object just deleted).
+
+Therefore, to claim this space, you should run `container.repack()` that will recreate each pack, removing any unused
+portions of the file.
diff --git a/performance-benchmarks/validation-calls/performance.txt b/performance-benchmarks/validation-calls/performance.txt
index bd32bc6..4dd5681 100644
--- a/performance-benchmarks/validation-calls/performance.txt
+++ b/performance-benchmarks/validation-calls/performance.txt
@@ -136,7 +136,7 @@ print(total, cnt, time.time() - t)
 def _validate_hashkeys_pack(self, pack_id, callback=None):
 
     _CHUNKSIZE = 524288
-    hash_class = get_hash(self.hash_type)
+    hash_class = get_hash_cls(self.hash_type)
 
     total = self._get_cached_session().query(Obj).filter(Obj.pack_id==pack_id).count()
     if callback:
diff --git a/tests/test_deprecations.py b/tests/test_deprecations.py
new file mode 100644
index 0000000..17b746d
--- /dev/null
+++ b/tests/test_deprecations.py
@@ -0,0 +1,17 @@
+"""Module to test deprecated functionality.
+
+Both to check that they work, but also to ensure full coverage."""
+import pytest
+
+from disk_objectstore import utils
+
+
+def test_get_hash():
+    """Test the get_hash method."""
+    hash_type = "sha256"
+    content = b"523453dfvsd"
+    with pytest.warns(DeprecationWarning, match="get_hash_cls"):
+        hasher = utils.get_hash(hash_type=hash_type)()
+    hasher.update(content)
+    hashkey = hasher.hexdigest()
+    assert hashkey == "11c4da82bc95154d2a3116e66c3d49568e4fd0f7184d44a9d611f2749539b7f6"
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 71c0f2c..2d1b880 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -350,7 +350,7 @@ def test_object_writer_existing_locked(  # pylint: disable=invalid-name
     os.mkdir(duplicates_folder)
 
     content = b"523453dfvsd"
-    hasher = utils.get_hash(hash_type=hash_type)()
+    hasher = utils.get_hash_cls(hash_type=hash_type)()
     hasher.update(content)
     hashkey = hasher.hexdigest()
 
@@ -435,7 +435,7 @@ def test_object_writer_appears_concurrently(  # pylint: disable=invalid-name,too
     os.mkdir(duplicates_folder)
 
     content = b"523453dfvsd"
-    hasher = utils.get_hash(hash_type=hash_type)()
+    hasher = utils.get_hash_cls(hash_type=hash_type)()
     hasher.update(content)
     hashkey = hasher.hexdigest()
 
@@ -558,7 +558,7 @@ def test_object_writer_existing_corrupted_reappears(  # pylint: disable=invalid-
     os.mkdir(duplicates_folder)
 
     content = b"523453dfvsd"
-    hasher = utils.get_hash(hash_type=hash_type)()
+    hasher = utils.get_hash_cls(hash_type=hash_type)()
     hasher.update(content)
     hashkey = hasher.hexdigest()
 
@@ -700,7 +700,7 @@ def test_object_writer_deleted_while_checking_content(  # pylint: disable=invali
     os.mkdir(duplicates_folder)
 
     content = b"523453dfvsd"
-    hasher = utils.get_hash(hash_type=hash_type)()
+    hasher = utils.get_hash_cls(hash_type=hash_type)()
     hasher.update(content)
     hashkey = hasher.hexdigest()
 
@@ -784,7 +784,7 @@ def test_object_writer_existing_OK(
     os.mkdir(duplicates_folder)
 
     content = b"523453dfvsd"
-    hasher = utils.get_hash(hash_type=hash_type)()
+    hasher = utils.get_hash_cls(hash_type=hash_type)()
     hasher.update(content)
     hashkey = hasher.hexdigest()
 
@@ -842,7 +842,7 @@ def test_object_writer_existing_corrupted(
     os.mkdir(duplicates_folder)
 
     content = b"523453dfvsd"
-    hasher = utils.get_hash(hash_type=hash_type)()
+    hasher = utils.get_hash_cls(hash_type=hash_type)()
     hasher.update(content)
     hashkey = hasher.hexdigest()