Skip to content

Commit

Permalink
👌 IMPROVE: Repository.hash() speed (#4923)
Browse files Browse the repository at this point in the history
The `Repository.hash` method was recursively hashing the content of all
the files it contained which is slow. However, for the disk object store
this is redundant, since the content is already hashed when it is stored
and used as the `key`. So in that case, we can actually skip the rehash
and simply return the `key`. Note that this shortcut does introduce a
dependency that the container should actually use SHA256, as that is what
should be consistent in AiiDA across all repository implementations, but
it is actually configurable. That is why a check is added to only use
the `key` if the container is configured to use SHA256.
  • Loading branch information
chrisjsewell authored May 7, 2021
1 parent 19d7d8e commit 597a4d0
Show file tree
Hide file tree
Showing 8 changed files with 96 additions and 4 deletions.
26 changes: 26 additions & 0 deletions aiida/common/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import numbers
import random
import time
import typing
import uuid
from collections import abc, OrderedDict
from functools import singledispatch
Expand Down Expand Up @@ -82,6 +83,31 @@ def get_random_string(length=12, allowed_chars='abcdefghijklmnopqrstuvwxyzABCDEF
}


def chunked_file_hash(
handle: typing.BinaryIO, hash_cls: typing.Any, chunksize: int = 524288, **kwargs: typing.Any
) -> str:
"""Return the hash for the given file handle
Will read the file in chunks, which should be opened in 'rb' mode.
:param handle: a file handle, opened in 'rb' mode.
:param hash_cls: a class implementing hashlib._Hash
:param chunksize: number of bytes to chunk the file read in
:param kwargs: arguments to pass to the hasher initialisation
:return: the hash hexdigest (the hash key)
"""
hasher = hash_cls(**kwargs)
while True:
chunk = handle.read(chunksize)
hasher.update(chunk)

if not chunk:
# Empty returned value: EOF
break

return hasher.hexdigest()


def make_hash(object_to_hash, **kwargs):
"""
Makes a hash from a dictionary, list, tuple or set to any level, that contains
Expand Down
17 changes: 17 additions & 0 deletions aiida/repository/backend/abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,13 @@
"""
import abc
import contextlib
import hashlib
import io
import pathlib
import typing

from aiida.common.hashing import chunked_file_hash

__all__ = ('AbstractRepositoryBackend',)


Expand Down Expand Up @@ -105,6 +108,20 @@ def get_object_content(self, key: str) -> bytes:
with self.open(key) as handle:
return handle.read()

def get_object_hash(self, key: str) -> str:
"""Return the SHA-256 hash of an object stored under the given key.
.. important::
A SHA-256 hash should always be returned,
to ensure consistency across different repository implementations.
:param key: fully qualified identifier for the object within the repository.
:raise FileNotFoundError: if the file does not exist.
:raise OSError: if the file could not be opened.
"""
with self.open(key) as handle:
return chunked_file_hash(handle, hashlib.sha256)

def delete_object(self, key: str):
"""Delete the object from the repository.
Expand Down
17 changes: 17 additions & 0 deletions aiida/repository/backend/disk_object_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,20 @@ def delete_object(self, key: str):
"""
super().delete_object(key)
self.container.delete_objects([key])

def get_object_hash(self, key: str) -> str:
"""Return the SHA-256 hash of an object stored under the given key.
.. important::
A SHA-256 hash should always be returned,
to ensure consistency across different repository implementations.
:param key: fully qualified identifier for the object within the repository.
:raise FileNotFoundError: if the file does not exist.
"""
if not self.has_object(key):
raise FileNotFoundError(key)
if self.container.hash_type != 'sha256':
return super().get_object_hash(key)
return key
4 changes: 2 additions & 2 deletions aiida/repository/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,8 @@ def hash(self) -> str:
for root, dirnames, filenames in self.walk():
objects['__dirnames__'] = dirnames
for filename in filenames:
with self.open(root / filename) as handle:
objects[str(root / filename)] = handle.read()
key = self.get_file(root / filename).key
objects[str(root / filename)] = self.backend.get_object_hash(key)

return make_hash(objects)

Expand Down
1 change: 1 addition & 0 deletions docs/source/nitpick-exceptions
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ py:class function
py:class IO
py:class traceback
py:class _io.BufferedReader
py:class BinaryIO

### AiiDA

Expand Down
13 changes: 11 additions & 2 deletions tests/common/test_hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@
Unittests for aiida.common.hashing:make_hash with hardcoded hash values
"""

import itertools
import collections
import itertools
from datetime import datetime
import hashlib
import uuid

import numpy as np
Expand All @@ -25,7 +26,7 @@
import unittest

from aiida.common.exceptions import HashingError
from aiida.common.hashing import make_hash, float_to_text
from aiida.common.hashing import make_hash, float_to_text, chunked_file_hash
from aiida.common.folders import SandboxFolder
from aiida.backends.testbase import AiidaTestCase
from aiida.orm import Dict
Expand Down Expand Up @@ -257,3 +258,11 @@ def test_attribute_storing(self):
recomputed_hash = node.get_hash()

self.assertEqual(first_hash, recomputed_hash)


def test_chunked_file_hash(tmp_path):
"""Test the ``chunked_file_hash`` function."""
(tmp_path / 'afile').write_bytes(b'content')
with (tmp_path / 'afile').open('rb') as handle:
key = chunked_file_hash(handle, hashlib.sha256)
assert key == 'ed7002b439e9ac845f22357d822bac1444730fbdb6016d3ec9432297b9ec9f73'
11 changes: 11 additions & 0 deletions tests/repository/backend/test_disk_object_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,14 @@ def test_erase(repository, generate_directory):

assert not dirpath.exists()
assert not repository.is_initialised


def test_get_object_hash(repository, generate_directory):
"""Test the ``Repository.get_object_hash`` returns the expected value."""
repository.initialise()
directory = generate_directory({'file_a': b'content'})

with open(directory / 'file_a', 'rb') as handle:
key = repository.put_object_from_filelike(handle)

assert repository.get_object_hash(key) == 'ed7002b439e9ac845f22357d822bac1444730fbdb6016d3ec9432297b9ec9f73'
11 changes: 11 additions & 0 deletions tests/repository/backend/test_sandbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,14 @@ def test_cleanup():
assert dirpath.exists()
del repository
assert not dirpath.exists()


def test_get_object_hash(repository, generate_directory):
"""Test the ``Repository.get_object_hash`` returns the expected value."""
repository.initialise()
directory = generate_directory({'file_a': b'content'})

with open(directory / 'file_a', 'rb') as handle:
key = repository.put_object_from_filelike(handle)

assert repository.get_object_hash(key) == 'ed7002b439e9ac845f22357d822bac1444730fbdb6016d3ec9432297b9ec9f73'

0 comments on commit 597a4d0

Please sign in to comment.